1 diff -Naur linux-2002-09-30/drivers/evms/AIXlvm_vge.c evms-2002-09-30/drivers/evms/AIXlvm_vge.c
2 --- linux-2002-09-30/drivers/evms/AIXlvm_vge.c Wed Dec 31 18:00:00 1969
3 +++ evms-2002-09-30/drivers/evms/AIXlvm_vge.c Fri Sep 27 14:55:45 2002
10 + * Copyright (c) International Business Machines Corp., 2000
12 + * This program is free software; you can redistribute it and/or modify
13 + * it under the terms of the GNU General Public License as published by
14 + * the Free Software Foundation; either version 2 of the License, or
15 + * (at your option) any later version.
17 + * This program is distributed in the hope that it will be useful,
18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
20 + * the GNU General Public License for more details.
22 + * You should have received a copy of the GNU General Public License
23 + * along with this program; if not, write to the Free Software
24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 + * linux/drivers/evms/AIXlvm_vge.c
31 + * EVMS AIX LVM Volume Group Emulator
37 +#define EVMS_AIX_DEBUG 1
39 +#define AIX_COMMON_SERVICES_MAJOR 0 // Required common services levels for the AIX kernel plugin
40 +#define AIX_COMMON_SERVICES_MINOR 5 // These must be incremented if new function is added to common
41 +#define AIX_COMMON_SERVICES_PATCHLEVEL 0 // services and the AIX kernel plugin uses the new function.
42 +#define AIX_INCREMENT_REQUEST 1
43 +#define AIX_DECREMENT_REQUEST -1
44 +#define AIX_RESYNC_BLOCKSIZE 512
45 +#define AIX_SYNC_INCOMPLETE 0x01
46 +#define AIX_SYNC_COMPLETE 0x00
48 +#define AIX_SLAVE_1 1
49 +#define AIX_SLAVE_2 2
51 +#include <linux/module.h>
52 +#include <linux/kernel.h>
53 +#include <linux/config.h>
55 +#include <linux/genhd.h>
56 +#include <linux/string.h>
57 +#include <linux/blk.h>
58 +#include <linux/init.h>
59 +#include <linux/slab.h>
61 +#include <linux/evms/evms.h>
62 +#include <linux/evms/evms_aix.h>
63 +#include <asm/system.h>
64 +#include <asm/uaccess.h>
66 +#include <linux/sched.h>
67 +#include <linux/smp_lock.h>
68 +#include <linux/locks.h>
69 +#include <linux/delay.h>
70 +#include <linux/reboot.h>
71 +#include <linux/completion.h>
72 +#include <linux/vmalloc.h>
74 +#ifdef EVMS_AIX_DEBUG
75 +static int AIX_volume_group_dump(void);
78 +static struct aix_volume_group *AIXVolumeGroupList = NULL;
79 +static struct evms_thread *AIX_mirror_read_retry_thread;
80 +static struct evms_thread *AIX_mirror_resync_thread;
81 +static struct evms_pool_mgmt *AIX_BH_list_pool = NULL;
82 +static struct aix_mirror_bh *AIX_retry_list = NULL;
83 +static struct aix_mirror_bh **AIX_retry_tail = NULL;
84 +static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
85 +static spinlock_t AIX_resync_list_lock = SPIN_LOCK_UNLOCKED;
86 +static spinlock_t AIX_resync_pp_lock = SPIN_LOCK_UNLOCKED;
87 +static int AIXResyncInProgress = FALSE;
88 +static struct aix_resync_struct *AIX_resync_list = NULL;
90 +// Plugin API prototypes
92 +static void AIXiod(void *data);
93 +static void AIXresync(void *data);
94 +static int discover_aix(struct evms_logical_node **evms_logical_disk_head);
95 +static int discover_volume_groups(struct evms_logical_node **);
96 +static int discover_logical_volumes(void);
97 +static int end_discover_aix(struct evms_logical_node **evms_logical_disk_head);
98 +static void read_aix(struct evms_logical_node *node, struct buffer_head *bh);
99 +static void write_aix(struct evms_logical_node *node, struct buffer_head *bh);
100 +static int ioctl_aix(struct evms_logical_node *logical_node,
101 + struct inode *inode,
102 + struct file *file, unsigned int cmd, unsigned long arg);
104 +static int aix_direct_ioctl(struct inode *inode,
106 + unsigned int cmd, unsigned long args);
108 +static int AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap
109 + u64 size, // size (in sectors) of request to remap
110 + u64 * new_sector, // remapped sector
111 + u64 * new_size, // new size (in sectors)
112 + struct partition_list_entry **partition, // new node for which new_sector is relative
113 + u32 * le, u32 * offset_in_le);
115 +static int validate_build_volume_group_disk_info(struct evms_logical_node
117 + struct AIXlvm_rec *AIXlvm);
119 +static int add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
120 + struct aix_volume_group *new_group,
122 +static int add_PV_to_volume_group(struct aix_volume_group *group,
123 + struct evms_logical_node *evms_partition,
125 +static struct aix_volume_group *AIX_create_volume_group(struct evms_logical_node
130 +static int AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
131 + struct evms_logical_node *logical_node,
132 + struct AIXlvm_rec *AIXlvm);
134 +static int AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node);
136 +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs);
138 +static int AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force);
140 +static int AIX_copy_on_read(struct aix_logical_volume *volume,
141 + struct partition_list_entry *master_part,
142 + struct partition_list_entry *slave1_part,
143 + struct partition_list_entry *slave2_part,
146 + u64 slave2_offset, u32 pe_size, int le);
148 +static int export_volumes(struct evms_logical_node **evms_logical_disk_head);
149 +static int lvm_cleanup(void);
150 +static int AIX_copy_header_info(struct vg_header *AIXvgh,
151 + struct vg_header *AIXvgh2);
152 +static int build_pe_maps(struct aix_volume_group *volume_group);
154 +static struct aix_logical_volume *new_logical_volume(struct lv_entries
156 + struct aix_volume_group
157 + *group, char *lv_name,
160 +static int check_log_volume_and_pe_maps(struct aix_volume_group *group);
161 +static int check_volume_groups(void);
162 +static int init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
163 + u64 sect_nr, /* disk LBA */
164 + u64 num_sects, /* # of sectors */
165 + void *buf_addr); /* buffer address */
167 +static int delete_logical_volume(struct aix_logical_volume *volume);
168 +static int delete_aix_node(struct evms_logical_node *logical_node);
169 +static int deallocate_volume_group(struct aix_volume_group *group);
171 +static void AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate);
173 +static void AIX_handle_write_mirror_drives(struct buffer_head *bh,
176 +static void aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep,
177 + unsigned long flags);
179 +static void AIX_schedule_resync(struct aix_logical_volume *resync_volume,
181 +static struct aix_logical_volume *AIX_get_volume_data(char *object_name);
183 +static void AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate);
185 +static int AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh,
186 + int index, int offset);
188 +static struct aix_mirror_bh *AIX_alloc_rbh(struct evms_logical_node *node,
189 + struct buffer_head *bh,
191 + u32 le, u64 org_sector, int cmd);
193 +static struct aix_mirror_bh *AIX_alloc_wbh(struct evms_logical_node *node,
194 + struct evms_logical_node *node2,
195 + struct evms_logical_node *node3,
196 + struct buffer_head *bh,
199 + u64 new_sector2, u64 new_sector3);
201 +static struct aix_mirror_bh *AIX_alloc_sbh(struct aix_logical_volume *volume,
202 + struct partition_list_entry
204 + struct partition_list_entry
206 + struct partition_list_entry
207 + *slave2_part, u64 master_offset,
208 + u64 slave1_offset, u64 slave2_offset,
211 +static void AIX_free_headers(struct vg_header *AIXvgh,
212 + struct vg_header *AIXvgh2,
213 + struct vg_trailer *AIXvgt,
214 + struct vg_trailer *AIXvgt2);
216 +static int remove_group_from_list(struct aix_volume_group *group);
218 +//****************************************************************************************************
220 +/* END of PROTOTYES*/
222 +#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
224 +#define COMPARE_TIMESTAMPS(t1, t2) ( (t1).tv_sec == (t2).tv_sec && \
225 + (t1).tv_nsec == (t2).tv_nsec )
227 +#define COMPARE_UNIQUE_IDS(id1, id2) ( (id1).word1 == (id2).word1 && \
228 + (id1).word2 == (id2).word2 && \
229 + (id1).word3 == (id2).word3 && \
230 + (id1).word4 == (id2).word4 )
232 +#define SECTOR_IN_RANGE(s1, s2) ((s2 > s1) && (s2 < s1 + AIX_RESYNC_BLOCKSIZE))
234 +#define AIX_PV_STATE_VALID 0 // Both VGDAs are valid and match.
235 +#define AIX_PV_STATE_FIRST_VGDA 1 // Only the first VGDA is valid.
236 +#define AIX_PV_STATE_SECOND_VGDA 2 // Only the second VGDA is valid.
237 +#define AIX_PV_STATE_EITHER_VGDA -1 // Both VGDAs are valid, but do not match each other.
238 +#define AIX_PV_STATE_INVALID -2 // We're in an invalid state but there's more PVs in this group
240 +#ifndef EVMS_AIX_DEBUG
241 +#define AIX_VOLUME_GROUP_DUMP()
243 +#define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
244 + AIX_volume_group_dump()
247 +// Global LVM data structures
249 +static struct evms_plugin_fops AIXlvm_fops = {
250 + .discover = discover_aix,
251 + .end_discover = end_discover_aix,
252 + .delete = delete_aix_node,
254 + .write = write_aix,
255 + .init_io = init_io_aix,
256 + .ioctl = ioctl_aix,
257 + .direct_ioctl = aix_direct_ioctl
260 +static struct evms_plugin_header plugin_header = {
261 + .id = SetPluginID(IBM_OEM_ID,
262 + EVMS_REGION_MANAGER,
263 + EVMS_AIX_FEATURE_ID),
268 + .required_services_version = {
269 + .major = AIX_COMMON_SERVICES_MAJOR,
270 + .minor = AIX_COMMON_SERVICES_MINOR,
272 + AIX_COMMON_SERVICES_PATCHLEVEL},
273 + .fops = &AIXlvm_fops
277 + * Function: remap sector
278 + * Common function to remap volume lba to partition lba in appropriate PE
281 +AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap
282 + u64 size, // size (in sectors) of request to remap
283 + u64 * new_sector, // remapped sector
284 + u64 * new_size, // new size (in sectors)
285 + struct partition_list_entry **partition, // new node for which new_sector is relative
286 + u32 * le, u32 * offset_in_le)
288 + struct aix_logical_volume *volume;
290 + u32 sectors_per_stripe;
291 + u32 partition_to_use;
293 + u32 stripe_in_column;
295 + u32 org_sector32; // Until striping is 64-bit enabled.
297 + volume = (struct aix_logical_volume *) node->private;
300 + LOG_DEBUG("-- %s volume:%p lv:%d size:" PFU64 " Name:%s\n",
301 + __FUNCTION__, volume, volume->lv_number, size, volume->name);
302 + LOG_DEBUG(" node %p node_name [%s] org_sector:" PFU64 "\n", node,
303 + node->name, org_sector);
304 + LOG_DEBUG(" mirror_copies:%d volume->lv_size:" PFU64 "\n",
305 + volume->mirror_copies, volume->lv_size);
308 + org_sector32 = org_sector;
310 + *(new_size) = size;
312 + // Check if volume is striped. Reset the size if the request
313 + // crosses a stripe boundary.
314 + if (volume->stripes > 1) {
316 + LOG_DEBUG(" *** STRIPED ***\n");
317 + LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",
318 + volume->stripe_size, org_sector32, volume->stripes);
321 + *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
322 + *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
325 + LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n", *(le),
329 + sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
331 + (org_sector32 / sectors_per_stripe) % volume->stripes;
333 + ((((org_sector32 / volume->stripe_size) / volume->stripes) *
334 + volume->stripe_size) +
335 + (org_sector32 % sectors_per_stripe));
337 + ((org_sector32 / sectors_per_stripe) / volume->stripes) *
338 + sectors_per_stripe;
341 + LOG_DEBUG("offset_in_le:%d org_sector:" PFU64
342 + " pe_shift:%d stripe_shift:%d\n", *(offset_in_le),
343 + org_sector, volume->pe_size_shift,
344 + volume->stripe_size_shift);
346 + LOG_DEBUG(" org_sector:%d sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",
347 + org_sector32, sectors_per_stripe, partition_to_use,
348 + stripe_in_column, column);
349 + LOG_DEBUG(" offset_in_le + size:" PFU64
350 + " volume->pe_size:%d volume->lv_size:" PFU64 "\n",
351 + (*(offset_in_le) + size), volume->pe_size,
355 + if (*(offset_in_le) + size > volume->pe_size) {
356 + *new_size = volume->pe_size - *(offset_in_le);
357 + LOG_DEBUG(" new_size " PFU64 "\n", *new_size);
361 + // Non-striped volume. Just find LE and offset. Reset the size
362 + // if the request crosses an LE boundary.
365 + LOG_DEBUG(" *** NON-STRIPED ***\n");
368 + *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
369 + *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
374 + LOG_DEBUG(" offset_in_le:%d org_sector:" PFU64 " shift:%d\n",
375 + *(offset_in_le), org_sector, volume->pe_size_shift);
377 + if (*(le) >= volume->num_le) {
378 + LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",
379 + *(le), volume->num_le);
384 + *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
385 + *(partition) = volume->le_to_pe_map[*(le)].owning_pv;
388 + LOG_DEBUG(" new_sector:" PFU64 "\n", *(new_sector));
389 + LOG_DEBUG(" Owning Part %p\n", *(partition));
390 + LOG_DEBUG(" End %s\n", __FUNCTION__);
397 + * Function: read_aix
400 +read_aix(struct evms_logical_node *node, struct buffer_head *bh)
402 + struct partition_list_entry *partition;
406 + struct aix_logical_volume *volume;
407 + struct aix_mirror_bh *tmp_bh;
408 + u32 le, offset_in_le, count;
411 + volume = (struct aix_logical_volume *) node->private;
413 +// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
414 +// volume->mirror_iterations);
418 + LOG_DEBUG(" node->total_vsectors:" PFU64 "\n", node->total_vsectors);
419 + LOG_DEBUG(" rsector:%lu rsize:%u node_flags:%u\n", bh->b_rsector,
420 + bh->b_size, node->flags);
423 + // Check if I/O goes past end of logical volume.
424 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
425 + node->total_vsectors) {
426 + LOG_CRITICAL(" read_aix ERROR %d\n", __LINE__);
427 + buffer_IO_error(bh);
431 + // Logical-to-physical remapping.
432 + if (AIX_remap_sector
433 + (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
434 + &new_sector, &new_size, &partition, &le, &offset_in_le)
435 + || (!partition || !new_sector)) {
436 + LOG_CRITICAL(" read_aix bh: ERROR %d\n", __LINE__);
437 + buffer_IO_error(bh);
441 + org_sector = bh->b_rsector;
442 + bh->b_rsector = new_sector;
443 + //bh->b_size = new_size;
446 + LOG_DEBUG(" read_aix Mirror_Copies:%d\n", volume->mirror_copies);
449 + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
452 + AIX_alloc_rbh(node, bh, 1, le, new_sector, AIX_LV_READ);
455 + buffer_IO_error(bh);
459 + if (volume->le_to_pe_map_mir1) {
460 + tmp_bh->mir_node1 =
461 + volume->le_to_pe_map_mir1[le].owning_pv->
463 + tmp_bh->mir_sector1 =
464 + volume->le_to_pe_map_mir1[le].pe_sector_offset +
468 + if (volume->mirror_copies == AIX_MAX_MIRRORS) {
469 + tmp_bh->mir_node2 =
470 + volume->le_to_pe_map_mir2[le].owning_pv->
472 + tmp_bh->mir_sector2 =
473 + volume->le_to_pe_map_mir2[le].pe_sector_offset +
477 + if (evms_cs_volume_request_in_progress
478 + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
479 + buffer_IO_error(bh);
483 + if (AIXResyncInProgress) {
484 + if (SECTOR_IN_RANGE
485 + (tmp_bh->bh_req.b_rsector,
486 + AIX_resync_list->master_offset)) {
487 + spin_lock_irqsave(&AIX_resync_list_lock, flags);
491 + R_IO(partition->logical_node, &tmp_bh->bh_req);
493 + if (AIXResyncInProgress) {
494 + if (SECTOR_IN_RANGE
495 + (tmp_bh->bh_req.b_rsector,
496 + AIX_resync_list->master_offset)) {
497 + spin_unlock_irqrestore(&AIX_resync_list_lock,
504 + R_IO(partition->logical_node, bh);
508 + LOG_DEBUG(" ***** %s ***** returning\n", __FUNCTION__);
514 + * Function: write_aix
517 +write_aix(struct evms_logical_node *node, struct buffer_head *bh)
519 + struct partition_list_entry *partition;
520 + u64 new_sector, new_sector2 = 0, new_sector3 = 0;
523 + struct aix_logical_volume *volume;
524 + struct aix_mirror_bh *tmp_bh;
525 + struct evms_logical_node *node2 = NULL, *node3 = NULL;
526 + u32 le, offset_in_le, count;
529 + volume = (struct aix_logical_volume *) node->private;
532 +// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
533 +// volume->mirror_iterations);
534 + LOG_DEBUG(" write_aix rsector:%lu rsize:%u\n", bh->b_rsector,
536 + LOG_DEBUG(" write_aix total_sectors:" PFU64 "\n", node->total_vsectors);
539 + if (volume->lv_access & EVMS_LV_INCOMPLETE) { //No writes allowed on incomplete volumes
540 + LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",
542 + buffer_IO_error(bh);
546 + // Check if I/O goes past end of logical volume.
547 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
548 + node->total_vsectors) {
549 + LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
550 + buffer_IO_error(bh);
553 + // Logical-to-Physical remapping
554 + if (AIX_remap_sector
555 + (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
556 + &new_sector, &new_size, &partition, &le, &offset_in_le)
557 + || (!new_sector || !partition)) {
558 + LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
559 + buffer_IO_error(bh);
563 + org_sector = bh->b_rsector;
564 + bh->b_rsector = new_sector;
565 + //bh->b_size = new_size;
568 + LOG_DEBUG(" write_aix Mirror_Copies:%d\n", volume->mirror_copies);
571 + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
573 + if (volume->le_to_pe_map_mir1) {
575 + volume->le_to_pe_map_mir1[le].pe_sector_offset +
578 + volume->le_to_pe_map_mir1[le].owning_pv->
582 + if (volume->mirror_copies == AIX_MAX_MIRRORS) {
585 + volume->le_to_pe_map_mir2[le].pe_sector_offset +
588 + volume->le_to_pe_map_mir2[le].owning_pv->
593 + AIX_alloc_wbh(partition->logical_node, node2, node3, bh, le,
594 + volume->mirror_copies, new_sector2,
598 + buffer_IO_error(bh);
601 + tmp_bh->node = node;
603 + tmp_bh = tmp_bh->mirror_bh_list;
605 + if (evms_cs_volume_request_in_progress
606 + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
607 + buffer_IO_error(bh);
608 + // free memory here
612 + if (AIXResyncInProgress) {
613 + if (SECTOR_IN_RANGE
614 + (tmp_bh->bh_req.b_rsector,
615 + AIX_resync_list->master_offset)) {
616 + spin_lock_irqsave(&AIX_resync_list_lock, flags);
620 + W_IO(tmp_bh->node, &tmp_bh->bh_req);
622 + if (AIXResyncInProgress) {
623 + if (SECTOR_IN_RANGE
624 + (tmp_bh->bh_req.b_rsector,
625 + AIX_resync_list->master_offset)) {
626 + spin_unlock_irqrestore(&AIX_resync_list_lock,
631 + tmp_bh = tmp_bh->next_r1;
634 + W_IO(tmp_bh->node, &tmp_bh->bh_req);
635 + tmp_bh = tmp_bh->next_r1;
639 + W_IO(tmp_bh->node, &tmp_bh->bh_req);
644 + W_IO(partition->logical_node, bh);
648 + LOG_DEBUG(" ***** %s returning *****\n", __FUNCTION__);
654 + * Function: ioctl_aix
658 +ioctl_aix(struct evms_logical_node *logical_node,
659 + struct inode *inode,
660 + struct file *file, unsigned int cmd, unsigned long arg)
662 + struct aix_logical_volume *volume =
663 + (struct aix_logical_volume *) (logical_node->private);
666 + LOG_EXTRA(" Ioctl %u\n", cmd);
672 + // Fixed geomerty for all LVM volumes
673 + unsigned char heads = 64;
674 + unsigned char sectors = 32;
676 + struct hd_geometry *hd = (struct hd_geometry *) arg;
678 + cylinders = logical_node->total_vsectors;
679 + cylinders = (cylinders / heads) / sectors;
686 + ((char *) (&hd->heads), &heads, sizeof (heads)) != 0
687 + || copy_to_user((char *) (&hd->sectors), §ors,
688 + sizeof (sectors)) != 0
689 + || copy_to_user((short *) (&hd->cylinders),
690 + &cylinders, sizeof (cylinders)) != 0
691 + || copy_to_user((long *) (&hd->start), &start,
692 + sizeof (start)) != 0) {
698 + case EVMS_QUIESCE_VOLUME:
701 + case EVMS_GET_DISK_LIST:
702 + case EVMS_CHECK_MEDIA_CHANGE:
703 + case EVMS_REVALIDATE_DISK:
704 + case EVMS_OPEN_VOLUME:
705 + case EVMS_CLOSE_VOLUME:
706 + case EVMS_CHECK_DEVICE_STATUS:
708 + // These five ioctl all need to be broadcast to all PVs.
709 + struct aix_volume_group *group = volume->group;
710 + struct partition_list_entry *partition;
711 + for (partition = group->partition_list; partition;
712 + partition = partition->next) {
714 + IOCTL(partition->logical_node, inode, file,
721 + // Currently the VGE does not send any ioctl's down to the
722 + // partitions. Which partition would they go to?
729 +/* Function: aix_direct_ioctl
731 + * This function provides a method for user-space to communicate directly
732 + * with a plugin in the kernel.
735 +aix_direct_ioctl(struct inode *inode,
736 + struct file *file, unsigned int cmd, unsigned long args)
738 + struct aix_logical_volume *volume = NULL;
739 + struct evms_plugin_ioctl_pkt argument;
743 + LOG_DEBUG(" Function:%s cmd:%d \n", __FUNCTION__, cmd);
745 + // Copy user's parameters to kernel space
747 + (&argument, (struct evms_plugin_ioctl *) args, sizeof (argument))) {
751 + // Make sure this is supposed to be our ioctl.
752 + if (argument.feature_id != plugin_header.id) {
757 + argument.feature_command = 1;
759 + switch (argument.feature_command) {
761 + case EVMS_AIX_RESYNC_MIRRORS:
763 + struct aix_volume_resync_ioctl aix_lv_resync;
767 + (struct aix_volume_resync_ioctl *) argument.
768 + feature_ioctl_data, sizeof (aix_lv_resync))) {
773 + volume = AIX_get_volume_data(aix_lv_resync.object_name);
776 + AIX_schedule_resync(volume, FALSE);
779 + (" Function:%s object_name:%s -- no match found\n",
780 + __FUNCTION__, aix_lv_resync.object_name);
792 + argument.status = rc;
793 + copy_to_user((struct evms_plugin_ioctl *) args, &argument,
794 + sizeof (argument));
799 +/* Function: aix_direct_ioctl
801 + * This function provides a method for user-space to communicate directly
802 + * with a plugin in the kernel.
804 +static struct aix_logical_volume *
805 +AIX_get_volume_data(char *object_name)
808 + struct aix_volume_group *VG_ptr;
809 + struct aix_logical_volume *volume = NULL;
812 + LOG_DEBUG(" Function:%s object_name:%s \n", __FUNCTION__, object_name);
814 + if (!object_name || !strlen(object_name)) {
818 + for (VG_ptr = AIXVolumeGroupList; VG_ptr; VG_ptr = VG_ptr->next) {
819 + for (i = 0; VG_ptr->volume_list[i]; i++) {
820 + if (!strcmp(VG_ptr->volume_list[i]->name, object_name)) {
822 + (" Function:%s FOUND!! volume_name:%s \n",
824 + VG_ptr->volume_list[i]->name);
825 + volume = VG_ptr->volume_list[i];
832 + LOG_DEBUG(" Function:%s object_name:%s NOT FOUND !! volume:%p \n",
833 + __FUNCTION__, object_name, volume);
840 + * Function: init_io_aix
844 +init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
845 + u64 sect_nr, /* disk LBA */
846 + u64 num_sects, /* # of sectors */
848 +{ /* buffer address */
849 + struct partition_list_entry *partition;
850 + u64 new_sector = 0;
855 + LOG_DEBUG(" ************ init_io_aix() num_sects:" PFU64
856 + " node:%p sect_nr:" PFU64 "\n", num_sects, node, sect_nr);
858 + // Init IO needs to deal with the possibility that a request can come
859 + // in that spans PEs or stripes. This is possible because there is no
860 + // limit on num_sects. To fix this, we loop through AIX_remap_sector and
861 + // INIT_IO until num_sects reaches zero.
863 + while (num_sects > 0) {
865 + if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size,
866 + &partition, &le, &offset) || (!new_sector || !partition)) {
867 + LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",
872 + LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:"
873 + PFU64 " new_size:" PFU64 "\n", __LINE__,
874 + partition->logical_node, io_flag, new_sector, new_size);
876 + rc = INIT_IO(partition->logical_node, io_flag, new_sector,
877 + new_size, buf_addr);
878 + num_sects -= new_size;
879 + sect_nr += new_size;
880 + buf_addr = (void *) (((unsigned long) buf_addr) +
881 + (unsigned long) (new_size << EVMS_VSECTOR_SIZE_SHIFT));
888 + * Function: AIXlvm_vge_init
892 +AIXlvm_vge_init(void)
895 + LOG_DEBUG(" %s --------\n", __FUNCTION__);
898 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
901 +module_init(AIXlvm_vge_init);
903 +/********** Required Plugin Functions **********/
906 + * Function: discover_aix
908 + * This is the entry point into the LVM discovery process.
911 +discover_aix(struct evms_logical_node **evms_logical_disk_head)
913 + int rc = 0, count = 0;
916 + LOG_DEBUG("[%s] discover_volume_groups\n", __FUNCTION__);
918 + rc = discover_volume_groups(evms_logical_disk_head);
921 + LOG_ERROR("[%s] discover_volume_groups rc=%d\n", __FUNCTION__,rc);
924 + if (AIXVolumeGroupList && !rc) {
926 + LOG_DEBUG("[%s] discover_logical_volumes\n", __FUNCTION__);
928 + rc = discover_logical_volumes();
931 + LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",
935 + LOG_DEBUG("[%s] export_volumes\n", __FUNCTION__);
937 + count = export_volumes(evms_logical_disk_head);
939 + LOG_DEBUG("[%s] export_volumes count=%d\n", __FUNCTION__,
948 +discover_volume_groups(struct evms_logical_node **evms_logical_disk_head)
950 + struct evms_logical_node *logical_node;
951 + struct evms_logical_node *next_node;
952 + struct aix_ipl_rec_area *AIXpv;
953 + struct AIXlvm_rec *AIXlvm; // Temp holder for the LVM on disk rec
955 + LOG_DEBUG(" Begin %s\n", __FUNCTION__);
957 + AIXpv = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
962 + // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
964 + AIXlvm = kmalloc(sizeof (struct AIXlvm_rec), GFP_KERNEL);
970 + for (logical_node = *evms_logical_disk_head; logical_node;
971 + logical_node = next_node) {
973 + // Grab the next list item in case we remove this partition from the global list.
974 + next_node = logical_node->next;
976 + // Read the first sector and see if it has a valid AIX PV signature.
978 + if (INIT_IO(logical_node, 0, 0, 1, AIXpv)) {
979 + // On an I/O error, continue on to the next
980 + // partition. The group that this partition
981 + // belongs to will be incomplete, but we still
982 + // need to discover any other groups.
984 + LOG_ERROR(" Error reading PV [%p]\n", logical_node);
988 + if (AIXpv->IPL_record_id == IPLRECID) {
990 + // This partition is definitely a PV,
991 + // but is it part of a valid VG?
992 + LOG_DEBUG(" DVG removing node from list logical_node %p\n",
995 + if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
996 + LOG_ERROR(" Error reading PV [%p]\n",logical_node);
1000 + if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
1002 + if (validate_build_volume_group_disk_info(
1003 + logical_node, AIXlvm)) {
1004 + // Again, continue on and we'll
1005 + // clean up later.
1009 + evms_cs_remove_logical_node_from_list(
1010 + evms_logical_disk_head, logical_node);
1013 + LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %d)\n",
1018 + LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",
1023 + AIX_VOLUME_GROUP_DUMP();
1025 + if (check_volume_groups()) {
1036 + * Function: validate_build_volume_group_disk_info
1038 + * Creates and validates the volume groups found on the disk structures.
1042 +validate_build_volume_group_disk_info(struct evms_logical_node *logical_node,
1043 + struct AIXlvm_rec *AIXlvm)
1046 + struct aix_volume_group *AIXVGLptr = AIXVolumeGroupList;
1048 + LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
1050 + while (AIXVGLptr) {
1051 + if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
1054 + AIXVGLptr = AIXVGLptr->next; // There is more than one so walk the list
1058 + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
1059 + AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm);
1061 + AIXVGLptr->next = AIXVolumeGroupList;
1062 + AIXVolumeGroupList = AIXVGLptr;
1065 + LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n",
1066 + AIXVGLptr, __LINE__);
1068 + if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
1070 + (" VBVGDI ERROR on Rediscover AIXVGLptr:%p line:%d\n",
1071 + AIXVGLptr, __LINE__);
1077 + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1079 + LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1080 + LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
1084 + LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n",
1085 + AIXVolumeGroupList, __LINE__);
1086 + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1088 + LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1090 + if (add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num)) {
1099 + * Function: add_VG_data_to_VG_list
1101 + * Allocate space for a new LVM volume group and all of its sub-fields.
1102 + * Initialize the appropriate fields.
1106 +add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
1107 + struct aix_volume_group *new_group, short int pvNum)
1111 +// struct pv_header *AIXpvh;
1113 + // The array of pointer to the logical volumes.
1114 + // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
1115 + // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
1117 + LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n", pvNum,
1118 + new_group->vgda_psn);
1120 +// pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
1122 +/* AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1127 + memset(AIXpvh, 0, AIX_SECTOR_SIZE);
1129 + LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
1131 + if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
1135 + LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
1137 + if (!new_group->volume_list) {
1138 + new_group->volume_list =
1139 + kmalloc(LVM_MAXLVS * sizeof (struct aix_logical_volume *),
1141 + if (!new_group->volume_list) {
1145 + memset(new_group->volume_list, 0,
1146 + (LVM_MAXLVS * sizeof (struct aix_logical_volume *)));
1149 + new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1;
1150 + new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2;
1151 + new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3;
1152 + new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4;
1153 +// new_group->numpvs = new_group->AIXvgh->numpvs;
1154 +// new_group->numlvs = new_group->AIXvgh->numlvs;
1155 +// new_group->lv_max = new_group->AIXvgh->maxlvs;
1156 + new_group->pe_size = GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) /
1159 +// new_group->block_size = 0;
1160 +// new_group->hard_sect_size = 0;
1161 + new_group->flags |= AIX_VG_DIRTY;
1165 + LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
1171 + * Function: add_PV_to_volume_group
1173 + * Create a new partition_list_entry for the specified volume group.
1174 + * Initialize the new partition with the evms node and lvm pv information,
1175 + * and add the new partition to the group's list.
1179 +add_PV_to_volume_group(struct aix_volume_group *group,
1180 + struct evms_logical_node *evms_partition, int pvNum)
1182 + struct partition_list_entry *new_partition;
1184 + LOG_DEBUG(" APVVG Entering pvNum:%d\n", pvNum);
1186 + group->flags |= AIX_VG_DIRTY;
1188 + for (new_partition = group->partition_list; new_partition != NULL;
1189 + new_partition = new_partition->next) {
1190 + if (new_partition->logical_node == evms_partition) {
1196 + kmalloc(sizeof (struct partition_list_entry), GFP_KERNEL);
1197 + if (!new_partition) {
1201 + memset(new_partition, 0, sizeof (struct partition_list_entry));
1203 + // Add this partition to this group's list.
1204 + new_partition->logical_node = evms_partition;
1205 + new_partition->pv_number = pvNum;
1207 + if (evms_partition->hardsector_size > group->hard_sect_size) {
1208 + group->hard_sect_size = evms_partition->hardsector_size;
1210 + if (evms_partition->block_size > group->block_size) {
1211 + group->block_size = evms_partition->block_size;
1214 + // Add this partition to the beginning of its group's list.
1215 + new_partition->next = group->partition_list;
1216 + group->partition_list = new_partition;
1217 + group->partition_count++;
1219 + LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",
1220 + group->partition_count, pvNum);
1225 +/****************************************************
1229 +*****************************************************/
1230 +static struct aix_volume_group *
1231 +AIX_create_volume_group(struct evms_logical_node *logical_node,
1232 + struct AIXlvm_rec *AIXlvm)
1234 + struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1235 + struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1236 + struct aix_volume_group *AIXVGLptr;
1238 + AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1243 + AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1245 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1249 + AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1251 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1255 + AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1257 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1261 + memset(AIXvgh, 0, AIX_SECTOR_SIZE);
1262 + memset(AIXvgh2, 0, AIX_SECTOR_SIZE);
1263 + memset(AIXvgt, 0, AIX_SECTOR_SIZE);
1264 + memset(AIXvgt2, 0, AIX_SECTOR_SIZE);
1266 + // First time thru we want to read this in, we may only have one PV in this group, all others
1267 + // may be corrupt, etc. If the info is clean we shouldn't get here.
1269 + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1270 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1274 + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1275 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1279 + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1281 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1285 + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1287 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1291 + LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1292 + LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1293 + LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1294 + LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1295 + LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",(int) sizeof (struct aix_volume_group));
1297 + AIXVGLptr = kmalloc(sizeof (struct aix_volume_group), GFP_KERNEL);
1299 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1302 + memset(AIXVGLptr, 0, sizeof (struct aix_volume_group));
1304 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1305 + AIXVGLptr->flags |= AIX_VG_DIRTY;
1307 + LOG_DEBUG("CVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1309 + AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1310 + if (!AIXVGLptr->AIXvgh) {
1312 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1315 + memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1317 + LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",
1318 + AIXVGLptr->CleanVGInfo);
1320 + if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1321 + if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1322 + if (COMPARE_TIMESTAMPS
1323 + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1324 + if (COMPARE_TIMESTAMPS
1325 + (AIXvgh->vg_timestamp,
1326 + AIXvgh2->vg_timestamp)) {
1327 + // All timestamps match. Yea!
1328 + AIXVGLptr->CleanVGInfo =
1329 + AIX_PV_STATE_VALID;
1331 + // Both VGDAs are good, but timestamps are
1332 + // different. Can't tell yet which one is
1334 + AIXVGLptr->CleanVGInfo =
1335 + AIX_PV_STATE_EITHER_VGDA;
1338 + // First VGDA is good, second is bad.
1339 + AIXVGLptr->CleanVGInfo =
1340 + AIX_PV_STATE_FIRST_VGDA;
1343 + if (COMPARE_TIMESTAMPS
1344 + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1345 + // First VGDA is bad, second is good.
1346 + AIXVGLptr->CleanVGInfo =
1347 + AIX_PV_STATE_SECOND_VGDA;
1348 + } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1349 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1351 + // This should never happen.
1352 + LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n",
1353 + AIXVGLptr->vg_id.word2);
1354 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1359 + LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1360 + AIXVGLptr->CleanVGInfo);
1362 + switch (AIXVGLptr->CleanVGInfo) {
1363 + case AIX_PV_STATE_VALID:
1364 + case AIX_PV_STATE_FIRST_VGDA:
1366 + LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",
1367 + AIXVGLptr->CleanVGInfo,
1368 + (int) sizeof (struct vg_header));
1370 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1372 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1373 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1376 + case AIX_PV_STATE_SECOND_VGDA:
1377 + LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",
1378 + AIXVGLptr->CleanVGInfo,
1379 + (int) sizeof (struct vg_header));
1381 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1383 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1384 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1387 + case AIX_PV_STATE_EITHER_VGDA:
1388 + LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",
1389 + AIXVGLptr->CleanVGInfo,(int) sizeof (struct vg_header));
1390 + if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1392 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1394 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1395 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1397 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1398 + // Not sure where this PV belongs. It thinks it is
1399 + // supposed to be in two different containers. We will
1400 + // probably need to put this on a separate, temporary
1401 + // list, and determine later which container is missing
1407 + LOG_ERROR("Invalid PV state (%d) for %d\n",
1408 + AIXVGLptr->CleanVGInfo,
1409 + AIXVGLptr->vg_id.word2);
1410 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1416 + // Currently AIX Big VGDA is not supported - cleanup and return NULL so this VG doesn't get added
1418 + if (AIXVGLptr->AIXvgh->bigvg != 0) {
1419 + LOG_SERIOUS("Error creating Volume Group AIX Big VGDA is not currently supported\n");
1420 + if (AIXVGLptr->AIXvgh) {
1421 + kfree(AIXVGLptr->AIXvgh);
1422 + AIXVGLptr->AIXvgh = NULL;
1430 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1434 + add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1436 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1438 + LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1443 +/****************************************************
1447 +*****************************************************/
1449 +AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
1450 + struct evms_logical_node *logical_node,
1451 + struct AIXlvm_rec *AIXlvm)
1453 + struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1454 + struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1456 + AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1461 + AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1463 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1467 + AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1469 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1473 + AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1475 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1479 + // First time thru we want to read this in, we may only have one PV in this group, all others
1480 + // may be corrupt, etc. If the info is clean we shouldn't get here.
1482 + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1483 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1487 + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1488 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1492 + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1494 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1498 + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1500 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1504 + LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1505 + LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1506 + LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1507 + LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1509 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1510 + AIXVGLptr->flags |= AIX_VG_DIRTY;
1512 + LOG_DEBUG("UVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1514 + AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1515 + if (!AIXVGLptr->AIXvgh) {
1516 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1519 + memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1521 + LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1523 + if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1524 + if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1525 + if (COMPARE_TIMESTAMPS
1526 + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1527 + if (COMPARE_TIMESTAMPS
1528 + (AIXvgh->vg_timestamp,
1529 + AIXvgh2->vg_timestamp)) {
1530 + // All timestamps match. Yea!
1531 + AIXVGLptr->CleanVGInfo =
1532 + AIX_PV_STATE_VALID;
1534 + // Both VGDAs are good, but timestamps are
1535 + // different. Can't tell yet which one is
1537 + AIXVGLptr->CleanVGInfo =
1538 + AIX_PV_STATE_EITHER_VGDA;
1541 + // First VGDA is good, second is bad.
1542 + AIXVGLptr->CleanVGInfo =
1543 + AIX_PV_STATE_FIRST_VGDA;
1546 + if (COMPARE_TIMESTAMPS
1547 + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1548 + // First VGDA is bad, second is good.
1549 + AIXVGLptr->CleanVGInfo =
1550 + AIX_PV_STATE_SECOND_VGDA;
1551 + } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1552 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1554 + // This should never happen.
1556 + ("All four VG timestamps for %d are different. What happened?!?\n",
1557 + AIXVGLptr->vg_id.word2);
1558 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1563 + LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1564 + AIXVGLptr->CleanVGInfo);
1566 + switch (AIXVGLptr->CleanVGInfo) {
1567 + case AIX_PV_STATE_VALID:
1568 + case AIX_PV_STATE_FIRST_VGDA:
1570 + LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",
1571 + AIXVGLptr->CleanVGInfo,
1572 + (int) sizeof (struct vg_header));
1574 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1576 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1577 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1580 + case AIX_PV_STATE_SECOND_VGDA:
1581 + LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",
1582 + AIXVGLptr->CleanVGInfo,
1583 + (int) sizeof (struct vg_header));
1585 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1587 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1588 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1591 + case AIX_PV_STATE_EITHER_VGDA:
1592 + LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",
1593 + AIXVGLptr->CleanVGInfo,
1594 + (int) sizeof (struct vg_header));
1595 + if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1597 + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1599 + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1600 + AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1602 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1603 + // Not sure where this PV belongs. It thinks it is
1604 + // supposed to be in two different containers. We will
1605 + // probably need to put this on a separate, temporary
1606 + // list, and determine later which container is missing
1612 + LOG_ERROR("UVG Invalid PV state (%d) for %d\n",
1613 + AIXVGLptr->CleanVGInfo,
1614 + AIXVGLptr->vg_id.word2);
1615 + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1621 +// add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1622 + AIXVGLptr->flags |= AIX_VG_DIRTY;
1624 + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1626 + LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1631 +/****************************************************
1632 +* Function: check_volume_groups
1634 +* We just want to make sure the volume groups have found
1635 +* all their drives.
1637 +* If not, we'll continue and build what we can
1638 +*****************************************************/
1640 +check_volume_groups(void)
1642 + struct aix_volume_group *group;
1643 + struct aix_volume_group *next_group;
1644 +// struct partition_list_entry *partitions;
1647 + LOG_DEBUG("CHVG Checking volume groups:\n");
1650 + for (group = AIXVolumeGroupList; group; group = next_group) {
1651 + next_group = group->next;
1653 + if (group->flags & AIX_VG_DIRTY){
1654 + if (group->AIXvgh->numlvs == 0) {
1655 + remove_group_from_list(group);
1656 + deallocate_volume_group(group);
1658 + if (group->partition_count != group->AIXvgh->numpvs) {
1659 + group->flags |= AIX_VG_INCOMPLETE;
1660 + LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",
1662 + LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",
1663 + group->partition_count, group->AIXvgh->numpvs);
1669 + LOG_DEBUG("CHVG Finished Checking volume groups:\n");
1674 +/************************************************************************
1675 + * Function: discover_logical_volumes
1677 + * After all PVs have been claimed and added to the appropriate VG list,
1678 + * the volumes for each VG must be constructed.
1683 +discover_logical_volumes(void)
1686 + struct aix_volume_group *AIXVGLPtr;
1687 + struct aix_logical_volume *new_LV;
1688 + struct partition_list_entry *partition;
1689 + struct evms_logical_node *node;
1690 + struct lv_entries *AIXlvent, *AIXlventHead;
1691 + int j, lv_found, all_lvs_found, rc;
1692 + struct namelist *AIXnamelist;
1696 + kmalloc(MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE, GFP_KERNEL);
1697 + if (!AIXlventHead) {
1701 + memset(AIXlventHead, 0, (MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE));
1704 + kmalloc(MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE, GFP_KERNEL);
1705 + if (!NameBuffer) {
1706 + kfree(AIXlventHead);
1710 + memset(NameBuffer, 0, (MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE));
1712 + for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr;
1713 + AIXVGLPtr = AIXVGLPtr->next ) {
1715 + partition = AIXVGLPtr->partition_list;
1717 + if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
1721 + if (partition == NULL) {
1725 + node = partition->logical_node;
1727 + if (node == NULL) {
1731 + LOG_DEBUG("DLV INIT_IO AIXNameList position:%d\n",
1732 + ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1733 + MAX_SECTORS_NAMELIST));
1734 + LOG_DEBUG("AIXVGLPTR:%p partition:%p node:%p \n", AIXVGLPtr,
1737 + if (INIT_IO(node, 0,
1738 + ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1739 + MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST,
1744 + LOG_DEBUG("DLV INIT_IO AIXNameList\n");
1746 + if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC,
1747 + MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
1750 + AIXlvent = AIXlventHead;
1751 + AIXnamelist = (struct namelist *) NameBuffer;
1753 + LOG_DEBUG("DLV INIT_IO AIXlvent\n");
1754 + // Search through the LV structs for valid LV entries
1755 + // We're just going to search until all valid LVs are found
1756 + // The max. allowable LVs is 256 and we want don't want to
1757 + // search for 255 if only 8 are defined 1-8 however, there
1758 + // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
1760 + for (j = 0, lv_found = 0, all_lvs_found = FALSE;
1761 + !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
1763 + LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",
1764 + AIXlvent->num_lps, AIXnamelist->name[j], j,
1765 + AIXlvent->lvname);
1766 + LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n",
1767 + AIXlvent->striping_width,
1768 + GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp),
1769 + AIXlvent->lv_state);
1770 + LOG_DEBUG(" DVIG Group:%x.Access:%x\n",
1771 + (unsigned int) AIXVGLPtr->vg_id.word2,
1772 + AIXlvent->permissions);
1773 + LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n",
1774 + AIXlvent->mirror, AIXlvent->mirror_policy,
1775 + AIXlvent->mirwrt_consist);
1777 + // This is the same check we used in "diskedit" and "readdisk"
1778 + if (AIXlvent->lv_state == 0 ||
1779 + AIXlvent->permissions > 0x10) {
1784 + if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
1785 + all_lvs_found = TRUE;
1788 + LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n",
1789 + lv_found, all_lvs_found);
1791 + // Create a new logical volume and place it in the appropriate
1792 + // spot in this VG's volume list. For re-discovery, make sure
1793 + // this volume does not already exist.
1794 + if (!AIXVGLPtr->volume_list[AIXlvent->lvname]) {
1796 + new_logical_volume(AIXlvent,
1800 + GET_PHYSICAL_PART_SIZE
1806 + LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",
1807 + new_LV->lv_number,AIXVGLPtr->vg_id.word2);
1809 + AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
1811 + LOG_DEBUG("DVIG Updating Vol Exists\n");
1815 + // Build the le_to_pe_map for each volume that was discovered above.
1816 + // This has to be done after all volumes in the group are discovered
1817 + if ((rc = build_pe_maps(AIXVGLPtr))) {
1821 + check_log_volume_and_pe_maps(AIXVGLPtr);
1824 + kfree(NameBuffer);
1825 + kfree(AIXlventHead);
1831 + * Function: new_logical_volume
1833 + * Allocate space for a new LVM logical volume, including space for the
1836 +static struct aix_logical_volume *
1837 +new_logical_volume(struct lv_entries *AIXlvent,
1838 + struct aix_volume_group *volume_group,
1839 + char *lv_name, u32 stripesize)
1842 + struct aix_logical_volume *new_volume;
1843 + const char *name = "evms_AIXiod";
1844 + const char *resync_name = "evms_AIXresync";
1846 + LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n",
1847 + AIXlvent->lvname, AIXlvent->num_lps,
1848 + AIXlvent->num_lps * volume_group->pe_size);
1850 + // Allocate space for the new logical volume.
1851 + new_volume = kmalloc(sizeof (struct aix_logical_volume), GFP_KERNEL);
1852 + if (!new_volume) {
1855 + memset(new_volume, 0, sizeof (struct aix_logical_volume));
1857 + // Allocate space for the LE to PE mapping table
1858 + // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
1859 + new_volume->le_to_pe_map =
1860 + kmalloc((AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry),
1862 + if (!new_volume->le_to_pe_map) {
1863 + delete_logical_volume(new_volume);
1867 + memset(new_volume->le_to_pe_map, 0,
1868 + (AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry));
1870 + if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
1871 + new_volume->le_to_pe_map_mir1 =
1872 + kmalloc((AIXlvent->num_lps +
1873 + 1) * sizeof (struct pe_table_entry), GFP_KERNEL);
1874 + if (!new_volume->le_to_pe_map_mir1) {
1875 + delete_logical_volume(new_volume);
1878 + memset(new_volume->le_to_pe_map_mir1, 0,
1879 + (AIXlvent->num_lps +
1880 + 1) * sizeof (struct pe_table_entry));
1883 + if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
1884 + new_volume->le_to_pe_map_mir2 =
1885 + kmalloc((AIXlvent->num_lps + 1)
1886 + * sizeof (struct pe_table_entry), GFP_KERNEL);
1887 + if (!new_volume->le_to_pe_map_mir2) {
1888 + delete_logical_volume(new_volume);
1891 + memset(new_volume->le_to_pe_map_mir2, 0,
1892 + (AIXlvent->num_lps +1)
1893 + * sizeof (struct pe_table_entry));
1896 + // Initialize the rest of the new volume.
1897 + new_volume->lv_number = AIXlvent->lvname;
1898 + new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size);
1899 + new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW; // All volumes start new.
1900 + new_volume->lv_status = AIXlvent->lv_state;
1901 + //new_volume->lv_minor = MINOR(1);
1902 + new_volume->mirror_copies = AIXlvent->mirror;
1903 +// new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
1904 + new_volume->stripes = AIXlvent->striping_width;
1905 + new_volume->stripe_size = stripesize;
1906 + new_volume->stripe_size_shift = evms_cs_log2(stripesize);
1907 + new_volume->pe_size = volume_group->pe_size;
1908 + new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size);
1909 + new_volume->num_le = AIXlvent->num_lps;
1910 +// new_volume->new_volume = TRUE;
1911 + new_volume->group = volume_group;
1913 + volume_group->numlvs++;
1915 + sprintf(new_volume->name, "aix/%s", lv_name);
1917 + if (!AIX_BH_list_pool
1918 + && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1920 + // We only need the ReSync thread if we have at least one mirrored LV.
1921 + // You can't ReSync a non-mirrored drive
1923 + AIX_BH_list_pool =
1924 + evms_cs_create_pool(sizeof (struct aix_mirror_bh),
1925 + "EVMS_AIX_BH", aix_notify_cache_ctor,
1927 + if (!AIX_BH_list_pool) {
1930 + AIX_mirror_read_retry_thread =
1931 + evms_cs_register_thread(AIXiod, NULL, name);
1933 + AIX_mirror_resync_thread =
1934 + evms_cs_register_thread(AIXresync, NULL,
1939 + LOG_DEBUG("NLV lv_number:%d name:%s lv_size " PFU64 " \n",
1940 + new_volume->lv_number, new_volume->name, new_volume->lv_size);
1941 + LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n",
1942 + new_volume->stripe_size, new_volume->stripe_size_shift);
1944 + return new_volume;
1948 + * Function: aix_notify_cache_ctor
1949 + * this function initializes the b_wait field in the buffer heads
1950 + * in our private buffer head pool.
1953 +aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
1955 + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1956 + SLAB_CTOR_CONSTRUCTOR) {
1957 + struct aix_mirror_bh *rbh = (struct aix_mirror_bh *) foo;
1958 + memset(rbh, 0, sizeof (struct aix_mirror_bh));
1959 + rbh->remaining = (atomic_t) ATOMIC_INIT(0);
1960 + init_waitqueue_head(&rbh->bh_req.b_wait);
1965 + * Function: build_pe_maps
1967 + * After all logical volumes have been discovered, the mappings from
1968 + * logical extents to physical extents must be constructed. Each PV
1969 + * contains a map on-disk of its PEs. Each PE map entry contains the
1970 + * logical volume number and the logical extent number on that volume.
1971 + * Our internal map is the reverse of this map for each volume, listing
1972 + * the PV node and sector offset for every logical extent on the volume.
1975 + build_pe_maps(struct aix_volume_group *volume_group)
1977 + struct partition_list_entry *partition;
1978 + struct partition_list_entry *mirror_partition;
1979 + struct pp_entries *AIXppent, *AIXppent_buff;
1980 + struct pv_header *AIXpvh;
1983 + u32 j, pp_count, pvh_pos;
1985 + u32 pvh_posn[LVM_MAXPVS];
1987 +#ifdef EVMS_DEBUG_MIRRORS
1988 + u32 lv_found, all_lvs_found;
1992 + LOG_DEBUG(" *** BPEM ***\n");
1993 + // For every partition in this VG
1995 + AIXppent_buff = kmalloc(AIX_SECTOR_SIZE * PHYS_VOL_OFFSET, GFP_KERNEL);
1996 + if (!AIXppent_buff) {
2000 + memset(AIXppent_buff, 0, AIX_SECTOR_SIZE * PHYS_VOL_OFFSET);
2001 + memset(pvh_posn, 0, LVM_MAXPVS);
2003 + AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
2005 + kfree(AIXppent_buff);
2009 + memset(AIXpvh, 0, AIX_SECTOR_SIZE);
2011 + LOG_DEBUG(" BPEM AIXppent_buff:%d \n",
2012 + (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
2014 + // This next section is to calculate the sector spacing between PV info for the VG
2015 + // AIX doesn't always space the info. the same. It could be 17 or 34 sectors apart
2016 + // depending on the PE size selected.
2018 + rc = AIX_pvh_data_posn(volume_group->vgda_psn, pvh_posn, volume_group->partition_list, volume_group->AIXvgh->numpvs);
2021 + kfree(AIXppent_buff);
2026 + for (partition = volume_group->partition_list; partition;
2027 + partition = partition->next) {
2029 + LOG_DEBUG(" BPEM partition:%p next:%p\n", partition,
2032 + pvh_pos = pvh_posn[partition->pv_number];
2034 + LOG_DEBUG(" BPEM pvh_pos:%d pv_number:%d\n", pvh_pos, partition->pv_number);
2036 + if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
2037 + kfree(AIXppent_buff);
2041 + // For every entry in the PE map, calculate the PE's sector offset
2042 + // and update the correct LV's PE map. LV number of 0 marks an unused PE.
2043 + // For re-discovery, only compute entries for new volumes.
2045 + if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH,
2047 + kfree(AIXppent_buff);
2052 + AIXppent = AIXppent_buff;
2055 + pp_count = AIXpvh->pp_count;
2057 + LOG_DEBUG("BPEM AIXpvh data: pp_count:%d psn_part1:%d pv_id1:%d pv_id2:%d pv_id3:%d pv_id4:%d pv_num:%d pv_state:%d vgdas:%d res1:%d res2:%d\n", AIXpvh->pp_count,
2058 + AIXpvh->psn_part1,
2059 + AIXpvh->pv_id.word1,
2060 + AIXpvh->pv_id.word2,
2061 + AIXpvh->pv_id.word3,
2062 + AIXpvh->pv_id.word4,
2064 + AIXpvh->pv_state, AIXpvh->pvnum_vgdas, AIXpvh->res1, AIXpvh->res2);
2066 + LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
2067 + volume_group->vg_id.word2, AIXpvh->pv_num, partition,
2068 + partition->next, AIXppent->lv_index, pp_count);
2070 + for (j = 0; j < pp_count; j++,AIXppent++) {
2071 + if (!AIXppent->lv_index || AIXppent->pp_state == AIX_LVM_LVUNDEF) {
2075 + LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%d cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
2076 + volume_group->vg_id.word2, j + 1,
2077 + AIXppent->pp_state,
2078 + volume_group->volume_list[AIXppent->lv_index -1]->name,
2079 + AIXppent->lv_index, AIXppent->lp_num,
2080 + AIXppent->copy, AIXppent->fst_alt_vol,
2081 + AIXppent->fst_alt_part,
2082 + AIXppent->snd_alt_vol,
2083 + AIXppent->snd_alt_part);
2085 + le_number = AIXppent->lp_num - 1; // AIX lp's start @ 1, we want a 0 index
2086 + offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
2088 + LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
2089 + le_number, partition, AIXppent->lv_index,
2090 + volume_group->volume_list[AIXppent->lv_index -1]->name);
2092 + if (!volume_group->volume_list[AIXppent->lv_index - 1]) {
2093 + LOG_SERIOUS("Failed attempt to access volume without memory allocation lv:%d\n",
2094 + AIXppent->lv_index - 1);
2098 + if (volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map
2099 + && le_number <= volume_group->volume_list[AIXppent->lv_index - 1]->num_le) {
2101 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].owning_pv = partition;
2102 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pe_sector_offset = offset;
2103 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pp_state = AIXppent->pp_state;
2106 + if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies >
2107 + AIX_DEFAULT_MIRRORING) {
2109 + LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n",
2110 + AIXppent->lv_index);
2112 + for (mirror_partition = volume_group->partition_list,
2113 + MirrorFound = FALSE;
2114 + mirror_partition && !MirrorFound;
2115 + mirror_partition = mirror_partition->next) {
2117 + if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
2119 + offset = (((AIXppent->fst_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2121 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition;
2122 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
2123 + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pp_state = AIXppent->pp_state;
2125 + LOG_EXTRA(" PE Map: mirror_partition:%p \n",
2126 + mirror_partition);
2127 + LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n",
2128 + AIXppent->fst_alt_part);
2130 + MirrorFound = TRUE;
2134 + if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies == AIX_MAX_MIRRORS) {
2136 + for (mirror_partition = volume_group->partition_list,
2137 + MirrorFound = FALSE;
2138 + mirror_partition && !MirrorFound;
2139 + mirror_partition = mirror_partition->next) {
2141 + if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
2143 + offset = (((AIXppent->snd_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2145 + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition;
2146 + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
2147 + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pp_state = AIXppent->pp_state;
2149 + LOG_EXTRA(" PE Map: mirror_partition2:%p \n",
2150 + mirror_partition);
2151 + LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n",
2152 + AIXppent->snd_alt_part);
2154 + MirrorFound = TRUE;
2159 + } // End of if mirroring is enabled
2163 +// LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
2165 +#ifdef EVMS_DEBUG_MIRRORS
2166 + for (mirs = 0, lv_found = 0, all_lvs_found = FALSE;
2167 + !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
2169 + if (volume_group->volume_list[mirs] != NULL) {
2170 + if (volume_group->volume_list[mirs]->lv_status ==
2175 + LOG_DEBUG(" PE Map: owning part lv %d -- %p\n",
2177 + volume_group->volume_list[mirs]->
2178 + le_to_pe_map[0].owning_pv);
2179 + if (volume_group->volume_list[mirs]->
2180 + mirror_copies > AIX_DEFAULT_MIRRORING) {
2181 + LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2183 + volume_group->volume_list[mirs]->
2184 + le_to_pe_map_mir1[0].owning_pv);
2186 + if (volume_group->volume_list[mirs]->
2187 + mirror_copies == AIX_MAX_MIRRORS) {
2188 + LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2190 + volume_group->volume_list[mirs]->
2191 + le_to_pe_map_mir2[0].owning_pv);
2194 + if (lv_found == volume_group->AIXvgh->numlvs) {
2195 + all_lvs_found = TRUE;
2196 + LOG_DEBUG(" PE Map: all_lvs_found\n");
2203 + kfree(AIXppent_buff);
2209 + * Function: check_log_volume_and_pe_maps
2211 + * Make sure all volumes in this group have valid LE-to-PE maps.
2212 + * Any volume that doesn't is deleted. This is safe for re-discovery
2213 + * because only new volumes could have corrupted PE maps.
2216 +check_log_volume_and_pe_maps(struct aix_volume_group *group)
2218 + struct aix_logical_volume *volume;
2219 + int i, j, lv_found, all_lvs_found;
2221 + LOG_DEBUG(" check_pe_map.\n");
2223 + for (i = 0, all_lvs_found = FALSE, lv_found = 0;
2224 + !all_lvs_found && i < LVM_MAXLVS; i++) {
2225 + if (!group->volume_list[i]) {
2226 + LOG_DEBUG(" CPEM No Volume %d found \n", i);
2230 + volume = group->volume_list[i];
2231 + if (!volume->le_to_pe_map) {
2232 + LOG_DEBUG(" CPEM Volume %s has no PE map.\n",
2234 + delete_logical_volume(volume);
2238 + LOG_DEBUG(" CPEM volume %s num_le: %d \n", volume->name,
2243 + if (lv_found == group->AIXvgh->numlvs) {
2244 + all_lvs_found = TRUE;
2247 + for (j = 0; j < volume->num_le; j++) {
2248 + if (!volume->le_to_pe_map[j].owning_pv ||
2249 + !volume->le_to_pe_map[j].pe_sector_offset) {
2250 + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",
2252 + volume->lv_access |= EVMS_LV_INCOMPLETE;
2255 + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
2256 + if (!volume->le_to_pe_map_mir1[j].owning_pv ||
2257 + !volume->le_to_pe_map_mir1[j].
2258 + pe_sector_offset) {
2259 + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",
2261 + volume->lv_access |= EVMS_LV_INCOMPLETE;
2264 + if (volume->mirror_copies == AIX_MAX_MIRRORS) {
2265 + if (!volume->le_to_pe_map_mir2[j].
2267 + || !volume->le_to_pe_map_mir2[j].
2268 + pe_sector_offset) {
2269 + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",
2271 + volume->lv_access |= EVMS_LV_INCOMPLETE;
2278 + LOG_EXTRA(" Leaving check_pe_map.\n");
2283 + * Function: export_volumes
2285 + * The last thing this VGE must do is take each constructed volume and
2286 + * place it back on the evms logical partition list.
2289 +export_volumes(struct evms_logical_node **evms_partition_list)
2291 + struct aix_volume_group *AIXVGLPtr;
2292 + struct evms_logical_node *new_node;
2293 + struct aix_logical_volume *volume;
2294 + int j, lv_found, all_lvs_found;
2297 + for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr; AIXVGLPtr = AIXVGLPtr->next) {
2299 + if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
2300 + LOG_DEBUG(" EV Existing group(%d), not dirty, skipping\n",
2301 + AIXVGLPtr->vg_id.word2);
2304 + LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",
2305 + AIXVGLPtr->AIXvgh->numpvs, AIXVGLPtr->numlvs);
2307 + // Export every valid volume in the group. For re-discovery,
2308 + // make sure we are only exporting "new" volumes.
2310 + for (j = 0, all_lvs_found = FALSE, lv_found = 0;
2311 + !all_lvs_found && j < LVM_MAXLVS; j++) {
2312 + if (AIXVGLPtr->volume_list[j] != NULL) {
2313 + if (AIXVGLPtr->volume_list[j]->lv_access & EVMS_LV_NEW) {
2315 + LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",
2316 + j,AIXVGLPtr->volume_list[j]);
2318 + volume = AIXVGLPtr->volume_list[j];
2321 + if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
2322 + all_lvs_found = TRUE;
2324 + // For new volumes, create a new EVMS node and
2325 + // initialize the appropriate fields.
2326 + if (evms_cs_allocate_logical_node(&new_node)) {
2327 + LOG_DEBUG(" Export Vol Error allocating node !!\n");
2330 + LOG_DEBUG(" EV Node allocated OK\n");
2333 +// volume->new_volume = 0;
2334 + volume->volume_node = new_node;
2335 + volume->lv_access &= (~EVMS_LV_NEW);
2336 + new_node->hardsector_size = AIXVGLPtr->hard_sect_size;
2337 + new_node->block_size = AIXVGLPtr->block_size;
2338 + new_node->plugin = &plugin_header;
2339 + new_node->private = volume;
2340 + new_node->total_vsectors = volume->lv_size;
2342 + LOG_DEBUG(" EV volume->name:[%s]\n",
2345 + strncpy(new_node->name,volume->name,
2346 + EVMS_VOLUME_NAME_SIZE + 1);
2348 + // Is the volume read-only?
2349 + if (!(volume->lv_access & AIX_LV_WRITE)
2350 + || volume->lv_access & EVMS_LV_INCOMPLETE)
2352 + new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
2353 + LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",
2354 + volume->lv_access);
2357 + evms_cs_add_logical_node_to_list(evms_partition_list,
2361 + LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n",
2362 + volume, new_node,new_node->name);
2364 + evms_cs_add_logical_node_to_list(evms_partition_list,
2365 + AIXVGLPtr->volume_list[j]->volume_node);
2367 + LOG_DEBUG(" ELV vol_list[%d]%p\n", j,
2368 + AIXVGLPtr->volume_list[j]);
2371 + LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
2373 + } // end checking all lvs
2375 + AIXVGLPtr->flags &= ~AIX_VG_DIRTY;
2382 + * Function: delete_logical_volume
2384 + * This function deletes the in-memory representation of a single LVM
2385 + * logical volume, including its PE map and any snapshot data. It does
2386 + * not alter the parent volume group, except to remove this volume from
2387 + * its volume list.
2390 +delete_logical_volume(struct aix_logical_volume *volume)
2392 + struct aix_volume_group *group = volume->group;
2394 + LOG_DEBUG(" Deleting volume %s\n", volume->name);
2396 + // Now free up all the memory. This includes the LE-to-PE map, any
2397 + // mirror PEs, etc.
2398 + if (volume->le_to_pe_map) {
2399 + kfree(volume->le_to_pe_map);
2400 + volume->le_to_pe_map = NULL;
2403 + if (volume->le_to_pe_map_mir1) {
2404 + kfree(volume->le_to_pe_map_mir1);
2405 + volume->le_to_pe_map_mir1 = NULL;
2408 + if (volume->le_to_pe_map_mir2) {
2409 + kfree(volume->le_to_pe_map_mir2);
2410 + volume->le_to_pe_map_mir2 = NULL;
2412 + // Remove this volume from the volume-group's list.
2413 + if (group && group->volume_list[volume->lv_number] == volume) {
2414 + group->volume_list[volume->lv_number] = NULL;
2423 +/* Function: remove_group_from_list
2425 + * Remove an LVM volume group from the global LVM list.
2428 +remove_group_from_list(struct aix_volume_group *group)
2430 + struct aix_volume_group **p_group;
2432 + for (p_group = &AIXVolumeGroupList; *p_group;
2433 + p_group = &(*p_group)->next) {
2434 + if (*p_group == group) {
2435 + *p_group = (*p_group)->next;
2436 + group->next = NULL;
2444 + * Function: delete_aix_node
2446 + * This function deletes the in-memory representation of an LVM
2447 + * logical volume. Right now it makes a lot of assumptions about
2448 + * the data in the group not being corrupted. It would be possible
2449 + * to put in a lot of consistency checks before deleting everything
2450 + * to indicate if problems have occurred during the lifetime of the
2451 + * volume and its volume group.
2454 +delete_aix_node(struct evms_logical_node *logical_node)
2456 + struct aix_logical_volume *volume =
2457 + (struct aix_logical_volume *) (logical_node->private);
2458 + struct aix_volume_group *group = volume->group;
2460 + if (delete_logical_volume(volume)) {
2463 + // If we just removed the last volume from this group, the entire group
2464 + // can also be deleted.
2465 + if (group && group->numlvs == 0) {
2466 + remove_group_from_list(group);
2467 + deallocate_volume_group(group);
2469 + // Free the logical node.
2470 + evms_cs_deallocate_logical_node(logical_node);
2475 +/* Function: deallocate_volume_group
2477 + * This function deletes the entire in-memory representation of an LVM
2478 + * volume group, including all partitions and logical volumes. If this
2479 + * group is on the VGE's volume group list, it is removed.
2482 +deallocate_volume_group(struct aix_volume_group *group)
2484 + struct partition_list_entry *partition;
2485 + struct partition_list_entry *next_part;
2488 + LOG_DEBUG(" Deleting volume group %x\n", group->vg_id.word2);
2490 + // Delete all partitions from the group's list.
2491 + for (partition = group->partition_list; partition;
2492 + partition = next_part) {
2494 + next_part = partition->next;
2496 + if (partition->logical_node) {
2497 + // Send a delete command down to the partition manager.
2498 + LOG_DEBUG(" Deleting PV %d from group %x\n",
2499 + partition->pv_number, group->vg_id.word2);
2500 + DELETE(partition->logical_node);
2505 + // Delete all logical volumes, and the array of pointers.
2506 + for (i = 0; i < LVM_MAXLVS; i++) {
2507 + if (group->volume_list[i]) {
2508 + delete_logical_volume(group->volume_list[i]);
2517 +/* Function: end_discover_aix
2519 + * The discovery process at the region-manager level is now iterative,
2520 + * much like the EVMS feature level. To accomplish this correctly, and
2521 + * also to accomplish partial volume discovery, a second discover
2522 + * entry point is needed, so EVMS can tell the region managers that
2523 + * discovery is over, and to finish up any discovery that is not yet
2524 + * complete. When this function is called, it should be assumed that
2525 + * the node list has had nothing new added to it since the last call
2526 + * of the regular discover function. Therefore, when this function is
2527 + * called, we do not need to try to discovery any additional volume
2528 + * groups. We will, however, look for logical volumes once more. This
2529 + * gives us the ability to export (read-only) volumes that have
2530 + * partially corrupted LE maps due to missing PVs in their VG.
2533 +end_discover_aix(struct evms_logical_node **evms_logical_disk_head)
2538 + MOD_INC_USE_COUNT;
2539 + LOG_DEBUG("Final Discovery:\n");
2541 + rc = discover_logical_volumes();
2544 + rc = export_volumes(evms_logical_disk_head);
2549 + MOD_DEC_USE_COUNT;
2553 +/****************************************************
2554 +* Function: AIX_alloc_wbh
2556 +* Alloc any buffer heads from the pool and return a linked list
2559 +*****************************************************/
2560 +static struct aix_mirror_bh *
2561 +AIX_alloc_wbh(struct evms_logical_node *node,
2562 + struct evms_logical_node *node2,
2563 + struct evms_logical_node *node3,
2564 + struct buffer_head *bh,
2565 + u32 mirror_copies, u32 le, u64 new_sector2, u64 new_sector3)
2567 + struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
2570 + head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2573 + LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2578 + head_bh->master_bh = bh;
2579 + head_bh->mirror_bh_list = NULL;
2580 + head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2582 + for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
2585 + evms_cs_allocate_from_pool(AIX_BH_list_pool,
2588 + LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2593 + tmp_bh->next_r1 = head_bh->mirror_bh_list;
2594 + head_bh->mirror_bh_list = tmp_bh;
2595 + atomic_inc(&head_bh->remaining);
2597 + memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2598 + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2599 + init_waitqueue_head(&tmp_bh->bh_req.b_wait);
2600 + //tmp_bh->bh_req.b_size = bh->b_size;
2604 + case AIX_DEFAULT_MIRRORING:
2605 + tmp_bh->node = node;
2606 + tmp_bh->bh_req.b_rsector = bh->b_rsector;
2609 + case AIX_FIRST_MIRROR:
2610 + tmp_bh->node = node2;
2611 + tmp_bh->bh_req.b_rsector = new_sector2;
2614 + case AIX_MAX_MIRRORS:
2615 + tmp_bh->node = node3;
2616 + tmp_bh->bh_req.b_rsector = new_sector3;
2620 + tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives; //setup callback routine
2621 + tmp_bh->bh_req.b_private = (void *) head_bh;
2629 +/****************************************************
2630 +* Function: AIX_handle_write_mirror_drives
2632 +* Handles a write from a set of mirrored AIX LVs
2636 +*****************************************************/
2638 +AIX_handle_write_mirror_drives(struct buffer_head *bh, int uptodate)
2640 + struct aix_logical_volume *volume;
2641 + struct evms_logical_node *node;
2642 + struct aix_mirror_bh *tmp_bh = NULL, *tmp_bh2 = NULL;
2643 + kdev_t tmp_b_rdev;
2644 + u32 count, le = 0;
2646 + tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2647 + tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2648 + node = tmp_bh->node;
2649 + volume = (struct aix_logical_volume *) node->private;
2651 + LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2652 + node, bh->b_state, uptodate, volume->mirror_copies);
2657 + switch (tmp_bh->iteration) {
2658 + case AIX_DEFAULT_MIRRORING:
2659 + volume->le_to_pe_map[le].pp_state += AIX_LVM_LVSTALE;
2662 + case AIX_FIRST_MIRROR:
2663 + volume->le_to_pe_map_mir1[le].pp_state +=
2667 + case AIX_MAX_MIRRORS:
2668 + volume->le_to_pe_map_mir2[le].pp_state +=
2673 + AIX_evms_cs_notify_lv_io_error(node);
2676 + if (atomic_dec_and_test(&tmp_bh->remaining)) {
2677 + tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2678 + tmp_bh2 = tmp_bh->mirror_bh_list;
2679 + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2682 + tmp_bh = tmp_bh2->next_r1;
2683 + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
2687 + evms_cs_volume_request_in_progress(tmp_b_rdev,
2688 + AIX_DECREMENT_REQUEST,
2695 +/****************************************************
2696 +* Function: AIX_alloc_rbh
2698 +* Alloc any buffer heads from the pool and return a linked list
2701 +*****************************************************/
2702 +static struct aix_mirror_bh *
2703 +AIX_alloc_rbh(struct evms_logical_node *node,
2704 + struct buffer_head *bh,
2705 + u32 mirror_copies, u32 le, u64 org_sector, int cmd)
2707 + struct aix_mirror_bh *tmp_bh = NULL;
2709 + tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2713 + ("Unable to allocate memory for mirror pool line:%d\n",
2718 + memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2719 + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2720 + tmp_bh->node = node;
2721 + tmp_bh->master_bh = bh;
2722 + tmp_bh->iteration = AIX_FIRST_MIRROR;
2723 + //tmp_bh->eio.rsector = eio->rsector;
2724 + //tmp_bh->eio.rsize = eio->rsize;
2726 + //tmp_bh->eio.bh = &tmp_bh->bh_req;
2728 + if (cmd == AIX_LV_READ) {
2729 + tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives; //setup callback routine
2731 + tmp_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions; //setup callback routine
2734 + tmp_bh->bh_req.b_private = (void *) tmp_bh;
2736 + tmp_bh->cmd = cmd;
2737 + tmp_bh->next_r1 = NULL;
2738 + tmp_bh->node = node;
2744 +/****************************************************
2745 +* Function: AIX_reschedule_retry
2747 +* reschedule a read of one of our mirror copies
2750 +*****************************************************/
2752 +AIX_reschedule_retry(struct aix_mirror_bh *aix_bh)
2754 + unsigned long flags;
2756 + spin_lock_irqsave(&AIX_retry_list_lock, flags);
2757 + if (AIX_retry_list == NULL)
2758 + AIX_retry_tail = &AIX_retry_list;
2759 + *AIX_retry_tail = aix_bh;
2760 + AIX_retry_tail = &aix_bh->next_r1;
2761 + aix_bh->next_r1 = NULL;
2762 + spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2763 + evms_cs_wakeup_thread(AIX_mirror_read_retry_thread);
2766 +/****************************************************
2767 +* Function: AIX_handle_read_mirror_drives
2769 +* Handles a read from a set of mirrored AIX LVs
2773 +*****************************************************/
2775 +AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate)
2777 + struct aix_logical_volume *volume;
2778 + struct evms_logical_node *node;
2779 + struct aix_mirror_bh *tmp_bh;
2780 + kdev_t tmp_b_rdev;
2781 + u32 count, le = 0;
2783 + tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2784 + tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2785 + volume = (struct aix_logical_volume *) tmp_bh->node->private;
2786 + node = tmp_bh->node;
2789 + LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2790 + node, bh->b_state, uptodate, volume->mirror_copies);
2792 + switch (tmp_bh->iteration) {
2793 + case AIX_DEFAULT_MIRRORING:
2794 + count = volume->le_to_pe_map[le].pp_state;
2797 + case AIX_FIRST_MIRROR:
2798 + count = volume->le_to_pe_map[le].pp_state;
2801 + case AIX_MAX_MIRRORS:
2802 + count = volume->le_to_pe_map[le].pp_state;
2806 + if (count == (AIX_LVM_LVSTALE + AIX_LVM_LVDEFINED)) {
2811 + if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
2812 + AIX_evms_cs_notify_lv_io_error(node);
2813 + AIX_reschedule_retry(tmp_bh);
2815 + tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2816 + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2817 + evms_cs_volume_request_in_progress(tmp_b_rdev,
2818 + AIX_DECREMENT_REQUEST,
2826 +/****************************************************
2827 +* This is a temporary function until a common EVMS
2828 +* notification function can be created.
2830 +*****************************************************/
2832 +AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node)
2834 + struct aix_logical_volume *volume;
2836 + volume = (struct aix_logical_volume *) node->private;
2838 + LOG_CRITICAL("Notify_ERROR !! node:%p volume->lv_status:%d volume->name:[%s]\n",
2839 + node, volume->lv_status, volume->name);
2844 +/* Function: lvm_cleanup
2846 + * This function runs through the entire lvm data structure, removing
2847 + * all items that are not needed at runtime. Currently, this is just the
2848 + * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
2849 + * groups that don't contain any volumes are deleted. All of the other
2850 + * volume_group, logical_volume and evms_logical_node structures will be
2851 + * kept around at run-time.
2856 + struct aix_volume_group *group;
2858 + group = AIXVolumeGroupList;
2862 + if (group->AIXvgh) {
2863 + kfree(group->AIXvgh);
2864 + group->AIXvgh = NULL;
2867 + group = group->next;
2873 +/****************************************************
2874 +* Function: AIX_copy_header_info
2876 +* Copy the disk header info into the volume struct
2877 +* so we can use it later.
2881 +*****************************************************/
2883 +AIX_copy_header_info(struct vg_header *AIXvgh, struct vg_header *AIXvgh2)
2886 + LOG_DEBUG("CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2890 + AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec;
2891 + AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec;
2892 + AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1;
2893 + AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2;
2894 + AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3;
2895 + AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4;
2896 + AIXvgh->numlvs = AIXvgh2->numlvs;
2897 + AIXvgh->maxlvs = AIXvgh2->maxlvs;
2898 + AIXvgh->pp_size = AIXvgh2->pp_size;
2899 + AIXvgh->numpvs = AIXvgh2->numpvs;
2900 + AIXvgh->total_vgdas = AIXvgh2->total_vgdas;
2901 + AIXvgh->vgda_size = AIXvgh2->vgda_size;
2902 + AIXvgh->bigvg = AIXvgh2->bigvg;
2903 + AIXvgh->quorum = AIXvgh2->quorum;
2904 + AIXvgh->auto_varyon = AIXvgh2->auto_varyon;
2905 + AIXvgh->checksum = AIXvgh2->checksum;
2906 + AIXvgh->bigda_size = AIXvgh2->bigda_size;
2912 + LOG_DEBUG("Returning CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2917 +/****************************************************
2918 +* Function: AIX_free_header
2924 +*****************************************************/
2926 +AIX_free_headers(struct vg_header *AIXvgh, struct vg_header *AIXvgh2,
2927 + struct vg_trailer *AIXvgt, struct vg_trailer *AIXvgt2)
2952 +/****************************************************
2955 +* This is a kernel thread that handles read of mirrors
2956 +* This shouldn't ever run on a non-mirrored LV read
2959 +*****************************************************/
2963 + struct aix_mirror_bh *r1_bh;
2964 + struct evms_logical_node *node;
2965 + unsigned long flags;
2969 + spin_lock_irqsave(&AIX_retry_list_lock, flags);
2970 + if (AIX_retry_list == NULL) {
2971 + spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2974 + r1_bh = AIX_retry_list;
2975 + AIX_retry_list = r1_bh->next_r1;
2976 + spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2977 + r1_bh->next_r1 = NULL; // for mark
2979 + switch (r1_bh->cmd) {
2982 + r1_bh->iteration++;
2983 + LOG_DEBUG("Report from thread AIXiod READ\n");
2985 + if (r1_bh->iteration == AIX_FIRST_MIRROR) {
2986 + node = r1_bh->mir_node1;
2987 + r1_bh->bh_req.b_rsector = r1_bh->mir_sector1;
2989 + node = r1_bh->mir_node2;
2990 + r1_bh->bh_req.b_rsector = r1_bh->mir_sector2;
2993 + R_IO(node, &r1_bh->bh_req);
2998 + LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n",
3007 +/****************************************************
3008 +* Function: AIX_schedule_resync
3010 +* schedule a resync of one of our lv mirror copies
3013 +*****************************************************/
3015 +AIX_schedule_resync(struct aix_logical_volume *resync_volume, int force)
3017 + unsigned long flags;
3019 + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__,
3020 + resync_volume->name);
3022 + spin_lock_irqsave(&AIX_resync_list_lock, flags);
3024 + if (!AIX_resync_list) {
3026 + kmalloc(sizeof (struct aix_resync_struct), GFP_ATOMIC);
3027 + if (!AIX_resync_list) {
3030 + memset(AIX_resync_list, 0, sizeof (struct aix_resync_struct));
3033 + AIX_resync_list->resync_vol = resync_volume;
3034 + AIX_resync_list->next_resync_vol = NULL;
3036 + spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3037 + evms_cs_wakeup_thread(AIX_mirror_resync_thread);
3040 +/****************************************************
3041 +* Function: AIXresync
3043 +* This is a kernel thread that handles resync of mirrors
3044 +* This shouldn't ever run on a non-mirrored LV
3047 +*****************************************************/
3049 +AIXresync(void *data)
3052 + struct aix_logical_volume *volume = NULL;
3053 + int force = FALSE; // Currently we don't force a resync of non-stale pe's
3055 + if (AIX_resync_list == NULL) {
3056 + LOG_ERROR("No Volumes on list to resync\n");
3060 + volume = AIX_resync_list->resync_vol;
3061 + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3064 + LOG_ERROR("Invalid volume passed to sync\n");
3068 + if (AIXResyncInProgress) {
3069 + LOG_ERROR("Unable to resync multiple LVs concurrently %s\n",
3074 + if (volume->mirror_copies == AIX_DEFAULT_MIRRORING) {
3075 + LOG_ERROR("Unable to resync non-mirrored LV %s \n",
3080 + AIXResyncInProgress = TRUE;
3082 + AIX_resync_lv_mirrors(volume, force);
3087 +/****************************************************
3088 +* Function: AIX_resync_lv_mirrors
3094 +*****************************************************/
3096 +AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force)
3100 + char pp_stale = FALSE;
3102 + struct partition_list_entry *master_part = NULL;
3103 + struct partition_list_entry *slave1_part = NULL;
3104 + struct partition_list_entry *slave2_part = NULL;
3106 + u64 master_offset = 0;
3107 + u64 slave1_offset = 0;
3108 + u64 slave2_offset = 0;
3110 + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3112 + for (i = 0; i < volume->num_le; i++, pp_stale = FALSE) {
3114 + // We need to see which mirror has a valid non-stale copy.
3115 + // The first non-stale copy will be our master and we'll
3116 + // copy to the slave(s).
3118 + if ((volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3122 + if (volume->le_to_pe_map_mir1 != NULL) {
3123 + if ((volume->le_to_pe_map_mir1[i].
3124 + pp_state & AIX_LVM_LVSTALE)) {
3129 + if (volume->le_to_pe_map_mir2 != NULL) {
3130 + if ((volume->le_to_pe_map_mir2[i].
3131 + pp_state & AIX_LVM_LVSTALE)) {
3136 + LOG_DEBUG("Function %s pp_stale:%d force:%d \n", __FUNCTION__,
3139 + if (pp_stale || force) {
3140 + if (!(volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3142 + master_part = volume->le_to_pe_map[i].owning_pv;
3143 + master_offset = volume->le_to_pe_map[i].pe_sector_offset;
3145 + if (volume->le_to_pe_map_mir1 != NULL) {
3146 + slave1_part = volume->le_to_pe_map_mir1[i].owning_pv;
3147 + slave1_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3150 + if (volume->le_to_pe_map_mir2 != NULL) {
3151 + slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3152 + slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3155 + if (!(volume->le_to_pe_map_mir1[i].pp_state & AIX_LVM_LVSTALE)) {
3156 + master_part = volume->le_to_pe_map_mir1[i].owning_pv;
3157 + master_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3159 + if (volume->le_to_pe_map != NULL) {
3160 + slave1_part = volume->le_to_pe_map[i].owning_pv;
3161 + slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3164 + if (volume->le_to_pe_map_mir2 != NULL) {
3165 + slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3166 + slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3169 + if (!(volume->le_to_pe_map_mir2[i].pp_state & AIX_LVM_LVSTALE)) {
3170 + master_part = volume->le_to_pe_map_mir2[i].owning_pv;
3171 + master_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3173 + if (volume->le_to_pe_map != NULL) {
3174 + slave1_part = volume->le_to_pe_map[i].owning_pv;
3175 + slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3178 + if (volume->le_to_pe_map_mir1 != NULL) {
3179 + slave2_part = volume->le_to_pe_map_mir1[i].owning_pv;
3180 + slave2_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3184 + if (AIX_copy_on_read(volume, master_part, slave1_part, slave2_part,
3185 + master_offset, slave1_offset, slave2_offset,
3186 + volume->pe_size, i)) {
3188 + LOG_CRITICAL("ReSync of logical Volume %s FAILED !!\n",
3190 + AIX_evms_cs_notify_lv_io_error(volume->
3202 +/****************************************************
3203 +* Function: AIX_copy_on_read
3209 +*****************************************************/
3211 +AIX_copy_on_read(struct aix_logical_volume *volume,
3212 + struct partition_list_entry *master_part,
3213 + struct partition_list_entry *slave1_part,
3214 + struct partition_list_entry *slave2_part,
3215 + u64 master_offset,
3216 + u64 slave1_offset, u64 slave2_offset, u32 pe_size, int le)
3218 + unsigned long flags;
3219 + struct aix_mirror_bh *tmp_bh = NULL;
3221 + // Check for valid partitions we need at least 2 good partitions so slave2 doesn't have to be valid
3223 + if (!master_part || !slave1_part) {
3224 + LOG_ERROR("Invalid partitions for resync master part:%p slave1_part:%p slave2_part:%p\n",
3225 + master_part, slave1_part, slave2_part);
3229 + LOG_DEBUG("Function %s volume:%s master_part:%d, slave1_part:%d, slave2_part:%d master_offset:"
3230 + PFU64 ", slave1_offset:" PFU64 " slave2_offset:" PFU64 ", \n",
3231 + __FUNCTION__, volume->name, master_part->pv_number,
3232 + slave1_part->pv_number, slave2_part->pv_number, master_offset,
3233 + slave1_offset, slave2_offset);
3235 + LOG_DEBUG("pe_size:%d le:%d\n", pe_size, le);
3238 + AIX_alloc_sbh(volume, master_part, slave1_part, slave2_part,
3239 + master_offset, slave1_offset, slave2_offset, pe_size);
3242 + buffer_IO_error(&tmp_bh->bh_req);
3246 +/* if (evms_cs_volume_request_in_progress
3247 + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
3248 + buffer_IO_error(&tmp_bh->bh_req);
3252 + spin_lock_irqsave(&AIX_resync_pp_lock, flags);
3254 + LOG_DEBUG("Function:%s kicking off read node:%p\n", __FUNCTION__,
3255 + master_part->logical_node);
3257 + R_IO(master_part->logical_node, &tmp_bh->bh_req);
3259 + spin_unlock_irqrestore(&AIX_resync_pp_lock, flags);
3264 +/****************************************************
3265 +* Function: AIX_alloc_sbh
3267 +* Alloc any buffer heads from the pool and return a linked list
3270 +*****************************************************/
3271 +static struct aix_mirror_bh *
3272 +AIX_alloc_sbh(struct aix_logical_volume *volume,
3273 + struct partition_list_entry *master_part,
3274 + struct partition_list_entry *slave1_part,
3275 + struct partition_list_entry *slave2_part,
3276 + u64 master_offset,
3277 + u64 slave1_offset, u64 slave2_offset, u32 pe_size)
3279 + struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
3280 + unsigned long flags;
3282 + LOG_DEBUG("Function:%s Enter\n", __FUNCTION__);
3284 + head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3287 + ("Unable to allocate memory for mirror pool line:%d\n",
3291 + // Update buffer so we block on a read/write on the normal IO path
3292 + // if we're trying to sync the same sector on the disk
3293 + // We don't want to block if it's different sectors
3295 + spin_lock_irqsave(&AIX_resync_list_lock, flags);
3297 + AIX_resync_list->master_part = master_part;
3298 + AIX_resync_list->slave1_part = slave1_part;
3299 + AIX_resync_list->slave2_part = slave2_part;
3300 + AIX_resync_list->master_offset = master_offset;
3301 + AIX_resync_list->slave1_offset = slave1_offset;
3302 + AIX_resync_list->slave2_offset = slave2_offset;
3304 + head_bh->bh_req.b_data = kmalloc(AIX_RESYNC_BLOCKSIZE + 1, GFP_NOIO);
3305 + if (!head_bh->bh_req.b_data) {
3306 + evms_cs_deallocate_to_pool(AIX_BH_list_pool, head_bh);
3308 + ("Unable to allocate memory for mirror pool line:%d\n",
3313 + memset(head_bh->bh_req.b_data, 0, AIX_RESYNC_BLOCKSIZE + 1);
3315 + head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3316 + head_bh->bh_req.b_rsector = master_offset;
3317 + head_bh->bh_req.b_size = AIX_RESYNC_BLOCKSIZE;
3318 + head_bh->sync_flag = AIX_SYNC_INCOMPLETE;
3319 + head_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions;
3320 + head_bh->bh_req.b_page = virt_to_page(head_bh->bh_req.b_data);
3321 + head_bh->bh_req.b_state = 0;
3322 + set_bit(BH_Dirty, &head_bh->bh_req.b_state);
3323 + set_bit(BH_Lock, &head_bh->bh_req.b_state);
3324 + set_bit(BH_Req, &head_bh->bh_req.b_state);
3325 + set_bit(BH_Mapped, &head_bh->bh_req.b_state);
3326 + head_bh->master_bh = NULL;
3327 + head_bh->mirror_bh_list = NULL;
3329 + tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3332 + ("Unable to allocate memory for mirror pool line:%d\n",
3337 + head_bh->next_r1 = tmp_bh;
3338 + memcpy(&tmp_bh->bh_req, head_bh, sizeof (struct buffer_head));
3339 + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3340 + tmp_bh->bh_req.b_end_io = NULL;
3342 + if (volume->mirror_copies == AIX_MAX_MIRRORS) {
3344 + evms_cs_allocate_from_pool(AIX_BH_list_pool,
3346 + if (!tmp_bh->next_r1) {
3348 + ("Unable to allocate memory for mirror pool line:%d\n",
3353 + memcpy(&tmp_bh->next_r1->bh_req, head_bh,
3354 + sizeof (struct buffer_head));
3355 + tmp_bh->next_r1->bh_req.b_end_io = NULL;
3356 + tmp_bh->next_r1->remaining = (atomic_t) ATOMIC_INIT(0);
3359 + init_waitqueue_head(&head_bh->bh_req.b_wait);
3361 + spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3363 + LOG_DEBUG("Function:%s Exit head_bh:%p\n", __FUNCTION__, head_bh);
3368 +/****************************************************
3369 +* Function: AIX_sync_mirrored_partitions
3375 +*****************************************************/
3377 +AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate)
3379 + struct aix_logical_volume *volume = NULL;
3380 + struct aix_mirror_bh *tmp_bh, *head_bh;
3382 + head_bh = tmp_bh = (struct aix_mirror_bh *) bh->b_private;
3383 + volume = (struct aix_logical_volume *) tmp_bh->node->private;
3385 + LOG_DEBUG("Function:%s Enter uptodate:%d\n", __FUNCTION__, uptodate);
3389 + AIX_evms_cs_notify_lv_io_error(tmp_bh->node);
3392 + tmp_bh = head_bh->next_r1;
3394 + LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3395 + __LINE__, tmp_bh);
3398 + W_IO(tmp_bh->node, &tmp_bh->bh_req);
3399 + AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_1,
3400 + AIX_RESYNC_BLOCKSIZE);
3403 + tmp_bh = tmp_bh->next_r1;
3404 + LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3405 + __LINE__, tmp_bh);
3408 + W_IO(tmp_bh->node, &tmp_bh->bh_req);
3409 + AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_2,
3410 + AIX_RESYNC_BLOCKSIZE);
3413 + LOG_DEBUG("Function:%s line:%d read from master:%p\n", __FUNCTION__,
3414 + __LINE__, head_bh);
3416 + if (head_bh && head_bh->sync_flag) {
3417 + AIX_get_set_mirror_offset(head_bh, AIX_MASTER,
3418 + AIX_RESYNC_BLOCKSIZE);
3419 + if (head_bh->sync_flag == AIX_SYNC_INCOMPLETE) {
3420 + R_IO(head_bh->node, &head_bh->bh_req);
3424 + LOG_DEBUG("Function:%s line:%d head_bh->sync_flag:%d\n", __FUNCTION__,
3425 + __LINE__, head_bh->sync_flag);
3427 + if (!head_bh->sync_flag) {
3429 + head_bh = head_bh->next_r1;
3431 + while (tmp_bh != NULL) {
3432 + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
3436 + AIXResyncInProgress = FALSE;
3437 +/* evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_rdev,
3438 + AIX_DECREMENT_REQUEST,
3441 + if (AIX_resync_list) {
3442 + kfree(AIX_resync_list);
3449 +/****************************************************
3450 +* Function: AIX_get_set_mirror_offset
3456 +*****************************************************/
3458 +AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh, int index, int offset)
3466 + LOG_DEBUG("Function:%s Enter offset:%d\n", __FUNCTION__, offset);
3468 + tmp_bh->bh_req.b_rsector += tmp_bh->bh_req.b_rsector + offset;
3470 + if (tmp_bh->bh_req.b_rsector > tmp_bh->node->total_vsectors) {
3471 + tmp_bh->sync_flag = AIX_SYNC_COMPLETE;
3474 + // Update buffer so we block on a read/write on the normal IO path
3475 + // if we're trying to sync the same sector on the disk
3476 + // We don't want to block if it's different sectors
3478 + spin_lock_irqsave(&AIX_resync_list_lock, flags);
3480 + if (AIX_resync_list->master_part->logical_node == tmp_bh->node) {
3481 + AIX_resync_list->master_offset += offset;
3484 + if (AIX_resync_list->slave1_part->logical_node == tmp_bh->node) {
3485 + AIX_resync_list->slave1_offset += offset;
3488 + if (AIX_resync_list->slave2_part->logical_node == tmp_bh->node) {
3489 + AIX_resync_list->slave2_offset += offset;
3492 + spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3498 +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs)
3500 + struct partition_list_entry * pv;
3501 + struct pv_header * AIXpvh;
3506 + LOG_DEBUG("APDP - vgda_psn:%d numpvs:%d \n", vgda_psn, numpvs);
3508 + AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
3513 + memset(AIXpvh, 0 , sizeof(struct pv_header));
3515 + // Adjust this because when AIX VGs/Volumes are created on Intel platforms, the
3516 + // pp_count could be anything since we don't give up the entire physical drive.
3517 + // This is for calculation purposes only.
3522 + for (i = 1; i <= numpvs; i++) {
3523 + for (pv = partition; pv->pv_number != i; pv = pv->next );
3525 + LOG_DEBUG("APDP line:%d pp_count:%d \n", __LINE__, AIXpvh->pp_count);
3527 + num_pps = AIXpvh->pp_count;
3528 + num_pps++; // Account for the pv_header on the front
3530 + while ((num_pps * sizeof(struct pp_entries)) % AIX_SECTOR_SIZE) {
3531 + LOG_EXTRA("num_pps:%d \n", num_pps);
3535 + tmp = (num_pps * sizeof(struct pp_entries)) / AIX_SECTOR_SIZE;
3537 + LOG_DEBUG("APDP tmp:%d num_pps:%d \n", tmp,num_pps);
3539 + posn = ((vgda_psn + PSN_PPH_OFFSET) + ((pv->pv_number -1) * tmp));
3541 + pvh_posn[pv->pv_number] = posn;
3543 + if (INIT_IO(pv->logical_node, 0, posn, 1, AIXpvh)) {
3556 +/****************************************************
3557 +* Function: AIX_volume_group_dump
3559 +* This is for debug purposes and will walk the volume group list
3560 +* and LV's within the volume groups
3562 +* It can be called at anytime however the output to the display is large
3564 +*****************************************************/
3565 +#ifdef EVMS_AIX_DEBUG
3567 +AIX_volume_group_dump(void)
3569 + struct aix_volume_group *AIXVGLDebugPtr;
3570 + struct partition_list_entry *DebugPartitionList;
3571 + struct aix_logical_volume *DebugLVList;
3574 + AIXVGLDebugPtr = AIXVolumeGroupList;
3576 + if (!AIXVGLDebugPtr) {
3577 + LOG_DEBUG("***********************************************\n");
3578 + LOG_DEBUG("ERROR Nothing built in the list to check !!! \n");
3579 + LOG_DEBUG("***********************************************\n");
3583 + LOG_DEBUG("*********************************************** \n");
3584 + LOG_DEBUG("Begin Volume Group Dump \n");
3585 + LOG_DEBUG("*********************************************** \n");
3587 + while (AIXVGLDebugPtr) {
3589 + LOG_DEBUG("vg_number %x\n", AIXVGLDebugPtr->vg_id.word2);
3590 + LOG_DEBUG("numpsrtitions %d\n", AIXVGLDebugPtr->partition_count);
3591 + LOG_DEBUG("numlvs %d\n", AIXVGLDebugPtr->numlvs);
3592 + LOG_DEBUG("hard_sect_size %d\n", AIXVGLDebugPtr->hard_sect_size);
3593 + LOG_DEBUG("block_size %d\n", AIXVGLDebugPtr->block_size);
3594 + LOG_DEBUG("flags %d\n", AIXVGLDebugPtr->flags);
3595 +// LOG_DEBUG("lv_max %d\n", AIXVGLDebugPtr->lv_max);
3596 + LOG_DEBUG("pe_size %d\n", AIXVGLDebugPtr->pe_size);
3597 + LOG_DEBUG("CleanVGInfo %d\n", AIXVGLDebugPtr->CleanVGInfo);
3599 + DebugPartitionList = AIXVGLDebugPtr->partition_list;
3601 + LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
3603 + if (!DebugPartitionList) {
3604 + LOG_DEBUG("No partitions to check !! \n");
3607 + while (DebugPartitionList) {
3608 + LOG_DEBUG("logical_node %p\n",
3609 + DebugPartitionList->logical_node);
3610 + LOG_DEBUG("pv_number %d\n",
3611 + DebugPartitionList->pv_number);
3612 + LOG_DEBUG("block_size %d\n",
3613 + DebugPartitionList->block_size);
3614 + LOG_DEBUG("hard_sect_size %d\n",
3615 + DebugPartitionList->hard_sect_size);
3616 + LOG_DEBUG("-------------------------------------------------------------\n");
3617 + DebugPartitionList = DebugPartitionList->next;
3620 + LOG_DEBUG("********* End Volume Partition Dump **********\n");
3622 + LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
3624 + DebugLVList = AIXVGLDebugPtr->volume_list[0];
3626 + if (!DebugLVList) {
3627 + LOG_DEBUG("No logical volumes to check !! \n");
3630 + for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
3632 + DebugLVList = AIXVGLDebugPtr->volume_list[i];
3634 + if (DebugLVList) {
3635 + LOG_DEBUG("volume_list # %d \n", i);
3636 + LOG_DEBUG("lv_number %d \n",
3637 + DebugLVList->lv_number);
3638 + LOG_DEBUG("LV name %s \n",
3639 + DebugLVList->name);
3640 + LOG_DEBUG("lv_size " PFU64 " \n",
3641 + DebugLVList->lv_size);
3642 + LOG_DEBUG("lv_access %d \n",
3643 + DebugLVList->lv_access);
3644 + LOG_DEBUG("lv_status %d \n",
3645 + DebugLVList->lv_status);
3646 +// LOG_DEBUG("lv_minor %d \n",
3647 +// DebugLVList->lv_minor);
3648 + LOG_DEBUG("mirror_copies %d \n",
3649 + DebugLVList->mirror_copies);
3650 +// LOG_DEBUG("mirror_number %d \n",
3651 +// DebugLVList->mirror_number);
3652 + LOG_DEBUG("stripes %d \n",
3653 + DebugLVList->stripes);
3654 + LOG_DEBUG("stripe_size %d \n",
3655 + DebugLVList->stripe_size);
3656 + LOG_DEBUG("stripe_size_shift%d \n",
3657 + DebugLVList->stripe_size_shift);
3658 + LOG_DEBUG("pe_size %d \n",
3659 + DebugLVList->pe_size);
3660 + LOG_DEBUG("pe_size_shift %d \n",
3661 + DebugLVList->pe_size_shift);
3662 + LOG_DEBUG("num_le %d \n",
3663 + DebugLVList->num_le);
3664 +// LOG_DEBUG("new_volume %d \n",
3665 +// DebugLVList->new_volume);
3666 + LOG_DEBUG("group %p \n",
3667 + DebugLVList->group);
3672 + AIXVGLDebugPtr = AIXVGLDebugPtr->next;
3674 + LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
3678 + LOG_DEBUG("***********************************************\n");
3679 + LOG_DEBUG("End Volume Group Dump \n");
3680 + LOG_DEBUG("***********************************************\n");
3686 diff -Naur linux-2002-09-30/drivers/evms/Config.in evms-2002-09-30/drivers/evms/Config.in
3687 --- linux-2002-09-30/drivers/evms/Config.in Wed Dec 31 18:00:00 1969
3688 +++ evms-2002-09-30/drivers/evms/Config.in Mon Sep 16 15:55:24 2002
3691 +# Copyright (c) International Business Machines Corp., 2000
3693 +# This program is free software; you can redistribute it and/or modify
3694 +# it under the terms of the GNU General Public License as published by
3695 +# the Free Software Foundation; either version 2 of the License, or
3696 +# (at your option) any later version.
3698 +# This program is distributed in the hope that it will be useful,
3699 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
3700 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
3701 +# the GNU General Public License for more details.
3703 +# You should have received a copy of the GNU General Public License
3704 +# along with this program; if not, write to the Free Software
3705 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3708 +# EVMS driver configuration
3711 +mainmenu_option next_comment
3712 +comment 'Enterprise Volume Management System'
3714 +tristate 'EVMS Kernel Runtime' CONFIG_EVMS
3715 +dep_tristate ' EVMS Local Device Manager' CONFIG_EVMS_LOCAL_DEV_MGR $CONFIG_EVMS
3716 +dep_tristate ' EVMS DOS Segment Manager' CONFIG_EVMS_DOS_SEGMENT_MGR $CONFIG_EVMS
3717 +dep_tristate ' EVMS GPT Segment Manager' CONFIG_EVMS_GPT_SEGMENT_MGR $CONFIG_EVMS
3718 +if [ "$CONFIG_ARCH_S390" = "y" ]; then
3719 +dep_tristate ' EVMS S/390 Segment Manager' CONFIG_EVMS_S390_SEGMENT_MGR $CONFIG_EVMS
3721 +dep_tristate ' EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT $CONFIG_EVMS
3722 +dep_tristate ' EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK $CONFIG_EVMS
3723 +dep_tristate ' EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR $CONFIG_EVMS
3724 +dep_tristate ' EVMS Linux LVM Package' CONFIG_EVMS_LVM $CONFIG_EVMS
3725 +dep_tristate ' EVMS Linux MD Package' CONFIG_EVMS_MD $CONFIG_EVMS
3726 +dep_tristate ' EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR $CONFIG_EVMS_MD
3727 +dep_tristate ' EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0 $CONFIG_EVMS_MD
3728 +dep_tristate ' EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1 $CONFIG_EVMS_MD
3729 +dep_tristate ' EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5 $CONFIG_EVMS_MD
3730 +dep_tristate ' EVMS AIX LVM Package' CONFIG_EVMS_AIX $CONFIG_EVMS
3731 +dep_tristate ' EVMS OS/2 LVM Package' CONFIG_EVMS_OS2 $CONFIG_EVMS
3732 +#dep_tristate ' EVMS Clustering Package' CONFIG_EVMS_ECR $CONFIG_EVMS
3734 +if [ "$CONFIG_EVMS" != "n" ]; then
3735 + choice ' EVMS Debug Level' \
3736 + "Critical CONFIG_EVMS_INFO_CRITICAL \
3737 + Serious CONFIG_EVMS_INFO_SERIOUS \
3738 + Error CONFIG_EVMS_INFO_ERROR \
3739 + Warning CONFIG_EVMS_INFO_WARNING \
3740 + Default CONFIG_EVMS_INFO_DEFAULT \
3741 + Details CONFIG_EVMS_INFO_DETAILS \
3742 + Debug CONFIG_EVMS_INFO_DEBUG \
3743 + Extra CONFIG_EVMS_INFO_EXTRA \
3744 + Entry_Exit CONFIG_EVMS_INFO_ENTRY_EXIT \
3745 + Everything CONFIG_EVMS_INFO_EVERYTHING" Default
3750 diff -Naur linux-2002-09-30/drivers/evms/Makefile evms-2002-09-30/drivers/evms/Makefile
3751 --- linux-2002-09-30/drivers/evms/Makefile Wed Dec 31 18:00:00 1969
3752 +++ evms-2002-09-30/drivers/evms/Makefile Mon Sep 16 15:55:24 2002
3755 +# Makefile for the kernel EVMS driver and modules.
3757 +# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
3760 +O_TARGET := evmsdrvr.o
3762 +export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o \
3763 + snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o \
3764 + os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o \
3765 + md_raid1.o md_raid5.o md_xor.o s390_part.o gpt_part.o
3767 +# Link order is important! Plugins must come first, then the EVMS core.
3769 +obj-$(CONFIG_EVMS_LOCAL_DEV_MGR) += ldev_mgr.o
3770 +obj-$(CONFIG_EVMS_DOS_SEGMENT_MGR) += dos_part.o
3771 +obj-$(CONFIG_EVMS_GPT_SEGMENT_MGR) += gpt_part.o
3772 +obj-$(CONFIG_EVMS_S390_SEGMENT_MGR) += s390_part.o
3773 +obj-$(CONFIG_EVMS_MD) += md_core.o
3774 +obj-$(CONFIG_EVMS_MD_LINEAR) += md_linear.o
3775 +obj-$(CONFIG_EVMS_MD_RAID0) += md_raid0.o
3776 +obj-$(CONFIG_EVMS_MD_RAID1) += md_raid1.o
3777 +obj-$(CONFIG_EVMS_MD_RAID5) += md_raid5.o md_xor.o
3778 +obj-$(CONFIG_EVMS_LVM) += lvm_vge.o
3779 +obj-$(CONFIG_EVMS_AIX) += AIXlvm_vge.o
3780 +obj-$(CONFIG_EVMS_OS2) += os2lvm_vge.o
3781 +obj-$(CONFIG_EVMS_DRIVELINK) += evms_drivelink.o
3782 +obj-$(CONFIG_EVMS_BBR) += evms_bbr.o
3783 +obj-$(CONFIG_EVMS_SNAPSHOT) += snapshot.o
3784 +obj-$(CONFIG_EVMS_ECR) += evms_ecr.o
3785 +obj-$(CONFIG_EVMS) += evms_passthru.o evms.o
3787 +EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
3788 +ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
3789 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
3791 +ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
3792 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
3794 +ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
3795 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
3797 +ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
3798 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
3800 +ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
3801 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
3803 +ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
3804 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
3806 +ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
3807 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
3809 +ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
3810 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
3812 +ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
3813 + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
3816 +include $(TOPDIR)/Rules.make
3818 diff -Naur linux-2002-09-30/drivers/evms/dos_part.c evms-2002-09-30/drivers/evms/dos_part.c
3819 --- linux-2002-09-30/drivers/evms/dos_part.c Wed Dec 31 18:00:00 1969
3820 +++ evms-2002-09-30/drivers/evms/dos_part.c Fri Sep 13 16:09:55 2002
3822 +/* -*- linux-c -*- */
3826 + * Copyright (c) International Business Machines Corp., 2000
3828 + * This program is free software; you can redistribute it and/or modify
3829 + * it under the terms of the GNU General Public License as published by
3830 + * the Free Software Foundation; either version 2 of the License, or
3831 + * (at your option) any later version.
3833 + * This program is distributed in the hope that it will be useful,
3834 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
3835 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
3836 + * the GNU General Public License for more details.
3838 + * You should have received a copy of the GNU General Public License
3839 + * along with this program; if not, write to the Free Software
3840 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3845 + * linux/drivers/evms/dos_part.c
3847 + * EVMS DOS partition manager
3849 + * Partial code extracted from
3851 + * linux/fs/partitions/msdos.c
3855 +#include <linux/config.h>
3856 +#include <linux/module.h>
3857 +#include <linux/kernel.h>
3858 +#include <linux/config.h>
3859 +#include <linux/fs.h>
3860 +#include <linux/genhd.h>
3861 +#include <linux/string.h>
3862 +#include <linux/blk.h>
3863 +#include <linux/init.h>
3864 +#include <linux/iobuf.h> /* for kiobuf stuffs */
3866 +#ifdef CONFIG_BLK_DEV_IDE
3867 +#include <linux/ide.h> /* IDE xlate */
3868 +#endif /* CONFIG_BLK_DEV_IDE */
3870 +#include <linux/evms/evms.h>
3871 +#include <linux/evms/evms_os2.h>
3873 +#include <asm/system.h>
3874 +#include <asm/uaccess.h>
3876 +/* prefix used in logging messages */
3877 +#define LOG_PREFIX "dos_part: "
3879 +/* #include "msdos.h" */
3880 +#define MSDOS_LABEL_MAGIC 0xAA55
3881 +#define GPT_ENTIRE_DISK_INDICATOR 0xEE
3882 +#define GPT_ESP_INDICATOR 0xEF
3885 + * struct mbr_ebr - Skeletal MBR/EBR structure useful for our purposes
3886 + * @unused1: skip IPL record code
3887 + * @partitions: partition table
3888 + * @signature: DOS magic
3890 + * skeletal access to parition table in MBR/EBR
3893 + u8 unused1[0x1be];
3894 + struct partition partitions[4];
3899 + * struct dos_private - Private data structure for this plugin
3900 + * @source_object: object this IO will get remapped to
3901 + * @start_sect: source object relative starting address in 512 byte units
3902 + * @nr_sect: partition size in 512 bytes units
3903 + * @type: partition type or filesystem format indicator
3905 + * private copy of the just the fields we require to remap IO requests
3906 + * to the underlying object.
3908 +struct dos_private {
3909 + struct evms_logical_node *source_disk;
3912 + unsigned char type;
3916 + * struct extended_part - Structure used to track progress traversing an EBR chain
3917 + * @extended: partition table in the extended boot record
3918 + * @start_sect: address of the extended boot record in 512 byte units
3919 + * @next_ebr_start: address of next ebr in the chain
3920 + * @done: progress flag
3922 + * struct used to track extended boot record chain traversals.
3924 +struct extended_part {
3925 + struct partition *extended;
3927 + u64 next_ebr_start;
3931 +/* Global variables */
3932 +static int cur_comp_part_num; /* used to track non-primary
3933 + * partition numbers
3935 +static int exported_nodes; /* total # of exported segments
3936 + * produced during this discovery.
3939 +/* External references */
3940 +#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
3941 +extern void md_autodetect_dev(kdev_t dev);
3945 +static int mbr_ebr_partition_discover(struct evms_logical_node **);
3946 +static int mbr_ebr_partition_delete(struct evms_logical_node *);
3947 +static void mbr_ebr_partition_read(struct evms_logical_node *,
3948 + struct buffer_head *);
3949 +static void mbr_ebr_partition_write(struct evms_logical_node *,
3950 + struct buffer_head *);
3951 +static int mbr_ebr_partition_ioctl(struct evms_logical_node *, struct inode *,
3952 + struct file *, unsigned int, unsigned long);
3953 +static int mbr_ebr_partition_init_io(struct evms_logical_node *,
3954 + int, u64, u64, void *);
3956 +static struct evms_plugin_fops fops = {
3957 + .discover = mbr_ebr_partition_discover,
3958 + .delete = mbr_ebr_partition_delete,
3959 + .read = mbr_ebr_partition_read,
3960 + .write = mbr_ebr_partition_write,
3961 + .init_io = mbr_ebr_partition_init_io,
3962 + .ioctl = mbr_ebr_partition_ioctl
3965 +#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
3967 +static struct evms_plugin_header plugin_header = {
3968 + .id = SetPluginID(IBM_OEM_ID,
3969 + EVMS_SEGMENT_MANAGER,
3970 + EVMS_MSDOS_PARTITION_MANAGER_ID),
3976 + .required_services_version = {
3985 + * Many architectures don't like unaligned accesses, which is
3986 + * frequently the case with the nr_sects and start_sect partition
3989 +#include <asm/unaligned.h>
3991 +#define SYS_IND(p) (get_unaligned(&p->sys_ind))
3992 +#define NR_SECTS(p) (u64)({ __typeof__(p->nr_sects) __a = \
3993 + get_unaligned(&p->nr_sects); \
3994 + le32_to_cpu(__a); \
3997 +#define START_SECT(p) (u64)({ __typeof__(p->start_sect) __a = \
3998 + get_unaligned(&p->start_sect); \
3999 + le32_to_cpu(__a); \
4002 +/******************************************/
4003 +/* List Support - Variables, & Functions */
4004 +/******************************************/
4008 +struct segment_list_node {
4009 + struct evms_logical_node *segment;
4010 + struct segment_list_node *next;
4013 +struct disk_list_node {
4014 + struct evms_logical_node *disk;
4015 + struct segment_list_node *segment_list;
4016 + struct disk_list_node *next;
4021 +static struct disk_list_node *my_disk_list;
4025 +static struct disk_list_node **
4026 +lookup_disk(struct evms_logical_node *disk)
4028 + struct disk_list_node **ldln;
4030 + ldln = &my_disk_list;
4032 + if ((*ldln)->disk == disk)
4034 + ldln = &(*ldln)->next;
4039 +static struct segment_list_node **
4040 +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
4042 + struct segment_list_node **lsln;
4044 + lsln = &disk->segment_list;
4046 + if ((*lsln)->segment == segment)
4048 + lsln = &(*lsln)->next;
4053 +static struct evms_logical_node *
4054 +find_segment_on_disk(struct evms_logical_node *disk,
4055 + u64 start_sect, u64 nr_sects)
4057 + struct evms_logical_node *rc = NULL;
4058 + struct disk_list_node **ldln;
4059 + struct segment_list_node **lsln;
4060 + struct dos_private *dos_prv;
4062 + ldln = lookup_disk(disk);
4064 + /* disk found in list */
4065 + /* attempt to find segment */
4067 + lsln = &(*ldln)->segment_list;
4069 + dos_prv = (*lsln)->segment->private;
4070 + if (dos_prv->start_sect == start_sect)
4071 + if (dos_prv->nr_sects == nr_sects)
4073 + lsln = &(*lsln)->next;
4076 + rc = (*lsln)->segment;
4081 +/* function description: add_segment_to_disk
4083 + * this function attempts to add a segment to the segment
4084 + * list of a disk. if the specified disk is not found, it
4085 + * will be added to the global disk list. this function will
4086 + * return a pointer to the matching segment in the disk's
4087 + * segment list. the caller must compare the returned pointer
4088 + * to the specified segment to see if the
4089 + * specified segment was already present in the disk's segment
4090 + * list. if the return pointer matches the specified segment,
4091 + * then the specified segment was added to the list. if the
4092 + * return segment pointer to does not match the specified
4093 + * segment pointer, then the specified segment pointer was
4094 + * a duplicate and can be thrown away.
4097 +add_segment_to_disk(struct evms_logical_node *disk,
4098 + struct evms_logical_node *segment)
4101 + struct disk_list_node **ldln, *new_disk;
4102 + struct segment_list_node **lsln, *new_segment;
4104 + ldln = lookup_disk(disk);
4105 + if (*ldln == NULL) {
4106 + /* disk not in list, add disk */
4107 + new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
4109 + memset(new_disk, 0, sizeof (*new_disk));
4110 + new_disk->disk = disk;
4117 + /* attempt to add segment */
4118 + lsln = lookup_segment(*ldln, segment);
4119 + if (*lsln == NULL) {
4120 + /* segment not in list, add segment */
4122 + kmalloc(sizeof (*new_segment), GFP_KERNEL);
4123 + if (new_segment) {
4124 + memset(new_segment, 0, sizeof (*new_segment));
4125 + new_segment->segment = segment;
4126 + *lsln = new_segment;
4137 +remove_segment_from_disk(struct evms_logical_node *disk,
4138 + struct evms_logical_node *segment,
4139 + struct evms_logical_node **empty_disk)
4142 + struct disk_list_node **ldln, *tmp_disk_node;
4143 + struct segment_list_node **lsln, *tmp_segment_node;
4145 + *empty_disk = NULL;
4146 + ldln = lookup_disk(disk);
4147 + if (*ldln == NULL) {
4150 + /* disk found in list */
4151 + /* attempt to add segment */
4152 + lsln = lookup_segment(*ldln, segment);
4153 + if (*lsln == NULL) {
4156 + tmp_segment_node = *lsln;
4157 + /* remove segment from list */
4158 + *lsln = (*lsln)->next;
4159 + /* free the segment list node */
4160 + kfree(tmp_segment_node);
4162 + if ((*ldln)->segment_list == NULL) {
4163 + tmp_disk_node = *ldln;
4164 + *empty_disk = tmp_disk_node->disk;
4165 + /* remove disk from list */
4166 + *ldln = (*ldln)->next;
4167 + /* free the disk list node */
4168 + kfree(tmp_disk_node);
4176 +is_extended_partition(struct partition *p)
4178 + return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
4179 + SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
4180 + SYS_IND(p) == LINUX_EXTENDED_PARTITION);
4184 +part_start(struct partition *part, u64 ext_start, u64 ebr_start)
4186 + u64 pstart = START_SECT(part);
4187 + pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
4192 +validate_mbr_ebr(struct evms_logical_node *node,
4193 + struct mbr_ebr *mbr_ebr, u64 ext_start,
4196 + int valid_mbr_ebr, i, j, mbr_flag;
4197 + struct partition *pi, *pj;
4198 + u64 pi_start, pi_end, pj_start, pj_end;
4200 + /* assume an MBR */
4203 + /* assume its valid */
4204 + valid_mbr_ebr = TRUE;
4206 + /* check for valid signature */
4207 + if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
4208 + LOG_DEBUG("%s: invalid signature on '%s'!\n",
4209 + __FUNCTION__, node->name);
4210 + valid_mbr_ebr = FALSE;
4213 + /* check for an AIX IPL signature */
4214 +#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */
4215 + if (*(unsigned int *) mbr_ebr == IPLRECID) {
4216 + LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
4217 + __FUNCTION__, node->name);
4218 + valid_mbr_ebr = FALSE;
4221 + /* check for boot sector fields */
4223 +#if 0 //Remove checking of the first byte
4225 + /* attempt to make some initial assumptions about
4226 + * what type of data structure this could be. we
4227 + * start by checking the 1st byte. we can tell a
4228 + * few things based on what is or isn't there.
4230 + if (valid_mbr_ebr == TRUE)
4231 + switch (*(u_char *) mbr_ebr) {
4232 + /* check for JMP as 1st instruction
4233 + * if found, assume (for now), that
4234 + * this is a boot sector.
4236 + /* Removed the JMP opcode check because it's not enough to determine
4237 + * that this sector does not have a valid MBR.
4238 + * Note: To avoid going thru validation process of partition table,
4239 + * it's necessary to have a better boot sector check
4240 + * (eg. JMP opcode && other conditions) */
4243 + LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
4244 + valid_mbr_ebr = FALSE;
4246 + /* let this fall thru to pick up the
4247 + * mbr_flag == FALSE.
4250 + /* the MBR should contain boot strap
4251 + * code, so we don't expect the 1st
4252 + * byte to be a 0x0. If the 1st byte
4253 + * IS 0x0, its assumed (for now) to
4260 +#endif //Remove checking of the first byte
4262 + if (valid_mbr_ebr == TRUE) {
4263 + /* dump the partition table entries in debug mode */
4265 + ("%s: disk relative starts: ext_part("PFU64"), ebr("PFU64").\n",
4266 + __FUNCTION__, ext_start, ebr_start);
4267 + for (i = 0; i < 4; i++) {
4268 + pi = &mbr_ebr->partitions[i];
4270 + ("%s: Partition: index(%d), start("PFU64"), size("PFU64"), sys(0x%x).\n",
4271 + __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi),
4275 + /* check for PMBR (Protected Master Boot Record)
4276 + * and skip this node if found
4278 + for (i = 0; i < 4; i++) {
4279 + pi = &mbr_ebr->partitions[i];
4281 + if (SYS_IND(pi) == 0xEE) {
4282 + valid_mbr_ebr = FALSE;
4284 + ("%s: detected PMBR on '%s', skipping.\n",
4285 + __FUNCTION__, node->name);
4290 + /* check of this segment is marked as non-dividable
4291 + * and skip if found
4293 + if (node->iflags & EVMS_TOP_SEGMENT) {
4294 + valid_mbr_ebr = FALSE;
4298 + if (valid_mbr_ebr == TRUE) {
4299 + /* check for mbr/ebr partition table validity */
4300 + for (i = 0; i < 4; i++) {
4301 + pi = &mbr_ebr->partitions[i];
4302 + if (NR_SECTS(pi)) {
4303 + /* check for partition extending past end of node */
4304 + pi_start = part_start(pi, ext_start, ebr_start);
4305 + pi_end = pi_start + NR_SECTS(pi) - 1;
4306 + if (pi_end >= node->total_vsectors) {
4308 + ("%s: partition(%d) ends("PFU64") beyond the end of the disk(%s,"PFU64")!\n",
4309 + __FUNCTION__, i, pi_end,
4310 + node->name, node->total_vsectors);
4311 + valid_mbr_ebr = FALSE;
4313 + if (valid_mbr_ebr == FALSE)
4316 + /* check for partition overlap */
4317 + for (j = i + 1; j < 4; j++) {
4318 + pj = &mbr_ebr->partitions[j];
4319 + if (NR_SECTS(pj)) {
4321 + part_start(pj, ext_start,
4324 + pj_start + NR_SECTS(pj) - 1;
4325 + if (pi_start == pj_start) {
4326 + valid_mbr_ebr = FALSE;
4327 + } else if (pi_start < pj_start) {
4328 + if (pi_end >= pj_start)
4331 + } else if (pi_start <= pj_end)
4332 + valid_mbr_ebr = FALSE;
4334 + if (valid_mbr_ebr == FALSE) {
4336 + ("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
4337 + __FUNCTION__, i, j,
4343 + if (valid_mbr_ebr == FALSE)
4348 + if (valid_mbr_ebr == TRUE) {
4349 + LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
4350 + (mbr_flag == TRUE) ? 'M' : 'E', node->name);
4352 + LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
4353 + __FUNCTION__, node->name);
4355 + return (valid_mbr_ebr);
4359 + * Function: add_segment
4362 +mbr_ebr_process_segment(struct evms_logical_node **discover_list,
4363 + struct evms_logical_node *node,
4366 + unsigned char type, int part_num, char *partition_name)
4368 + struct dos_private *dos_prv = NULL;
4369 + struct evms_logical_node *segment;
4372 + segment = find_segment_on_disk(node, start_sect, nr_sects);
4374 + LOG_DETAILS("exporting segment '%s'.\n", segment->name);
4376 + dos_prv = kmalloc(sizeof (*dos_prv), GFP_KERNEL);
4378 + memset(dos_prv, 0, sizeof (*dos_prv));
4379 + dos_prv->source_disk = node;
4380 + dos_prv->start_sect = start_sect;
4381 + dos_prv->nr_sects = nr_sects;
4382 + dos_prv->type = type;
4383 + rc = evms_cs_allocate_logical_node(&segment);
4388 + segment->plugin = &plugin_header;
4389 + segment->system_id = (unsigned int) type;
4390 + segment->total_vsectors = nr_sects;
4391 + segment->block_size = node->block_size;
4392 + segment->hardsector_size = node->hardsector_size;
4393 + segment->private = dos_prv;
4394 + segment->flags = node->flags;
4395 + if (partition_name)
4396 + strcpy(segment->name, partition_name);
4398 + strcpy(segment->name, node->name);
4399 + if (GetPluginType(node->plugin->id) ==
4400 + EVMS_SEGMENT_MANAGER) {
4401 + strcat(segment->name, ".");
4403 + sprintf(segment->name + strlen(segment->name),
4406 + /* watch for super floppy format gpt system partition
4407 + * and dont let it be sub divided
4409 + if (segment->system_id == GPT_ESP_INDICATOR) {
4410 + node->iflags |= EVMS_TOP_SEGMENT;
4412 + LOG_DETAILS("creating segment '%s'.\n", segment->name);
4413 + rc = add_segment_to_disk(node, segment);
4416 + ("%s: error(%d) adding segment '%s'!\n",
4417 + __FUNCTION__, rc, segment->name);
4420 + MOD_INC_USE_COUNT;
4427 + evms_cs_deallocate_logical_node(segment);
4431 + evms_cs_add_logical_node_to_list(discover_list, segment);
4438 +print_partition_info(char *leading_comment, struct partition *p)
4441 + ("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA("PFU64"), sizeLBA("PFU64")\n",
4442 + leading_comment, p->boot_ind, p->sys_ind, p->cyl, p->head,
4443 + p->sector, p->end_cyl, p->end_head, p->end_sector, START_SECT(p),
4447 +#ifdef CONFIG_BSD_DISKLABEL
4448 +#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
4450 +print_bsd_partition_info(char *leading_comment, struct bsd_partition *p)
4453 + ("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
4454 + leading_comment, p->p_size, p->p_offset, p->p_fsize, p->p_fstype,
4455 + p->p_frag, p->p_cpg);
4459 + * bsd_disklabel_partition
4462 + * - 0 for 0 partition
4463 + * - (positive) number for number of BSD partitions found
4464 + * - (negative) error code
4467 +bsd_disklabel_partition(struct evms_logical_node **discover_list,
4468 + struct evms_logical_node *node, struct partition *bsd)
4470 + struct bsd_disklabel *l;
4471 + struct bsd_partition *p;
4472 + int max_partitions;
4477 + data = kmalloc(node->hardsector_size, GFP_KERNEL);
4479 + rc = INIT_IO(node,
4482 + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET, 1, data);
4487 + l = (struct bsd_disklabel *) data;
4488 + if (l->d_magic == BSD_DISKMAGIC) {
4492 + OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS :
4493 + BSD_MAXPARTITIONS);
4494 + if (l->d_npartitions < max_partitions)
4495 + max_partitions = l->d_npartitions;
4496 + for (p = l->d_partitions;
4497 + p - l->d_partitions < max_partitions; p++) {
4498 + if (p->p_fstype != BSD_FS_UNUSED) {
4499 + evmsLOG2(EVMS_INFO_EXTRA,
4500 + (print_bsd_partition_info
4501 + (__FUNCTION__, p)));
4502 + rc = mbr_ebr_process_segment
4503 + (discover_list, node,
4504 + (u64) p->p_offset,
4505 + (u64) p->p_size, p->p_fstype,
4506 + cur_comp_part_num++, NULL);
4518 + LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4523 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4524 +#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
4527 + * unixware_partition
4530 + * - 0 for 0 partition
4531 + * - (positive) number for number of UNIXWARE partitions found
4532 + * - (negative) error code
4535 +unixware_partition(struct evms_logical_node **discover_list,
4536 + struct evms_logical_node *node,
4537 + struct partition *unixware_part)
4539 + struct unixware_disklabel *l;
4540 + struct unixware_slice *p;
4541 + char *data = NULL;
4545 + data = kmalloc(node->hardsector_size, GFP_KERNEL);
4547 + rc = INIT_IO(node,
4549 + START_SECT(unixware_part) +
4550 + UNIXWARE_PART_TABLE_SECTOR_OFFSET, 1, data);
4554 + l = (struct unixware_disklabel *) data;
4555 + if (le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
4556 + le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
4557 + p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */
4558 + while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
4559 + if (p->s_label != UNIXWARE_FS_UNUSED) {
4560 + rc = mbr_ebr_process_segment
4561 + (discover_list, node, START_SECT(p),
4562 + NR_SECTS(p), UNIXWARE_PARTITION,
4563 + cur_comp_part_num++, NULL);
4576 + LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4581 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4582 +#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
4584 + * solaris_x86_partition
4587 + * - 0 for 0 partition
4588 + * - (positive) number for number of solaris partitions found
4589 + * - (negative) error code
4592 +solaris_x86_partition(struct evms_logical_node **discover_list,
4593 + struct evms_logical_node *node,
4594 + struct partition *solaris_x86, int probe_only)
4595 +{ /* if TRUE, do not add segments */
4596 + long offset = START_SECT(solaris_x86);
4597 + struct solaris_x86_vtoc *v;
4598 + struct solaris_x86_slice *s;
4600 + char *data = NULL;
4604 + data = kmalloc(node->hardsector_size, GFP_KERNEL);
4606 + rc = INIT_IO(node,
4608 + START_SECT(solaris_x86) +
4609 + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET, 1, data);
4614 + v = (struct solaris_x86_vtoc *) data;
4616 + if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
4617 + if (v->v_version != 1) {
4619 + ("%s: cannot handle version %d vtoc>\n",
4620 + __FUNCTION__, v->v_version);
4622 + for (i = 0; i < v->v_nparts; i++) {
4623 + s = &v->v_slice[i];
4625 + ("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
4626 + i, s->s_tag, s->s_flag, s->s_start,
4628 + s->s_start + s->s_size - 1);
4630 + if ((s->s_size == 0)
4631 + || (s->s_tag == 0x05))
4633 + if (!probe_only) {
4634 + rc = mbr_ebr_process_segment
4635 + (discover_list, node,
4636 + (u64) (s->s_start +
4639 + SOLARIS_X86_PARTITION,
4640 + cur_comp_part_num++, NULL);
4653 + LOG_DETAILS("%s: %s (%d) partitions\n",
4654 + __FUNCTION__, probe_only ? " " : "exported", rc);
4660 + * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
4662 + * Returns: 1 - os2 DLAT was found
4667 +os2lvm_partition(u64 MBR_EBR_sect,
4668 + struct evms_logical_node *node, struct dla_table_sector *dlat)
4670 + struct hd_geometry geometry;
4674 + rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long) &geometry);
4676 + LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n",
4677 + __FUNCTION__, rc, node->name);
4679 + if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat))
4681 + if ((dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1))
4682 + && (dlat->DLA_Signature2 ==
4683 + cpu_to_le32(DLA_TABLE_SIGNATURE2))) {
4684 + crc_hold = le32_to_cpu(dlat->DLA_CRC);
4685 + dlat->DLA_CRC = 0;
4686 + if (evms_cs_calculate_crc
4687 + (EVMS_INITIAL_CRC, (void *) dlat,
4688 + node->hardsector_size) == crc_hold)
4696 +mbr_ebr_process_logical_drive(struct evms_logical_node **discover_list,
4697 + struct evms_logical_node *node,
4698 + struct extended_part *ext_info,
4700 + struct partition *p,
4701 + int os2lvm, struct dla_table_sector *dlat)
4704 + char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4706 + LOG_EXTRA("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4707 + __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4709 + if (NR_SECTS(p)) {
4710 + if (is_extended_partition(p)) {
4711 + ext_info->next_ebr_start =
4712 + (u64) (START_SECT(p) +
4713 + START_SECT(ext_info->extended));
4714 + ext_info->done = FALSE; /* not done yet */
4716 + partition_name = NULL;
4717 + if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4718 + le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4719 + (ext_info->start_sect + START_SECT(p))
4720 + && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4722 + && dlat->DLA_Array[i].Drive_Letter != '\0') {
4723 + sprintf(tmp_buf, "os2/%c",
4724 + dlat->DLA_Array[i].Drive_Letter);
4725 + partition_name = tmp_buf;
4727 + evmsLOG2(EVMS_INFO_EXTRA,
4728 + (print_partition_info(__FUNCTION__, p)));
4730 + rc = mbr_ebr_process_segment(discover_list,
4732 + ext_info->start_sect +
4733 + START_SECT(p), NR_SECTS(p),
4735 + cur_comp_part_num++,
4743 +mbr_ebr_process_ebr(struct evms_logical_node **discover_list,
4744 + struct evms_logical_node *node,
4745 + struct extended_part *ext_info, struct mbr_ebr *ebr)
4747 + int rc = 0, i, os2lvm;
4748 + struct partition *p;
4749 + struct dla_table_sector *dlat = NULL;
4751 + /* allocate space for the OS2 DLAT info */
4752 + dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4754 + /* read the dlat for this mbr */
4755 + os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
4757 + /* walk thru the partition table in the mbr
4758 + * processing each partition record.
4760 + for (i = 0; i < 4; i++) {
4761 + p = &ebr->partitions[i];
4762 + rc = mbr_ebr_process_logical_drive(discover_list,
4765 + i, p, os2lvm, dlat);
4771 + /* free the space used for OS2 DLAT info */
4779 +mbr_ebr_probe_for_ebr(struct evms_logical_node **discover_list,
4780 + struct evms_logical_node *node,
4781 + struct extended_part *ext_info)
4784 + u_char *sector_buffer = NULL;
4785 + struct mbr_ebr *ebr = NULL;
4787 + /* allocate a sector size buffer */
4788 + sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
4789 + if (sector_buffer)
4790 + /* read the location of the mbr sector */
4791 + rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
4796 + ebr = (struct mbr_ebr *) sector_buffer;
4797 + if (validate_mbr_ebr(node, ebr,
4798 + START_SECT(ext_info->extended),
4799 + ext_info->start_sect) == TRUE)
4800 + rc = mbr_ebr_process_ebr(discover_list,
4801 + node, ext_info, ebr);
4804 + if (sector_buffer)
4805 + kfree(sector_buffer);
4811 +mbr_ebr_process_extended_partition(struct evms_logical_node **discover_list,
4812 + struct evms_logical_node *node,
4813 + struct partition *p)
4816 + struct extended_part ext_info;
4818 + memset(&ext_info, 0, sizeof (ext_info));
4819 + ext_info.done = FALSE;
4820 + ext_info.extended = p;
4821 + ext_info.next_ebr_start = START_SECT(p);
4822 + while (ext_info.done == FALSE) {
4823 + ext_info.done = TRUE; /* assume done, unless we find another EBR */
4824 + ext_info.start_sect = ext_info.next_ebr_start;
4825 + rc = mbr_ebr_probe_for_ebr(discover_list, node, &ext_info);
4831 + * is_non_dos_extended
4833 + * This function returns TRUE if the partition entry represents a non-DOS
4834 + * extended partition such as UnixWare, Solaris x86 and BSD
4837 +is_non_dos_extended(struct evms_logical_node **discover_list,
4838 + struct evms_logical_node *node, struct partition *p)
4840 + if (NR_SECTS(p)) {
4841 +#ifdef CONFIG_BSD_DISKLABEL
4842 + if (SYS_IND(p) == BSD_PARTITION ||
4843 + SYS_IND(p) == NETBSD_PARTITION ||
4844 + SYS_IND(p) == OPENBSD_PARTITION)
4848 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4849 + if (SYS_IND(p) == UNIXWARE_PARTITION)
4853 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4854 + if ((SYS_IND(p) == SOLARIS_X86_PARTITION) &&
4855 + (solaris_x86_partition(discover_list, node, p, TRUE) > 0))
4863 + * mbr_ebr_process_other_primary_partition
4864 + * This function processes other (non-DOS) primary partitions such as
4865 + * UnixWare, Solaris x86 and BSD
4868 +mbr_ebr_process_other_primary_partition(struct evms_logical_node
4870 + struct evms_logical_node *node,
4871 + struct partition *p)
4873 + if (NR_SECTS(p)) {
4874 +#ifdef CONFIG_BSD_DISKLABEL
4875 + if (SYS_IND(p) == BSD_PARTITION ||
4876 + SYS_IND(p) == NETBSD_PARTITION ||
4877 + SYS_IND(p) == OPENBSD_PARTITION)
4878 + return bsd_disklabel_partition(discover_list, node, p);
4881 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4882 + if (SYS_IND(p) == UNIXWARE_PARTITION)
4883 + return unixware_partition(discover_list, node, p);
4886 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4887 + if (SYS_IND(p) == SOLARIS_X86_PARTITION)
4888 + return solaris_x86_partition(discover_list, node, p,
4896 +mbr_ebr_process_dos_primary_partition(struct evms_logical_node **discover_list,
4897 + struct evms_logical_node *node,
4899 + struct partition *p,
4900 + int os2lvm, struct dla_table_sector *dlat)
4903 + char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4905 + LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4906 + __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4908 + if (NR_SECTS(p)) {
4910 + if (is_extended_partition(p))
4911 + rc = mbr_ebr_process_extended_partition(discover_list,
4915 + partition_name = NULL;
4916 + if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4917 + le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4919 + && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4921 + && dlat->DLA_Array[i].Drive_Letter != '\0') {
4922 + sprintf(tmp_buf, "os2/%c",
4923 + dlat->DLA_Array[i].Drive_Letter);
4924 + partition_name = tmp_buf;
4926 + evmsLOG2(EVMS_INFO_EXTRA,
4927 + (print_partition_info(__FUNCTION__, p)));
4929 + rc = mbr_ebr_process_segment(discover_list,
4934 + i + 1, partition_name);
4941 +mbr_ebr_process_mbr(struct evms_logical_node **discover_list,
4942 + struct evms_logical_node *node, struct mbr_ebr *mbr)
4944 + int rc = 0, i, os2lvm;
4945 + struct partition *p;
4946 + struct dla_table_sector *dlat = NULL;
4948 + cur_comp_part_num = 5; /* set this value for each disk */
4950 + /* allocate space for the OS2 DLAT info */
4951 + dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4953 + /* read the dlat for this mbr */
4954 + os2lvm = os2lvm_partition(0, node, dlat);
4956 + /* Pass 1: walk thru the partition table in the mbr
4957 + * processing each partition record.
4959 + for (i = 0; i < 4; i++) {
4960 + p = &mbr->partitions[i];
4961 + if (is_non_dos_extended(discover_list, node, p)) {
4963 + (" Found and skip a non-dos extended partition.\n");
4967 + mbr_ebr_process_dos_primary_partition(discover_list,
4973 + /* Pass 2: walk thru the partition table in the mbr
4974 + * processing each partition record for non-DOS extended partitions
4976 + for (i = 0; i < 4; i++) {
4977 + p = &mbr->partitions[i];
4978 + mbr_ebr_process_other_primary_partition(discover_list,
4986 + /* free the space used for OS2 DLAT info */
4994 +mbr_ebr_probe_for_mbr(struct evms_logical_node **discover_list,
4995 + struct evms_logical_node *node)
4998 + u_char *sector_buffer = NULL;
4999 + struct mbr_ebr *mbr = NULL;
5001 + LOG_DEBUG("%s: probing (%s).\n", __FUNCTION__, node->name);
5003 + /* allocate a sector size buffer */
5004 + sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
5005 + if (sector_buffer)
5006 + /* read the location of the mbr sector */
5007 + rc = INIT_IO(node, 0, 0, 1, sector_buffer);
5011 + LOG_ERROR("%s: read error(%d) on '%s'.\n",
5012 + __FUNCTION__, rc, node->name);
5014 + mbr = (struct mbr_ebr *) sector_buffer;
5015 + if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
5016 + /* since it looks like this disk has a
5017 + * valid MBR, remove the disk node from
5018 + * the discover list. it may already be
5019 + * on the global list, or it will be
5020 + * added to it. in the case of an mbr
5021 + * with no partitions, it is simply
5022 + * removed and forgotten. when one or
5023 + * more partitions are created, the
5024 + * disk will be examined and handled
5025 + * properly during the following
5026 + * rediscover operation.
5028 + evms_cs_remove_logical_node_from_list(discover_list,
5031 + rc = mbr_ebr_process_mbr(discover_list, node, mbr);
5035 + if (sector_buffer)
5036 + kfree(sector_buffer);
5042 + * Function: mbr_ebr_partition_discover
5046 +mbr_ebr_partition_discover(struct evms_logical_node **discover_list)
5049 + struct evms_logical_node *node, *next_node;
5051 + MOD_INC_USE_COUNT;
5052 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
5054 + /* initialize global variable */
5055 + exported_nodes = 0;
5057 + /* examine each node on the discover list */
5058 + next_node = *discover_list;
5059 + while (next_node) {
5061 + next_node = node->next;
5062 + if (node->plugin->id == plugin_header.id)
5063 + /* don't recurse into our own objects
5066 + mbr_ebr_probe_for_mbr(discover_list, node);
5069 + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
5070 + __FUNCTION__, exported_nodes, rc);
5071 + if (exported_nodes)
5072 + rc = exported_nodes;
5073 + MOD_DEC_USE_COUNT;
5078 + * Function: mbr_ebr_partition_delete
5082 +mbr_ebr_partition_delete(struct evms_logical_node *segment)
5085 + struct dos_private *dos_prv;
5086 + struct evms_logical_node *empty_disk = NULL;
5088 + LOG_DETAILS("deleting segment '%s'.\n", segment->name);
5093 + dos_prv = segment->private;
5095 + /* remove the segment from the
5096 + * disk's segment list
5098 + rc = remove_segment_from_disk(dos_prv->source_disk,
5099 + segment, &empty_disk);
5100 + /* free the local instance data */
5103 + /* free the segment node */
5104 + evms_cs_deallocate_logical_node(segment);
5105 + MOD_DEC_USE_COUNT;
5106 + /* if the last segment on the disk was
5107 + * deleted, delete the disk node too
5110 + DELETE(empty_disk);
5116 + * function: mbr_ebr_partition_io_error
5118 + * this function was primarily created because the function
5119 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
5120 + * to be set on inline functions. Since this was an error path
5121 + * and not mainline, I decided to add a trace statement to help
5122 + * report on the failing condition.
5126 +mbr_ebr_partition_io_error(struct evms_logical_node *node,
5127 + int io_flag, struct buffer_head *bh)
5130 + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector("PFU64").\n",
5131 + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
5132 + (u64) bh->b_rsector);
5134 + bh->b_end_io(bh, 0);
5138 + * Function: mbr_ebr_partition_read
5142 +mbr_ebr_partition_read(struct evms_logical_node *partition,
5143 + struct buffer_head *bh)
5145 + struct dos_private *dos_prv = partition->private;
5147 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5148 + partition->total_vsectors) {
5149 + bh->b_rsector += dos_prv->start_sect;
5150 + R_IO(dos_prv->source_disk, bh);
5152 + mbr_ebr_partition_io_error(partition, READ, bh);
5156 + * Function: mbr_ebr_partition_write
5160 +mbr_ebr_partition_write(struct evms_logical_node *partition,
5161 + struct buffer_head *bh)
5163 + struct dos_private *dos_prv = partition->private;
5165 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5166 + partition->total_vsectors) {
5167 + bh->b_rsector += dos_prv->start_sect;
5168 + W_IO(dos_prv->source_disk, bh);
5170 + mbr_ebr_partition_io_error(partition, WRITE, bh);
5174 + * Function: mbr_ebr_partition_init_io
5178 +mbr_ebr_partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */
5179 + u64 sect_nr, /* disk LBA */
5180 + u64 num_sects, /* # of sectors */
5182 +{ /* buffer address */
5184 + struct dos_private *dos_prv = partition->private;
5186 + if ((sect_nr + num_sects) <= partition->total_vsectors) {
5187 + rc = INIT_IO(dos_prv->source_disk, io_flag,
5188 + sect_nr + dos_prv->start_sect, num_sects,
5192 + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
5193 + (io_flag) ? "WRITE" : "READ", partition->name,
5194 + (dos_prv->nr_sects - 1), sect_nr, num_sects);
5202 + * Function: mbr_ebr_partition_ioctl
5206 +mbr_ebr_partition_ioctl(struct evms_logical_node *partition,
5207 + struct inode *inode,
5208 + struct file *file, unsigned int cmd, unsigned long arg)
5210 + struct dos_private *dos_prv;
5211 + struct hd_geometry hd_geo;
5215 + dos_prv = partition->private;
5221 + rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5224 + if (copy_from_user
5225 + (&hd_geo, (void *) arg,
5226 + sizeof (struct hd_geometry)))
5230 + hd_geo.start = dos_prv->start_sect;
5232 + ((void *) arg, &hd_geo,
5233 + sizeof (struct hd_geometry)))
5237 + case EVMS_GET_BMAP:
5239 + struct evms_get_bmap_pkt *bmap =
5240 + (struct evms_get_bmap_pkt *) arg;
5241 + bmap->rsector += dos_prv->start_sect;
5242 + /* intentionally fall thru to
5243 + * default ioctl down to device
5248 + rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5254 + * Function: dos_part_init
5258 +dos_part_init(void)
5260 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
5264 +dos_part_exit(void)
5266 + evms_cs_unregister_plugin(&plugin_header);
5269 +module_init(dos_part_init);
5270 +module_exit(dos_part_exit);
5271 +#ifdef MODULE_LICENSE
5272 +MODULE_LICENSE("GPL");
5274 diff -Naur linux-2002-09-30/drivers/evms/evms.c evms-2002-09-30/drivers/evms/evms.c
5275 --- linux-2002-09-30/drivers/evms/evms.c Wed Dec 31 18:00:00 1969
5276 +++ evms-2002-09-30/drivers/evms/evms.c Thu Sep 26 11:55:45 2002
5278 +/* -*- linux-c -*- */
5282 + * Copyright (c) International Business Machines Corp., 2000
5284 + * This program is free software; you can redistribute it and/or modify
5285 + * it under the terms of the GNU General Public License as published by
5286 + * the Free Software Foundation; either version 2 of the License, or
5287 + * (at your option) any later version.
5289 + * This program is distributed in the hope that it will be useful,
5290 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
5291 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
5292 + * the GNU General Public License for more details.
5294 + * You should have received a copy of the GNU General Public License
5295 + * along with this program; if not, write to the Free Software
5296 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
5302 + * linux/drivers/evms/evms.c
5304 + * EVMS Base and Common Services
5308 +#define DEVICE_NR(device) MINOR(device) /* evms has no partition bits */
5309 +#define DEVICE_NAME "evms" /* name for messaging */
5310 +#define DEVICE_NO_RANDOM /* no entropy to contribute */
5311 +#define DEVICE_OFF(d) /* do nothing */
5313 +//#define LOCAL_DEBUG 1
5315 +#include <linux/config.h>
5316 +#include <linux/module.h>
5317 +#include <linux/errno.h>
5318 +#include <linux/kernel.h>
5319 +#include <linux/init.h>
5320 +#include <linux/fs.h>
5321 +#include <linux/slab.h>
5322 +#include <asm/uaccess.h>
5323 +#include <linux/blk.h> /* must be included by all block drivers */
5324 +#include <linux/blkdev.h>
5325 +#include <linux/blkpg.h>
5326 +#include <linux/iobuf.h>
5327 +#include <linux/genhd.h>
5328 +#include <linux/sched.h>
5329 +#include <linux/completion.h>
5330 +#include <linux/version.h>
5331 +#include <linux/swap.h>
5332 +#include <net/checksum.h>
5333 +#include <linux/sysctl.h>
5334 +#include <linux/smp_lock.h>
5335 +#include <linux/reboot.h>
5336 +#include <linux/compiler.h>
5337 +#include <linux/evms/evms.h>
5339 +//#define VFS_PATCH_PRESENT
5341 +/* prefix used in logging messages */
5344 +struct evms_registered_plugin {
5345 + struct evms_plugin_header *plugin;
5346 + struct evms_registered_plugin *next;
5348 +static struct evms_registered_plugin *registered_plugin_head = NULL;
5350 +static struct evms_list_node *evms_global_device_list = NULL;
5351 +static struct evms_list_node *evms_global_feature_node_list = NULL;
5352 +static struct evms_list_node *evms_global_notify_list = NULL;
5354 +int evms_info_level = EVMS_INFO_LEVEL;
5355 +struct proc_dir_entry *evms_proc_dir = NULL;
5356 +EXPORT_SYMBOL(evms_info_level);
5357 +static struct evms_logical_volume *evms_logical_volumes;
5358 +static int evms_volumes = 0;
5359 +/* a few variables to aid in detecting memory leaks.
5360 + * these variables are always in use, regardless of
5361 + * the state of EVMS_MEM_DEBUG.
5363 +static atomic_t evms_allocs = (atomic_t) ATOMIC_INIT(0);
5364 +static atomic_t evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
5366 +u8 *evms_primary_string = "primary";
5367 +EXPORT_SYMBOL(evms_primary_string);
5368 +u8 *evms_secondary_string = "secondary";
5369 +EXPORT_SYMBOL(evms_secondary_string);
5371 +static struct evms_version evms_svc_version = {
5372 + .major = EVMS_COMMON_SERVICES_MAJOR,
5373 + .minor = EVMS_COMMON_SERVICES_MINOR,
5374 + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL
5377 +/* Handles for "private" EVMS object pools */
5378 +static struct evms_pool_mgmt *evms_io_notify_pool;
5380 +/* Handles for "public" EVMS object pools */
5381 +struct evms_pool_mgmt *evms_bh_pool;
5382 +EXPORT_SYMBOL(evms_bh_pool);
5384 +/* Handle for the devfs directory entry */
5385 +devfs_handle_t evms_dir_devfs_handle;
5386 +devfs_handle_t evms_blk_devfs_handle;
5388 +/**********************************************************/
5389 +/* SYSCTL - EVMS folder */
5390 +/**********************************************************/
5392 +#ifdef CONFIG_PROC_FS
5393 +static struct ctl_table_header *evms_table_header;
5394 +static int evms_info_level_min = EVMS_INFO_CRITICAL;
5395 +static int evms_info_level_max = EVMS_INFO_EVERYTHING;
5397 +static ctl_table evms_table[] = {
5398 + {DEV_EVMS_INFO_LEVEL, "evms_info_level",
5399 + &evms_info_level, sizeof (int), 0644, NULL,
5400 + &proc_dointvec_minmax, &sysctl_intvec,
5401 + NULL, &evms_info_level_min, &evms_info_level_max},
5405 +static ctl_table evms_dir_table[] = {
5406 + {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},
5410 +static ctl_table dev_dir_table[] = {
5411 + {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
5416 +/**********************************************************/
5417 +/* START -- arch ioctl32 support */
5418 +/**********************************************************/
5419 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
5420 +#include <linux/evms/evms_bbr_k.h>
5421 +#include <linux/raid/md.h>
5423 +extern asmlinkage long
5424 +sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
5427 +register_ioctl32_conversion(unsigned int cmd, void *handler);
5430 +unregister_ioctl32_conversion(unsigned int cmd);
5432 +#define uvirt_to_kernel(__x) ((unsigned long)(__x))
5433 +typedef unsigned int __uvirt_addr;
5435 +struct evms_sector_io32 {
5438 + u64 starting_sector;
5440 + __uvirt_addr buffer_address;
5444 +struct evms_rediscover32 {
5447 + __uvirt_addr drive_array;
5450 +struct evms_compute_csum32 {
5451 + __uvirt_addr buffer_address;
5458 +struct evms_plugin_ioctl32 {
5460 + s32 feature_command;
5462 + __uvirt_addr feature_ioctl_data;
5465 +struct evms_notify_bbr32 {
5466 + char object_name[EVMS_VOLUME_NAME_SIZE+1];
5470 + __uvirt_addr buffer;
5474 +#define EVMS_MD_ID 4
5475 +#define EVMS_MD_PERS_IOCTL_CMD 1
5476 +#define EVMS_MD_ADD 2
5477 +#define EVMS_MD_REMOVE 3
5478 +#define EVMS_MD_ACTIVATE 4
5479 +#define EVMS_MD_DEACTIVATE 5
5480 +#define EVMS_MD_GET_ARRAY_INFO 6
5481 +#define EVMS_MD_RAID5_INIT_IO 1
5483 +struct evms_md_ioctl {
5489 +struct evms_md_ioctl32 {
5495 +struct evms_md_array_info {
5496 + unsigned long state;
5500 +struct evms_md_array_info32 {
5505 +struct raid5_ioctl_init_io {
5512 +struct raid5_ioctl_init_io32 {
5516 + __uvirt_addr data;
5519 +#define EVMS_MD_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5520 + (EVMS_REGION_MANAGER << 12) | EVMS_MD_ID)
5521 +#define EVMS_BBR_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5522 + (EVMS_FEATURE << 12) | EVMS_BBR_FEATURE_ID)
5524 +#define EVMS_SECTOR_IO_32 _IOWR(EVMS_MAJOR, \
5525 + EVMS_SECTOR_IO_NUMBER, \
5526 + struct evms_sector_io32)
5527 +#define EVMS_REDISCOVER_VOLUMES_32 _IOWR(EVMS_MAJOR, \
5528 + EVMS_REDISCOVER_VOLUMES_NUMBER, \
5529 + struct evms_rediscover32)
5530 +#define EVMS_COMPUTE_CSUM_32 _IOWR(EVMS_MAJOR, \
5531 + EVMS_COMPUTE_CSUM_NUMBER, \
5532 + struct evms_compute_csum32)
5533 +#define EVMS_PLUGIN_IOCTL_32 _IOR(EVMS_MAJOR, \
5534 + EVMS_PLUGIN_IOCTL_NUMBER, \
5535 + struct evms_plugin_ioctl32)
5537 +static int evms_sector_io(unsigned int fd,
5539 + unsigned long arg)
5541 + mm_segment_t old_fs = get_fs();
5542 + struct evms_sector_io32 parms32;
5543 + struct evms_sector_io_pkt parms;
5544 + unsigned int kcmd;
5548 + if (copy_from_user(&parms32, (struct evms_sector_io32 *)arg,
5549 + sizeof(struct evms_sector_io32)))
5552 + parms.disk_handle = parms32.disk_handle;
5553 + parms.io_flag = parms32.io_flag;
5554 + parms.starting_sector = parms32.starting_sector;
5555 + parms.sector_count = parms32.sector_count;
5556 + parms.buffer_address = (u8 *)uvirt_to_kernel(parms32.buffer_address);
5559 + kcmd = EVMS_SECTOR_IO;
5562 + set_fs(KERNEL_DS);
5563 + rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5566 + parms32.status = parms.status;
5568 + if (copy_to_user((struct evms_sector_io32 *)arg, &parms32,
5569 + sizeof(struct evms_sector_io32)))
5575 +static int evms_rediscover(unsigned int fd,
5577 + unsigned long arg)
5579 + mm_segment_t old_fs = get_fs();
5580 + struct evms_rediscover32 parms32;
5581 + struct evms_rediscover_pkt parms;
5582 + unsigned int kcmd;
5586 + if (copy_from_user(&parms32, (struct evms_rediscover32 *)arg,
5587 + sizeof(struct evms_rediscover32)))
5590 + parms.drive_count = parms32.drive_count;
5591 + parms.drive_array = (void *)uvirt_to_kernel(parms32.drive_array);
5594 + kcmd = EVMS_REDISCOVER_VOLUMES;
5597 + set_fs(KERNEL_DS);
5598 + rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5601 + parms32.status = parms.status;
5603 + if (copy_to_user((struct evms_rediscover32 *)arg, &parms32,
5604 + sizeof(struct evms_rediscover32)))
5610 +static int evms_compute_csum(unsigned int fd,
5612 + unsigned long arg)
5614 + mm_segment_t old_fs = get_fs();
5615 + struct evms_compute_csum32 parms32;
5616 + struct evms_compute_csum_pkt parms;
5617 + unsigned int kcmd;
5621 + if (copy_from_user(&parms32, (struct evms_compute_csum32 *)arg,
5622 + sizeof(struct evms_compute_csum32)))
5625 + parms.insum = parms32.insum;
5626 + parms.outsum = parms32.outsum;
5627 + parms.buffer_size = parms32.buffer_size;
5628 + parms.buffer_address = (void *)uvirt_to_kernel(parms32.buffer_address);
5631 + kcmd = EVMS_COMPUTE_CSUM;
5634 + set_fs(KERNEL_DS);
5635 + rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5638 + parms32.status = parms.status;
5639 + parms32.outsum = parms.outsum;
5641 + if (copy_to_user((struct evms_compute_csum32 *)arg, &parms32,
5642 + sizeof(struct evms_compute_csum32)))
5648 +static int evms_bbr_plugin_ioctl(unsigned int fd,
5650 + unsigned long arg)
5652 + mm_segment_t old_fs = get_fs();
5653 + struct evms_notify_bbr32 bbr_parms32;
5654 + struct evms_notify_bbr bbr_parms;
5655 + struct evms_plugin_ioctl_pkt *parms =
5656 + (struct evms_plugin_ioctl_pkt *)arg;
5657 + void *old_ptr = NULL;
5660 + if (copy_from_user(&bbr_parms32,
5661 + (struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5662 + sizeof(struct evms_notify_bbr32)))
5665 + memcpy(&bbr_parms, &bbr_parms32, sizeof(struct evms_notify_bbr32));
5666 + bbr_parms.buffer = (void *)uvirt_to_kernel(bbr_parms32.buffer);
5667 + bbr_parms.rw = bbr_parms32.rw;
5668 + old_ptr = parms->feature_ioctl_data;
5669 + parms->feature_ioctl_data = &bbr_parms;
5671 + set_fs(KERNEL_DS);
5672 + rc = sys_ioctl(fd, cmd, arg);
5675 + parms->feature_ioctl_data = old_ptr;
5678 + bbr_parms32.nr_sect = bbr_parms.nr_sect;
5679 + rc = copy_to_user((struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5681 + sizeof(struct evms_notify_bbr32));
5687 +static int evms_md_plugin_ioctl(unsigned int fd,
5689 + unsigned long arg)
5691 + mm_segment_t old_fs = get_fs();
5692 + void *old_ptr = NULL;
5693 + void *old_md_ptr = NULL;
5694 + struct evms_md_ioctl32 md_parms32;
5695 + struct evms_md_ioctl md_parms;
5696 + struct evms_md_array_info32 md_array_parms32;
5697 + struct evms_md_array_info md_array_parms;
5698 + struct raid5_ioctl_init_io32 r5_init_io_parms32;
5699 + struct raid5_ioctl_init_io r5_init_io_parms;
5700 + struct evms_plugin_ioctl_pkt *parms =
5701 + (struct evms_plugin_ioctl_pkt *)arg;
5704 + if (copy_from_user(&md_parms32,
5705 + (struct evms_md_ioctl*)parms->feature_ioctl_data,
5706 + sizeof(struct evms_md_ioctl32)))
5709 + md_parms.mddev_idx = md_parms32.mddev_idx;
5710 + md_parms.cmd = md_parms32.cmd;
5711 + md_parms.arg = (void *)uvirt_to_kernel(md_parms32.arg);
5712 + old_ptr = parms->feature_ioctl_data;
5713 + parms->feature_ioctl_data = &md_parms;
5715 + if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5716 + if (copy_from_user(&md_array_parms32,
5717 + (struct evms_md_array_info32*)md_parms.arg,
5718 + sizeof(struct evms_md_array_info32)))
5721 + md_array_parms.state = md_array_parms32.state;
5722 + md_array_parms.sb =
5723 + (void *)uvirt_to_kernel(md_array_parms32.sb);
5724 + old_md_ptr = (void *)md_parms.arg;
5725 + md_parms.arg = &md_array_parms;
5726 + } else if (parms->feature_command == EVMS_MD_PERS_IOCTL_CMD) {
5727 + if (md_parms.cmd == EVMS_MD_RAID5_INIT_IO) {
5728 + if (copy_from_user(&r5_init_io_parms32,
5729 + (struct raid5_ioctl_init_io32*)md_parms.arg,
5730 + sizeof(struct raid5_ioctl_init_io32)))
5733 + r5_init_io_parms.rw = r5_init_io_parms32.rw;
5734 + r5_init_io_parms.lsn = r5_init_io_parms32.lsn;
5735 + r5_init_io_parms.nr_sects = r5_init_io_parms32.nr_sects;
5736 + r5_init_io_parms.data =
5737 + (void *)uvirt_to_kernel(r5_init_io_parms32.data);
5738 + old_md_ptr = (void *)md_parms.arg;
5739 + md_parms.arg = &r5_init_io_parms;
5743 + set_fs(KERNEL_DS);
5744 + rc = sys_ioctl(fd, cmd, arg);
5747 + parms->feature_ioctl_data = old_ptr;
5748 + md_parms.arg = old_md_ptr;
5751 + if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5752 + md_array_parms32.state = md_array_parms.state;
5753 + rc = copy_to_user((struct evms_md_array_info32 *)md_parms.arg,
5754 + &md_array_parms32,
5755 + sizeof(struct evms_md_array_info32));
5758 + md_parms32.mddev_idx = md_parms.mddev_idx;
5759 + rc = copy_to_user((struct evms_md_ioctl*)parms->feature_ioctl_data,
5761 + sizeof(struct evms_md_ioctl32));
5768 +static int evms_plugin_ioctl(unsigned int fd,
5770 + unsigned long arg)
5772 + mm_segment_t old_fs = get_fs();
5773 + struct evms_plugin_ioctl32 parms32;
5774 + struct evms_plugin_ioctl_pkt parms;
5775 + unsigned int kcmd;
5779 + if (copy_from_user(&parms32, (struct evms_plugin_ioctl32 *)arg,
5780 + sizeof(struct evms_plugin_ioctl32)))
5783 + parms.feature_id = parms32.feature_id;
5784 + parms.feature_command = parms32.feature_command;
5785 + parms.status = parms32.status;
5786 + parms.feature_ioctl_data =
5787 + (void *)uvirt_to_kernel(parms32.feature_ioctl_data);
5789 + kcmd = EVMS_PLUGIN_IOCTL;
5792 + switch (parms.feature_id) {
5793 + case EVMS_MD_PLUGIN_ID:
5794 + rc = evms_md_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5796 + case EVMS_BBR_PLUGIN_ID:
5797 + rc = evms_bbr_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5800 + set_fs(KERNEL_DS);
5801 + rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5806 + parms32.status = parms.status;
5807 + rc = copy_to_user((struct evms_plugin_ioctl32 *)arg, &parms32,
5808 + sizeof(struct evms_plugin_ioctl32));
5815 +/**********************************************************/
5816 +/* START -- exported functions/Common Services */
5817 +/**********************************************************/
5820 + * Function: evms_cs_get_version
5821 + * Description: This function returns the current EVMS version
5824 +evms_cs_get_version(int *major, int *minor)
5826 + *major = EVMS_MAJOR_VERSION;
5827 + *minor = EVMS_MINOR_VERSION;
5830 +EXPORT_SYMBOL(evms_cs_get_version);
5833 +evms_cs_check_version(struct evms_version *required,
5834 + struct evms_version *actual)
5836 + if (required->major != actual->major)
5838 + else if (required->minor > actual->minor)
5840 + else if (required->minor == actual->minor)
5841 + if (required->patchlevel > actual->patchlevel)
5846 +EXPORT_SYMBOL(evms_cs_check_version);
5849 +evms_cs_allocate_logical_node(struct evms_logical_node **pp)
5851 + *pp = kmalloc(sizeof (struct evms_logical_node), GFP_KERNEL);
5853 + memset(*pp, 0, sizeof (struct evms_logical_node));
5854 + atomic_inc(&evms_logical_nodes);
5860 +EXPORT_SYMBOL(evms_cs_allocate_logical_node);
5863 +evms_cs_deallocate_volume_info(struct evms_logical_node *p)
5865 + if (p->iflags & EVMS_FEATURE_BOTTOM) {
5866 + evms_cs_remove_item_from_list(&evms_global_feature_node_list,
5868 + kfree(p->volume_info);
5869 + p->volume_info = NULL;
5870 + p->iflags &= ~EVMS_FEATURE_BOTTOM;
5874 +EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
5877 +evms_cs_deallocate_logical_node(struct evms_logical_node *p)
5881 + ("Deallocating object whose NEXT ptr is not null!!\n");
5883 + evms_cs_deallocate_volume_info(p);
5884 + if (p->feature_header) {
5885 + kfree(p->feature_header);
5886 + p->feature_header = NULL;
5889 + atomic_dec(&evms_logical_nodes);
5892 +EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
5895 + * Function: evms_cs_register_plugin
5896 + * Description: This function is exported so that all plugins can register with EVMS
5899 +evms_cs_register_plugin(struct evms_plugin_header *plugin)
5902 + struct evms_registered_plugin *reg_record, **pp;
5903 + struct evms_version *ver;
5905 + ver = &plugin->required_services_version;
5908 + ("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5909 + GetPluginOEM(plugin->id), GetPluginType(plugin->id),
5910 + GetPluginID(plugin->id), plugin->version.major,
5911 + plugin->version.minor, plugin->version.patchlevel, ver->major,
5912 + ver->minor, ver->patchlevel);
5914 + /* check common services requirements */
5915 + rc = evms_cs_check_version(ver, &evms_svc_version);
5918 + ("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
5919 + EVMS_COMMON_SERVICES_MAJOR, EVMS_COMMON_SERVICES_MINOR,
5920 + EVMS_COMMON_SERVICES_PATCHLEVEL);
5923 + /* ensure a plugin with this feature id is
5924 + * not already loaded.
5926 + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) {
5927 + if ((*pp)->plugin->id == plugin->id) {
5930 + ("error(%d) attempting to load another plugin with id(%x).\n",
5936 + /* ensure the plugin has provided functions for
5937 + * the mandatory entry points.
5939 + if (!plugin->fops->discover) {
5941 + } else if (!plugin->fops->init_io) {
5943 + } else if (!plugin->fops->ioctl) {
5945 + } else if (!plugin->fops->read) {
5947 + } else if (!plugin->fops->write) {
5949 + } else if (!plugin->fops->delete) {
5954 + /* allocate a new plugin registration record */
5956 + kmalloc(sizeof (struct evms_registered_plugin), GFP_KERNEL);
5957 + if (!reg_record) {
5962 + memset(reg_record, 0, sizeof (struct evms_registered_plugin));
5963 + /* store ptr to plugin header in new registration record */
5964 + reg_record->plugin = plugin;
5966 + /* terminate the record */
5967 + reg_record->next = NULL;
5969 + /* find end of the plugin registration list */
5970 + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) ;
5971 + /* add registration record to list */
5974 + /* increment the usage count */
5975 + MOD_INC_USE_COUNT;
5981 +EXPORT_SYMBOL(evms_cs_register_plugin);
5984 + * Function: evms_cs_unregister_plugin
5985 + * Description: This function is exported so that all plugins can
5986 + * unregister with EVMS
5989 +evms_cs_unregister_plugin(struct evms_plugin_header *plugin)
5991 + int rc = 0, found = FALSE;
5992 + struct evms_registered_plugin **pp;
5993 + struct evms_version *ver;
5995 + ver = &plugin->required_services_version;
5998 + ("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5999 + GetPluginOEM(plugin->id), GetPluginType(plugin->id),
6000 + GetPluginID(plugin->id), plugin->version.major,
6001 + plugin->version.minor, plugin->version.patchlevel, ver->major,
6002 + ver->minor, ver->patchlevel);
6003 + /* ensure a plugin with this feature id is
6004 + * currently loaded.
6006 + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) {
6007 + if ((*pp)->plugin->id == plugin->id) {
6015 + ("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
6018 + /* actually unload the plugin now */
6020 + struct evms_registered_plugin *tmp = *pp;
6022 + /* remove the plugin record from our
6023 + * internal plugin list
6025 + *pp = (*pp)->next;
6026 + /* deallocate the plugin registration record
6030 + /* decrement the usage count */
6031 + MOD_DEC_USE_COUNT;
6036 +EXPORT_SYMBOL(evms_cs_unregister_plugin);
6038 +/* function: evms_cs_add_logical_node_to_list
6040 + * This functions adds a new logical node to the end of a
6043 + * NOTE: This function is only expected to be called at
6044 + * discovery time, which is singled threaded by nature,
6045 + * and therefore doesn't need to be made SMP safe.
6048 +evms_cs_add_logical_node_to_list(struct evms_logical_node **list_head,
6049 + struct evms_logical_node *node)
6052 + struct evms_logical_node **pp = NULL;
6054 + /* check to make sure node is not already on a list */
6058 + /* check to make sure node being added is not already in the list */
6059 + for (pp = list_head; *pp; pp = &(*pp)->next)
6060 + if (*pp == node) {
6065 + /* add node to the end of the list */
6072 +EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
6074 +/* function: evms_cs_remove_logical_node_from_list
6076 + * This functions removes a new logical node from a node list.
6078 + * NOTE: This function is only expected to be called at
6079 + * discovery time, which is singled threaded by nature,
6080 + * and therefore doesn't need to be made SMP safe.
6083 +evms_cs_remove_logical_node_from_list(struct evms_logical_node **list_head,
6084 + struct evms_logical_node *node)
6086 + /* remove this node from the head of the list */
6087 + int rc = 1; /* assume failure until target node is found */
6088 + struct evms_logical_node **pp;
6089 + for (pp = list_head; *pp; pp = &(*pp)->next)
6090 + if (*pp == node) {
6091 + *pp = (*pp)->next;
6092 + node->next = NULL;
6099 +EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
6102 +evms_cs_kernel_ioctl(struct evms_logical_node *node, unsigned int cmd,
6103 + unsigned long arg)
6106 + struct inode tmp_inode;
6112 + rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
6120 +EXPORT_SYMBOL(evms_cs_kernel_ioctl);
6123 + * function: evms_cs_size_in_vsectors
6125 + * In EVMS a V(irtual)Sector is 512 bytes in size.
6126 + * This function computes the number of VSECTORs an specified
6127 + * item size would require.
6129 + * NOTE: This function has been coded to work with 64 bit values.
6132 +evms_cs_size_in_vsectors(long long item_size)
6134 + long long sectors;
6136 + sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
6137 + if (item_size & (EVMS_VSECTOR_SIZE - 1))
6143 +EXPORT_SYMBOL(evms_cs_size_in_vsectors);
6146 + * function: evms_cs_log2
6148 + * this function computes the power of the 2 of specified
6149 + * value. If the value is 0, a -1 is returned. If the value
6150 + * is NOT a power of 2, a -2 is return. Otherwise the power
6151 + * of 2 is returned.
6154 +evms_cs_log2(long long value)
6162 + while (!(tmp & 1)) {
6173 +EXPORT_SYMBOL(evms_cs_log2);
6178 + * build_crc_table()
6182 + * Description: The functions in this module provide a means of calculating
6183 + * the 32 bit CRC for a block of data. build_crc_table must
6184 + * be called to initialize this module. calculate_crc must
6185 + * NOT be used until after build_crc_table has been called.
6186 + * Once build_crc_table has been called, calculate_crc can
6187 + * be used to calculate the crc of the data residing in a
6188 + * user specified buffer.
6192 +#define CRC_POLYNOMIAL 0xEDB88320L
6194 +static u32 crc_table[256];
6195 +static u32 crc_table_built = FALSE;
6197 +/*********************************************************************/
6199 +/* Function Name: build_crc_table */
6201 +/* Descriptive Name: This module implements the crc function using */
6202 +/* a table driven method. The required table */
6203 +/* must be setup before the calculate_crc */
6204 +/* function can be used. This table only needs */
6205 +/* to be set up once. This function sets up the */
6206 +/* crc table needed by calculate_crc. */
6212 +/* Error Handling: N/A */
6214 +/* Side Effects: The internal crc table is initialized. */
6218 +/*********************************************************************/
6220 +build_crc_table(void)
6224 + for (i = 0; i <= 255; i++) {
6226 + for (j = 8; j > 0; j--) {
6228 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
6232 + crc_table[i] = crc;
6234 + crc_table_built = TRUE;
6237 +/*********************************************************************/
6239 +/* Function Name: calculate_crc */
6241 +/* Descriptive Name: This function calculates the crc value for */
6242 +/* the data in the buffer specified by Buffer. */
6244 +/* Input: u32 crc : This is the starting crc. If you are */
6245 +/* starting a new crc calculation, then */
6246 +/* this should be set to 0xFFFFFFFF. If */
6247 +/* you are continuing a crc calculation */
6248 +/* (i.e. all of the data did not fit in */
6249 +/* the buffer so you could not calculate */
6250 +/* the crc in a single operation), then */
6251 +/* this is the crc output by the last */
6252 +/* calculate_crc call. */
6254 +/* Output: The crc for the data in the buffer, based upon the value*/
6255 +/* of the input parameter crc. */
6257 +/* Error Handling: None. */
6259 +/* Side Effects: None. */
6263 +/*********************************************************************/
6265 +evms_cs_calculate_crc(u32 crc, void *buffer, u32 buffersize)
6267 + unsigned char *current_byte;
6268 + u32 temp1, temp2, i;
6270 + current_byte = (unsigned char *) buffer;
6271 + /* Make sure the crc table is available */
6272 + if (crc_table_built == FALSE)
6273 + build_crc_table();
6274 + /* Process each byte in the buffer. */
6275 + for (i = 0; i < buffersize; i++) {
6276 + temp1 = (crc >> 8) & 0x00FFFFFF;
6278 + crc_table[(crc ^ (u32) *
6279 + current_byte) & (u32) 0xff];
6281 + crc = temp1 ^ temp2;
6286 +EXPORT_SYMBOL(evms_cs_calculate_crc);
6288 +#define EVMS_ORIGINAL_CALLBACK_FLAG 1<<0
6289 +typedef struct io_notify_s {
6290 + unsigned int flags;
6292 + struct buffer_head *bh;
6296 + void (*callback_function) (struct evms_logical_node * node,
6297 + struct buffer_head * bh,
6298 + int uptodate, int *redrive);
6299 + struct io_notify_s *next;
6302 +struct evms_pool_mgmt *
6303 +evms_cs_create_pool(int objsize,
6305 + void (*ctor) (void *, kmem_cache_t *, unsigned long),
6306 + void (*dtor) (void *, kmem_cache_t *, unsigned long))
6308 + struct evms_pool_mgmt *pool;
6310 + /* create the pool management structure */
6311 + pool = kmalloc(sizeof (struct evms_pool_mgmt), GFP_KERNEL);
6313 + LOG_CRITICAL("Cannot create %s fpool mgmt structure",
6317 + /* initialize various field in pool mgmt structure */
6318 + memset(pool, 0, sizeof (struct evms_pool_mgmt));
6319 + pool->member_size = objsize;
6320 + pool->name = pool_name;
6321 + pool->waiters = (atomic_t) ATOMIC_INIT(0);
6322 + init_waitqueue_head(&pool->wait_queue);
6323 + /* go create the pool */
6324 + pool->cachep = kmem_cache_create(pool->name,
6325 + pool->member_size,
6326 + 0, SLAB_HWCACHE_ALIGN, ctor, dtor);
6327 + if (!pool->cachep)
6328 + panic("Cannot create %s SLAB cache", pool->name);
6332 +EXPORT_SYMBOL(evms_cs_create_pool);
6335 +evms_cs_allocate_from_pool(struct evms_pool_mgmt *pool, int blockable)
6340 + objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
6341 + if (objp || !blockable) {
6344 + /* block and wait for an object to
6345 + * be returned to the pool
6347 + atomic_inc(&pool->waiters);
6348 + wait_event(pool->wait_queue,
6349 + (!atomic_read(&pool->waiters)));
6355 +EXPORT_SYMBOL(evms_cs_allocate_from_pool);
6358 +evms_cs_deallocate_to_pool(struct evms_pool_mgmt *pool, void *objp)
6360 + kmem_cache_free(pool->cachep, objp);
6361 + atomic_set(&pool->waiters, 0);
6362 + if (waitqueue_active(&pool->wait_queue)) {
6363 + wake_up(&pool->wait_queue);
6367 +EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
6370 +evms_cs_destroy_pool(struct evms_pool_mgmt *pool)
6372 + kmem_cache_destroy(pool->cachep);
6376 +EXPORT_SYMBOL(evms_cs_destroy_pool);
6379 + * function: evms_end_io
6381 + * This is a support function for
6382 + * evms_cs_register_for_end_io_notification.
6383 + * This function is called during I/O completion on any buffer
6384 + * head that was registered by a plugin. Control is passed here
6385 + * and this routine will, thru the use of the I/O notify entry
6386 + * stored in the b_private field of the buffer head, restore
6387 + * the b_rsector value the buffer head had at the time of
6388 + * registration and pass control to the registered callback
6389 + * address, with pointers to the buffer head and an optional
6390 + * plugin private data. Upon completion of the callback,
6391 + * control is returned back here. The io notify list entry
6392 + * is deleted. This process repeats until this routine
6393 + * detects that all registered plugins have been called back
6394 + * and the buffer head's original end_io function has been
6395 + * called. At this point the DONE flag is set, and we terminate
6396 + * callback loop and exit.
6398 + * Plugins may desire to break or interrupt the callback
6399 + * sequence or chain. This may be useful to redrive I/O or
6400 + * to wait for other buffer heads to complete before
6401 + * allowing the original buffer head callback to occur.
6402 + * To interrupt the callback "chain", a registered
6403 + * plugin's callback must return with the DONE flag set.
6405 + * NOTE: If a plugin set the DONE flag, and wishes to redrive
6406 + * a buffer head, the plugin MUST reregister the buffer head
6407 + * to receive another callback on this buffer head. Also, the
6408 + * plugin MUST ensure that the original buffer head end_io
6409 + * function get called at some point, either by reregistering
6410 + * this buffer head and receiving another callback, or by
6411 + * means of buffer head aggregation triggered by the callbacks
6412 + * of other buffer heads.
6416 +evms_end_io(struct buffer_head *bh, int uptodate)
6418 + io_notify_t *entry;
6423 + /* retrieve the io_notify_entry ptr from
6424 + * the b_private field in the buffer head.
6426 + entry = (io_notify_t *) bh->b_private;
6428 + /* restore the b_private value to
6429 + * the previous b_private value (which
6430 + * should be a previous io_notify_entry
6431 + * or the original b_private pointer).
6433 + bh->b_private = entry->b_private;
6435 + /* check for original callback for this bh */
6436 + if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
6437 + /* this is the original for bh */
6439 + /* turn off flag marking this as the original */
6440 + entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
6442 + /* decrement volume's requests_in_progress var */
6443 + atomic_dec(&evms_logical_volumes[MINOR(bh->b_rdev)].
6444 + requests_in_progress);
6446 + /* restore b_end_io to original value */
6447 + bh->b_end_io = (void *) entry->callback_function;
6448 + if (bh->b_end_io) {
6449 + /* invoke original callback function
6452 + bh->b_end_io(bh, uptodate);
6456 + /* this is a plugin callback */
6458 + /* restore the rsector value to the
6459 + * value at the time of callback
6462 + bh->b_rsector = entry->rsector;
6463 + bh->b_rdev = entry->rdev;
6464 + /* invoke plugin callback function */
6465 + entry->callback_function(entry->private, bh, uptodate,
6468 + /* free the io notify entry */
6469 + evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
6474 + * function: evms_cs_register_for_end_io_notification
6476 + * This function is an evms common service.
6477 + * This routine allows a (plugin) function to register to
6478 + * participate in the io completion notification process.
6479 + * This is useful for plugins which alter data after it
6480 + * has been read from the disk (i.e. encryption or
6483 + * This routine also records the rsector value at the time
6484 + * of registration, so that it can be restored to that value
6485 + * prior to the callback to a plugin, thus allowing that
6486 + * plugin to work with the value it had seen during the
6487 + * initiating I/O request.
6489 + * This routine also records a private data pointer at the
6490 + * time of registration, and is returned to the plugin
6491 + * at callback time. This private data pointer was designed
6492 + * to contain context/callback/buffer_head specific data, and
6493 + * frees the plugin from having to store and find associated
6494 + * data at the time of the callback. This field is not used
6495 + * by this function and is optional (NULL if unused). It is
6496 + * recorded and returned as a convenience for the plugins.
6498 + * DANGER!!! - WILL ROBINSON - DANGER!!!
6499 + * This routine uses the b_private field in the
6500 + * buffer_head structure. If any lower level driver uses this
6501 + * field and do NOT restore it, the I/O callback will fail!!
6503 + * Any plugins writers requiring a field for private storage
6504 + * should instead use the private field parameter in this
6505 + * function to store their private data.
6510 +evms_cs_register_for_end_io_notification(void *private,
6511 + struct buffer_head *bh,
6512 + void *callback_function)
6515 + io_notify_t *new_entry;
6519 + /* allocate a notify entry */
6521 + evms_cs_allocate_from_pool(evms_io_notify_pool,
6528 + /* initialize notify entry */
6529 + new_entry->private = private;
6530 + new_entry->bh = bh;
6531 + new_entry->rsector = bh->b_rsector;
6532 + new_entry->rdev = bh->b_rdev;
6533 + new_entry->b_private = bh->b_private;
6534 + new_entry->flags = 0;
6536 + /* is this the first callback for this bh? */
6537 + if (bh->b_end_io != evms_end_io) {
6538 + /* yes, first callback */
6539 + new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
6540 + new_entry->callback_function = (void *) bh->b_end_io;
6542 + /* increment volume's requests_in_progress var */
6543 + atomic_inc(&evms_logical_volumes[MINOR(bh->b_rdev)].
6544 + requests_in_progress);
6546 + /* set b_end_io so we get control */
6547 + bh->b_end_io = evms_end_io;
6549 + /* no, not first callback */
6550 + new_entry->callback_function = callback_function;
6553 + /* set b_private to aid in quick lookup */
6554 + bh->b_private = new_entry;
6559 +EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
6561 +/* function description: evms_cs_lookup_item_in_list
6563 + * this function searches for the specified item in the
6564 + * specified node list. it returns the address of the
6565 + * evms_list_node containing the specified item.
6567 +struct evms_list_node **
6568 +evms_cs_lookup_item_in_list(struct evms_list_node **node_list, void *item)
6570 + struct evms_list_node **list_node;
6572 + list_node = node_list;
6573 + while (*list_node) {
6574 + if ((*list_node)->item == item)
6576 + list_node = &(*list_node)->next;
6578 + return (list_node);
6581 +EXPORT_SYMBOL(evms_cs_lookup_item_in_list);
6583 +/* function description: evms_add_item_to_list
6585 + * this function adds an item to the list. the
6586 + * node for the new item is added to the end
6587 + * of the list. the list is traversed to find the end.
6588 + * while the traversal occurs, the list is checked
6589 + * for the presence of the specified item. if already
6590 + * present in the list, and error code is returned.
6592 +/* function description: evms_cs_add_item_to_list
6594 + * this function adds an item to an item list.
6596 + * RC == 0 is returned for:
6597 + * a successful add of a new item
6599 + * RC == 1 is returned when:
6600 + * the item is already on the list
6602 + * RC < 0 is returned for an error attempting to add the item.
6605 +evms_cs_add_item_to_list(struct evms_list_node **list, void *item)
6608 + struct evms_list_node **list_node, *new_node;
6610 + list_node = evms_cs_lookup_item_in_list(list, item);
6611 + if (*list_node == NULL) {
6612 + new_node = kmalloc(sizeof (struct evms_list_node), GFP_NOIO);
6614 + memset(new_node, 0, sizeof (struct evms_list_node));
6615 + new_node->item = item;
6616 + *list_node = new_node;
6623 + ("warning: attempt to add duplicate item(%p) to list(%p).\n",
6629 +EXPORT_SYMBOL(evms_cs_add_item_to_list);
6631 +/* function description: evms_remove_item_from_list
6633 + * this function removes a specified item from the
6634 + * specified list. if the specified item is not
6635 + * found in the list, and error is returned.
6638 +evms_cs_remove_item_from_list(struct evms_list_node **list, void *item)
6641 + struct evms_list_node **list_node;
6643 + /* check to see if item is in the list */
6644 + list_node = evms_cs_lookup_item_in_list(list, item);
6646 + /* was the node found in the list? */
6648 + /* yes, it was found */
6649 + struct evms_list_node *tmp_node;
6651 + /* save ptr to node being removed */
6652 + tmp_node = *list_node;
6653 + /* remove it from the global list */
6654 + *list_node = tmp_node->next;
6655 + /* delete removed node */
6658 + /* no, it was not found */
6661 + ("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
6667 +EXPORT_SYMBOL(evms_cs_remove_item_from_list);
6669 +/* function description: evms_cs_register_device
6671 + * this function adds a device to the EVMS global device list.
6673 + * RC == 0 is returned for:
6674 + * a successful add of a new device
6676 + * RC == 1 is returned when:
6677 + * the device is already on the list
6679 + * RC < 0 is returned for an error attempting to add the device.
6682 +evms_cs_register_device(struct evms_logical_node *device)
6684 + return (evms_cs_add_item_to_list(&evms_global_device_list, device));
6687 +EXPORT_SYMBOL(evms_cs_register_device);
6689 +/* function description: evms_cs_unregister_device
6691 + * this function removes a device from the EVMS global device list.
6693 + * RC == 0 is returned for:
6694 + * a successful removal of the specified device
6696 + * RC < 0 is returned for an error attempting to add the device.
6697 + * -ENODATA is returned if specified device is not found.
6700 +evms_cs_unregister_device(struct evms_logical_node *device)
6702 + return (evms_cs_remove_item_from_list(&evms_global_device_list,
6706 +EXPORT_SYMBOL(evms_cs_unregister_device);
6708 +static struct evms_list_node *find_first_next_list_node = NULL;
6710 +evms_cs_find_next_device(struct evms_logical_node *in_device,
6711 + struct evms_logical_node **out_device)
6714 + struct evms_list_node **list_node;
6716 + if (in_device == NULL)
6717 + find_first_next_list_node = evms_global_device_list;
6720 + evms_cs_lookup_item_in_list(&evms_global_device_list,
6722 + find_first_next_list_node = *list_node;
6723 + if (find_first_next_list_node == NULL)
6726 + find_first_next_list_node =
6727 + find_first_next_list_node->next;
6730 + if (find_first_next_list_node == NULL)
6731 + *out_device = NULL;
6733 + *out_device = (struct evms_logical_node *)
6734 + find_first_next_list_node->item;
6739 +EXPORT_SYMBOL(evms_cs_find_next_device);
6742 +evms_cs_signal_event(int eventid)
6745 + struct evms_list_node **list_node;
6747 + /* signal PID(s) of specified event */
6748 + list_node = &evms_global_notify_list;
6749 + while (*list_node) {
6750 + struct evms_event *event;
6752 + event = (*list_node)->item;
6753 + if (event->eventid == eventid) {
6754 + struct task_struct *tsk;
6756 + tsk = find_task_by_pid(event->pid);
6758 + struct siginfo siginfo;
6760 + siginfo.si_signo = event->signo;
6761 + siginfo.si_errno = 0;
6762 + siginfo.si_code = 0;
6763 + rc = send_sig_info(event->signo, &siginfo, tsk);
6766 + * unregister this stale
6767 + * notification record
6771 + list_node = &(*list_node)->next;
6775 +EXPORT_SYMBOL(evms_cs_signal_event);
6778 +evms_flush_signals(void)
6780 + spin_lock(¤t->sigmask_lock);
6781 + flush_signals(current);
6782 + spin_unlock(¤t->sigmask_lock);
6786 +evms_init_signals(void)
6788 + current->exit_signal = SIGCHLD;
6789 + siginitsetinv(¤t->blocked, sigmask(SIGKILL));
6793 +evms_thread(void *arg)
6795 + struct evms_thread *thread = arg;
6804 + sprintf(current->comm, thread->name);
6805 + evms_init_signals();
6806 + evms_flush_signals();
6807 + thread->tsk = current;
6809 + current->policy = SCHED_OTHER;
6810 +#ifdef O1_SCHEDULER
6811 + set_user_nice(current, -20);
6813 + current->nice = -20;
6817 + complete(thread->event);
6818 + while (thread->run) {
6819 + void (*run) (void *data);
6820 + DECLARE_WAITQUEUE(wait, current);
6822 + add_wait_queue(&thread->wqueue, &wait);
6823 +#ifdef O1_SCHEDULER
6824 + set_current_state(TASK_INTERRUPTIBLE);
6826 + set_task_state(current, TASK_INTERRUPTIBLE);
6828 + if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {
6831 +#ifdef O1_SCHEDULER
6832 + set_current_state(TASK_RUNNING);
6834 + current->state = TASK_RUNNING;
6836 + remove_wait_queue(&thread->wqueue, &wait);
6837 + clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6839 + run = thread->run;
6841 + run(thread->data);
6842 + run_task_queue(&tq_disk);
6844 + if (signal_pending(current)) {
6845 + evms_flush_signals();
6848 + complete(thread->event);
6852 +struct evms_thread *
6853 +evms_cs_register_thread(void (*run) (void *), void *data, const u8 * name)
6855 + struct evms_thread *thread;
6857 + struct completion event;
6859 + thread = kmalloc(sizeof (struct evms_thread), GFP_KERNEL);
6863 + memset(thread, 0, sizeof (struct evms_thread));
6864 + init_waitqueue_head(&thread->wqueue);
6866 + init_completion(&event);
6867 + thread->event = &event;
6868 + thread->run = run;
6869 + thread->data = data;
6870 + thread->name = name;
6871 + ret = kernel_thread(evms_thread, thread, 0);
6876 + wait_for_completion(&event);
6880 +EXPORT_SYMBOL(evms_cs_register_thread);
6883 +evms_cs_unregister_thread(struct evms_thread *thread)
6885 + struct completion event;
6887 + init_completion(&event);
6889 + thread->event = &event;
6890 + thread->run = NULL;
6891 + thread->name = NULL;
6892 + evms_cs_interrupt_thread(thread);
6893 + wait_for_completion(&event);
6897 +EXPORT_SYMBOL(evms_cs_unregister_thread);
6900 +evms_cs_wakeup_thread(struct evms_thread *thread)
6902 + set_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6903 + wake_up(&thread->wqueue);
6906 +EXPORT_SYMBOL(evms_cs_wakeup_thread);
6909 +evms_cs_interrupt_thread(struct evms_thread *thread)
6911 + if (!thread->tsk) {
6912 + LOG_ERROR("error: attempted to interrupt an invalid thread!\n");
6915 + send_sig(SIGKILL, thread->tsk, 1);
6918 +EXPORT_SYMBOL(evms_cs_interrupt_thread);
6920 +struct proc_dir_entry *
6921 +evms_cs_get_evms_proc_dir(void)
6923 +#ifdef CONFIG_PROC_FS
6924 + if (!evms_proc_dir) {
6925 + evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);
6928 + return (evms_proc_dir);
6931 +EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);
6934 +evms_cs_volume_request_in_progress(kdev_t dev,
6935 + int operation, int *current_count)
6938 + struct evms_logical_volume *volume;
6940 + volume = &evms_logical_volumes[MINOR(dev)];
6941 + if (volume->node) {
6942 + if (operation > 0) {
6943 + atomic_inc(&volume->requests_in_progress);
6944 + } else if (operation < 0) {
6945 + atomic_dec(&volume->requests_in_progress);
6947 + if (current_count) {
6949 + atomic_read(&volume->requests_in_progress);
6957 +EXPORT_SYMBOL(evms_cs_volume_request_in_progress);
6960 +evms_cs_invalidate_volume(struct evms_logical_node *node)
6963 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6964 + if (evms_logical_volumes[i].node && node->name) {
6967 + (evms_logical_volumes[i].node->name,
6970 + ("Invalidating EVMS device %s minor %d\n",
6972 + invalidate_device(MKDEV(EVMS_MAJOR, i), 0);
6979 +EXPORT_SYMBOL(evms_cs_invalidate_volume);
6984 + return atomic_read(&evms_logical_volumes[minor].opens);
6987 +/**********************************************************/
6988 +/* END -- exported functions/Common Services */
6989 +/**********************************************************/
6991 +/**********************************************************/
6992 +/* START -- Proc FS Support functions */
6993 +/**********************************************************/
6995 +#ifdef CONFIG_PROC_FS
6997 +evms_info_read_proc(char *page,
6998 + char **start, off_t off, int count, int *eof, void *data)
7001 + char *info_level_text = NULL;
7003 + PROCPRINT("Enterprise Volume Management System: Info\n");
7004 + switch (evms_info_level) {
7005 + case EVMS_INFO_CRITICAL:
7006 + info_level_text = "critical";
7008 + case EVMS_INFO_SERIOUS:
7009 + info_level_text = "serious";
7011 + case EVMS_INFO_ERROR:
7012 + info_level_text = "error";
7014 + case EVMS_INFO_WARNING:
7015 + info_level_text = "warning";
7017 + case EVMS_INFO_DEFAULT:
7018 + info_level_text = "default";
7020 + case EVMS_INFO_DETAILS:
7021 + info_level_text = "details";
7023 + case EVMS_INFO_DEBUG:
7024 + info_level_text = "debug";
7026 + case EVMS_INFO_EXTRA:
7027 + info_level_text = "extra";
7029 + case EVMS_INFO_ENTRY_EXIT:
7030 + info_level_text = "entry exit";
7032 + case EVMS_INFO_EVERYTHING:
7033 + info_level_text = "everything";
7036 + info_level_text = "unknown";
7039 + PROCPRINT("EVMS info level: %d (%s).\n",
7040 + evms_info_level, info_level_text);
7042 + PROCPRINT("EVMS kernel version: %d.%d.%d\n",
7043 + EVMS_MAJOR_VERSION,
7044 + EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
7046 + PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",
7047 + EVMS_IOCTL_INTERFACE_MAJOR,
7048 + EVMS_IOCTL_INTERFACE_MINOR, EVMS_IOCTL_INTERFACE_PATCHLEVEL);
7050 + PROCPRINT("EVMS Common Services version: %d.%d.%d\n",
7051 + EVMS_COMMON_SERVICES_MAJOR,
7052 + EVMS_COMMON_SERVICES_MINOR, EVMS_COMMON_SERVICES_PATCHLEVEL);
7057 + *start = page + off;
7061 + return sz > count ? count : sz;
7065 +evms_plugins_read_proc(char *page,
7066 + char **start, off_t off, int count, int *eof, void *data)
7069 + struct evms_registered_plugin *rp = NULL;
7071 + PROCPRINT("Enterprise Volume Management System: Plugins\n");
7072 + /* 0 1 1 2 2 3 3 4 4 5 5 6 6 7 */
7073 + /* 1 5 0 5 0 5 0 5 0 5 0 5 0 5 0 */
7074 + PROCPRINT(" ---------Plugin---------- required services\n");
7075 + PROCPRINT(" ----id---- version version\n\n");
7076 + for (rp = registered_plugin_head; rp; rp = rp->next) {
7077 + PROCPRINT(" %x.%x.%x\t %d.%d.%d\t%d.%d.%d\n",
7078 + GetPluginOEM(rp->plugin->id),
7079 + GetPluginType(rp->plugin->id),
7080 + GetPluginID(rp->plugin->id),
7081 + rp->plugin->version.major,
7082 + rp->plugin->version.minor,
7083 + rp->plugin->version.patchlevel,
7084 + rp->plugin->required_services_version.major,
7085 + rp->plugin->required_services_version.minor,
7086 + rp->plugin->required_services_version.patchlevel);
7090 + *start = page + off;
7094 + return sz > count ? count : sz;
7098 +evms_volumes_read_proc(char *page,
7099 + char **start, off_t off, int count, int *eof, void *data)
7103 + PROCPRINT("Enterprise Volume Management System: Volumes\n");
7104 + PROCPRINT("major minor #blocks type flags name\n\n");
7105 + for (j = 1; j < MAX_EVMS_VOLUMES; j++) {
7106 + struct evms_logical_volume *volume;
7108 + volume = &evms_logical_volumes[j];
7109 + if (volume->node) {
7110 + PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",
7112 + (long long)volume->node->total_vsectors >> 1,
7114 + flags & EVMS_VOLUME_FLAG) ? "evms " :
7117 + flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",
7119 + flags & EVMS_VOLUME_PARTIAL) ? "p " : " ",
7120 + EVMS_DEV_NODE_PATH, volume->name);
7124 + *start = page + off;
7128 + return sz > count ? count : sz;
7133 +/**********************************************************/
7134 +/* END -- Proc FS Support functions */
7135 +/**********************************************************/
7137 +/**********************************************************/
7138 +/* START -- FOPS functions definitions */
7139 +/**********************************************************/
7141 +/************************************************/
7142 +/* START -- IOCTL commands -- EVMS specific */
7143 +/************************************************/
7146 +evms_ioctl_cmd_get_ioctl_version(void *arg)
7149 + struct evms_version ver;
7151 + ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
7152 + ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
7153 + ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
7155 + /* copy info to userspace */
7156 + if (copy_to_user(arg, &ver, sizeof (ver)))
7163 +evms_ioctl_cmd_get_version(void *arg)
7166 + struct evms_version ver;
7168 + ver.major = EVMS_MAJOR_VERSION;
7169 + ver.minor = EVMS_MINOR_VERSION;
7170 + ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
7172 + /* copy info to userspace */
7173 + if (copy_to_user(arg, &ver, sizeof (ver)))
7180 +evms_ioctl_cmd_get_info_level(void *arg)
7184 + /* copy info to userspace */
7185 + if (copy_to_user(arg, &evms_info_level, sizeof (evms_info_level)))
7192 +evms_ioctl_cmd_set_info_level(void *arg)
7196 + /* copy info from userspace */
7197 + if (copy_from_user(&temp, arg, sizeof (temp)))
7200 + evms_info_level = temp;
7205 +/* function: evms_quiesce_volume
7207 + * this function performs the actual quiesce operation on
7208 + * a volume in kernel memory.
7210 + * when quiescing, all new I/Os to a volume are stopped,
7211 + * causing the calling thread to block. this thread then
7212 + * waits until all I/Os in progress are completed, before
7213 + * return control to the caller.
7215 + * when unquiescing, all new I/Os are allowed to proceed
7216 + * unencumbered, and all threads waiting (blocked) on this
7217 + * volume, are woken up and allowed to proceed.
7221 +evms_quiesce_volume(struct evms_logical_volume *volume,
7222 + struct inode *inode,
7223 + struct file *file, struct evms_quiesce_vol_pkt *qv)
7227 + LOG_DEBUG("%squiescing %s.\n",
7228 + ((qv->command) ? "" : "un"), volume->name);
7230 +#ifdef VFS_PATCH_PRESENT
7232 + /* VFS function call to sync and lock the filesystem */
7233 + fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));
7234 + volume->vfs_quiesced = TRUE;
7237 + volume->quiesced = qv->command;
7239 + /* Command specified was "quiesce". */
7240 + if (qv->command) {
7241 + /* After setting the volume to
7242 + * a quiesced state, there could
7243 + * be threads (on SMP systems)
7244 + * that are executing in the
7245 + * function, evms_handle_request,
7246 + * between the "wait_event" and the
7247 + * "atomic_inc" lines. We need to
7248 + * provide a "delay" sufficient
7249 + * to allow those threads to
7250 + * to reach the atomic_inc's
7251 + * before executing the while loop
7252 + * below. The "schedule" call should
7256 + /* wait for outstanding requests
7259 + while (atomic_read(&volume->requests_in_progress) > 0)
7262 + /* send this command down the stack so lower */
7263 + /* layers can know about this */
7264 + rc = IOCTL(volume->node, inode, file,
7265 + EVMS_QUIESCE_VOLUME, (unsigned long) qv);
7267 + /* Command specified was "unquiesce". */
7268 + if (!qv->command) {
7269 + /* "wakeup" any I/O requests waiting on
7272 + if (waitqueue_active(&volume->wait_queue))
7273 + wake_up(&volume->wait_queue);
7274 +#ifdef VFS_PATCH_PRESENT
7275 + if (volume->vfs_quiesced) {
7276 + /* VFS function call to unlock the filesystem */
7277 + unlockfs(MKDEV(EVMS_MAJOR, qv->minor));
7278 + volume->vfs_quiesced = FALSE;
7283 + LOG_ERROR("error(%d) %squiescing %s.\n",
7284 + rc, ((qv->command) ? "" : "un"), volume->name);
7289 +/* function: evms_delete_volume
7291 + * this function performs the actual delete operation on
7292 + * a volume to purge it from kernel memory. all structures
7293 + * and memory consumed by this volume will be free as well
7294 + * as clearing or unregistering any system services or
7295 + * global data arrays.
7297 + * NOTE: this function will return -EBUSY on attempts to
7298 + * delete mounted volumes.
7302 +evms_delete_volume(struct evms_logical_volume *volume,
7303 + struct evms_delete_vol_pkt *dv)
7307 + /* if this is a "permament" delete */
7308 + /* check to make sure volume is not mounted */
7309 + if (dv->command) {
7310 + if (is_open(dv->minor)) {
7313 + // invalidate the device since it is not coming back
7314 + // this is required incase we are re-using the minor number
7315 + invalidate_device(MKDEV(EVMS_MAJOR, dv->minor), 1);
7319 + /* invoke the delete ioctl at the top of the feature stack */
7321 + LOG_DETAILS("deleting '%s'.\n", volume->name);
7322 + rc = DELETE(volume->node);
7325 + /* the volume has been deleted, do any clean up work
7329 + devfs_unregister(volume->devfs_handle);
7330 + if (dv->command) {
7331 + /* if "permanent" delete, free the name
7332 + * and NULL the name field.
7334 + kfree(volume->name);
7335 + volume->name = NULL;
7336 + volume->flags = 0;
7338 + /* if "soft" delete, leave the name so
7339 + * we can use it to reassign the same
7340 + * minor to this volume after a
7343 + volume->flags = EVMS_VOLUME_SOFT_DELETED;
7345 + volume->node = NULL;
7346 + set_device_ro(MKDEV(EVMS_MAJOR, dv->minor), 0);
7347 + blk_size[EVMS_MAJOR][dv->minor] = 0;
7348 + blksize_size[EVMS_MAJOR][dv->minor] = 0;
7349 + hardsect_size[EVMS_MAJOR][dv->minor] = 0;
7352 + LOG_ERROR("error(%d) %s deleting %s.\n",
7353 + rc, ((dv->command) ? "hard" : "soft"), volume->name);
7358 +/* function: evms_user_delete_volume
7360 + * this function, depending on the parameters, performs
7361 + * a "soft" or a "hard" delete. for a "soft" delete, a
7362 + * quiesce & delete request is queued up, to be executed
7363 + * at the beginning of the next rediscovery. for a
7364 + * "hard" delete, the target volume is quiesced and then
7365 + * deleted. if there is any errors attempting to delete
7366 + * the target, then the target is unquiesced. if an
7367 + * associative volume is specified it is quiesced before
7368 + * the target volume is quiesced, and is unquiesced
7369 + * after the attempt to delete the target volume.
7373 +evms_user_delete_volume(struct evms_logical_volume *lvt,
7374 + struct inode *inode,
7375 + struct file *file, struct evms_delete_vol_pkt *dv)
7379 + if (!dv->command) {
7380 + /* "soft delete" requested */
7381 + lvt->flags |= (EVMS_REQUESTED_QUIESCE | EVMS_REQUESTED_DELETE);
7383 + lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;
7386 + /* "hard delete" requested */
7388 + struct evms_quiesce_vol_pkt qv;
7389 + struct evms_logical_volume *lva = NULL;
7391 + if (dv->associative_minor) {
7392 + /* associative volume specified
7396 + lva = &evms_logical_volumes[dv->associative_minor];
7397 + /* quiesce associative volume */
7398 + qv.command = EVMS_QUIESCE;
7399 + qv.do_vfs = EVMS_VFS_DO_NOTHING;
7400 + qv.minor = dv->associative_minor;
7401 + rc = evms_quiesce_volume(lva, inode, file, &qv);
7402 + qa = (rc) ? FALSE : TRUE;
7405 + /* quiesce target volume */
7406 + qv.command = EVMS_QUIESCE;
7407 + qv.do_vfs = EVMS_VFS_DO_NOTHING;
7408 + qv.minor = dv->minor;
7409 + rc = evms_quiesce_volume(lvt, inode, file, &qv);
7412 + /* delete the target volume */
7413 + rc = evms_delete_volume(lvt, dv);
7415 + /* got an error undeleting...
7417 + * unquiesce the target
7419 + qv.command = EVMS_UNQUIESCE;
7420 + qv.do_vfs = EVMS_VFS_DO_NOTHING;
7421 + qv.minor = dv->minor;
7422 + evms_quiesce_volume(lvt, inode, file, &qv);
7425 + if (dv->associative_minor) {
7426 + /* associative volume specified
7431 + /* only unquiesce associative
7432 + * if we successfully quiesced
7435 + qv.command = EVMS_UNQUIESCE;
7436 + qv.do_vfs = EVMS_VFS_DO_NOTHING;
7437 + qv.minor = dv->associative_minor;
7438 + evms_quiesce_volume(lva, inode, file, &qv);
7445 +/* function: evms_ioctl_cmd_delete_volume
7447 + * this function copy user data to/from the kernel, and
7448 + * validates user parameters. after validation, control
7449 + * is passed to worker routine evms_user_delete_volume.
7453 +evms_ioctl_cmd_delete_volume(struct inode *inode,
7454 + struct file *file, unsigned long arg)
7457 + struct evms_delete_vol_pkt tmp, *user_parms;
7458 + struct evms_logical_volume *volume = NULL;
7460 + user_parms = (struct evms_delete_vol_pkt *) arg;
7461 + /* copy user's parameters to kernel space */
7462 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7465 + /* check to make sure associative minor is in use */
7467 + if (tmp.associative_minor) {
7468 + volume = &evms_logical_volumes[tmp.associative_minor];
7469 + if (volume->node == NULL)
7473 + /* check to make sure target minor is in use */
7475 + volume = &evms_logical_volumes[tmp.minor];
7476 + if (volume->node == NULL)
7479 + rc = evms_user_delete_volume(volume, inode, file, &tmp);
7481 + /* copy the status value back to the user */
7483 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7489 +/* function: evms_full_rediscover_prep
7491 + * this function helps to prevent problems when evms is
7492 + * configured with the base built in statically and some
7493 + * plugins built as modules.
7495 + * in these cases, when the initial discovery is done,
7496 + * only the statically built modules are available for
7497 + * volume construction. as a result, some volumes that
7498 + * require the plugins built as modules (which haven't
7499 + * been loaded), to be fully reconstructed, may come up
7500 + * as compatibility volumes or partial volumes.
7502 + * when parts of evms are built as modules, the
7503 + * evms_rediscover_pkty utility is used, to perform a secondary
7504 + * rediscover, after all the plugins built as modules
7505 + * have been loaded, to construct all the volumes
7506 + * requiring these plugins.
7508 + * however since some of the volumes, requiring the plugins
7509 + * built as modules, may have been already exported as
7510 + * compatibility or partial volumes, we need to purge these
7511 + * volumes from kernel's memory, so that can be rediscovered
7512 + * and claimed by the appropriate plugins, and reconstructed
7513 + * into the correct volumes.
7515 + * this function purges all compatibility volumes that are
7516 + * not in use(mounted) and all partial volumes, prior to
7517 + * doing the secondary rediscover, thus allowing volumes to
7518 + * rediscovered correctly.
7520 + * NOTE: again, this is only required in cases when a
7521 + * combination of plugins are built statically and as
7526 +evms_full_rediscover_prep(struct inode *inode, struct file *file)
7530 + LOG_DETAILS("%s: started.\n", __FUNCTION__);
7531 + /* check for acceptable volumes to be deleted */
7532 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7533 + struct evms_logical_volume *volume = NULL;
7534 + struct evms_delete_vol_pkt dv;
7535 + int volume_open, doit;
7537 + volume = &evms_logical_volumes[i];
7538 + if (!volume->node)
7540 + volume_open = is_open(i);
7541 + /* only proceed on volumes that are:
7544 + * unopened compatibility volumes
7547 + if (volume->flags & EVMS_VOLUME_PARTIAL) {
7548 + /* do all partial volumes
7551 + } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
7552 + /* check all compatibility volumes
7554 + if (!volume_open && !is_swap_partition(MKDEV(EVMS_MAJOR, i))) {
7555 + /* only do unopened volumes
7560 + if (doit == FALSE) {
7563 + /* delete the volume from memory.
7564 + * do a 'soft' delete if volume
7565 + * is mounted, and 'hard' delete
7568 + * NOTE: the delete operation will
7569 + * clear the bits in the flags field.
7571 + dv.command = (volume_open) ?
7572 + EVMS_SOFT_DELETE : EVMS_HARD_DELETE;
7574 + dv.associative_minor = 0;
7576 + rc = evms_user_delete_volume(volume, inode, file, &dv);
7578 + LOG_DETAILS("%s: completed.\n", __FUNCTION__);
7582 +evms_ioctl_cmd_rediscover_volumes(struct inode *inode,
7583 + struct file *file,
7584 + unsigned int cmd, unsigned long arg)
7587 + struct evms_rediscover_pkt tmp, *user_parms;
7588 + u64 *array_ptr = NULL;
7589 + ulong array_size = 0;
7590 + struct evms_logical_volume *volume = NULL;
7592 + rc = tmp.drive_count = 0;
7593 + user_parms = (struct evms_rediscover_pkt *) arg;
7594 + /* copy user's parameters to kernel space */
7595 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7598 + if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
7599 + evms_full_rediscover_prep(inode, file);
7601 + /* quiesce all queued volumes */
7602 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7603 + struct evms_quiesce_vol_pkt qv;
7605 + volume = &evms_logical_volumes[i];
7606 + if (!volume->node) {
7609 + if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {
7612 + qv.command = EVMS_QUIESCE;
7614 + qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ?
7615 + EVMS_VFS_DO : EVMS_VFS_DO_NOTHING, qv.status = 0;
7616 + rc = evms_quiesce_volume(volume, inode, file, &qv);
7618 + /* "soft" delete all queued volumes */
7619 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7620 + struct evms_delete_vol_pkt dv;
7622 + volume = &evms_logical_volumes[i];
7623 + if (!volume->node) {
7626 + if (!(volume->flags & EVMS_REQUESTED_DELETE)) {
7629 + dv.command = EVMS_SOFT_DELETE;
7631 + dv.associative_minor = 0;
7633 + rc = evms_delete_volume(volume, &dv);
7636 + if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {
7638 + /* create space for userspace drive array */
7640 + sizeof (*tmp.drive_array) * tmp.drive_count;
7641 + array_ptr = tmp.drive_array;
7642 + tmp.drive_array = kmalloc(array_size, GFP_KERNEL);
7643 + if (!tmp.drive_array) {
7648 + /* copy rediscover drive array to kernel space */
7649 + if (copy_from_user
7650 + (tmp.drive_array, array_ptr, array_size))
7655 + static int evms_discover_volumes(struct evms_rediscover_pkt *);
7656 + /* perform the rediscovery operation */
7657 + rc = evms_discover_volumes(&tmp);
7660 + /* clean up after operation */
7661 + if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES))
7662 + kfree(tmp.drive_array);
7664 + /* set return code and copy info to userspace */
7666 + if (copy_to_user(&user_parms->status, &tmp.status, sizeof (tmp.status)))
7672 +static struct evms_list_node *user_disk_ptr;
7674 +evms_ioctl_cmd_get_logical_disk(void *arg)
7677 + struct evms_user_disk_pkt tmp, *user_parms;
7679 + user_parms = (struct evms_user_disk_pkt *) arg;
7680 + /* copy user's parameters to kernel space */
7681 + if (copy_from_user
7682 + (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7686 + if (tmp.command == EVMS_FIRST_DISK)
7687 + user_disk_ptr = evms_global_device_list;
7688 + else /* tmp.command == EVMS_NEXT_DISK */
7689 + user_disk_ptr = user_disk_ptr->next;
7691 + if (user_disk_ptr == NULL)
7692 + tmp.status = EVMS_DISK_INVALID;
7694 + tmp.status = EVMS_DISK_VALID;
7696 + NODE_TO_DEV_HANDLE(user_disk_ptr->item);
7698 + /* copy info to userspace */
7699 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7706 +evms_ioctl_cmd_get_logical_disk_info(void *arg)
7709 + struct evms_user_disk_info_pkt tmp, *user_parms;
7710 + struct evms_list_node *p;
7711 + struct evms_logical_node *disk_node = NULL;
7713 + user_parms = (struct evms_user_disk_info_pkt *) arg;
7714 + /* copy user's parameters to kernel space */
7715 + if (copy_from_user
7716 + (&tmp.disk_handle, &user_parms->disk_handle,
7717 + sizeof (tmp.disk_handle)))
7720 + /* check handle for validity */
7723 + disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7724 + for (p = evms_global_device_list; p; p = p->next)
7725 + if (p->item == disk_node) {
7727 + user_disk_ptr = p;
7732 + /* populate kernel copy of user's structure with appropriate info */
7734 + struct hd_geometry geo;
7735 + struct evms_logical_node *node =
7736 + (struct evms_logical_node *) user_disk_ptr->item;
7737 + tmp.flags = node->flags;
7738 + strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
7739 + strcat(tmp.disk_name, node->name);
7740 + rc = evms_cs_kernel_ioctl(node, EVMS_UPDATE_DEVICE_INFO,
7743 + tmp.total_sectors = node->total_vsectors;
7744 + tmp.hardsect_size = node->hardsector_size;
7745 + tmp.block_size = node->block_size;
7746 + rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO,
7747 + (unsigned long) &geo);
7750 + tmp.geo_sectors = geo.sectors;
7751 + tmp.geo_heads = geo.heads;
7752 + tmp.geo_cylinders = geo.cylinders;
7756 + /* set return code and copy info to userspace */
7758 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7765 +evms_ioctl_cmd_sector_io(void *arg)
7768 +#define MAX_IO_SIZE 128
7769 + u64 io_size, max_io_size = MAX_IO_SIZE;
7771 + struct evms_sector_io_pkt tmp, *user_parms;
7772 + struct evms_logical_node *disk_node = NULL;
7773 + struct evms_list_node *list_node;
7774 + unsigned char *io_buffer;
7780 + user_parms = (struct evms_sector_io_pkt *) arg;
7781 + /* copy user's parameters to kernel space */
7782 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7785 + /* check handle for validity */
7788 + disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7789 + for (list_node = evms_global_device_list; list_node;
7790 + list_node = list_node->next)
7791 + if (list_node->item == disk_node) {
7798 + /* allocate a io buffer upto 64Kbytes in size */
7799 + if (tmp.sector_count < max_io_size)
7800 + max_io_size = tmp.sector_count;
7803 + /* allocate buffer large enough to max_io_size sectors */
7805 + kmalloc(max_io_size << EVMS_VSECTOR_SIZE_SHIFT,
7808 + max_io_size >>= 1;
7809 + if (!max_io_size) {
7817 + /* perform io with specified disk */
7819 + u64 io_sector_offset, io_remaining;
7821 + u_char *user_buffer_ptr;
7823 + io_remaining = tmp.sector_count;
7824 + io_sector_offset = 0;
7825 + user_buffer_ptr = tmp.buffer_address;
7826 + while (io_remaining) {
7827 + /* compute the io_size for this pass */
7828 + io_size = (io_remaining >= max_io_size) ?
7829 + max_io_size : io_remaining;
7831 + io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
7832 + /* for writes, copy a sector from user to kernel */
7833 + if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {
7834 + /* copy sector from user data buffer */
7835 + if (copy_from_user(io_buffer,
7836 + user_buffer_ptr, io_bytes))
7842 + /* perform IO one sector at a time */
7843 + rc = INIT_IO(disk_node,
7845 + io_sector_offset + tmp.starting_sector,
7846 + io_size, io_buffer);
7851 + if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {
7852 + /* copy sector to user data buffer */
7853 + if (copy_to_user(user_buffer_ptr,
7854 + io_buffer, io_bytes))
7860 + user_buffer_ptr += io_bytes;
7861 + tmp.buffer_address += io_bytes;
7862 + io_sector_offset += io_size;
7863 + io_remaining -= io_size;
7867 + /* if the sector_buffer was allocated, free it */
7871 + /* copy the status value back to the user */
7873 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7879 +static int user_minor;
7881 +evms_ioctl_cmd_get_minor(void *arg)
7884 + struct evms_user_minor_pkt tmp, *user_parms;
7886 + user_parms = (struct evms_user_minor_pkt *) arg;
7887 + /* copy user's parameters to kernel space */
7888 + if (copy_from_user
7889 + (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7893 + if (tmp.command == EVMS_FIRST_VOLUME)
7895 + else /* tmp.command == EVMS_NEXT_VOLUME */
7898 + tmp.status = EVMS_VOLUME_INVALID;
7899 + for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
7900 + struct evms_logical_volume *lv;
7902 + lv = &evms_logical_volumes[user_minor];
7903 + /* see if any corrupt volumes have been
7904 + * unmounted. If so, clean up the
7905 + * evms_logical_volumes array entry, and
7906 + * don't report the volume to the user.
7908 + if (lv->flags & EVMS_VOLUME_CORRUPT) {
7909 + if (!is_open(user_minor)) {
7910 + /* clear logical volume structure
7911 + * for this volume so it may be
7915 + ("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
7917 + flags & EVMS_VOLUME_SOFT_DELETED)
7918 + ? "'soft deleted'" : ""),
7919 + EVMS_MAJOR, user_minor, lv->name);
7921 + (" releasing minor(%d) used by volume(%s)!\n",
7922 + user_minor, lv->name);
7928 + if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
7929 + tmp.status = EVMS_VOLUME_VALID;
7930 + tmp.minor = user_minor;
7935 + /* copy info to userspace */
7936 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7943 +evms_ioctl_cmd_get_volume_data(void *arg)
7946 + struct evms_volume_data_pkt tmp, *user_parms;
7947 + struct evms_logical_volume *volume = NULL;
7948 + struct evms_logical_node *node = NULL;
7950 + user_parms = (struct evms_volume_data_pkt *) arg;
7951 + /* copy user's parameters to kernel space */
7952 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7956 + volume = &evms_logical_volumes[tmp.minor];
7957 + node = volume->node;
7962 + tmp.flags = volume->flags;
7963 + strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
7964 + strcat(tmp.volume_name, volume->name);
7967 + /* copy return code and info to userspace */
7969 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7974 +static struct evms_registered_plugin *ioctl_reg_record;
7976 +evms_ioctl_cmd_get_plugin(void *arg)
7979 + struct evms_kernel_plugin_pkt tmp, *user_parms;
7981 + user_parms = (struct evms_kernel_plugin_pkt *) arg;
7982 + /* copy user's parameters to kernel space */
7983 + if (copy_from_user
7984 + (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7988 + /* if the command is not 0, then verify
7989 + * that ioctl_reg_record is pointing to
7990 + * current and valid plugin header.
7992 + if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */
7993 + struct evms_registered_plugin *tmp_reg_record;
7994 + tmp_reg_record = registered_plugin_head;
7995 + /* search the current plugin list */
7996 + while (tmp_reg_record) {
7997 + if (tmp_reg_record == ioctl_reg_record)
7999 + tmp_reg_record = tmp_reg_record->next;
8001 + /* if the ioctl_reg_record is not in the
8002 + * current list, then start at the beginning.
8004 + if (!tmp_reg_record)
8005 + tmp.command = EVMS_FIRST_PLUGIN;
8008 + if (tmp.command == EVMS_FIRST_PLUGIN)
8009 + /* start at beginning of plugin list */
8010 + ioctl_reg_record = registered_plugin_head;
8011 + else /* tmp.command == EVMS_NEXT_PLUGIN */
8012 + /* continue from current position in list */
8013 + ioctl_reg_record = ioctl_reg_record->next;
8015 + tmp.status = EVMS_PLUGIN_INVALID;
8017 + if (ioctl_reg_record) {
8018 + tmp.id = ioctl_reg_record->plugin->id;
8019 + tmp.version = ioctl_reg_record->plugin->version;
8020 + tmp.status = EVMS_PLUGIN_VALID;
8023 + /* copy info to userspace */
8024 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8031 +evms_ioctl_cmd_plugin_ioctl(struct inode *inode,
8032 + struct file *file,
8033 + unsigned int cmd, unsigned long arg)
8035 + int rc = 0, found = FALSE;
8036 + struct evms_plugin_ioctl_pkt tmp, *user_parms;
8037 + struct evms_registered_plugin *p;
8039 + user_parms = (struct evms_plugin_ioctl_pkt *) arg;
8040 + /* copy user's parameters to kernel space */
8041 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8045 + /* search for the specified plugin */
8046 + for (p = registered_plugin_head; p; p = p->next)
8047 + /* check for the specified feature id */
8048 + if (p->plugin->id == tmp.feature_id) {
8050 + /* check that entry point is used */
8051 + if (p->plugin->fops->direct_ioctl)
8052 + rc = DIRECT_IOCTL(p, inode, file, cmd,
8058 + /* was the specified plugin found? */
8059 + if (found == FALSE)
8062 + /* copy the status value back to the user */
8064 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8070 +#define MAX_BUFFER_SIZE 65536
8072 +evms_ioctl_cmd_kernel_partial_csum(void *arg)
8075 + u64 compute_size = MAX_BUFFER_SIZE;
8076 + struct evms_compute_csum_pkt tmp, *user_parms;
8077 + unsigned char *buffer = NULL;
8079 + user_parms = (struct evms_compute_csum_pkt *) arg;
8080 + /* copy user's parameters to kernel space */
8081 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8085 + /* allocate a io buffer upto 64Kbytes in size */
8086 + if (tmp.buffer_size < MAX_BUFFER_SIZE)
8087 + compute_size = tmp.buffer_size;
8089 + /* allocate buffer large enough to hold a single sector */
8090 + buffer = kmalloc(compute_size, GFP_KERNEL);
8095 + /* perform io with specified disk */
8097 + u64 remaining_bytes;
8098 + u_char *user_buffer_ptr;
8099 + unsigned int insum = tmp.insum;
8101 + remaining_bytes = tmp.buffer_size;
8102 + user_buffer_ptr = tmp.buffer_address;
8103 + while (remaining_bytes) {
8104 + /* compute the compute_size for this pass */
8105 + compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ?
8106 + MAX_BUFFER_SIZE : remaining_bytes;
8108 + /* copy into kernel from user data buffer */
8109 + if (copy_from_user(buffer, user_buffer_ptr,
8114 + /* compute the checksum for this pass */
8115 + tmp.outsum = csum_partial(buffer, tmp.buffer_size,
8117 + /* set up for another possible pass */
8118 + insum = tmp.outsum;
8119 + /* update loop progress variables */
8120 + user_buffer_ptr += compute_size;
8121 + tmp.buffer_address += compute_size;
8122 + remaining_bytes -= compute_size;
8126 + /* if the sector_buffer was allocated, free it */
8130 + /* copy the status value back to the user */
8132 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8138 +#undef MAX_BUFFER_SIZE
8141 +evms_ioctl_cmd_get_bmap(struct inode *inode,
8142 + struct file *file, unsigned int cmd, unsigned long arg)
8145 + struct evms_get_bmap_pkt tmp, *user_parms;
8147 + user_parms = (struct evms_get_bmap_pkt *) arg;
8148 + /* copy user's parameters to kernel space */
8149 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8152 + /* pass the ioctl down the volume stack */
8154 + struct evms_logical_volume *volume;
8156 + volume = &evms_logical_volumes[MINOR(inode->i_rdev)];
8157 + rc = IOCTL(volume->node, inode, file, cmd,
8158 + (unsigned long) &tmp);
8160 + /* copy the status value back to the user */
8162 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8169 +evms_ioctl_cmd_process_notify_event(unsigned long arg)
8171 + int rc = 0, found = FALSE;
8172 + struct evms_notify_pkt tmp, *user_parms;
8173 + struct evms_list_node **list_node = NULL;
8174 + struct evms_event *event = NULL;
8176 + user_parms = (struct evms_notify_pkt *) arg;
8177 + /* copy user's parameters to kernel space */
8178 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8181 + /* check to see if PID has already been registered
8185 + list_node = &evms_global_notify_list;
8186 + while (*list_node) {
8187 + event = (*list_node)->item;
8188 + if ((event->pid == tmp.eventry.pid) &&
8189 + (event->eventid == tmp.eventry.eventid)) {
8193 + list_node = &(*list_node)->next;
8196 + if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */
8197 + /* registration code */
8201 + ("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",
8202 + rc, tmp.eventry.pid, tmp.eventry.signo,
8203 + tmp.eventry.eventid);
8205 + /* register this pid/event type */
8206 + event = kmalloc(sizeof (struct evms_event), GFP_KERNEL);
8210 + ("error(%d) allocating event structure.\n",
8213 + memset(event, 0, sizeof (struct evms_event));
8214 + event->pid = tmp.eventry.pid;
8215 + event->eventid = tmp.eventry.eventid;
8216 + event->signo = tmp.eventry.signo;
8217 + rc = evms_cs_add_item_to_list
8218 + (&evms_global_notify_list, event);
8221 + } else { /* tmp.command == EVMS_UNREGISTER_EVENT */
8222 + /* unregistration code */
8226 + ("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",
8227 + rc, tmp.eventry.pid, tmp.eventry.eventid);
8229 + event = (*list_node)->item;
8230 + rc = evms_cs_remove_item_from_list
8231 + (&evms_global_notify_list, event);
8237 + /* copy the status value back to the user */
8239 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8246 +evms_ioctl_cmd_check_mount_status(struct inode *inode, struct file *file,
8250 + struct evms_mount_status_pkt tmp, *user_parms;
8252 + user_parms = (struct evms_mount_status_pkt *) arg;
8253 + /* copy user's parameters to kernel space */
8254 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8259 + (is_mounted(MKDEV(EVMS_MAJOR, tmp.minor))) ? TRUE : FALSE;
8262 + /* copy the status value back to the user */
8264 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8271 +evms_ioctl_cmd_check_open_status(struct inode *inode, struct file *file,
8275 + struct evms_open_status_pkt tmp, *user_parms;
8277 + user_parms = (struct evms_open_status_pkt *) arg;
8278 + /* copy user's parameters to kernel space */
8279 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8283 + tmp.opens = is_open(tmp.minor);
8286 + /* copy the status value back to the user */
8288 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8294 +/************************************************/
8295 +/* END -- IOCTL commands -- EVMS specific */
8296 +/************************************************/
8298 +/************************************************/
8299 +/* START -- IOCTL commands -- Volume specific */
8300 +/************************************************/
8302 +/************************************************/
8303 +/* END -- IOCTL commands -- Volume specific */
8304 +/************************************************/
8306 +/************************************************/
8307 +/* START -- IOCTL main */
8308 +/************************************************/
8311 + * Function: evms_ioctl
8313 + * This function is the main ioctl entry point for all of evms.
8317 +evms_ioctl(struct inode *inode,
8318 + struct file *file, unsigned int cmd, unsigned long arg)
8320 + unsigned long minor = 0;
8322 + struct evms_logical_node *node = NULL;
8324 + /* check user access */
8325 + if (!capable(CAP_SYS_ADMIN))
8332 + /* get the minor */
8333 + minor = MINOR(inode->i_rdev);
8335 + ("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
8336 + minor, (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK,
8337 + (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
8338 + (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
8339 + (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK);
8341 + /* insure this minor points to a valid volume */
8343 + node = evms_logical_volumes[minor].node;
8349 + /* process the IOCTL commands */
8352 + /* process all EVMS specific commands */
8354 + case EVMS_GET_IOCTL_VERSION:
8355 + rc = evms_ioctl_cmd_get_ioctl_version((void *)
8358 + case EVMS_GET_VERSION:
8359 + rc = evms_ioctl_cmd_get_version((void *) arg);
8361 + case EVMS_GET_INFO_LEVEL:
8362 + rc = evms_ioctl_cmd_get_info_level((void *)
8365 + case EVMS_SET_INFO_LEVEL:
8366 + rc = evms_ioctl_cmd_set_info_level((void *)
8369 + case EVMS_REDISCOVER_VOLUMES:
8370 + rc = evms_ioctl_cmd_rediscover_volumes(inode,
8375 + case EVMS_GET_LOGICAL_DISK:
8376 + rc = evms_ioctl_cmd_get_logical_disk((void *)
8379 + case EVMS_GET_LOGICAL_DISK_INFO:
8380 + rc = evms_ioctl_cmd_get_logical_disk_info((void
8384 + case EVMS_SECTOR_IO:
8385 + rc = evms_ioctl_cmd_sector_io((void *) arg);
8387 + case EVMS_GET_MINOR:
8388 + rc = evms_ioctl_cmd_get_minor((void *) arg);
8390 + case EVMS_GET_VOLUME_DATA:
8391 + rc = evms_ioctl_cmd_get_volume_data((void *)
8394 + case EVMS_DELETE_VOLUME:
8395 + rc = evms_ioctl_cmd_delete_volume(inode, file,
8398 + case EVMS_GET_PLUGIN:
8399 + rc = evms_ioctl_cmd_get_plugin((void *) arg);
8401 + case EVMS_PLUGIN_IOCTL:
8402 + rc = evms_ioctl_cmd_plugin_ioctl(inode, file,
8405 + case EVMS_COMPUTE_CSUM:
8406 + rc = evms_ioctl_cmd_kernel_partial_csum((void *)
8409 + case EVMS_PROCESS_NOTIFY_EVENT:
8410 + rc = evms_ioctl_cmd_process_notify_event(arg);
8412 + case EVMS_CHECK_MOUNT_STATUS:
8413 + rc = evms_ioctl_cmd_check_mount_status(inode,
8417 + case EVMS_CHECK_OPEN_STATUS:
8418 + rc = evms_ioctl_cmd_check_open_status(inode,
8427 + /* process Volume specific commands */
8429 + /* pick up standard blk ioctls */
8435 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8440 + rc = blk_ioctl(inode->i_rdev, cmd, arg);
8442 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8445 + /* casting size down to 32-bits until
8446 + * kernel allows return of 64-bit size
8449 + long size = node->total_vsectors;
8451 + ((long *) arg, &size,
8456 + case BLKGETSIZE64:
8458 + u64 size_in_bytes =
8461 + EVMS_VSECTOR_SIZE_SHIFT;
8463 + ((u64 *) arg, &size_in_bytes,
8469 + case EVMS_GET_IOCTL_VERSION:
8470 + rc = evms_ioctl_cmd_get_ioctl_version((void *)
8473 + case EVMS_GET_BMAP:
8474 + rc = evms_ioctl_cmd_get_bmap(inode, file, cmd,
8477 + case EVMS_GET_VOL_STRIPE_INFO:
8479 + struct evms_vol_stripe_info_pkt info;
8483 + EVMS_VSECTOR_SIZE_SHIFT;
8486 + ((struct evms_vol_stripe_info_pkt *)
8487 + arg, &info, sizeof (info)))
8493 + rc = IOCTL(node, inode, file, cmd, arg);
8501 +/************************************************/
8502 +/* END -- IOCTL main */
8503 +/************************************************/
8505 +/************************************************/
8506 +/* START -- CHECK MEDIA CHANGE */
8507 +/************************************************/
8510 +evms_check_media_change(kdev_t dev)
8513 + struct evms_logical_volume *volume = NULL;
8515 + /* check user access */
8516 + if (!capable(CAP_SYS_ADMIN))
8520 + /* get the minor */
8521 + minor = MINOR(dev);
8522 + /* insure this minor points to a valid volume */
8523 + volume = &evms_logical_volumes[minor];
8524 + if (volume->node == NULL) {
8529 + if (volume->flags & EVMS_DEVICE_REMOVABLE) {
8530 + /* check for media change */
8531 + rc = evms_cs_kernel_ioctl(volume->node,
8532 + EVMS_CHECK_MEDIA_CHANGE,
8533 + (unsigned long) NULL);
8536 + ("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8537 + rc, volume->name);
8544 +/************************************************/
8545 +/* END -- CHECK MEDIA CHANGE */
8546 +/************************************************/
8549 +evms_check_for_device_changes(struct inode *inode, struct file *file)
8551 + int rc = 0, something_changed = 0, i;
8552 + struct evms_rediscover_pkt kernel_rd_pckt = { 0, 0, NULL };
8553 + struct evms_list_node *disk_list = NULL, *lnode, *next_lnode;
8554 + struct evms_logical_node *disk, *new_device_list = NULL;
8555 + struct evms_logical_volume *volume = NULL;
8557 + /* check for new devices
8559 + * put all new devices on the disk list so they
8560 + * will be included in the rediscovery process.
8562 + static void evms_discover_logical_disks(struct evms_logical_node **);
8563 + evms_discover_logical_disks(&new_device_list);
8564 + if (new_device_list) {
8565 + LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);
8566 + something_changed++;
8567 + /* put these new nodes on the disk list */
8568 + while (new_device_list) {
8569 + disk = new_device_list;
8570 + rc = evms_cs_remove_logical_node_from_list
8571 + (&new_device_list, disk);
8574 + ("%s: error(%d) removing device(%s) from list.\n",
8575 + __FUNCTION__, rc, disk->name);
8577 + rc = evms_cs_add_item_to_list(&disk_list, disk);
8580 + ("%s: error(%d) adding device(%s) from list.\n",
8581 + __FUNCTION__, rc, disk->name);
8586 + /* check all devices for changed removable media
8588 + * scan the global device list and issue check
8589 + * media change on each removable media device.
8590 + * put all removable devices that indicate a
8591 + * media change on the disk list.
8593 + * also scan for devices that have been unplugged
8594 + * or contain corrupt volumes.
8596 + for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
8597 + int add_to_list = FALSE;
8598 + disk = (struct evms_logical_node *) lnode->item;
8599 + /* only really check removable media devices */
8600 + if (disk->flags & EVMS_DEVICE_REMOVABLE) {
8601 + /* check for media change */
8602 + rc = evms_cs_kernel_ioctl(disk,
8603 + EVMS_CHECK_MEDIA_CHANGE,
8604 + (unsigned long) NULL);
8607 + ("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8608 + __FUNCTION__, rc, disk->name);
8609 + } else if (rc == 1) {
8610 + add_to_list = TRUE;
8613 + /* check for device that where present
8614 + * before but are gone (unplugged
8615 + * device or unloaded driver).
8617 + rc = IOCTL(disk, inode, file,
8618 + EVMS_CHECK_DEVICE_STATUS, (ulong) NULL);
8621 + ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8622 + rc, volume->name);
8624 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
8625 + add_to_list = TRUE;
8627 + if (add_to_list) {
8628 + something_changed++;
8629 + rc = evms_cs_add_item_to_list(&disk_list, disk);
8632 + /* log a statement that we detected changed media.
8635 + LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);
8638 + /* check for volumes with removed removable media.
8639 + * mark the volumes that reside on changed media.
8641 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8642 + volume = &evms_logical_volumes[i];
8643 + if (!volume->node)
8645 + if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
8647 + if (evms_check_media_change(MKDEV(EVMS_MAJOR, i)) <= 0)
8649 + /* remember which volumes have changed media */
8650 + volume->flags |= EVMS_MEDIA_CHANGED;
8651 + something_changed++;
8654 + /* check for removed devices */
8655 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8657 + volume = &evms_logical_volumes[i];
8658 + if (!volume->node)
8660 + /* check for device status */
8662 + rc = IOCTL(volume->node, inode, file,
8663 + EVMS_CHECK_DEVICE_STATUS, (ulong) & status);
8666 + ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8667 + rc, volume->name);
8670 + if (!(status & EVMS_DEVICE_UNAVAILABLE)) {
8673 + /* remember which volumes have changed media */
8674 + volume->flags |= EVMS_DEVICE_UNPLUGGED;
8675 + something_changed++;
8678 + /* do we have some work to do? */
8679 + if (something_changed) {
8680 + /* check for volumes to be deleted */
8681 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8682 + struct evms_quiesce_vol_pkt qv;
8684 + volume = &evms_logical_volumes[i];
8685 + if (!volume->node)
8687 + /* only proceed on volumes with:
8689 + * hot-unplugged devices,
8690 + * & partial volumes
8692 + if (!(volume->flags &
8693 + (EVMS_MEDIA_CHANGED |
8694 + EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8696 + /* gather the disk's needing to be
8697 + * rediscovered to rebuild this
8700 + * this will locate other disks that
8701 + * the volume resides on that don't
8702 + * indicate media change.
8704 + rc = evms_cs_kernel_ioctl(volume->node,
8705 + EVMS_GET_DISK_LIST,
8706 + (unsigned long) &disk_list);
8709 + ("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",
8710 + __FUNCTION__, rc, volume->name);
8713 + /* quiesce all the changed volumes
8714 + * prior to being deleted.
8716 + qv.command = 1; // quiesce
8718 + qv.status = 0; // reset status
8720 + rc = evms_quiesce_volume(volume, inode, file, &qv);
8723 + ("%s: error(%d) attempting to quiesce '%s%s'.\n",
8724 + __FUNCTION__, rc, EVMS_DEV_NODE_PATH,
8729 + /* we need to revalidate all the changed
8730 + * media. this is accomplished by issuing
8731 + * the revalidate disk ioctl to each device
8732 + * with changed media. the device manager
8733 + * remembers which devices indicated
8734 + * media changed (set by check media
8735 + * changed ioctl issued earlier), and will
8736 + * only issue the revalidate disk ioctl to
8737 + * those disks one time.
8740 + * this needs to be done BEFORE deleting
8741 + * the volumes because deleting the
8742 + * last segment on disk will cause the
8743 + * associated disk node to freed, and we
8744 + * will not be able to issue the
8745 + * revalidate disk ioctl after that.
8747 + for (lnode = disk_list; lnode; lnode = lnode->next) {
8748 + disk = (struct evms_logical_node *) lnode->item;
8749 + /* only really do removable media devices */
8750 + if (disk->flags & EVMS_MEDIA_CHANGED) {
8751 + /* go revalidate the change media */
8752 + rc = evms_cs_kernel_ioctl(disk,
8753 + EVMS_REVALIDATE_DISK,
8754 + (unsigned long) NULL);
8757 + ("%s: error(%d) attempting to revalidate '%s%s'.\n",
8759 + EVMS_DEV_NODE_PATH, volume->name);
8764 + /* delete all the affected volumes */
8765 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8766 + struct evms_delete_vol_pkt dv;
8768 + volume = &evms_logical_volumes[i];
8769 + if (!volume->node)
8771 + /* only proceed on volumes with:
8773 + * hot-unplugged devices,
8774 + * & partial volumes
8776 + if (!(volume->flags &
8777 + (EVMS_MEDIA_CHANGED |
8778 + EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8780 + /* only delete quiesced volumes */
8781 + if (!volume->quiesced)
8783 + /* delete the volume from memory.
8784 + * do a 'soft' delete if volume
8785 + * is mounted, and 'hard' delete
8788 + * NOTE: the delete operation will
8789 + * clear the bits in the flags field.
8791 + dv.command = is_open(i);
8794 + rc = evms_delete_volume(volume, &dv);
8797 + /* at this point all devices indicating
8798 + * media change that had volumes on them
8799 + * should be gone. however, we could still
8800 + * have devices indicating media change
8801 + * that had no volumes on them in the disk
8802 + * list. we need to delete these devices
8803 + * from kernel memory and the global device
8806 + for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
8807 + next_lnode = lnode->next;
8809 + disk = (struct evms_logical_node *) lnode->item;
8810 + if (disk->flags & EVMS_MEDIA_CHANGED) {
8811 + rc = DELETE(disk);
8815 + /* all the devices that indicated media
8816 + * change should be gone, both from kernel
8817 + * memory and global device list. we now
8818 + * need to remove any references to these
8819 + * devices from the disk list.
8821 + * when removable media is installed, it
8822 + * will get detected in the device manager's
8823 + * rediscovery as a new device and added to
8824 + * the discover list.
8826 + for (lnode = disk_list; lnode; lnode = next_lnode) {
8827 + struct evms_list_node *glnode;
8828 + int lnode_still_there;
8830 + next_lnode = lnode->next;
8832 + lnode_still_there = FALSE;
8833 + for (glnode = evms_global_device_list;
8834 + glnode; glnode = glnode->next) {
8835 + if (glnode->item == lnode->item) {
8836 + lnode_still_there = TRUE;
8840 + if (lnode_still_there == FALSE) {
8841 + rc = evms_cs_remove_item_from_list(&disk_list,
8845 + ("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",
8846 + __FUNCTION__, rc, lnode->item,
8852 + /* build the in-kernel rediscover packet */
8854 + /* allocate the space for the drive_array in
8855 + * the struct evms_rediscover_pkt packet. to do this
8856 + * we need to count the number of disk nodes,
8857 + * then allocate the necessary space.
8859 + /* count the disk nodes */
8860 + for (lnode = disk_list; lnode; lnode = lnode->next)
8861 + kernel_rd_pckt.drive_count++;
8862 + /* allocate the space */
8863 + if (kernel_rd_pckt.drive_count) {
8864 + kernel_rd_pckt.drive_array =
8865 + kmalloc(kernel_rd_pckt.drive_count *
8866 + sizeof (u64), GFP_KERNEL);
8867 + if (!kernel_rd_pckt.drive_array) {
8870 + ("%s: error(%d) allocating rediscover drive array.\n",
8871 + __FUNCTION__, rc);
8874 + /* populate the drive array
8876 + * this also frees the disk_list which is useful
8877 + * if we had an error allocating the drive array.
8879 + for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
8880 + next_lnode = lnode->next;
8882 + /* remove this disk from the disk list */
8883 + disk = (struct evms_logical_node *) lnode->item;
8884 + rc = evms_cs_remove_item_from_list(&disk_list, disk);
8886 + /* add this disk to rediscover
8889 + kernel_rd_pckt.drive_array[i] =
8890 + NODE_TO_DEV_HANDLE(disk);
8893 + /* perform the rediscovery operation */
8895 + static int evms_discover_volumes(struct
8896 + evms_rediscover_pkt *);
8897 + rc = evms_discover_volumes(&kernel_rd_pckt);
8898 + if (kernel_rd_pckt.drive_count) {
8899 + kfree(kernel_rd_pckt.drive_array);
8902 + LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);
8908 +/************************************************/
8909 +/* START -- REVALIDATE DISK */
8910 +/************************************************/
8913 +evms_revalidate_disk(kdev_t dev)
8916 + struct evms_logical_volume *volume = NULL;
8918 + /* check user access */
8919 + if (!capable(CAP_SYS_ADMIN))
8923 + /* get the minor */
8924 + minor = MINOR(dev);
8925 + /* insure this minor points to a valid volume */
8926 + volume = &evms_logical_volumes[minor];
8927 + if (volume->node == NULL) {
8932 + /* go revalidate the change media */
8933 + rc = evms_cs_kernel_ioctl(volume->node,
8934 + EVMS_REVALIDATE_DISK,
8935 + (unsigned long) NULL);
8940 +/************************************************/
8941 +/* END -- REVALIDATE DISK */
8942 +/************************************************/
8944 +/************************************************/
8945 +/* START -- OPEN */
8946 +/************************************************/
8949 +evms_open(struct inode *inode, struct file *file)
8951 + int rc = 0, minor = 0;
8952 + struct evms_logical_volume *volume = NULL;
8954 + /* check user access */
8955 + if (!capable(CAP_SYS_ADMIN))
8961 + rc = evms_check_for_device_changes(inode, file);
8963 + /* get the minor */
8964 + minor = MINOR(inode->i_rdev);
8966 + /* insure this minor points to a valid volume */
8967 + volume = &evms_logical_volumes[minor];
8968 + if (volume->node == NULL) {
8973 + /* go "open" the volume */
8974 + if (!rc && minor) {
8975 + atomic_inc(&volume->opens);
8976 + rc = IOCTL(volume->node, inode, file,
8977 + EVMS_OPEN_VOLUME, (unsigned long) NULL);
8980 + ("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
8981 + rc, volume->name);
8982 + atomic_dec(&volume->opens);
8988 +/************************************************/
8990 +/************************************************/
8992 +/************************************************/
8993 +/* START -- RELEASE */
8994 +/************************************************/
8997 +evms_release(struct inode *inode, struct file *file)
8999 + int rc = 0, minor = 0;
9000 + struct evms_logical_volume *volume = NULL;
9005 + /* get the minor */
9006 + minor = MINOR(inode->i_rdev);
9008 + /* insure this minor points to a valid volume */
9009 + volume = &evms_logical_volumes[minor];
9010 + if (volume->node == NULL) {
9015 + /* go "close" the volume */
9016 + if (!rc && minor) {
9017 + rc = IOCTL(volume->node, inode, file,
9018 + EVMS_CLOSE_VOLUME, (unsigned long) NULL);
9021 + ("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
9022 + rc, volume->name);
9024 + atomic_dec(&volume->opens);
9030 +/************************************************/
9031 +/* END -- RELEASE */
9032 +/************************************************/
9034 +static struct block_device_operations evms_fops = {
9035 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)
9036 + owner:THIS_MODULE,
9039 + release:evms_release,
9041 + check_media_change:evms_check_media_change,
9042 + revalidate:evms_revalidate_disk
9045 +/**********************************************************/
9046 +/* END -- FOPS functions definitions */
9047 +/**********************************************************/
9049 +/**********************************************************/
9050 +/* START -- RUNTIME support functions */
9051 +/**********************************************************/
9054 +evms_do_request_fn(request_queue_t * q)
9056 + LOG_WARNING("This function should not be called.\n");
9060 +static request_queue_t *
9061 +evms_find_queue(kdev_t dev)
9063 + request_queue_t *rq = NULL;
9064 + struct evms_logical_volume *volume;
9066 + volume = &evms_logical_volumes[MINOR(dev)];
9068 + rq = &volume->request_queue;
9074 + * Function: evms_make_request_fn
9078 +evms_make_request_fn(request_queue_t * q, int rw, struct buffer_head *bh)
9080 + struct evms_logical_volume *volume;
9082 + volume = &evms_logical_volumes[MINOR(bh->b_rdev)];
9083 + wait_event(volume->wait_queue, (!volume->quiesced));
9084 + if (volume->node) {
9088 + atomic_inc(&volume->requests_in_progress);
9089 + R_IO(volume->node, bh);
9090 + atomic_dec(&volume->requests_in_progress);
9093 + atomic_inc(&volume->requests_in_progress);
9094 + W_IO(volume->node, bh);
9095 + atomic_dec(&volume->requests_in_progress);
9098 + buffer_IO_error(bh);
9102 + LOG_ERROR("request for unknown logical volume [minor(%d)].\n",
9103 + MINOR(bh->b_rdev));
9104 + buffer_IO_error(bh);
9109 +/**********************************************************/
9110 +/* END -- RUNTIME support functions */
9111 +/**********************************************************/
9113 +/**********************************************************/
9114 +/* START -- INIT/DISCOVERY support functions */
9115 +/**********************************************************/
9119 +display_discover_list(struct evms_logical_node *discover_list, char *text)
9121 + struct evms_logical_node *node;
9123 + LOG_DETAILS("discover list:(%s)\n", text);
9124 + for (node = discover_list; node; node = node->next) {
9125 + LOG_DETAILS("\nnode info:\n");
9126 + LOG_DETAILS("node.....................(0x%p)\n", node);
9127 + LOG_DETAILS("name.....................(%s)\n", node->name);
9128 + LOG_DETAILS("plugin id................(0x%x)\n",
9129 + node->plugin->id);
9130 + LOG_DETAILS("size.....................("PFU64")\n",
9131 + node->total_vsectors);
9132 + LOG_DETAILS("flags....................(0x%x)\n", node->flags);
9133 + LOG_DETAILS("iflags...................(0x%x)\n", node->iflags);
9134 + LOG_DETAILS("sector size..............(%d)\n",
9135 + node->hardsector_size);
9136 + LOG_DETAILS("block size...............(%d)\n",
9137 + node->block_size);
9138 + LOG_DETAILS("sys id...................(0x%x)\n",
9141 + if (node->feature_header) {
9142 + struct evms_feature_header *fh;
9144 + fh = node->feature_header;
9145 + LOG_DETAILS("\nfeature header:\n");
9146 + LOG_DETAILS("signature................(0x%x)\n",
9148 + LOG_DETAILS("crc......................(0x%x)\n",
9150 + LOG_DETAILS("feature header version...(%d.%d.%d)\n",
9151 + fh->version.major, fh->version.minor,
9152 + fh->version.patchlevel);
9153 + LOG_DETAILS("engine version...........(%d.%d.%d)\n",
9154 + fh->engine_version.major,
9155 + fh->engine_version.minor,
9156 + fh->engine_version.patchlevel);
9157 + LOG_DETAILS("flags....................(0x%x)\n",
9159 + LOG_DETAILS("feature id...............(0x%x)\n",
9161 + LOG_DETAILS("sequence#................("PFU64")\n",
9162 + fh->sequence_number);
9163 + LOG_DETAILS("alignment padding........("PFU64")\n",
9164 + fh->alignment_padding);
9165 + LOG_DETAILS("feature data1 lsn........("PFU64")\n",
9166 + fh->feature_data1_start_lsn);
9167 + LOG_DETAILS("feature data1 size.......("PFU64")\n",
9168 + fh->feature_data1_size);
9169 + LOG_DETAILS("feature data2 lsn........("PFU64")\n",
9170 + fh->feature_data2_start_lsn);
9171 + LOG_DETAILS("feature data2 size.......("PFU64")\n",
9172 + fh->feature_data2_size);
9173 + LOG_DETAILS("volume sn................("PFU64")\n",
9174 + fh->volume_serial_number);
9175 + LOG_DETAILS("volume minor#............(%d)\n",
9176 + fh->volume_system_id);
9177 + LOG_DETAILS("object depth.............(%d)\n",
9178 + fh->object_depth);
9179 + LOG_DETAILS("object name..............(%s)\n",
9181 + LOG_DETAILS("volume name..............(%s)\n",
9185 + if (node->volume_info) {
9186 + struct evms_volume_info *vi;
9188 + vi = node->volume_info;
9189 + LOG_DETAILS("\nvolume info:\n");
9190 + LOG_DETAILS("volume name..............(%s)\n",
9192 + LOG_DETAILS("volume sn................("PFU64")\n",
9194 + LOG_DETAILS("volume minor#............(%d)\n",
9195 + vi->volume_minor);
9198 + if (discover_list) {
9199 + LOG_DETAILS("\n");
9205 + * Function: evms_discover_logical_disks
9206 + * Description: Construct the logical disk list by calling all registered device managers.
9209 +evms_discover_logical_disks(struct evms_logical_node **disk_list)
9211 + struct evms_registered_plugin *p;
9212 + LOG_EXTRA("discovering logical disks...\n");
9213 + for (p = registered_plugin_head; p; p = p->next) {
9214 + if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
9215 + DISCOVER(p, disk_list);
9221 + * Function: evms_discover_logical_partitions
9222 + * Description: Construct the logical partition list by calling all registered partition managers.
9225 +evms_discover_logical_partitions(struct evms_logical_node **discover_list)
9229 + struct evms_registered_plugin *p;
9230 + LOG_EXTRA("discovering logical partitions...\n");
9233 + for (p = registered_plugin_head; p; p = p->next) {
9234 + if (GetPluginType(p->plugin->id) ==
9235 + EVMS_SEGMENT_MANAGER) {
9236 + rc = DISCOVER(p, discover_list);
9237 + /* RC > 0 means the plugin
9238 + * added something to the
9239 + * discover list. This also
9240 + * means we must loop thru
9241 + * these plugins another time.
9242 + * RC == 0 means nothing was
9243 + * added to the discover list
9245 + * RC < 0 means the plugin
9246 + * encountered some error and
9247 + * nothing was added to the list.
9248 + * NOTE: If a plugin has both
9249 + * added something new to the
9250 + * discover list and encountered
9251 + * an error, RC > 0 must be
9258 + } while (done == FALSE);
9260 + /* send the end of discovery signal to each
9261 + * partition manager plugin.
9263 + for (p = registered_plugin_head; p; p = p->next)
9264 + if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
9265 + if (p->plugin->fops->end_discover)
9266 + rc = END_DISCOVER(p, discover_list);
9270 + * Function: evms_discover_volume_groups
9271 + * Description: Find volume groups within the logical partitions list
9274 +evms_discover_volume_groups(struct evms_logical_node **discover_list)
9278 + struct evms_registered_plugin *p;
9279 + LOG_EXTRA("discovering logical volume groups...\n");
9282 + for (p = registered_plugin_head; p; p = p->next) {
9283 + if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
9284 + rc = DISCOVER(p, discover_list);
9285 + /* RC > 0 means the plugin
9286 + * added something to the
9287 + * discover list. This also
9288 + * means we must loop thru
9289 + * these plugins another time.
9290 + * RC == 0 means nothing was
9291 + * added to the discover list
9293 + * RC < 0 means the plugin
9294 + * encountered some error and
9295 + * nothing was added to the list.
9296 + * NOTE: If a plugin has both
9297 + * added something new to the
9298 + * discover list and encountered
9299 + * an error, RC > 0 must be
9306 + } while (done == FALSE);
9308 + /* send the end of discovery signal to each volume
9311 + for (p = registered_plugin_head; p; p = p->next)
9312 + if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
9313 + if (p->plugin->fops->end_discover)
9314 + rc = END_DISCOVER(p, discover_list);
9319 + * convert all the feature header fields into cpu native format
9320 + * from the on-disk Little Endian format. From this point forward
9321 + * all plugins can deal with feature headers natively.
9324 +le_feature_header_to_cpu(struct evms_feature_header *fh)
9326 + fh->signature = le32_to_cpup(&fh->signature);
9327 + fh->crc = le32_to_cpup(&fh->crc);
9328 + fh->version.major = le32_to_cpup(&fh->version.major);
9329 + fh->version.minor = le32_to_cpup(&fh->version.minor);
9330 + fh->version.patchlevel = le32_to_cpup(&fh->version.patchlevel);
9331 + fh->engine_version.major = le32_to_cpup(&fh->engine_version.major);
9332 + fh->engine_version.minor = le32_to_cpup(&fh->engine_version.minor);
9333 + fh->engine_version.patchlevel =
9334 + le32_to_cpup(&fh->engine_version.patchlevel);
9335 + fh->flags = le32_to_cpup(&fh->flags);
9336 + fh->feature_id = le32_to_cpup(&fh->feature_id);
9337 + fh->sequence_number = le64_to_cpup(&fh->sequence_number);
9338 + fh->alignment_padding = le64_to_cpup(&fh->alignment_padding);
9339 + fh->feature_data1_start_lsn =
9340 + le64_to_cpup(&fh->feature_data1_start_lsn);
9341 + fh->feature_data1_size = le64_to_cpup(&fh->feature_data1_size);
9342 + fh->feature_data2_start_lsn =
9343 + le64_to_cpup(&fh->feature_data2_start_lsn);
9344 + fh->feature_data2_size = le64_to_cpup(&fh->feature_data2_size);
9345 + fh->volume_serial_number = le64_to_cpup(&fh->volume_serial_number);
9346 + fh->volume_system_id = le32_to_cpup(&fh->volume_system_id);
9347 + fh->object_depth = le32_to_cpup(&fh->object_depth);
9351 +edef_load_feature_header(struct evms_logical_node *node)
9353 + int i, rc = 0, rc_array[2] = { 0, 0 };
9354 + unsigned long size_in_bytes;
9355 + u64 size_in_sectors, starting_sector = 0;
9356 + struct evms_feature_header *fh = NULL, *fh1 = NULL, *fh2 = NULL;
9357 + char *location_name = NULL;
9358 + struct evms_version version = {
9359 + EVMS_FEATURE_HEADER_MAJOR,
9360 + EVMS_FEATURE_HEADER_MINOR,
9361 + EVMS_FEATURE_HEADER_PATCHLEVEL
9364 + if (!node->feature_header) {
9365 + size_in_sectors = evms_cs_size_in_vsectors(sizeof (*fh));
9366 + size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
9367 + fh1 = kmalloc(size_in_bytes, GFP_KERNEL);
9369 + fh2 = kmalloc(size_in_bytes, GFP_KERNEL);
9378 + for (i = 0; i < 2; i++) {
9381 + node->total_vsectors - size_in_sectors;
9383 + location_name = evms_primary_string;
9385 + starting_sector--;
9387 + location_name = evms_secondary_string;
9389 + /* read header into buffer */
9390 + rc = INIT_IO(node,
9391 + 0, starting_sector, size_in_sectors, fh);
9394 + ("error(%d) probing for %s feature header(at "PFU64") on '%s'.\n",
9395 + rc, location_name, starting_sector,
9400 + /* validate header signature */
9401 + if (cpu_to_le32(fh->signature) !=
9402 + EVMS_FEATURE_HEADER_SIGNATURE) {
9407 + /* validate header CRC */
9408 + if (fh->crc != EVMS_MAGIC_CRC) {
9409 + u32 org_crc, final_crc;
9410 + org_crc = cpu_to_le32(fh->crc);
9413 + evms_cs_calculate_crc(EVMS_INITIAL_CRC, fh,
9415 + if (final_crc != org_crc) {
9417 + ("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at "PFU64") on '%s'.\n",
9418 + org_crc, final_crc, location_name,
9419 + starting_sector, node->name);
9426 + ("CRC disabled in %s feature header(at "PFU64") on '%s'.\n",
9427 + location_name, starting_sector,
9430 + /* convert the feature header from the
9431 + * on-disk format (Little Endian) to
9432 + * native cpu format.
9434 + le_feature_header_to_cpu(fh);
9435 + /* verify the system data version */
9436 + rc = evms_cs_check_version(&version, &fh->version);
9439 + ("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
9440 + fh->version.major, fh->version.minor,
9441 + fh->version.patchlevel, location_name,
9447 + /* getting same return code for both copies? */
9448 + if (rc_array[0] == rc_array[1]) {
9450 + /* if no errors on both copies,
9451 + * check the sequence numbers.
9452 + * use the highest sequence number.
9455 + /* compare sequence numbers */
9456 + if (fh1->sequence_number ==
9457 + fh2->sequence_number) {
9461 + ("%s feature header sequence number("PFU64") mismatches %s feature header sequence number("PFU64") on '%s'!\n",
9462 + evms_primary_string,
9463 + fh1->sequence_number,
9464 + evms_secondary_string,
9465 + fh2->sequence_number, node->name);
9466 + if (fh1->sequence_number >
9467 + fh2->sequence_number) {
9470 + evms_primary_string;
9471 + /* indicate bad sequence number of secondary */
9476 + evms_secondary_string;
9477 + /* indicate bad sequence number of primary */
9482 + /* getting different return codes for each copy */
9484 + /* either primary or secondary copy is
9485 + * valid, so use the valid copy.
9487 + if ((rc_array[0] == 0) || (rc_array[1] == 0)) {
9488 + char *warn_name = NULL;
9490 + /* indicate success */
9492 + /* set variables based on which copy is valid */
9493 + if (rc_array[0] == 0) {
9494 + /* use primary (rear) copy if its good */
9496 + location_name = evms_primary_string;
9497 + warn_name = evms_secondary_string;
9499 + /* use secondary (front) copy if its good */
9501 + location_name = evms_secondary_string;
9502 + warn_name = evms_primary_string;
9504 + /* warn the user about the invalid copy */
9506 + ("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
9507 + rc_array[0] + rc_array[1], warn_name, node->name);
9509 + /* both copies had a different error,
9510 + * and one was a fatal error, so
9511 + * indicate fatal error.
9513 + if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
9517 + /* on error, set fh to NULL */
9521 + /* deallocate metadata buffers appropriately */
9527 + /* save validated feature header pointer */
9529 + node->feature_header = fh;
9530 + if (rc_array[0] != rc_array[1]) {
9532 + ("using %s feature header on '%s'.\n",
9533 + location_name, node->name);
9537 + /* if no signature found, adjust return code */
9538 + if (rc == -ENODATA) {
9540 + LOG_DEBUG("no feature header found on '%s'.\n",
9548 +edef_find_first_features(struct evms_logical_node **discover_list)
9551 + struct evms_logical_node *node, *tmp_list_head;
9553 + tmp_list_head = *discover_list;
9554 + *discover_list = NULL;
9556 + while (tmp_list_head) {
9557 + struct evms_list_node **evms_node;
9559 + node = tmp_list_head;
9560 + rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
9565 + /* check for duplicate pointers
9566 + * search for the node in global list
9569 + evms_cs_lookup_item_in_list(&evms_global_feature_node_list,
9571 + /* already present? */
9573 + /* yes, already present */
9574 + rc = -ENODATA; /* dont process this node further */
9575 + LOG_DETAILS("deleting duplicate reference to '%s'.\n",
9577 + /* forget this node */
9580 + /* load the feature header if present */
9581 + rc = edef_load_feature_header(node);
9582 + /* This node have a feature header ?
9583 + * it won't be if there is no header to load
9585 + * there was a fatal error attempting to read it.
9587 + if (node->feature_header) {
9588 + /* check for object flag */
9589 + if (node->feature_header->flags &
9590 + EVMS_VOLUME_DATA_OBJECT) {
9592 + ("object detected, deleting '%s'.\n",
9596 + /* check for stop-data flag */
9597 + if (node->feature_header->flags &
9598 + EVMS_VOLUME_DATA_STOP) {
9600 + ("stop data detected, deleting '%s'.\n",
9604 + /* we have a valid feature header.
9605 + * initialize appropriate node fields
9606 + * to indicate this.
9608 + node->flags |= EVMS_VOLUME_FLAG;
9609 + node->iflags |= EVMS_FEATURE_BOTTOM;
9610 + node->volume_info =
9612 + (struct evms_volume_info),
9614 + if (node->volume_info) {
9618 + memset(node->volume_info, 0,
9621 + evms_volume_info));
9622 + node->volume_info->volume_sn =
9623 + node->feature_header->
9624 + volume_serial_number;
9625 + node->volume_info->
9627 + node->feature_header->
9629 + strcpy(node->volume_info->
9631 + node->feature_header->
9633 + /* register(add) node to
9634 + * the global list.
9636 + rc = evms_cs_add_item_to_list
9637 + (&evms_global_feature_node_list,
9645 + /* if any errors, delete the node */
9651 + /* on successful processing of this node
9652 + * place it back on the discover list.
9654 + evms_cs_add_logical_node_to_list(discover_list, node);
9659 +/* These define describe the node types that can be isolated. */
9660 +#define ISOLATE_ASSOCIATIVE_FEATURES 0
9661 +#define ISOLATE_COMPATIBILITY_VOLUMES 1
9662 +#define ISOLATE_EVMS_VOLUMES 2
9663 +#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER 3
9664 +#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH 4
9666 +edef_isolate_nodes_by_type(unsigned int type,
9667 + struct evms_logical_node **src_list,
9668 + struct evms_logical_node **trg_list,
9669 + u32 compare32, u64 compare64)
9671 + struct evms_logical_node *node, *next_node;
9672 + int rc = 0, found_node;
9673 + struct evms_feature_header *fh = NULL;
9675 + for (node = *src_list; node; node = next_node) {
9676 + next_node = node->next;
9678 + if (node->feature_header)
9679 + fh = node->feature_header;
9680 + found_node = FALSE;
9682 + case ISOLATE_ASSOCIATIVE_FEATURES:
9684 + if (GetPluginType(fh->feature_id) ==
9685 + EVMS_ASSOCIATIVE_FEATURE)
9686 + found_node = TRUE;
9689 + case ISOLATE_COMPATIBILITY_VOLUMES:
9690 + if (!(node->flags & EVMS_VOLUME_FLAG))
9691 + found_node = TRUE;
9693 + case ISOLATE_EVMS_VOLUMES:
9694 + if (node->flags & EVMS_VOLUME_FLAG)
9695 + found_node = TRUE;
9697 + /* EVMS volumes with same serial # */
9698 + case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
9699 + if (node->volume_info->volume_sn == compare64)
9700 + found_node = TRUE;
9702 + case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
9704 + if (fh->object_depth == compare64)
9705 + if (fh->feature_id == compare32)
9706 + found_node = TRUE;
9709 + if (found_node == TRUE) {
9710 + rc = evms_cs_remove_logical_node_from_list(src_list,
9714 + rc = evms_cs_add_logical_node_to_list(trg_list, node);
9723 +edef_apply_feature(struct evms_logical_node *node,
9724 + struct evms_logical_node **volume_node_list)
9726 + struct evms_registered_plugin *p;
9729 + for (p = registered_plugin_head; p; p = p->next) {
9730 + if (p->plugin->id == node->feature_header->feature_id) {
9731 + rc = DISCOVER(p, volume_node_list);
9739 +edef_get_feature_plugin_header(u32 id, struct evms_plugin_header **header)
9742 + struct evms_registered_plugin *p;
9744 + for (p = registered_plugin_head; p; p = p->next) {
9745 + if (p->plugin->id == id) {
9746 + *header = p->plugin;
9752 + LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);
9757 +typedef struct evms_volume_build_info_s {
9759 + int feature_header_count;
9760 + int feature_count;
9761 + int associative_feature_count;
9763 + struct evms_plugin_header *plugin;
9764 + struct evms_logical_node *feature_node_list;
9765 +} evms_volume_build_info_t;
9768 + * edef_evaluate_volume_node_list:
9770 + * 1) put all nodes from feature list back on volume list
9771 + * 2) loads the node's feature headers
9772 + * 3) counts the node list's entries
9773 + * 4) builds the feature node list
9774 + * 5) counts the feature headers for associative features
9775 + * 6) sets feature count to >1 if >1 features to be processed
9778 +edef_evaluate_volume_node_list(struct evms_logical_node **volume_node_list,
9779 + evms_volume_build_info_t * vbi,
9780 + int volume_complete)
9783 + struct evms_logical_node *node;
9786 + vbi->feature_count =
9787 + vbi->associative_feature_count = vbi->max_depth = 0;
9788 + vbi->plugin = NULL;
9790 + /* put all feature nodes back on the volume list */
9791 + rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9792 + &vbi->feature_node_list,
9793 + volume_node_list, 0, 0);
9797 + /* load all the feature headers */
9798 + if (!volume_complete) {
9799 + for (node = *volume_node_list; node; node = node->next) {
9800 + rc = edef_load_feature_header(node);
9806 + /* find the 1st max depth object:
9807 + * record the depth
9808 + * record the plugin
9810 + for (node = *volume_node_list; node; node = node->next) {
9811 + struct evms_plugin_header *plugin;
9812 + struct evms_feature_header *fh = node->feature_header;
9814 + /* count the nodes */
9815 + vbi->node_count++;
9817 + /* no feature header found, continue to next node */
9821 + /* check the depth */
9822 + if (fh->object_depth > vbi->max_depth) {
9823 + /* record new max depth */
9824 + vbi->max_depth = fh->object_depth;
9825 + /* find the plugin header for this feature id */
9826 + rc = edef_get_feature_plugin_header(fh->feature_id,
9830 + /* check for >1 plugins */
9831 + if (vbi->plugin != plugin) {
9832 + vbi->feature_count++;
9833 + vbi->plugin = plugin;
9836 + /* check for "associative" feature indicator */
9837 + if (GetPluginType(vbi->plugin->id) == EVMS_ASSOCIATIVE_FEATURE)
9838 + vbi->associative_feature_count++;
9840 + /* build a list of max depth nodes for this feature */
9841 + if (vbi->max_depth) {
9842 + rc = edef_isolate_nodes_by_type
9843 + (ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, volume_node_list,
9844 + &vbi->feature_node_list, vbi->plugin->id, vbi->max_depth);
9848 + return (-ENODATA);
9849 + if (!vbi->feature_node_list)
9850 + return (-ENODATA);
9856 +/* function: edef_check_feature_conditions
9858 + * This routine verifies the state of volume based on the features
9859 + * headers and nodes in the current discovery list. All detected
9860 + * errors are considered fatal.
9863 +edef_check_feature_conditions(evms_volume_build_info_t * vbi)
9867 + if (vbi->associative_feature_count) {
9868 + if (vbi->node_count > 1) {
9869 + rc = -EVMS_VOLUME_FATAL_ERROR;
9871 + ("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
9873 + } else if (vbi->max_depth != 1) {
9874 + rc = -EVMS_VOLUME_FATAL_ERROR;
9876 + ("associative ERROR: associative feature found at node depth("PFU64") != 1!\n",
9879 + rc = -EVMS_ASSOCIATIVE_FEATURE;
9882 + if (!vbi->max_depth) {
9883 + if (vbi->node_count > 1) {
9884 + rc = -EVMS_VOLUME_FATAL_ERROR;
9886 + ("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
9889 + } else if (vbi->max_depth == 1) {
9890 + if (vbi->feature_count > 1) {
9891 + rc = -EVMS_VOLUME_FATAL_ERROR;
9893 + ("max depth 1 ERROR: > 1 features remaining to be processed!\n");
9900 +/* function: edef_apply_features
9902 + * This routine applies none, one, or more features to an EVMS
9903 + * volume. The system data structure is first verified and then
9904 + * features are applied and verified recursively until the
9905 + * entire volume has been constructed. Fatal errors result in
9906 + * all nodes in the volume discovery list being deleted.
9909 +edef_apply_features(struct evms_logical_node **volume_node_list)
9911 + int rc = 1, done, top_feature_applying;
9912 + evms_volume_build_info_t vbi;
9914 + vbi.feature_node_list = NULL;
9915 + rc = edef_evaluate_volume_node_list(volume_node_list, &vbi, FALSE);
9917 + /* ensure we don't go into the next loop
9918 + * without having a target plugin to
9919 + * pass control to.
9922 + if (!vbi.plugin) {
9927 + /* this loop should ONLY get used when
9928 + * there are features to process.
9930 + done = (rc) ? TRUE : FALSE;
9932 + rc = edef_check_feature_conditions(&vbi);
9935 + top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
9936 + rc = vbi.plugin->fops->discover(&vbi.feature_node_list);
9938 + rc = edef_evaluate_volume_node_list(volume_node_list,
9940 + top_feature_applying);
9941 + if (top_feature_applying == TRUE) {
9942 + if (vbi.node_count > 1) {
9943 + rc = -EVMS_VOLUME_FATAL_ERROR;
9945 + ("ERROR: detected > 1 node at volume completion!\n");
9949 + if (!vbi.plugin) {
9950 + rc = -EVMS_VOLUME_FATAL_ERROR;
9952 + ("ERROR: depth("PFU64"): expected another feature!\n",
9957 + } else { /* rc != 0 */
9958 + rc = -EVMS_VOLUME_FATAL_ERROR;
9963 + /* put all feature nodes back on the volume list */
9964 + if (edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9965 + &vbi.feature_node_list,
9966 + volume_node_list, 0, 0))
9972 +edef_delete_node(struct evms_logical_node **node_list,
9973 + struct evms_logical_node *node, int return_code,
9978 + rc = evms_cs_remove_logical_node_from_list(node_list, node);
9980 + LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
9981 + log_text, return_code,
9982 + node->volume_info->volume_name, node->name);
9983 + rc = DELETE(node);
9985 + LOG_ERROR("error(%d) while deleting node(%s)\n",
9990 + ("%s error(%d): node gone, assumed deleted by plugin.\n",
9991 + log_text, return_code);
9992 + /* plugin must have cleaned up the node.
9993 + * So just reset the return code and leave.
10002 +edef_process_evms_volumes(struct evms_logical_node **discover_list,
10003 + struct evms_logical_node **associative_feature_list)
10006 + struct evms_logical_node *node, *evms_volumes_list, *volume_node_list;
10009 + /* put all EVMS volumes on their own list */
10010 + evms_volumes_list = NULL;
10011 + rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
10013 + &evms_volumes_list, 0, 0);
10015 + /* apply features to each EVMS volume */
10016 + /* one volume at a time on each pass */
10017 + while (evms_volumes_list) {
10018 + node = evms_volumes_list;
10019 + /* put all nodes for one EVMS volume on separate list */
10020 + volume_node_list = NULL;
10021 + volume_sn = node->volume_info->volume_sn;
10022 + rc = edef_isolate_nodes_by_type
10023 + (ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, &evms_volumes_list,
10024 + &volume_node_list, 0, volume_sn);
10027 + /* go apply all the volume features now */
10028 + rc = edef_apply_features(&volume_node_list);
10030 + case 0: /* SUCCESS */
10031 + /* remove volume just processed */
10032 + node = volume_node_list;
10033 + rc = evms_cs_remove_logical_node_from_list
10034 + (&volume_node_list, node);
10037 + /* put volume on global list */
10038 + rc = evms_cs_add_logical_node_to_list(discover_list,
10041 + case -EVMS_ASSOCIATIVE_FEATURE:
10042 + /* put all "associative" features on their own list */
10043 + rc = edef_isolate_nodes_by_type
10044 + (ISOLATE_ASSOCIATIVE_FEATURES, &volume_node_list,
10045 + associative_feature_list, 0, 0);
10047 + default: /* FATAL ERROR */
10048 + /* delete each node remaining in the list */
10049 + if (volume_node_list) {
10051 + ("encountered fatal error building volume '%s'\n",
10052 + volume_node_list->volume_info->
10055 + while (volume_node_list) {
10056 + node = volume_node_list;
10057 + edef_delete_node(&volume_node_list,
10058 + node, rc, "EVMS feature");
10070 +edef_process_associative_volumes(struct evms_logical_node
10071 + **associative_feature_list,
10072 + struct evms_logical_node **discover_list)
10075 + struct evms_logical_node *node;
10077 + while (*associative_feature_list) {
10078 + node = *associative_feature_list;
10079 + /* remove this node from associative feature list */
10080 + rc = evms_cs_remove_logical_node_from_list
10081 + (associative_feature_list, node);
10084 + /* put volume on global list */
10085 + rc = evms_cs_add_logical_node_to_list(discover_list, node);
10088 + rc = edef_load_feature_header(node);
10091 + rc = edef_apply_feature(node, discover_list);
10093 + edef_delete_node(discover_list, node, rc,
10094 + "Associative feature");
10100 +edef_check_for_incomplete_volumes(struct evms_logical_node **discover_list)
10103 + struct evms_logical_node *next_node, *node;
10105 + /* check to see if any incomplete volumes are left around */
10106 + /* if so, delete them. */
10107 + /* complete volumes should not have feature_headers */
10108 + /* hanging off them, if we find any, we know the volume */
10109 + /* is incomplete. */
10111 + for (node = *discover_list; node; node = next_node) {
10112 + next_node = node->next;
10114 + if (node->feature_header) {
10115 + edef_delete_node(discover_list, node, rc,
10116 + "Unexpected feature header");
10123 + * Function: evms_discover_evms_features
10124 + * Description: Find features for nodes on the logical partitions list
10127 +evms_discover_evms_features(struct evms_logical_node **discover_list)
10129 + struct evms_logical_node *associative_feature_list;
10132 + LOG_EXTRA("discovering evms volume features...\n");
10134 + /* initialize "associative" features list */
10135 + associative_feature_list = NULL;
10137 + /* find the bottom features */
10138 + rc = edef_find_first_features(discover_list);
10139 +#ifdef LOCAL_DEBUG
10140 + display_discover_list(*discover_list, "after 1st features hdr");
10143 + /* process EVMS volumes here */
10144 + rc = edef_process_evms_volumes(discover_list,
10145 + &associative_feature_list);
10146 +#ifdef LOCAL_DEBUG
10147 + display_discover_list(*discover_list, "after evms volumes");
10150 + /* process "associative" features here */
10151 + rc = edef_process_associative_volumes(&associative_feature_list,
10153 +#ifdef LOCAL_DEBUG
10154 + display_discover_list(*discover_list, "after associatives");
10157 + /* check for incomplete volumes */
10158 + rc = edef_check_for_incomplete_volumes(discover_list);
10164 + * function: eelv_assign_volume_minor
10166 + * This is a support function for evms_export_logical_volumes.
10167 + * This routine assigns a specific minor number to a volume. It
10168 + * also performs the remaining steps to make this volume visible
10169 + * and usable to the kernel.
10173 +eelv_assign_volume_minor(struct evms_logical_node *node, int minor)
10175 + struct evms_logical_volume *volume;
10177 + /* initialize the logical_node entry in the volume array */
10178 + volume = &evms_logical_volumes[minor];
10179 + volume->node = node;
10181 + kmalloc(strlen(EVMS_GET_NODE_NAME(node)) + 1, GFP_KERNEL);
10182 + if (!volume->name)
10184 + strcpy(volume->name, EVMS_GET_NODE_NAME(node));
10186 + /* copy flags from top level node into volume structure */
10187 + volume->flags = node->flags;
10189 + /* check for read-only volume */
10190 + if (volume->flags & EVMS_VOLUME_READ_ONLY) {
10191 + set_device_ro(MKDEV(EVMS_MAJOR, minor), 1);
10194 + /* adjust volume size based on hardsector size */
10195 + node->total_vsectors &=
10196 + ~((node->hardsector_size >> EVMS_VSECTOR_SIZE_SHIFT) - 1);
10198 + /* initialize the global device arrays */
10199 + blksize_size[EVMS_MAJOR][minor] = node->block_size;
10200 + hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;
10201 + blk_size[EVMS_MAJOR][minor] = (int) (node->total_vsectors >> 1);
10203 + /* register this volume with devfs */
10204 + volume->devfs_handle =
10205 + devfs_register(evms_dir_devfs_handle,
10207 + DEVFS_FL_DEFAULT,
10208 + EVMS_MAJOR, minor,
10209 + S_IFBLK | S_IRUGO | S_IWUGO, &evms_fops, NULL);
10213 + LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
10214 + EVMS_MAJOR, minor, EVMS_DEV_NODE_PATH, volume->name);
10218 + * function: eelv_check_for_duplicity
10220 + * This is a support function for evms_export_logical_volumes.
10221 + * This routine compares the serial number in the top most node
10222 + * in the volume to the list of currently exported volumes. If
10223 + * this volumes serial number is found in the list then we know
10224 + * this volume is a duplicate and it is then delete.
10228 +eelv_check_for_duplicity(struct evms_logical_node **discover_list)
10230 + struct evms_logical_node *next_node, *node;
10231 + struct evms_logical_volume *lv;
10234 + for (node = *discover_list; node; node = next_node) {
10235 + next_node = node->next;
10238 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10239 + lv = &evms_logical_volumes[i];
10240 + /* only check exported volumes */
10242 + char *type_ptr = NULL;
10244 + /* check for duplicate pointer */
10245 + if (node == lv->node) {
10247 + type_ptr = "pointer";
10248 + /* check for duplicate node */
10249 + } else if (!strcmp(node->name, lv->node->name)) {
10251 + type_ptr = "node";
10253 + if (is_dup == TRUE) {
10254 + evms_cs_remove_logical_node_from_list
10255 + (discover_list, node);
10257 + ("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
10258 + type_ptr, EVMS_MAJOR, i,
10259 + EVMS_GET_NODE_NAME(node));
10260 + /* forget duplicate */
10269 + * function: eelv_reassign_soft_deleted_volume_minors
10271 + * This is a support function for evms_export_logical_volumes.
10272 + * This routine reassigns minor numbers to rediscovered "soft"
10273 + * deleted volumes.
10277 +eelv_reassign_soft_deleted_volume_minors(struct evms_logical_node
10280 + struct evms_logical_node *next_node, *node;
10281 + struct evms_logical_volume *lv;
10282 + int i, node_removed;
10284 + for (node = *discover_list; node; node = next_node) {
10285 + next_node = node->next;
10287 + node_removed = FALSE;
10288 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10289 + lv = &evms_logical_volumes[i];
10290 + /* only check soft deleted volumes:
10291 + * they have a non-NULL name.
10293 + if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10294 + if (!strcmp(EVMS_GET_NODE_NAME(node), lv->name)) {
10295 + /* reassign requested minor */
10296 + evms_cs_remove_logical_node_from_list
10297 + (discover_list, node);
10298 + node_removed = TRUE;
10299 + LOG_DEFAULT("Re");
10300 + /* free the previously used name */
10303 + /* clear the EVMS_VOLUME_SOFT_DELETED flag */
10305 + eelv_assign_volume_minor(node, i);
10314 + * function: eelv_assign_evms_volume_minors
10316 + * This is a support function for evms_export_logical_volumes.
10317 + * This routine assigns minor numbers to new evms volumes. If
10318 + * the specified minor is already in use, the requested minor
10319 + * is set to 0, and will be assigned next available along with
10320 + * any remaining volumes at the end of evms_export_logical_volumes.
10324 +eelv_assign_evms_volume_minors(struct evms_logical_node **discover_list)
10326 + struct evms_logical_node *next_node, *node, *lv_node;
10327 + unsigned int requested_minor, node_removed;
10329 + for (node = *discover_list; node; node = next_node) {
10330 + next_node = node->next;
10332 + node_removed = FALSE;
10333 + /* only process evms volumes */
10334 + if (node->flags & EVMS_VOLUME_FLAG) {
10335 + requested_minor = node->volume_info->volume_minor;
10336 + /* is there a requested minor? */
10337 + if (requested_minor) {
10338 + int lv_flags = 0;
10340 + /* check range of requested minor */
10341 + if (requested_minor >= MAX_EVMS_VOLUMES)
10344 + struct evms_logical_volume *lv;
10345 + lv = &evms_logical_volumes
10346 + [requested_minor];
10347 + lv_node = lv->node;
10348 + lv_flags = lv->flags;
10351 + && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED))) {
10352 + /* assign requested minor */
10353 + evms_cs_remove_logical_node_from_list
10354 + (discover_list, node);
10355 + node_removed = TRUE;
10356 + eelv_assign_volume_minor(node,
10357 + requested_minor);
10360 + ("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
10361 + node->volume_info->volume_name,
10362 + requested_minor);
10364 + * requested minor is already
10365 + * in use, defer assignment
10368 + node->volume_info->volume_minor = 0;
10376 + * function: eelv_assign_remaining_evms_volume_minors
10378 + * This is a support function for evms_export_logical_volumes.
10379 + * This routine assigns minor numbers to new evms volumes that
10380 + * have no/conflicting minor assignments. This function will
10381 + * search from high(255) minor values down, for the first available
10382 + * minor. Searching high to low minimizes the possibility of
10383 + * conflicting evms volumes causing "compatibility" minor
10384 + * assignments to shift from expected assignments.
10388 +eelv_assign_remaining_evms_volume_minors(struct evms_logical_node
10391 + struct evms_logical_node *next_node, *node;
10392 + int requested_minor, node_removed;
10394 + for (node = *discover_list; node; node = next_node) {
10395 + next_node = node->next;
10397 + node_removed = FALSE;
10398 + /* only process evms volumes */
10399 + /* all remaining evms volumes should now
10400 + * have a minor value of 0, meaning they
10401 + * had no minor assignment, or their minor
10402 + * assignment conflicted with an existing
10403 + * minor assignment.
10405 + if (node->flags & EVMS_VOLUME_FLAG) {
10406 + evms_cs_remove_logical_node_from_list(discover_list,
10408 + node_removed = TRUE;
10409 + /* find next available minor number */
10410 + for (requested_minor = 255;
10411 + (evms_logical_volumes[requested_minor].node ||
10412 + evms_logical_volumes[requested_minor].name) &&
10413 + requested_minor; requested_minor--) ;
10414 + /* check range of assigned minor */
10415 + if (!requested_minor) {
10417 + ("no more minor numbers available for evms volumes!!!!\n");
10420 + /* assign requested minor */
10421 + eelv_assign_volume_minor(node, requested_minor);
10427 + * function: eelv_assign_remaining_volume_minors
10429 + * This is a support function for evms_export_logical_volumes.
10430 + * This routine assigns minor numbers to all remaining unassigned
10431 + * volumes. Minor numbers are assigned on an availability
10432 + * basis. The first free minor number is used in the assignment.
10436 +eelv_assign_remaining_volume_minors(struct evms_logical_node **discover_list)
10438 + struct evms_logical_node *node;
10441 + while (*discover_list) {
10442 + node = *discover_list;
10443 + evms_cs_remove_logical_node_from_list(discover_list, node);
10445 + /* find next available minor number */
10447 + (evms_logical_volumes[minor].node ||
10448 + evms_logical_volumes[minor].name) &&
10449 + minor < MAX_EVMS_VOLUMES; minor++) ;
10451 + if (minor >= MAX_EVMS_VOLUMES) {
10453 + ("no more minor numbers available for compatibility volumes!!!!\n");
10456 + /* assign minor */
10457 + eelv_assign_volume_minor(node, minor);
10462 + * function: eelv_check_for_unreassign_soft_deleted_volume
10464 + * This is a support function for evms_export_logical_volumes.
10465 + * This routine reports any "soft deleted" volumes that were not
10466 + * found after a rediscovery.
10469 +eelv_check_for_unreassign_soft_deleted_volume(void)
10471 + struct evms_logical_volume *lv;
10474 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10475 + lv = &evms_logical_volumes[i];
10476 + /* only check soft deleted volumes:
10477 + * they have a NULL node ptr &
10478 + * they have a non-NULL name.
10480 + if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10482 + lv->flags |= EVMS_VOLUME_CORRUPT;
10484 + ("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
10485 + ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
10486 + EVMS_MAJOR, i, lv->name);
10487 + if (lv->flags & EVMS_VOLUME_CORRUPT) {
10489 + (" flagging volume(%u,%u,%s) as CORRUPT!\n",
10490 + EVMS_MAJOR, i, lv->name);
10493 + (" releasing minor(%d) used by volume(%s)!\n",
10495 + /* clear logical volume structure
10496 + * for this volume so it may be
10508 +eelv_unquiesce_volumes(void)
10512 + /* check each volume array entry */
10513 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10514 + struct evms_logical_volume *volume;
10516 + volume = &evms_logical_volumes[i];
10517 + /* is this volume "quiesced" ? */
10518 + if (volume->quiesced) {
10520 + if (volume->node) {
10521 + /* "unquiesce" it */
10522 + struct inode inode;
10523 + struct evms_quiesce_vol_pkt qv;
10525 + qv.command = qv.status = 0;
10528 + rc = evms_quiesce_volume(volume, &inode, NULL,
10531 + /* Wake up any waiters */
10533 + /* clear the flag */
10534 + volume->quiesced = 0;
10535 + /* wake up the waiters */
10536 + if (waitqueue_active(&volume->wait_queue))
10537 + wake_up(&volume->wait_queue);
10538 +#ifdef VFS_PATCH_PRESENT
10539 + /* unquiesce VFS if quiesced */
10540 + if (volume->vfs_quiesced) {
10541 + /* VFS function call to unlock the filesystem */
10542 + unlockfs(MKDEV(EVMS_MAJOR, i));
10543 + volume->vfs_quiesced = FALSE;
10552 + * Function: evms_export_logical_volumes
10554 + * This function is called from evms_discover_volumes. It
10555 + * check for duplicate volumes, assigns minor values to evms
10556 + * volumes, and assigns minor values to the remaining volumes.
10557 + * In addition to assigning minor values to each volume this
10558 + * function also completes the final steps necessary to allow
10559 + * the volumes to be using by the operating system.
10562 +evms_export_logical_volumes(struct evms_logical_node **discover_list)
10564 + LOG_EXTRA("exporting EVMS logical volumes...\n");
10566 + eelv_check_for_duplicity(discover_list);
10568 + eelv_reassign_soft_deleted_volume_minors(discover_list);
10570 + eelv_assign_evms_volume_minors(discover_list);
10572 + eelv_assign_remaining_evms_volume_minors(discover_list);
10574 + eelv_assign_remaining_volume_minors(discover_list);
10576 + eelv_check_for_unreassign_soft_deleted_volume();
10578 + /* "unquiesce" any "quiesced" volumes */
10579 + eelv_unquiesce_volumes();
10583 +edv_populate_discover_list(struct evms_list_node *src_list,
10584 + struct evms_logical_node **trg_list,
10585 + struct evms_rediscover_pkt *discover_parms)
10587 + int rc = 0, i, move_node, use_all_disks = FALSE;
10588 + struct evms_list_node *src_node;
10589 + struct evms_logical_node *disk_node = NULL;
10591 + /* if no discover parameters are specified */
10592 + /* copy ALL the disk nodes into the */
10593 + /* discovery list. */
10594 + if ((discover_parms == NULL) ||
10595 + (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
10596 + use_all_disks = TRUE;
10598 + /* copy the disk nodes specified in the */
10599 + /* discover_parms over to a discover list */
10600 + src_node = src_list;
10601 + while (src_node) {
10602 + move_node = use_all_disks;
10603 + if (move_node == FALSE)
10604 + /* check the rediscovery array */
10605 + for (i = 0; i < discover_parms->drive_count; i++) {
10607 + DEV_HANDLE_TO_NODE(discover_parms->
10609 + if (disk_node == src_node->item) {
10610 + move_node = TRUE;
10614 + /* check to see if we want this node */
10615 + if (move_node == TRUE)
10616 + evms_cs_add_logical_node_to_list(trg_list,
10618 + evms_logical_node *)
10620 + /* advance to next struct evms_list_node */
10621 + src_node = src_node->next;
10627 +evms_discover_volumes(struct evms_rediscover_pkt *discover_parms)
10630 + struct evms_logical_node *discover_list = NULL;
10632 + evms_discover_logical_disks(&discover_list);
10633 + if (evms_global_device_list) {
10634 + /* move the appropriate disk nodes, based on */
10635 + /* on the discover parameters, onto the */
10636 + /* discover list for the partition managers */
10638 + edv_populate_discover_list(evms_global_device_list,
10639 + &discover_list, discover_parms);
10641 + if (discover_list) {
10642 +#ifdef LOCAL_DEBUG
10643 + display_discover_list(discover_list, "after dev mgrs");
10645 + evms_discover_logical_partitions(&discover_list);
10647 + if (discover_list) {
10648 +#ifdef LOCAL_DEBUG
10649 + display_discover_list(discover_list, "after seg mgrs");
10651 + evms_discover_volume_groups(&discover_list);
10653 + if (discover_list) {
10654 +#ifdef LOCAL_DEBUG
10655 + display_discover_list(discover_list, "after reg mgrs");
10657 + evms_discover_evms_features(&discover_list);
10659 + if (discover_list) {
10660 +#ifdef LOCAL_DEBUG
10661 + display_discover_list(discover_list, "after features");
10663 + evms_export_logical_volumes(&discover_list);
10664 + evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);
10669 +/* function: evms_notify_reboot
10671 + * this function gets called at shutdown time and is used
10672 + * to remove any evms controlled volumes from memory, thus
10673 + * allowing any plugins needing to flush internal caches
10677 +evms_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
10680 + struct evms_logical_volume *volume;
10685 + case SYS_POWER_OFF:
10686 + LOG_DEFAULT("stopping all evms controlled volumes.\n");
10688 + /* quiesce all volumes */
10689 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10690 + struct evms_quiesce_vol_pkt qv;
10691 + struct inode inode;
10693 + volume = &evms_logical_volumes[i];
10694 + if (!volume->node)
10696 + qv.command = 1; // quiesce
10698 + qv.status = 0; // reset status
10700 + evms_quiesce_volume(volume, &inode, NULL, &qv);
10702 + /* delete all volumes
10704 + * to ensure this work under the
10705 + * most circumstances, a "soft"
10706 + * delete will be done. this will
10707 + * handle the strange case of a
10708 + * volume still being mounted.
10710 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10711 + struct evms_delete_vol_pkt dv;
10713 + volume = &evms_logical_volumes[i];
10714 + if (!volume->node)
10716 + /* only delete quiesced volumes */
10717 + if (!volume->quiesced)
10719 + /* delete the volume from memory.
10720 + * do a 'soft' delete if volume
10721 + * is mounted, and 'hard' delete
10724 + dv.command = is_open(i);
10727 + evms_delete_volume(volume, &dv);
10730 + return NOTIFY_DONE;
10733 +static struct notifier_block evms_notifier = {
10734 + .notifier_call = evms_notify_reboot,
10736 + .priority = INT_MAX, /* before any real devices */
10740 + * Function: find_root_fs_dev
10741 + * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
10742 + * is not enabled, we need to determine the appropriate minor number for the
10743 + * specified volume for the root fs.
10746 +find_root_fs_dev(void)
10749 + char root_name[64] = { 0 };
10753 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,18)
10754 + strncpy(root_name, root_device_name, 63);
10756 + get_root_device_name(root_name);
10759 + if (!strncmp(root_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME) + 1)) {
10760 + name = &root_name[strlen(EVMS_DIR_NAME) + 1];
10762 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10763 + if (evms_logical_volumes[i].name &&
10764 + !strncmp(name, evms_logical_volumes[i].name,
10765 + strlen(evms_logical_volumes[i].name))) {
10766 + ROOT_DEV = MKDEV(EVMS_MAJOR, i);
10775 + * Function: bh_cache_ctor
10776 + * this function initializes the b_wait field in the buffer heads
10777 + * in our private buffer head pool.
10780 +io_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10782 + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10783 + SLAB_CTOR_CONSTRUCTOR) {
10784 + io_notify_t *io_notify = (io_notify_t *) foo;
10785 + memset(io_notify, 0, sizeof (*io_notify));
10790 + * Function: bh_cache_ctor
10791 + * this function initializes the b_wait field in the buffer heads
10792 + * in our private buffer head pool.
10795 +bh_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10797 + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10798 + SLAB_CTOR_CONSTRUCTOR) {
10799 + struct buffer_head *bh = (struct buffer_head *) foo;
10800 + memset(bh, 0, sizeof (*bh));
10801 + init_waitqueue_head(&bh->b_wait);
10806 + * Function: evms_init_module
10807 + * This function runs once at system initialization.
10810 +evms_init_module(void)
10813 + int *evms_blocksizes;
10815 + LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n",
10816 + EVMS_MAJOR_VERSION,
10817 + EVMS_MINOR_VERSION,
10818 + EVMS_PATCHLEVEL_VERSION, evms_info_level);
10820 + /* initialize memory management counters */
10821 + evms_allocs = (atomic_t) ATOMIC_INIT(0);
10822 + evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
10824 + /* initialize the io_notify_entry pool */
10826 + evms_io_notify_pool = evms_cs_create_pool(sizeof (io_notify_t),
10827 + "EVMS IO Notify",
10828 + io_notify_cache_ctor,
10831 + /* initialize the "public" buffer_head pool */
10833 + evms_bh_pool = evms_cs_create_pool(sizeof (struct buffer_head),
10835 + bh_cache_ctor, NULL);
10837 + /* allocate the logical volume array */
10839 + evms_logical_volumes =
10840 + kmalloc(sizeof (struct evms_logical_volume) *
10841 + MAX_EVMS_VOLUMES, GFP_KERNEL);
10842 + if (!evms_logical_volumes) {
10846 + /* initialize the logical volume array entries */
10848 + memset(evms_logical_volumes, 0,
10849 + sizeof (struct evms_logical_volume) * MAX_EVMS_VOLUMES);
10850 + for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10851 + struct evms_logical_volume *volume;
10853 + volume = &evms_logical_volumes[i];
10854 + init_waitqueue_head(&volume->wait_queue);
10855 + volume->requests_in_progress =
10856 + (atomic_t) ATOMIC_INIT(0);
10858 + blk_init_queue(&volume->request_queue,
10859 + evms_do_request_fn);
10860 + blk_queue_make_request(&volume->request_queue,
10861 + evms_make_request_fn);
10866 + /* allocate EVMS' blk_size array */
10868 + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10869 + sizeof (int), GFP_KERNEL);
10870 + if (!evms_blocksizes) {
10873 + ("can't allocate memory for EVMS blk_size\n");
10875 + memset(evms_blocksizes, 0,
10876 + MAX_EVMS_VOLUMES * sizeof (int));
10877 + blk_size[EVMS_MAJOR] = evms_blocksizes;
10881 + /* allocate EVMS' blksize_size array */
10883 + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10884 + sizeof (int), GFP_KERNEL);
10885 + if (!evms_blocksizes) {
10888 + ("can't allocate memory for EVMS blksize_size\n");
10890 + memset(evms_blocksizes, 0,
10891 + MAX_EVMS_VOLUMES * sizeof (int));
10892 + blksize_size[EVMS_MAJOR] = evms_blocksizes;
10896 + /* allocate EVMS' hardsect_size array */
10898 + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10899 + sizeof (int), GFP_KERNEL);
10900 + if (!evms_blocksizes) {
10903 + ("can't allocate memory for EVMS hardsect_size\n");
10905 + memset(evms_blocksizes, 0,
10906 + MAX_EVMS_VOLUMES * sizeof (int));
10907 + hardsect_size[EVMS_MAJOR] = evms_blocksizes;
10911 + /* Register the block device */
10913 + rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME,
10917 + ("error calling devfs_register_blkdev() err=%u\n",
10923 + /* Register with devfs */
10925 + evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
10926 + // A NULL return cannot be fatal.
10927 + // Devfs just might not be running
10928 + if (!evms_dir_devfs_handle) {
10930 + ("NULL return from devfs_mk_dir() for \"%s\"\n",
10932 + LOG_EXTRA("Is devfs enabled?\n");
10934 + evms_blk_devfs_handle =
10935 + devfs_register(evms_dir_devfs_handle, EVMS_DEV_NAME,
10936 + DEVFS_FL_DEFAULT, EVMS_MAJOR, 0,
10937 + S_IFBLK | S_IRUGO | S_IWUGO,
10938 + &evms_fops, NULL);
10939 + if (!evms_blk_devfs_handle) {
10941 + ("NULL return from devfs_register() for \"%s\"\n",
10948 + read_ahead[EVMS_MAJOR] = 4096;
10950 + blk_dev[EVMS_MAJOR].queue = evms_find_queue;
10952 + blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10953 + evms_do_request_fn);
10954 + blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10955 + evms_make_request_fn);
10957 +#ifdef CONFIG_PROC_FS
10958 + evms_cs_get_evms_proc_dir();
10959 + if (evms_proc_dir) {
10960 + create_proc_read_entry("info", 0, evms_proc_dir,
10961 + evms_info_read_proc, NULL);
10962 + create_proc_read_entry("plugins", 0, evms_proc_dir,
10963 + evms_plugins_read_proc, NULL);
10964 + create_proc_read_entry("volumes", 0, evms_proc_dir,
10965 + evms_volumes_read_proc, NULL);
10967 + evms_table_header = register_sysctl_table(dev_dir_table, 1);
10969 + /* Register for reboot notification */
10970 + register_reboot_notifier(&evms_notifier);
10972 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
10973 + /* Register evms 32bit ioctl handlers */
10975 + register_ioctl32_conversion(EVMS_GET_INFO_LEVEL,NULL);
10976 + register_ioctl32_conversion(EVMS_SET_INFO_LEVEL,NULL);
10977 + register_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32,
10978 + evms_rediscover);
10979 + register_ioctl32_conversion(EVMS_DELETE_VOLUME,NULL);
10980 + register_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32,
10981 + evms_plugin_ioctl);
10982 + register_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT,NULL);
10983 + register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK,NULL);
10984 + register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO,NULL);
10985 + register_ioctl32_conversion(EVMS_SECTOR_IO_32, evms_sector_io);
10986 + register_ioctl32_conversion(EVMS_GET_MINOR,NULL);
10987 + register_ioctl32_conversion(EVMS_GET_VOLUME_DATA,NULL);
10988 + register_ioctl32_conversion(EVMS_GET_PLUGIN,NULL);
10989 + register_ioctl32_conversion(EVMS_COMPUTE_CSUM_32,
10990 + evms_compute_csum);
10991 + register_ioctl32_conversion(EVMS_GET_BMAP,NULL);
10992 + register_ioctl32_conversion(EVMS_GET_IOCTL_VERSION,NULL);
10993 + register_ioctl32_conversion(EVMS_GET_VERSION,NULL);
10994 + register_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO,NULL);
10995 + register_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS,NULL);
10996 + register_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO,NULL);
11006 + * Function: evms_exit_module
11007 + * This function runs once when the EVMS core module is unloaded.
11009 +static void __exit
11010 +evms_exit_module(void)
11012 + LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n",
11013 + EVMS_MAJOR_VERSION,
11014 + EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
11016 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
11017 + /* Un-Register evms 32bit ioctl handlers */
11019 + unregister_ioctl32_conversion(EVMS_GET_INFO_LEVEL);
11020 + unregister_ioctl32_conversion(EVMS_SET_INFO_LEVEL);
11021 + unregister_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32);
11022 + unregister_ioctl32_conversion(EVMS_DELETE_VOLUME);
11023 + unregister_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32);
11024 + unregister_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT);
11025 + unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK);
11026 + unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO);
11027 + unregister_ioctl32_conversion(EVMS_SECTOR_IO_32);
11028 + unregister_ioctl32_conversion(EVMS_GET_MINOR);
11029 + unregister_ioctl32_conversion(EVMS_GET_VOLUME_DATA);
11030 + unregister_ioctl32_conversion(EVMS_GET_PLUGIN);
11031 + unregister_ioctl32_conversion(EVMS_COMPUTE_CSUM_32);
11032 + unregister_ioctl32_conversion(EVMS_GET_BMAP);
11033 + unregister_ioctl32_conversion(EVMS_GET_IOCTL_VERSION);
11034 + unregister_ioctl32_conversion(EVMS_GET_VERSION);
11035 + unregister_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO);
11036 + unregister_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS);
11037 + unregister_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO);
11041 + /* unregister with devfs
11043 + devfs_unregister(evms_dir_devfs_handle);
11044 + /* clean up the queue for the block device
11046 + blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR, 0)));
11047 + /* unregister block device
11049 + devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
11050 + /* deallocate device arrays
11052 + kfree(blk_size[EVMS_MAJOR]);
11053 + blk_size[EVMS_MAJOR] = NULL;
11054 + kfree(blksize_size[EVMS_MAJOR]);
11055 + blksize_size[EVMS_MAJOR] = NULL;
11056 + kfree(hardsect_size[EVMS_MAJOR]);
11057 + hardsect_size[EVMS_MAJOR] = NULL;
11058 + read_ahead[EVMS_MAJOR] = 0;
11059 + /* deallocate logical volumes array
11061 + kfree(evms_logical_volumes);
11062 + /* destroy buffer head pool
11064 + evms_cs_destroy_pool(evms_bh_pool);
11065 + /* destroy io notify pool
11067 + evms_cs_destroy_pool(evms_io_notify_pool);
11068 +#ifdef CONFIG_PROC_FS
11069 + if (evms_proc_dir) {
11070 + remove_proc_entry("volumes", evms_proc_dir);
11071 + remove_proc_entry("plugins", evms_proc_dir);
11072 + remove_proc_entry("info", evms_proc_dir);
11073 + remove_proc_entry("evms", NULL);
11075 + unregister_sysctl_table(evms_table_header);
11080 + * Function: evms_init_discover
11081 + * If EVMS is statically built into the kernel, this function will be called
11082 + * to perform an initial volume discovery.
11085 +evms_init_discover(void)
11087 + /* go find volumes */
11088 + evms_discover_volumes(NULL);
11090 + /* Check if the root fs is on EVMS */
11091 + if (MAJOR(ROOT_DEV) == EVMS_MAJOR) {
11092 + find_root_fs_dev();
11099 + * a placeholder for cluster enablement
11102 +evms_cluster_init(int nodeid, int clusterid)
11108 +EXPORT_SYMBOL(evms_cluster_init);
11111 + * a placeholder for cluster enablement
11114 +evms_cluster_shutdown(void)
11120 +EXPORT_SYMBOL(evms_cluster_shutdown);
11123 +evms_boot_info_level(char *str)
11125 + int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
11126 + if (evms_boot_info_level) {
11127 + evms_info_level = evms_boot_info_level;
11132 +__setup("evms_info_level=", evms_boot_info_level);
11133 +module_init(evms_init_module);
11134 +module_exit(evms_exit_module);
11135 +__initcall(evms_init_discover);
11136 +#ifdef MODULE_LICENSE
11137 +MODULE_LICENSE("GPL");
11140 +/**********************************************************/
11141 +/* END -- INIT/DISCOVERY support functions */
11142 +/**********************************************************/
11143 diff -Naur linux-2002-09-30/drivers/evms/evms_bbr.c evms-2002-09-30/drivers/evms/evms_bbr.c
11144 --- linux-2002-09-30/drivers/evms/evms_bbr.c Wed Dec 31 18:00:00 1969
11145 +++ evms-2002-09-30/drivers/evms/evms_bbr.c Wed Sep 25 15:04:22 2002
11147 +/* -*- linux-c -*- */
11149 + * Copyright (c) International Business Machines Corp., 2000
11151 + * This program is free software; you can redistribute it and/or modify
11152 + * it under the terms of the GNU General Public License as published by
11153 + * the Free Software Foundation; either version 2 of the License, or
11154 + * (at your option) any later version.
11156 + * This program is distributed in the hope that it will be useful,
11157 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
11158 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
11159 + * the GNU General Public License for more details.
11161 + * You should have received a copy of the GNU General Public License
11162 + * along with this program; if not, write to the Free Software
11163 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
11165 +/* linux/driver/evms/evms_bbr.c
11167 + * EVMS - Bad Block Relocation (BBR) Feature Plugin
11169 + * BBR feature is designed to remap I/O write failures to another safe location
11170 + * on disk. Note that most disk drives have BBR built into them, this means
11171 + * that our software BBR will be only activated when all hardware BBR
11172 + * replacement sectors have been used.
11175 +#define LOG_PREFIX "bbr: "
11177 +#include <linux/config.h>
11178 +#include <linux/kernel.h>
11179 +#include <linux/module.h>
11180 +#include <linux/mempool.h>
11181 +#include <asm/uaccess.h>
11183 +#include <linux/evms/evms.h>
11184 +#include <linux/evms/evms_bbr_k.h>
11186 +/* API prototypes. */
11187 +static int bbr_discover(struct evms_logical_node ** discover_list);
11188 +static int bbr_delete(struct evms_logical_node * node);
11189 +static void bbr_read(struct evms_logical_node * node, struct buffer_head * bh);
11190 +static void bbr_write(struct evms_logical_node * node, struct buffer_head * bh);
11191 +static int bbr_ioctl(struct evms_logical_node * bbr_node,
11192 + struct inode * inode,
11193 + struct file * file,
11194 + unsigned int cmd,
11195 + unsigned long arg);
11196 +static int bbr_direct_ioctl(struct inode * inode,
11197 + struct file * file,
11198 + unsigned int cmd,
11199 + unsigned long arg);
11200 +static int bbr_init_io(struct evms_logical_node * bbr_node,
11206 +/* Other function prototypes. */
11207 +static int bbr_create_pools(void);
11208 +static void bbr_destroy_pools(void);
11209 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id);
11210 +static void bbr_io_handler(void * void_data);
11211 +static void bbr_free_private(struct bbr_private * bbr_id);
11212 +static inline void bbr_list_add(struct bbr_private * bbr_id);
11214 +/* List of all BBR nodes. */
11215 +static struct bbr_private * bbr_instances = NULL;
11217 +/* Data pertaining to the I/O thread. */
11218 +static struct evms_thread * bbr_io_thread = NULL;
11219 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
11220 +static struct list_head bbr_io_list = LIST_HEAD_INIT(bbr_io_list);
11222 +/* Global pools for bbr_io_buf's and bbr_remap's. */
11223 +kmem_cache_t * bbr_io_buf_slab;
11224 +mempool_t * bbr_io_buf_pool;
11225 +kmem_cache_t * bbr_remap_slab;
11226 +mempool_t * bbr_remap_pool;
11228 +/* Plugin function table and header. */
11229 +static struct evms_plugin_fops function_table = {
11230 + .discover = bbr_discover,
11231 + .delete = bbr_delete,
11232 + .read = bbr_read,
11233 + .write = bbr_write,
11234 + .init_io = bbr_init_io,
11235 + .ioctl = bbr_ioctl,
11236 + .direct_ioctl = bbr_direct_ioctl
11239 +static struct evms_plugin_header plugin_header = {
11240 + .id = SetPluginID(IBM_OEM_ID,
11242 + EVMS_BBR_FEATURE_ID),
11244 + .major = EVMS_BBR_VERSION_MAJOR,
11245 + .minor = EVMS_BBR_VERSION_MINOR,
11246 + .patchlevel = EVMS_BBR_VERSION_PATCHLEVEL
11248 + .required_services_version = {
11249 + .major = EVMS_BBR_COMMON_SERVICES_MAJOR,
11250 + .minor = EVMS_BBR_COMMON_SERVICES_MINOR,
11251 + .patchlevel = EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
11253 + .fops = &function_table
11257 + * le_meta_data_to_cpu
11259 + * Convert bbr meta data from on-disk (LE) format
11260 + * to the native cpu endian format.
11262 +void le_meta_data_to_cpu(struct evms_bbr_metadata * md)
11264 + md->signature = le32_to_cpup(&md->signature);
11265 + md->crc = le32_to_cpup(&md->crc);
11266 + md->block_size = le32_to_cpup(&md->block_size);
11267 + md->flags = le32_to_cpup(&md->flags);
11268 + md->sequence_number = le64_to_cpup(&md->sequence_number);
11269 + md->start_sect_bbr_table = le64_to_cpup(&md->start_sect_bbr_table);
11270 + md->nr_sects_bbr_table = le64_to_cpup(&md->nr_sects_bbr_table);
11271 + md->start_replacement_sect = le64_to_cpup(&md->start_replacement_sect);
11272 + md->nr_replacement_blks = le64_to_cpup(&md->nr_replacement_blks);
11276 + * le_bbr_table_sector_to_cpu
11278 + * Convert bbr meta data from on-disk (LE) format
11279 + * to the native cpu endian format.
11281 +void le_bbr_table_sector_to_cpu(struct evms_bbr_table * p)
11284 + p->signature = le32_to_cpup(&p->signature);
11285 + p->crc = le32_to_cpup(&p->crc);
11286 + p->sequence_number = le32_to_cpup(&p->sequence_number);
11287 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
11288 + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11289 + p->entries[i].bad_sect =
11290 + le64_to_cpup(&p->entries[i].bad_sect);
11291 + p->entries[i].replacement_sect =
11292 + le64_to_cpup(&p->entries[i].replacement_sect);
11297 + * cpu_bbr_table_sector_to_le
11299 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
11301 +void cpu_bbr_table_sector_to_le(struct evms_bbr_table * p,
11302 + struct evms_bbr_table * le)
11305 + le->signature = cpu_to_le32p(&p->signature);
11306 + le->crc = cpu_to_le32p(&p->crc);
11307 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
11308 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
11309 + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11310 + le->entries[i].bad_sect =
11311 + cpu_to_le64p(&p->entries[i].bad_sect);
11312 + le->entries[i].replacement_sect =
11313 + cpu_to_le64p(&p->entries[i].replacement_sect);
11317 +#ifdef EVMS_BBR_DEBUG
11318 +static void print_meta_data(struct evms_bbr_metadata * md)
11320 + LOG_DEBUG("BBR Metadata Sector:\n"
11321 + " signature 0x%08X\n"
11323 + " block_size %u\n"
11324 + " start_sect_bbr_table "PFU64"\n"
11325 + " nr_sects_bbr_table "PFU64"\n"
11326 + " start_replacement_sect "PFU64"\n"
11327 + " nr_replacement_blks "PFU64"\n",
11328 + md->signature, md->crc, md->block_size,
11329 + md->start_sect_bbr_table, md->nr_sects_bbr_table,
11330 + md->start_replacement_sect, md->nr_replacement_blks);
11333 +static void print_bbr_table_sector(struct evms_bbr_table * p)
11336 + LOG_DEBUG("BBR Table Sector:\n"
11340 + " in_use_cnt %u\n"
11341 + " Table Entries:\n",
11342 + p->signature, p->crc, p->sequence_number, p->in_use_cnt);
11343 + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11344 + LOG_DEBUG(" [%d] bad_sect: "PFU64" replacement_sect: "PFU64"\n",
11345 + i, p->entries[i].bad_sect,
11346 + p->entries[i].replacement_sect);
11350 +void print_binary_tree(struct bbr_runtime_remap * node)
11353 + LOG_DEFAULT("["PFU64","PFU64"]\n", node->remap.bad_sect,
11354 + node->remap.replacement_sect);
11355 + print_binary_tree(node->left);
11356 + print_binary_tree(node->right);
11360 +static void print_remap_list(struct bbr_private * bbr_id)
11362 + if (bbr_id->remap_root) {
11363 + LOG_DEFAULT("%s for %s\n", __FUNCTION__, bbr_id->node->name);
11364 + print_binary_tree(bbr_id->remap_root);
11370 + * validate_bbr_table_sector
11372 + * Check the specified BBR table sector for a valid signature and CRC.
11374 +static int validate_bbr_table_sector(struct evms_bbr_table * p)
11377 + int org_crc, final_crc;
11379 + if ( le32_to_cpup(&p->signature) != EVMS_BBR_TABLE_SIGNATURE ) {
11380 + LOG_ERROR("BBR table signature doesn't match!\n");
11381 + LOG_ERROR("Sector has (0x%08X) expected(0x%08X)\n",
11382 + le32_to_cpup(&p->signature),
11383 + EVMS_BBR_TABLE_SIGNATURE);
11387 + org_crc = le32_to_cpup(&p->crc);
11389 + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p,
11391 + if ( final_crc != org_crc ) {
11392 + LOG_ERROR("CRC failed!\n");
11393 + LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11394 + org_crc, final_crc);
11397 + p->crc = cpu_to_le32p(&org_crc);
11399 + LOG_ERROR("BBR table sector has no CRC!\n");
11404 + BBR_DEBUG_PRINT_TABLE_SECTOR(p);
11405 + le_bbr_table_sector_to_cpu(p);
11410 + * update_invalid_bbr_table_sector
11412 + * If one copy of a BBR table sector is bad, replace it with the valid copy.
11414 +void update_invalid_bbr_table_sector(struct evms_logical_node * node,
11415 + struct evms_bbr_table * valid,
11416 + struct evms_bbr_table * invalid,
11420 + struct evms_bbr_table * tmp_bbr_table;
11422 + /* Correct the invalid bbr table sector */
11423 + memcpy(invalid, valid, sizeof(struct evms_bbr_table));
11425 + /* Allocate memory for I/O */
11426 + tmp_bbr_table = kmalloc(sizeof(struct evms_bbr_table), GFP_KERNEL);
11427 + if (tmp_bbr_table) {
11428 + memset(tmp_bbr_table, 0, sizeof(struct evms_bbr_table));
11429 + cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
11430 + LOG_WARNING("Correcting BBR table sector "PFU64"\n", lsn);
11431 + rc = INIT_IO(node, 1, lsn, 1, tmp_bbr_table);
11433 + LOG_ERROR("Could not correct BBR table sector "PFU64".\n",
11436 + kfree(tmp_bbr_table);
11441 + * validate_bbr_table
11443 + * Validate the entire range of sectors in the BBR table.
11445 +static u32 validate_bbr_table(struct evms_bbr_metadata * md,
11446 + struct evms_bbr_table * p)
11450 + nr_sects = md->nr_sects_bbr_table;
11452 + for ( i = 0; i < nr_sects; i++, p++ ) {
11453 + if ( validate_bbr_table_sector(p) )
11457 + if ( i != nr_sects ) {
11458 + LOG_SERIOUS("Stopped BBR table validation at sector %u.\n", i);
11461 + LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11466 + * validate_bbr_tables
11467 + * @node: BBR node to validate.
11468 + * @MD1: Primary metadata sector.
11469 + * @MD2: Secondary metadata sector.
11470 + * @p1: Primary BBR table.
11471 + * @p2: Secondary BBR table.
11473 + * Validate both copies of the BBR table. If one of them is invalid,
11474 + * try to correct the errors using the valid copy.
11476 +static u32 validate_bbr_tables(struct evms_logical_node * node,
11477 + struct evms_bbr_metadata * MD1,
11478 + struct evms_bbr_metadata * MD2,
11479 + struct evms_bbr_table * p1,
11480 + struct evms_bbr_table * p2)
11482 + u32 i, rc1, rc2, nr_sects;
11484 + nr_sects = MD1->nr_sects_bbr_table;
11485 + if ( nr_sects != MD2->nr_sects_bbr_table ) {
11486 + nr_sects = (nr_sects < MD2->nr_sects_bbr_table) ?
11487 + nr_sects : MD2->nr_sects_bbr_table;
11488 + LOG_SERIOUS("Size of BBR tables don't match. Using %u\n",
11492 + for ( i = 0; i < nr_sects; i++, p1++, p2++ ) {
11493 + rc1 = validate_bbr_table_sector(p1);
11495 + LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11496 + MD1->start_sect_bbr_table + i);
11498 + rc2 = validate_bbr_table_sector(p2);
11500 + LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11501 + MD2->start_sect_bbr_table + i);
11504 + /* Correct BBR table errors. */
11505 + if (rc1 && rc2) {
11506 + /* Cannot fix. */
11508 + } else if (rc1) {
11509 + update_invalid_bbr_table_sector(node, p2, p1,
11510 + MD1->start_sect_bbr_table + i);
11512 + } else if (rc2) {
11513 + update_invalid_bbr_table_sector(node, p1, p2,
11514 + MD2->start_sect_bbr_table + i);
11518 + if ( p1->sequence_number != p2->sequence_number ) {
11519 + LOG_WARNING("Sequence numbers for BBR table index %u don't match.\n", i);
11520 + LOG_WARNING("MD1 sequence_nr=%u, MD2 sequence_nr_2=%u\n",
11521 + p1->sequence_number, p2->sequence_number);
11522 + if ( p1->sequence_number < p2->sequence_number ) {
11523 + update_invalid_bbr_table_sector(node, p2, p1,
11524 + MD1->start_sect_bbr_table + i);
11526 + update_invalid_bbr_table_sector(node, p1, p2,
11527 + MD2->start_sect_bbr_table + i);
11531 + if ( i != nr_sects ) {
11532 + LOG_SERIOUS("Stopped validation at sector %u\n", i);
11535 + LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11540 + * validate_meta_data
11542 + * Check the specified BBR metadata sector for a valid signature and CRC.
11544 +static int validate_meta_data(struct evms_bbr_metadata * md)
11546 + int org_crc, final_crc;
11548 + BBR_DEBUG_PRINT_META_DATA(md);
11550 + if ( le32_to_cpup(&md->signature) != EVMS_BBR_SIGNATURE ) {
11551 + LOG_SERIOUS("BBR signature doesn't match!\n");
11552 + LOG_SERIOUS("Found: 0x%08X Expecting: 0x%08X\n",
11553 + le32_to_cpup(&md->signature), EVMS_BBR_SIGNATURE);
11558 + org_crc = le32_to_cpup(&md->crc);
11560 + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md,
11562 + if ( final_crc != org_crc ) {
11563 + LOG_ERROR("CRC failed!\n");
11564 + LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11565 + org_crc, final_crc);
11568 + md->crc = cpu_to_le32p(&org_crc);
11570 + LOG_WARNING("Metadata sector has no CRC!\n");
11573 + le_meta_data_to_cpu(md);
11578 + * bbr_load_meta_data
11579 + * @node: BBR node to read metadata from.
11580 + * @lsn: Sector to read metadata from.
11581 + * @md: Pointer to return metadata structure.
11582 + * @bbr_table: Pointer to return BBR table.
11584 + * Load one copy of the BBR metadata. If the metadata is valid, load the
11585 + * corresponding copy of the BBR table.
11587 +static int load_meta_data(struct evms_logical_node * node,
11589 + struct evms_bbr_metadata ** md,
11590 + struct evms_bbr_table ** bbr_table)
11595 + *bbr_table = NULL;
11598 + LOG_WARNING("No sector specified for BBR metadata on %s.\n",
11603 + /* Allocate a buffer for the metadata sector. */
11604 + *md = kmalloc(sizeof(struct evms_bbr_metadata), GFP_KERNEL);
11606 + LOG_ERROR("kmalloc error creating metadata buffer for %s.\n",
11611 + /* Read the metadata sector. */
11612 + rc = INIT_IO(node, 0, lsn, 1, *md);
11614 + LOG_ERROR("init_io error on %s.\n", node->name);
11620 + /* Validate the metadata sector. */
11621 + rc = validate_meta_data(*md);
11623 + LOG_ERROR("Error validating metadata for %s.\n", node->name);
11629 + /* Allocate a buffer for the BBR table. */
11630 + *bbr_table = kmalloc((*md)->nr_sects_bbr_table <<
11631 + EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL);
11632 + if (!*bbr_table) {
11633 + LOG_ERROR("kmalloc error creating BBR table buffer for %s.\n",
11640 + /* Read the BBR table but don't validate here. */
11641 + rc = INIT_IO(node, 0, (*md)->start_sect_bbr_table,
11642 + (*md)->nr_sects_bbr_table, *bbr_table);
11644 + LOG_ERROR("init_io error on %s.\n", node->name);
11647 + kfree(*bbr_table);
11648 + *bbr_table = NULL;
11655 + * bbr_load_feature_data
11656 + * @node: BBR node
11657 + * @ID: Return pointer to BBR private data.
11659 + * Load both copies of the BBR metadata and table. If one is invalid, try
11660 + * to correct is using the valid copy. When a valid copy is found, create
11661 + * a private data structure for the specified node.
11663 +static int load_feature_data(struct evms_logical_node * node,
11664 + struct bbr_private ** ID)
11666 + struct evms_bbr_metadata * md1 = NULL;
11667 + struct evms_bbr_metadata * md2 = NULL;
11668 + struct evms_bbr_table * table1 = NULL;
11669 + struct evms_bbr_table * table2 = NULL;
11670 + u64 lba_table1 = 0, lba_table2 = 0;
11671 + u32 nr_sects = 0;
11672 + int rc = 0, rc1, rc2;
11676 + /* Load metadata 1 */
11677 + rc1 = load_meta_data(node,
11678 + node->feature_header->feature_data1_start_lsn,
11680 + /* Load metadata 2 */
11681 + rc2 = load_meta_data(node,
11682 + node->feature_header->feature_data2_start_lsn,
11685 + if (rc1 && rc2) {
11686 + /* Both copies are bad? Cannot continue. */
11688 + } else if (rc1 || rc2) {
11689 + /* One copy is bad. Use the good copy. */
11691 + lba_table2 = md2->start_sect_bbr_table;
11699 + lba_table1 = md1->start_sect_bbr_table;
11702 + nr_sects = validate_bbr_table(md1, table1);
11703 + if ( nr_sects == 0 ) {
11707 + lba_table1 = md1->start_sect_bbr_table;
11708 + lba_table2 = md2->start_sect_bbr_table;
11709 + nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
11710 + if ( nr_sects == 0 ) {
11715 + if (!rc && nr_sects) {
11716 + *ID = kmalloc(sizeof(struct bbr_private), GFP_KERNEL);
11718 + memset(*ID, 0, sizeof(struct bbr_private));
11719 + (*ID)->source = node;
11720 + (*ID)->blksize_in_sects = md1->block_size >>
11721 + EVMS_VSECTOR_SIZE_SHIFT;
11722 + (*ID)->remap_root = NULL;
11723 + (*ID)->lba_table1 = lba_table1;
11724 + (*ID)->lba_table2 = lba_table2;
11725 + (*ID)->bbr_table = table1;
11726 + (*ID)->nr_sects_bbr_table = nr_sects;
11727 + if ( nr_sects < md1->nr_sects_bbr_table ) {
11728 + LOG_WARNING("Making BBR node read-only\n");
11729 + (*ID)->flag |= EVMS_VOLUME_READ_ONLY;
11731 + (*ID)->nr_replacement_blks = nr_sects *
11732 + EVMS_BBR_ENTRIES_PER_SECT;
11733 + (*ID)->start_replacement_sect = md1->start_replacement_sect;
11734 + (*ID)->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
11735 + (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
11736 + if ( !bbr_remap_pool || !bbr_io_buf_pool ) {
11737 + rc = bbr_create_pools();
11740 + atomic_set(&(*ID)->in_use_replacement_blks,
11741 + bbr_table_to_remap_list(*ID));
11749 + if (!bbr_io_thread) {
11750 + const char * name = "evms_bbr_io";
11751 + bbr_io_thread = evms_cs_register_thread(bbr_io_handler,
11753 + if (!bbr_io_thread) {
11759 + /* If error, free table1. */
11765 + (*ID)->bbr_table = NULL;
11766 + bbr_free_private(*ID);
11771 + /* Will never use md1, md2 and table2 again */
11786 + * bbr_binary_tree_insert
11788 + * Insert a node into the binary tree.
11790 +void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
11791 + struct bbr_runtime_remap * newnode)
11793 + struct bbr_runtime_remap ** node = root;
11794 + while (node && *node) {
11795 + if ( newnode->remap.bad_sect > (*node)->remap.bad_sect ) {
11796 + node = &((*node)->right);
11798 + node = &((*node)->left);
11802 + newnode->left = newnode->right = NULL;
11807 + * bbr_binary_search
11809 + * Search for a node that contains bad_sect = lsn.
11811 +struct bbr_runtime_remap * bbr_binary_search(struct bbr_runtime_remap * root,
11814 + struct bbr_runtime_remap * node = root;
11816 + if (node->remap.bad_sect == lsn) {
11819 + if ( lsn > node->remap.bad_sect ) {
11820 + node = node->right;
11822 + node = node->left;
11829 + * bbr_binary_tree_destroy
11831 + * Destroy the binary tree.
11833 +void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
11834 + struct bbr_private * bbr_id)
11836 + struct bbr_runtime_remap ** link = NULL;
11837 + struct bbr_runtime_remap * node = root;
11840 + if (node->left) {
11841 + link = &(node->left);
11842 + node = node->left;
11845 + if (node->right) {
11846 + link = &(node->right);
11847 + node = node->right;
11851 + mempool_free(node, bbr_remap_pool);
11852 + if (node == root) {
11853 + /* If root is deleted, we're done. */
11857 + /* Back to root. */
11863 +static void bbr_free_remap(struct bbr_private * bbr_id)
11865 + unsigned long flags;
11866 + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11867 + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
11868 + bbr_id->remap_root = NULL;
11869 + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11873 + * bbr_insert_remap_entry
11875 + * Create a new remap entry and add it to the binary tree for this node.
11877 +static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
11878 + struct evms_bbr_table_entry * new_bbr_entry)
11880 + struct bbr_runtime_remap * newnode = NULL;
11881 + unsigned long flags;
11884 + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
11887 + LOG_SERIOUS("Could not allocate from remap pool! (rc=%d)\n", rc);
11890 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
11891 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
11892 + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11893 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
11894 + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11899 + * bbr_table_to_remap_list
11901 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
11902 + * improve run time performance, the in memory remap list must be sorted by
11903 + * the bad sector LBA. This function is called at discovery time to initialize
11904 + * the remap list. This function assumes that at least one copy of meta data
11907 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
11909 + u32 in_use_blks = 0;
11911 + struct evms_bbr_table * p;
11914 + for ( i = 0, p = bbr_id->bbr_table;
11915 + i < bbr_id->nr_sects_bbr_table;
11917 + if (!p->in_use_cnt) {
11920 + in_use_blks += p->in_use_cnt;
11921 + for ( j = 0; j < p->in_use_cnt; j++ ) {
11922 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
11926 + return in_use_blks;
11930 + * bbr_search_remap_entry
11932 + * Search remap entry for the specified sector. If found, return a pointer to
11933 + * the table entry. Otherwise, return NULL.
11935 +static struct evms_bbr_table_entry * bbr_search_remap_entry(struct bbr_private * bbr_id,
11938 + struct bbr_runtime_remap * p;
11939 + unsigned long flags;
11941 + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11942 + p = bbr_binary_search(bbr_id->remap_root, lsn);
11943 + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11945 + return (&p->remap);
11954 + * If *lsn is in the remap table, return TRUE and modify *lsn,
11955 + * else, return FALSE.
11957 +static inline int bbr_remap(struct bbr_private * bbr_id,
11960 + struct evms_bbr_table_entry *e;
11962 + if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11963 + ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11964 + e = bbr_search_remap_entry(bbr_id, *lsn);
11966 + *lsn = e->replacement_sect;
11967 + LOG_EXTRA("%s replacement sector (LSN="PFU64")\n",
11968 + __FUNCTION__, *lsn);
11976 + * bbr_remap_probe
11978 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
11979 + * table return TRUE, Else, return FALSE.
11981 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
11982 + u64 lsn, u64 nr_sects)
11986 + if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11987 + ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11988 + for ( cnt = 0, tmp = lsn;
11990 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
11991 + if ( bbr_remap(bbr_id,&tmp) ) {
11999 +static void *bbr_slab_pool_alloc(int gfp_mask, void * data)
12001 + return kmem_cache_alloc(data, gfp_mask);
12004 +static void bbr_slab_pool_free(void *ptr, void * data)
12006 + kmem_cache_free(data, ptr);
12009 +static int bbr_create_pools(void)
12011 + /* Create a memory pool for the remap list. */
12012 + if (!bbr_remap_slab) {
12013 + bbr_remap_slab = kmem_cache_create("BBR_Remap_Slab",
12014 + sizeof(struct bbr_runtime_remap),
12015 + 0, SLAB_HWCACHE_ALIGN,
12017 + if (!bbr_remap_slab) {
12018 + panic("Unable to create BBR remap cache.");
12021 + if (!bbr_remap_pool) {
12022 + bbr_remap_pool = mempool_create(64, bbr_slab_pool_alloc,
12023 + bbr_slab_pool_free,
12025 + if (!bbr_remap_pool) {
12026 + panic("Unable to create BBR remap pool.");
12030 + /* Create a memory pool for the BBR I/O anchors. */
12031 + if (!bbr_io_buf_slab) {
12032 + bbr_io_buf_slab = kmem_cache_create("BBR_IO_Buf_Slab",
12033 + sizeof(struct bbr_io_buffer),
12034 + 0, SLAB_HWCACHE_ALIGN,
12036 + if (!bbr_io_buf_slab) {
12037 + panic("Unable to create BBR I/O buffer cache.");
12040 + if (!bbr_io_buf_pool) {
12041 + bbr_io_buf_pool = mempool_create(256, bbr_slab_pool_alloc,
12042 + bbr_slab_pool_free,
12043 + bbr_io_buf_slab);
12044 + if (!bbr_io_buf_pool) {
12045 + panic("Unable to create BBR I/O buffer pool.");
12052 +static void bbr_destroy_pools(void)
12054 + if (bbr_io_buf_pool) {
12055 + mempool_destroy(bbr_io_buf_pool);
12056 + bbr_io_buf_pool = NULL;
12058 + if (bbr_io_buf_slab) {
12059 + kmem_cache_destroy(bbr_io_buf_slab);
12060 + bbr_io_buf_slab = NULL;
12062 + if (bbr_remap_pool) {
12063 + mempool_destroy(bbr_remap_pool);
12064 + bbr_remap_pool = NULL;
12066 + if (bbr_remap_slab) {
12067 + kmem_cache_destroy(bbr_remap_slab);
12068 + bbr_remap_slab = NULL;
12075 + * Search through the discover list looking for object with BBR metadata.
12076 + * Remove them from the list and replace with a new BBR node.
12078 +static int bbr_discover(struct evms_logical_node ** discover_list)
12080 + struct evms_logical_node * node, * next_node;
12081 + struct evms_logical_node * bbr_node = NULL;
12082 + struct bbr_private * bbr_id;
12083 + int bad_blocks, rc = 0;
12085 + MOD_INC_USE_COUNT;
12087 + next_node = *discover_list;
12088 + while (next_node) {
12089 + node = next_node;
12090 + next_node = node->next;
12092 + /* The node must have a BBR feature-header. */
12093 + if ( ! node->feature_header ||
12094 + node->feature_header->feature_id != plugin_header.id ) {
12098 + rc = load_feature_data(node, &bbr_id);
12100 + /* Error loading feature data.
12101 + * This node belongs to us, but metadata is invalid,
12102 + * - remove it from the discovery list
12104 + * - clear error code then continue.
12105 + * Will consider creating a read only BBR node in
12108 + LOG_SERIOUS("Error in node (%s) with "PFU64" sectors.\n",
12109 + node->name, node->total_vsectors);
12110 + evms_cs_remove_logical_node_from_list(discover_list,
12117 + rc = evms_cs_allocate_logical_node(&bbr_node);
12119 + LOG_SERIOUS("Could not allocate logical node! rc=%d\n", rc);
12120 + bbr_free_private(bbr_id);
12124 + MOD_INC_USE_COUNT;
12125 + bbr_node->volume_info = node->volume_info;
12126 + bbr_node->flags |= node->flags;
12127 + bbr_node->plugin = &plugin_header;
12128 + strcpy(bbr_node->name,
12129 + node->feature_header->object_name);
12130 + bbr_node->hardsector_size = node->hardsector_size;
12131 + bbr_node->total_vsectors = node->total_vsectors - 2 -
12132 + node->feature_header->feature_data1_size -
12133 + node->feature_header->feature_data2_size;
12134 + bbr_node->block_size = node->block_size;
12135 + bbr_node->private = bbr_id;
12136 + bbr_id->node = bbr_node;
12138 + /* Free the feature header */
12139 + kfree(node->feature_header);
12140 + node->feature_header = NULL;
12141 + evms_cs_remove_logical_node_from_list(discover_list, node);
12143 + /* If bad blocks exist, give warning */
12144 + bad_blocks = atomic_read(&bbr_id->in_use_replacement_blks);
12145 + if (bad_blocks) {
12146 + BBR_DEBUG_PRINT_REMAP_LIST(bbr_id);
12147 + LOG_WARNING("%s has %d bad blocks.\n",
12148 + bbr_id->source->name, bad_blocks);
12149 + LOG_WARNING("There are "PFU64" total replacement blocks.\n",
12150 + bbr_id->nr_replacement_blks);
12151 + LOG_WARNING("There are "PFU64" remaining replacement blocks.\n",
12152 + bbr_id->nr_replacement_blks -
12156 + evms_cs_add_logical_node_to_list(discover_list, bbr_node);
12157 + bbr_list_add(bbr_id);
12160 + MOD_DEC_USE_COUNT;
12164 +static inline void bbr_list_add(struct bbr_private * bbr_id)
12166 + bbr_id->next = bbr_instances;
12167 + bbr_instances = bbr_id;
12170 +static void bbr_list_remove(struct bbr_private * bbr_id)
12172 + struct bbr_private ** p;
12174 + for ( p = &bbr_instances; *p; p = &(*p)->next ) {
12175 + if ( *p == bbr_id ) {
12182 +static struct bbr_private * bbr_find_private(char * object_name)
12184 + struct bbr_private * p;
12186 + for ( p = bbr_instances; p; p = p->next ) {
12187 + if ( ! strncmp(p->node->name, object_name,
12188 + EVMS_VOLUME_NAME_SIZE) ) {
12195 +static void bbr_free_private(struct bbr_private * bbr_id)
12197 + if (bbr_id->remap_root) {
12198 + bbr_free_remap(bbr_id);
12200 + if (bbr_id->bbr_table) {
12201 + kfree(bbr_id->bbr_table);
12203 + bbr_list_remove(bbr_id);
12210 + * Delete the specified BBR node and the node it is built on. If the last BBR
12211 + * node is deleted, shut down the I/O thread.
12213 +static int bbr_delete(struct evms_logical_node * bbr_node)
12215 + struct bbr_private * bbr_id;
12218 + bbr_id = bbr_node->private;
12220 + rc = DELETE(bbr_id->source);
12222 + /* Now cleanup and go away */
12223 + bbr_free_private(bbr_id);
12224 + evms_cs_deallocate_logical_node(bbr_node);
12225 + if (!bbr_instances) {
12226 + bbr_destroy_pools();
12227 + if (bbr_io_thread) {
12228 + evms_cs_unregister_thread(bbr_io_thread);
12229 + bbr_io_thread = NULL;
12232 + MOD_DEC_USE_COUNT;
12237 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
12238 + struct buffer_head * bh,
12241 + struct bbr_io_buffer * bbr_io_buf;
12243 + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
12244 + if (bbr_io_buf) {
12245 + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
12246 + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
12247 + bbr_io_buf->bbr_id = bbr_id;
12248 + bbr_io_buf->bh = bh;
12249 + bbr_io_buf->rw = rw;
12251 + LOG_WARNING("Could not allocate from BBR I/O buffer pool!\n");
12253 + return bbr_io_buf;
12256 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
12258 + mempool_free(bbr_io_buf, bbr_io_buf_pool);
12262 + * bbr_io_remap_error
12263 + * @bbr_id: Private data for the BBR node.
12264 + * @rw: READ or WRITE.
12265 + * @starting_lsn: Starting sector of request to remap.
12266 + * @count: Number of sectors in the request.
12267 + * @buffer: Data buffer for the request.
12269 + * For the requested range, try to write each sector individually. For each
12270 + * sector that fails, find the next available remap location and write the
12271 + * data to that new location. Then update the table and write both copies
12272 + * of the table to disk. Finally, update the in-memory mapping and do any
12273 + * other necessary bookkeeping.
12275 +static int bbr_io_remap_error(struct bbr_private * bbr_id,
12277 + u64 starting_lsn,
12281 + struct evms_bbr_table * bbr_table;
12282 + unsigned long table_sector_index;
12283 + unsigned long table_sector_offset;
12284 + unsigned long index;
12285 + u64 lsn, new_lsn;
12288 + if ( rw == READ ) {
12289 + /* Nothing can be done about read errors. */
12293 + /* For each sector in the request. */
12294 + for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12295 + rc = INIT_IO(bbr_id->source, rw, starting_lsn + lsn, 1, buffer);
12297 + if ( bbr_id->flag & BBR_STOP_REMAP ) {
12298 + /* Can't allow new remaps if the
12299 + * engine told us to stop.
12301 + LOG_ERROR("Object %s: Bad sector ("PFU64"), but remapping is turned off.\n",
12302 + bbr_id->node->name, starting_lsn+lsn);
12306 + /* Find the next available relocation sector. */
12307 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
12308 + if ( new_lsn >= bbr_id->nr_replacement_blks ) {
12309 + /* No more replacement sectors available. */
12312 + new_lsn += bbr_id->start_replacement_sect;
12314 + /* Write the data to its new location. */
12315 + LOG_WARNING("Object %s: Trying to remap bad sector ("PFU64") to sector ("PFU64")\n",
12316 + bbr_id->node->name, starting_lsn + lsn,
12318 + rc = INIT_IO(bbr_id->source, rw, new_lsn, 1, buffer);
12320 + /* This replacement sector is bad.
12321 + * Try the next one.
12323 + LOG_ERROR("Object %s: Replacement sector ("PFU64") is bad. Skipping.\n",
12324 + bbr_id->node->name, new_lsn);
12325 + atomic_inc(&bbr_id->in_use_replacement_blks);
12329 + /* Add this new entry to the on-disk table. */
12330 + table_sector_index = new_lsn -
12331 + bbr_id->start_replacement_sect;
12332 + table_sector_offset = table_sector_index /
12333 + EVMS_BBR_ENTRIES_PER_SECT;
12334 + index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
12336 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
12337 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
12338 + bbr_table->entries[index].replacement_sect = new_lsn;
12339 + bbr_table->in_use_cnt++;
12340 + bbr_table->sequence_number++;
12341 + bbr_table->crc = 0;
12342 + bbr_table->crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
12344 + sizeof(struct evms_bbr_table));
12346 + /* Write the table to disk. */
12347 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
12348 + if ( bbr_id->lba_table1 ) {
12349 + rc = INIT_IO(bbr_id->source, WRITE,
12350 + bbr_id->lba_table1 +
12351 + table_sector_offset,
12354 + if ( bbr_id->lba_table2 ) {
12355 + rc |= INIT_IO(bbr_id->source, WRITE,
12356 + bbr_id->lba_table2 +
12357 + table_sector_offset,
12360 + le_bbr_table_sector_to_cpu(bbr_table);
12363 + /* Error writing one of the tables to disk. */
12364 + LOG_ERROR("Object %s: Error updating BBR tables on disk.\n",
12365 + bbr_id->node->name);
12369 + /* Insert a new entry in the remapping binary-tree. */
12370 + rc = bbr_insert_remap_entry(bbr_id,
12371 + &bbr_table->entries[index]);
12373 + LOG_ERROR("Object %s: Error adding new entry to remap tree.\n",
12374 + bbr_id->node->name);
12378 + atomic_inc(&bbr_id->in_use_replacement_blks);
12386 + * bbr_io_process_request
12388 + * For each sector in this request, check if the sector has already
12389 + * been remapped. If so, process all previous sectors in the request,
12390 + * followed by the remapped sector. Then reset the starting lsn and
12391 + * count, and keep going with the rest of the request as if it were
12392 + * a whole new request. If any of the INIT_IO's return an error,
12393 + * call the remapper to relocate the bad sector(s).
12395 +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
12397 + struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
12398 + u64 starting_lsn = bbr_io_buf->bh->b_rsector;
12399 + u64 count = bbr_io_buf->bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
12400 + u64 lsn, remapped_lsn;
12401 + char * buffer = bbr_io_buf->bh->b_data;
12402 + int rc = 0, rw = bbr_io_buf->rw;
12404 + /* For each sector in this request, check if this sector has already
12405 + * been remapped. If so, process all previous sectors in this request,
12406 + * followed by the remapped sector. Then reset the starting lsn and
12407 + * count and keep going with the rest of the request as if it were
12408 + * a whole new request.
12410 + for ( lsn = 0; lsn < count && !(bbr_id->flag & BBR_STOP_REMAP); lsn++ ) {
12411 + remapped_lsn = starting_lsn + lsn;
12412 + rc = bbr_remap(bbr_id, &remapped_lsn);
12414 + /* This sector is fine. */
12418 + /* Process all sectors in the request up to this one. */
12420 + rc = INIT_IO(bbr_id->source, rw,
12421 + starting_lsn, lsn, buffer);
12423 + /* If this I/O failed, then one of the sectors
12424 + * in this request needs to be relocated.
12426 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12432 + buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
12435 + /* Process the remapped sector. */
12436 + rc = INIT_IO(bbr_id->source, rw, remapped_lsn, 1, buffer);
12438 + /* BUGBUG - Need more processing if this caused an
12439 + * an error. If this I/O failed, then the existing
12440 + * remap is now bad, and we need to find a new remap.
12441 + * Can't use bbr_io_remap_error(), because the existing
12442 + * map entry needs to be changed, not added again, and
12443 + * the original table entry also needs to be changed.
12448 + buffer += EVMS_VSECTOR_SIZE;
12449 + starting_lsn += (lsn + 1);
12450 + count -= (lsn + 1);
12454 + /* Check for any remaining sectors after the last split. This could
12455 + * potentially be the whole request, but that should be a rare case
12456 + * because requests should only be processed by the thread if we know
12457 + * an error occurred or they contained one or more remapped sectors.
12460 + rc = INIT_IO(bbr_id->source, rw, starting_lsn, count, buffer);
12462 + /* If this I/O failed, then one of the sectors in this
12463 + * request needs to be relocated.
12465 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12479 + * This is the handler for the bbr_io_thread. It continuously loops,
12480 + * taking I/O requests off its list and processing them. If nothing
12481 + * is on the list, the thread goes back to sleep until specifically
12484 + * I/O requests should only be sent to this thread if we know that:
12485 + * a) the request contains at least one remapped sector.
12487 + * b) the request caused an error on the normal I/O path.
12488 + * This function uses synchronous I/O, so sending a request to this
12489 + * thread that doesn't need special processing will cause severe
12490 + * performance degredation.
12492 +static void bbr_io_handler(void * void_data)
12494 + struct bbr_io_buffer * bbr_io_buf;
12495 + struct buffer_head * bh;
12496 + unsigned long flags;
12500 + /* Process bbr_io_list, one entry at a time. */
12501 + spin_lock_irqsave(&bbr_io_list_lock, flags);
12502 + if (list_empty(&bbr_io_list)) {
12503 + /* No more items on the list. */
12504 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12507 + bbr_io_buf = list_entry(bbr_io_list.next,
12508 + struct bbr_io_buffer, bbr_io_list);
12509 + list_del(&bbr_io_buf->bbr_io_list);
12510 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12512 + rc = bbr_io_process_request(bbr_io_buf);
12514 + /* Clean up and complete the original I/O. */
12515 + bh = bbr_io_buf->bh;
12516 + if (bh->b_end_io) {
12517 + free_bbr_io_buf(bbr_io_buf);
12518 + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12519 + bh->b_end_io(bh, rc ? 0 : 1);
12521 + /* A request that originated from bbr_init_io. */
12522 + bbr_io_buf->rc = rc;
12523 + complete(bbr_io_buf->complete);
12529 + * bbr_schedule_io
12531 + * Place the specified bbr_io_buf on the thread's processing list.
12533 +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
12535 + unsigned long flags;
12537 + spin_lock_irqsave(&bbr_io_list_lock, flags);
12538 + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
12539 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12540 + evms_cs_wakeup_thread(bbr_io_thread);
12546 + * If there are any remapped sectors on this object, send this request over
12547 + * to the thread for processing. Otherwise send it down the stack normally.
12549 +static void bbr_read(struct evms_logical_node * bbr_node,
12550 + struct buffer_head * bh )
12552 + struct bbr_private * bbr_id = bbr_node->private;
12553 + struct bbr_io_buffer * bbr_io_buf;
12555 + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12556 + bbr_node->total_vsectors ) {
12557 + /* Request is off the end of the object. */
12558 + bh->b_end_io(bh, 0);
12562 + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12563 + bbr_id->flag & BBR_STOP_REMAP ||
12564 + ! bbr_remap_probe(bbr_id, bh->b_rsector,
12565 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12566 + /* No existing remaps, this request doesn't contain any
12567 + * remapped sectors, or the engine told us not to remap.
12569 + R_IO(bbr_id->source, bh);
12573 + /* This request has at least one remapped sector. */
12574 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
12575 + if (!bbr_io_buf) {
12576 + /* Can't get memory to track the I/O. */
12577 + bh->b_end_io(bh, 0);
12581 + evms_cs_volume_request_in_progress(bbr_io_buf->bh->b_rdev, +1, NULL);
12582 + bbr_schedule_io(bbr_io_buf);
12586 + * bbr_write_callback
12588 + * This is the callback for normal write requests. Check for an error
12589 + * during the I/O, and send to the thread for processing if necessary.
12591 +static void bbr_write_callback(struct buffer_head * bh,
12594 + struct bbr_io_buffer * bbr_io_buf = bh->b_private;
12596 + bh->b_end_io = bbr_io_buf->org_end_io;
12597 + bh->b_private = bbr_io_buf->org_private;
12598 + bh->b_rsector = bbr_io_buf->org_rsector;
12599 + bh->b_rdev = bbr_io_buf->org_dev;
12601 + if (!(bbr_io_buf->bbr_id->flag & BBR_STOP_REMAP) &&
12603 + LOG_ERROR("Object %s: Write failure on sector ("PFU64"). Scheduling for retry.\n",
12604 + bbr_io_buf->bbr_id->node->name, (u64)bbr_io_buf->bh->b_rsector);
12605 + bbr_schedule_io(bbr_io_buf);
12607 + free_bbr_io_buf(bbr_io_buf);
12608 + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12609 + bh->b_end_io(bh, uptodate);
12616 + * If there are any remapped sectors on this object, send the request over
12617 + * to the thread for processing. Otherwise, register for callback
12618 + * notification, and send the request down normally.
12620 +static void bbr_write(struct evms_logical_node * bbr_node,
12621 + struct buffer_head * bh)
12623 + struct bbr_private * bbr_id = bbr_node->private;
12624 + struct bbr_io_buffer * bbr_io_buf;
12626 + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12627 + bbr_node->total_vsectors ||
12628 + bbr_id->flag & EVMS_VOLUME_READ_ONLY ) {
12629 + /* Request is off the end of the object, or this
12630 + * is a read-only object.
12632 + bh->b_end_io(bh, 0);
12636 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
12637 + if (!bbr_io_buf) {
12638 + /* Can't get memory to track the I/O. */
12639 + bh->b_end_io(bh, 0);
12643 + evms_cs_volume_request_in_progress(bh->b_rdev, +1, NULL);
12645 + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12646 + bbr_id->flag & BBR_STOP_REMAP ||
12647 + ! bbr_remap_probe(bbr_id, bh->b_rsector,
12648 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12649 + /* No existing remaps, this request contains no remapped
12650 + * sectors, or the engine said to stop remapping.
12652 + bbr_io_buf->org_end_io = bh->b_end_io;
12653 + bbr_io_buf->org_private = bh->b_private;
12654 + bbr_io_buf->org_rsector = bh->b_rsector;
12655 + bbr_io_buf->org_dev = bh->b_rdev;
12656 + bh->b_end_io = bbr_write_callback;
12657 + bh->b_private = bbr_io_buf;
12658 + W_IO(bbr_id->source, bh);
12660 + /* This request contains at least one remapped sector. */
12661 + bbr_schedule_io(bbr_io_buf);
12666 + * bbr_init_io_schedule_io
12667 + * @bbr_id: Private data for the BBR node.
12668 + * @rw: READ or WRITE.
12669 + * @lsn: Starting sector for the request.
12670 + * @count: Number of sectors in the request.
12671 + * @buffer: Data buffer for the request.
12673 + * During init_io, failures must still be handled by the I/O thread. Create
12674 + * a bbr_io_buf, and schedule it to be handled by the thread. Then wait until
12675 + * the request is complete.
12677 +static int bbr_init_io_schedule_io(struct bbr_private * bbr_id,
12683 + struct bbr_io_buffer * bbr_io_buf;
12684 + struct buffer_head bh;
12685 + struct completion complete;
12688 + if ( rw != WRITE ) {
12689 + /* Nothing can be done about read failures. */
12693 + LOG_ERROR("Object %s: init_io write failure (sector "PFU64": count "PFU64"). Scheduling for retry.\n",
12694 + bbr_id->node->name, lsn, count);
12695 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, &bh, rw);
12696 + if (!bbr_io_buf) {
12700 + memset(&bh, 0, sizeof(struct buffer_head));
12701 + init_waitqueue_head(&bh.b_wait);
12702 + bh.b_rsector = lsn;
12703 + bh.b_size = count << EVMS_VSECTOR_SIZE_SHIFT;
12704 + bh.b_data = buffer;
12705 + bh.b_end_io = NULL;
12707 + /* Schedule the I/O and wait for it to finish. */
12708 + bbr_io_buf->complete = &complete;
12709 + init_completion(bbr_io_buf->complete);
12710 + bbr_schedule_io(bbr_io_buf);
12711 + wait_for_completion(bbr_io_buf->complete);
12713 + rc = bbr_io_buf->rc;
12714 + free_bbr_io_buf(bbr_io_buf);
12721 + * @bbr_node: BBR node.
12722 + * @rw: READ or WRITE.
12723 + * @lsn: Starting sector for I/O request.
12724 + * @count: Number of sectors in the I/O request.
12725 + * @buffer: Data buffer for the I/O request.
12727 + * Synchronous I/O requests.
12729 +static int bbr_init_io(struct evms_logical_node * bbr_node,
12735 + struct bbr_private * bbr_id = bbr_node->private;
12739 + if ( start_lsn + count > bbr_node->total_vsectors ) {
12740 + /* Request is off the end of the object. */
12744 + if ( rw == WRITE && (bbr_id->flag & EVMS_VOLUME_READ_ONLY) ) {
12745 + /* Can't write to a read-only object. */
12749 + if ( bbr_id->flag & BBR_STOP_REMAP ||
12750 + atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12751 + ! bbr_remap_probe(bbr_id, start_lsn, count) ) {
12752 + /* Normal case (no existing remaps). */
12753 + rc = INIT_IO(bbr_id->source, rw, start_lsn, count, buffer);
12754 + if (rc && ! (bbr_id->flag & BBR_STOP_REMAP) ) {
12755 + /* Init_io error. Send request over to
12756 + * thread for further processing.
12758 + rc = bbr_init_io_schedule_io(bbr_id, rw, start_lsn,
12762 + /* At least one sector in this request needs to be remapped.
12763 + * Test and send each one down individually.
12765 + for ( lsn = start_lsn;
12766 + lsn < start_lsn + count;
12767 + lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12768 + bbr_remap(bbr_id, &lsn);
12769 + rc = INIT_IO(bbr_id->source, rw, lsn, 1, buffer);
12771 + /* Init_io error. Send request
12772 + * to thread for processing.
12774 + rc = bbr_init_io_schedule_io(bbr_id, rw,
12787 + * bbr_direct_ioctl_sector_io
12789 + * Process an I/O from the engine on an active BBR object.
12791 +static int bbr_direct_ioctl_sector_io(struct bbr_private * bbr_id,
12792 + struct evms_notify_bbr * notify)
12794 + char * buffer, * user_buffer;
12798 + buffer = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
12803 + user_buffer = (char*)notify->buffer;
12806 + lsn < notify->nr_sect;
12807 + lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
12808 + if ( notify->rw == WRITE ) {
12809 + if ( copy_from_user(buffer, user_buffer,
12810 + EVMS_VSECTOR_SIZE) ) {
12816 + rc = bbr_init_io(bbr_id->node, notify->rw,
12817 + notify->start_sect + lsn, 1, buffer);
12822 + if ( notify->rw == READ ) {
12823 + if ( copy_to_user(user_buffer, buffer,
12824 + EVMS_VSECTOR_SIZE) ) {
12836 + * bbr_direct_ioctl
12840 + * @arg: Pointer to an evms_plugin_ioctl_pkt.
12842 + * BBR-specific ioctls from the engine. Currently handles:
12843 + * BBR_STOP_REMAP_CMD
12844 + * BBR_GET_INFO_CMD
12845 + * BBR_SECTOR_IO_CMD
12847 +static int bbr_direct_ioctl(struct inode * inode,
12848 + struct file * file,
12849 + unsigned int cmd,
12850 + unsigned long arg)
12853 + struct bbr_private * bbr_id;
12854 + struct evms_plugin_ioctl_pkt pkt, * user_pkt;
12855 + struct evms_notify_bbr notify, * user_notify;
12857 + MOD_INC_USE_COUNT;
12859 + user_pkt = (struct evms_plugin_ioctl_pkt *)arg;
12860 + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
12861 + MOD_DEC_USE_COUNT;
12865 + if ( pkt.feature_id != plugin_header.id ) {
12866 + MOD_DEC_USE_COUNT;
12870 + user_notify = (struct evms_notify_bbr *)pkt.feature_ioctl_data;
12871 + if ( copy_from_user(¬ify, user_notify, sizeof(notify)) ) {
12874 + bbr_id = bbr_find_private(notify.object_name);
12879 + switch(pkt.feature_command) {
12881 + case BBR_STOP_REMAP_CMD:
12882 + bbr_id->flag |= BBR_STOP_REMAP;
12883 + /* Fall through. */
12885 + case BBR_GET_INFO_CMD:
12886 + notify.count = atomic_read(&bbr_id->in_use_replacement_blks);
12887 + if ( copy_to_user(&user_notify->count,
12889 + sizeof(user_notify->count))) {
12894 + case BBR_SECTOR_IO_CMD:
12895 + rc = bbr_direct_ioctl_sector_io(bbr_id,
12906 + copy_to_user(user_pkt, &pkt, sizeof(pkt));
12907 + MOD_DEC_USE_COUNT;
12913 + * @bbr_node: BBR node.
12916 + * @cmd: ioctl command to process.
12917 + * @arg: ioctl-specific data pointer.
12919 + * IOCTL handler. Currently BBR handles plugin-specific ioctls, as well as
12920 + * EVMS_GET_BMAP. All others are passed to the child node.
12922 +static int bbr_ioctl (struct evms_logical_node * bbr_node,
12923 + struct inode * inode,
12924 + struct file * file,
12925 + unsigned int cmd,
12926 + unsigned long arg)
12928 + struct bbr_private * bbr_id = bbr_node->private;
12929 + struct evms_get_bmap_pkt * bmap;
12933 + case EVMS_PLUGIN_IOCTL:
12934 + rc = bbr_direct_ioctl(inode, file, cmd, arg);
12937 + case EVMS_GET_BMAP:
12938 + bmap = (struct evms_get_bmap_pkt *)arg;
12939 + bbr_remap(bbr_id, &bmap->rsector);
12943 + rc = IOCTL(bbr_id->source, inode, file, cmd, arg);
12948 +static int __init bbr_init(void)
12950 + return evms_cs_register_plugin(&plugin_header);
12953 +static void __exit bbr_exit(void)
12955 + evms_cs_unregister_plugin(&plugin_header);
12958 +module_init(bbr_init);
12959 +module_exit(bbr_exit);
12960 +#ifdef MODULE_LICENSE
12961 +MODULE_LICENSE("GPL");
12964 diff -Naur linux-2002-09-30/drivers/evms/evms_drivelink.c evms-2002-09-30/drivers/evms/evms_drivelink.c
12965 --- linux-2002-09-30/drivers/evms/evms_drivelink.c Wed Dec 31 18:00:00 1969
12966 +++ evms-2002-09-30/drivers/evms/evms_drivelink.c Fri Sep 13 16:09:55 2002
12968 +/* -*- linux-c -*-
12971 + * Copyright (c) International Business Machines Corp., 2000
12973 + * This program is free software; you can redistribute it and/or modify
12974 + * it under the terms of the GNU General Public License as published by
12975 + * the Free Software Foundation; either version 2 of the License, or
12976 + * (at your option) any later version.
12978 + * This program is distributed in the hope that it will be useful,
12979 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
12980 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12981 + * the GNU General Public License for more details.
12983 + * You should have received a copy of the GNU General Public License
12984 + * along with this program; if not, write to the Free Software
12985 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12990 + * linux/drivers/evms/drvlink.c
12993 + * EVMS Drive Linking Feature.
12995 + * This feature provides the ability to link multiple storage objects
12996 + * together as a single virtual storage object.
13000 +#include <linux/module.h>
13001 +#include <linux/kernel.h>
13002 +#include <linux/config.h>
13003 +#include <linux/genhd.h>
13004 +#include <linux/blk.h>
13005 +#include <linux/evms/evms.h>
13006 +#include <linux/evms/evms_drivelink.h>
13007 +#include <asm/uaccess.h>
13009 +#define LOG_PREFIX "drivelink: "
13011 +/* prototypes for mandatory plugin interface functions */
13012 +static int drivelink_discover(struct evms_logical_node **);
13013 +static int drivelink_delete(struct evms_logical_node *);
13014 +static void drivelink_read(struct evms_logical_node *, struct buffer_head *);
13015 +static void drivelink_write(struct evms_logical_node *, struct buffer_head *);
13016 +static int drivelink_ioctl(struct evms_logical_node *,
13018 + struct file *, unsigned int, unsigned long);
13019 +static int drivelink_init_io(struct evms_logical_node *,
13020 + int, u64, u64, void *);
13022 +/* plugin function table definition */
13023 +static struct evms_plugin_fops fops = {
13024 + .discover = drivelink_discover,
13025 + .delete = drivelink_delete,
13026 + .read = drivelink_read,
13027 + .write = drivelink_write,
13028 + .init_io = drivelink_init_io,
13029 + .ioctl = drivelink_ioctl
13032 +/* plugin header definition */
13033 +static struct evms_plugin_header plugin_header = {
13034 + .id = SetPluginID(IBM_OEM_ID,
13036 + EVMS_DRIVELINK_FEATURE_ID),
13042 + .required_services_version = {
13050 +/********************************************************/
13051 +/* Required Plugin Function Table Entry Point: */
13052 +/* Discover function & Support routines */
13053 +/********************************************************/
13056 + * le_feature_data_to_cpu:
13057 + * @md: drivelink metadata
13059 + * convert feature data from on-disk (Little Endian) format
13060 + * to the native cpu endian format.
13063 +le_feature_data_to_cpu(struct evms_drivelink_metadata *md)
13067 + md->signature = le32_to_cpup(&md->signature);
13068 + md->crc = le32_to_cpup(&md->crc);
13069 + md->version.major = le32_to_cpup(&md->version.major);
13070 + md->version.minor = le32_to_cpup(&md->version.minor);
13071 + md->version.patchlevel = le32_to_cpup(&md->version.patchlevel);
13072 + md->flags = le32_to_cpup(&md->flags);
13073 + md->sequence_number = le64_to_cpup(&md->sequence_number);
13074 + md->child_serial_number = le64_to_cpup(&md->child_serial_number);
13075 + md->parent_serial_number = le64_to_cpup(&md->parent_serial_number);
13076 + md->child_count = le64_to_cpup(&md->child_count);
13077 + for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
13078 + struct evms_dl_ordering_table_entry *child_entry;
13080 + child_entry = &md->ordering_table[i];
13081 + child_entry->child_serial_number =
13082 + le64_to_cpup(&child_entry->child_serial_number);
13083 + child_entry->child_vsize =
13084 + le64_to_cpup(&child_entry->child_vsize);
13089 + * load_feature_data: load a feature header from disk
13090 + * @node: storage object
13091 + * @md: ptr to drivelink metadata
13093 + * loads and verifies redundant copies of drivelink metadata. @md is modified
13094 + * and returned to the caller.
13096 + * Return value: 0 on success
13097 + * Otherwise error code
13100 +load_feature_data(struct evms_logical_node *node,
13101 + struct evms_drivelink_metadata **md)
13103 + int i, rc = 0, rc_array[2] = { 0, 0 }, size_in_bytes;
13104 + u64 real_metadata_size, feature_data_size;
13105 + u64 starting_sector;
13106 + struct evms_drivelink_metadata *cur_md, *md1, *md2 = NULL;
13107 + char *location_name;
13109 + /* verify the feature metadata size from the */
13110 + /* feature header agrees with the real size */
13111 + /* of the current metadata structure. */
13112 + real_metadata_size = evms_cs_size_in_vsectors(sizeof (**md));
13114 + /* allocate a buffer large enough to hold all */
13115 + /* sectors containing the feature's metadata */
13116 + size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
13117 + md1 = kmalloc(size_in_bytes, GFP_KERNEL);
13119 + md2 = kmalloc(size_in_bytes, GFP_KERNEL);
13128 + for (i = 0; i < 2; i++) {
13130 + starting_sector =
13131 + node->feature_header->
13132 + feature_data1_start_lsn;
13133 + feature_data_size =
13134 + node->feature_header->feature_data1_size;
13136 + location_name = evms_primary_string;
13138 + starting_sector =
13139 + node->feature_header->
13140 + feature_data2_start_lsn;
13141 + feature_data_size =
13142 + node->feature_header->feature_data2_size;
13144 + location_name = evms_secondary_string;
13146 + /* check that real metadata size matches the */
13147 + /* feature data size */
13148 + if (real_metadata_size != feature_data_size) {
13150 + ("%s feature data size("PFU64" bytes) doesn't match expected size("PFU64" bytes).\n",
13152 + feature_data_size <<
13153 + EVMS_VSECTOR_SIZE_SHIFT,
13154 + real_metadata_size <<
13155 + EVMS_VSECTOR_SIZE_SHIFT);
13157 + rc_array[i] = rc;
13160 + /* load the node's feature data */
13161 + rc = INIT_IO(node,
13164 + feature_data_size, cur_md);
13167 + ("error(%d) probing for %s feature data at sector("PFU64") on '%s'.\n",
13168 + rc, location_name, starting_sector,
13170 + rc_array[i] = rc;
13173 + /* check for valid metadata signature */
13174 + if (le32_to_cpup(&cur_md->signature) !=
13175 + EVMS_DRIVELINK_SIGNATURE) {
13178 + ("error(%d) invalid signature in %s feature data on '%s'\n",
13179 + rc, location_name, node->name);
13180 + rc_array[i] = rc;
13183 + /* validate feature data CRC */
13184 + if (cur_md->crc != EVMS_MAGIC_CRC) {
13185 + int org_crc, final_crc;
13186 + org_crc = le32_to_cpup(&cur_md->crc);
13189 + evms_cs_calculate_crc(EVMS_INITIAL_CRC,
13191 + sizeof (*cur_md));
13192 + if (final_crc != org_crc) {
13194 + ("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
13195 + org_crc, final_crc, location_name,
13198 + rc_array[i] = rc;
13203 + ("CRC disabled in %s feature data on '%s'.\n",
13204 + location_name, node->name);
13206 + /* convert feature data from on-disk
13207 + * format (Little Endian) to native
13208 + * cpu endian format.
13210 + le_feature_data_to_cpu(cur_md);
13211 + /* check for valid structure version */
13212 + rc = evms_cs_check_version(&metadata_ver,
13213 + &cur_md->version);
13216 + ("error(%d) obsolete version detected: actual(%d,%d,%d), requires(%d,%d,%d) in %s feature data on '%s'\n",
13217 + rc, cur_md->version.major,
13218 + cur_md->version.minor,
13219 + cur_md->version.patchlevel,
13220 + DRIVELINK_METADATA_MAJOR,
13221 + DRIVELINK_METADATA_MINOR,
13222 + DRIVELINK_METADATA_PATCHLEVEL,
13223 + location_name, node->name);
13224 + rc_array[i] = rc;
13227 + /* getting same return code for both copies? */
13228 + if (rc_array[0] == rc_array[1]) {
13229 + rc = rc_array[0];
13230 + /* if no errors on both copies,
13231 + * check the sequence numbers.
13232 + * use the highest sequence number.
13235 + /* compare sequence numbers */
13236 + if (md1->sequence_number ==
13237 + md2->sequence_number) {
13241 + ("sequence number mismatches between front("PFU64") and rear("PFU64") feature data copies on node(%s)!\n",
13242 + md2->sequence_number,
13243 + md1->sequence_number, node->name);
13244 + if (md1->sequence_number >
13245 + md2->sequence_number)
13250 + ("using %s feature data copy!\n",
13252 + md1) ? evms_primary_string :
13253 + evms_secondary_string);
13256 + /* getting different return codes for each copy */
13257 + } else if (rc_array[0] == 0) {
13258 + /* use 1st (rear) copy if its good */
13261 + } else if (rc_array[1] == 0) {
13262 + /* use 2nd (front) copy if its good */
13265 + } else if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
13266 + /* fail if either give a fatal error */
13271 + /* deallocate metadata buffers appropriately */
13272 + if (rc || (cur_md == md1))
13274 + if (rc || (cur_md == md2))
13277 + /* save validated feature header pointer */
13285 + * find_parent_node_for_child_node: finds or creates a parent node for this child node
13286 + * @child_node: input, child node
13287 + * @md: input, on-disk metadata
13288 + * @parent_node: output, parent node
13289 + * @dl_private: output, runtime metadata
13290 + * @discover_list: input/output, list of objects being discovered
13292 + * finds or creates a parent node for the specified child node. if the parent node is
13293 + * created, create and initialize the parent's private data area.
13295 + * Return value: 0 on success
13296 + * Otherwise error code.
13299 +find_parent_node_for_child_node(struct evms_logical_node *child_node,
13300 + struct evms_drivelink_metadata *md,
13301 + struct evms_logical_node **parent_node,
13302 + struct runtime_data **dl_private,
13303 + struct evms_logical_node **discover_list)
13305 + int rc = 0, parent_found = FALSE;
13306 + struct evms_logical_node *parent = NULL;
13307 + struct runtime_data *rd = NULL;
13309 + /* find the parent node for this child */
13310 + for (parent = *discover_list; parent; parent = parent->next) {
13311 + /* only parent nodes will have null feature headers */
13312 + if (!parent->feature_header) {
13313 + rd = (struct runtime_data *) parent->private;
13314 + if (rd->parent_sn == md->parent_serial_number) {
13315 + parent_found = TRUE;
13320 + /* if no parent node found, create it */
13321 + if (parent_found == FALSE) {
13322 + rc = evms_cs_allocate_logical_node(&parent);
13324 + /* transpose info from child to parent */
13325 + parent->flags |= child_node->flags;
13326 + strcpy(parent->name,
13327 + child_node->feature_header->object_name);
13328 + /* copy evms system data to parent */
13329 + parent->volume_info = child_node->volume_info;
13330 + /* initialize the plugin id field */
13331 + parent->plugin = &plugin_header;
13332 + /* allocate parent's instance data */
13333 + parent->private = kmalloc(sizeof(*rd), GFP_KERNEL);
13334 + if (!parent->private)
13338 + /* initialize some instance data fields */
13339 + rd = (struct runtime_data *) parent->private;
13340 + rd->block_size = 0;
13341 + rd->parent_sn = md->parent_serial_number;
13342 + rd->child_count = md->child_count;
13343 + /* allocate the child table */
13344 + rd->child_table = kmalloc(sizeof(struct runtime_entry) *
13345 + rd->child_count, GFP_KERNEL);
13346 + if (!rd->child_table)
13350 + memset(rd->child_table, 0,
13351 + sizeof(struct runtime_entry) * rd->child_count);
13352 + /* add the parent node to the discover list */
13353 + rc = evms_cs_add_logical_node_to_list(discover_list,
13355 + MOD_INC_USE_COUNT;
13357 + /* if any errors encountered, try to clean up */
13359 + LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
13360 + rc, child_node->name);
13369 + *dl_private = rd;
13370 + *parent_node = parent;
13376 + * compute_child_index: compute the index for a specific child node
13377 + * @node: the child node
13378 + * @md: the drivelink on-disk metadata
13380 + * compute and return and 0-based index value of this child node's position
13381 + * in the parent node's ordering table.
13383 + * Return value: -1 on error
13384 + * otherwise the index of the specified child.
13387 +compute_child_index(struct evms_logical_node *node,
13388 + struct evms_drivelink_metadata *md)
13390 + int i, position = -1;
13392 + for (i = 0; i < md->child_count; i++) {
13393 + if (md->ordering_table[i].child_serial_number ==
13394 + md->child_serial_number) {
13399 + if (position == -1) {
13400 + LOG_SERIOUS("%s: child not found from '%s'\n",
13401 + __FUNCTION__, node->name);
13403 + return (position);
13407 + * process_child_nodes: perform the discovery operation on each child node
13408 + * @discover_list: the list of potential child objects
13410 + * search the discovery list of drivelink child nodes. for each node found,
13411 + * perform the discovery operation on it.
13413 + * Return value: 0 on success
13414 + * otherwise error code
13417 +process_child_nodes(struct evms_logical_node **discover_list)
13419 + int rc = 0, index = -1;
13420 + struct evms_logical_node *node, *next_node, *parent;
13421 + struct evms_drivelink_metadata *md;
13422 + struct runtime_data *rd;
13423 + struct runtime_entry *child_entry = NULL;
13425 + for (node = *discover_list; node; node = next_node) {
13426 + next_node = node->next;
13427 + if ((!node->feature_header) ||
13428 + (node->feature_header->feature_id != plugin_header.id)) {
13432 + rc = evms_cs_remove_logical_node_from_list(discover_list, node);
13435 + /* we need to load the feature data to */
13436 + /* find the parent's serial number this */
13437 + /* child node belongs to. */
13439 + rc = load_feature_data(node, &md);
13441 + /* find the parent node for this child */
13443 + rc = find_parent_node_for_child_node(node, md,
13448 + /* determine position of child in drive link object */
13449 + index = compute_child_index(node, md);
13454 + /* check for multiple child index requests */
13456 + (struct runtime_entry *) &rd->child_table[index];
13457 + /* check to see if this child index is
13458 + * already in use.
13460 + if (child_entry->child_node) {
13462 + ("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
13463 + node->name, index,
13464 + child_entry->child_node->name);
13469 + /* fill in child info in parent */
13471 + /* check the sector size for this node */
13472 + if (node->hardsector_size > parent->hardsector_size)
13473 + parent->hardsector_size = node->hardsector_size;
13474 + /* check the block size for this node */
13475 + if (node->block_size > parent->block_size)
13476 + parent->block_size = node->block_size;
13477 + /* set the child node */
13478 + child_entry->child_node = node;
13479 + /* set the metadata for this node */
13480 + child_entry->child_metadata = md;
13483 + /* on error, clean up accordingly */
13487 + LOG_SERIOUS("%s: rc(%d) from '%s'\n",
13488 + __FUNCTION__, rc, node->name);
13489 + LOG_SERIOUS("deleting child node '%s'.\n", node->name);
13490 + rc = DELETE(node);
13493 + ("error(%d) attempting to delete '%s'.\n",
13499 + /* errors are handled internal to this function */
13500 + /* by deleting the failed node. This will get */
13501 + /* picked up by finalize_parent_nodes as a */
13502 + /* missing child node */
13506 +#define TEST_CHILD_PRESENCE 0
13507 +#define TEST_CHILD_COUNT 1
13508 +#define TEST_CHILD_PARENTS_SERIAL_NUM 2
13509 +#define TEST_CHILD_POSITION 3
13510 +#define TEST_CHILD_METADATA 4
13513 + * test_parent_node: verify that a parent is complete
13514 + * @node: specified parent node
13516 + * verify that the parent node has all of its child nodes accounted for.
13518 + * Return value: 0 on success
13519 + * otherwise error code
13522 +test_parent_node(struct evms_logical_node *node)
13525 + struct runtime_data *rd;
13526 + struct runtime_entry *child_entry;
13528 + rd = (struct runtime_data *) node->private;
13529 + for (i = 0; i < rd->child_count; i++) {
13530 + child_entry = (struct runtime_entry *) &rd->child_table[i];
13532 + /* insure each child entry is filled */
13533 + if (!child_entry->child_node) {
13535 + EVMS_VOLUME_SET_READ_ONLY | EVMS_VOLUME_PARTIAL;
13536 + LOG_ERROR("%s: missing child(%d).\n", __FUNCTION__, i);
13538 + /* insure child count is the same */
13539 + /* in each child's metadata */
13540 + if (child_entry->child_metadata->child_count != rd->child_count) {
13541 + rc = -EVMS_FEATURE_FATAL_ERROR;
13542 + LOG_ERROR("%s: child count wrong for node '%s'\n",
13543 + __FUNCTION__, node->name);
13545 + /* insure parent serial number is */
13546 + /* the same in each child's metadata */
13547 + if (child_entry->child_metadata->parent_serial_number !=
13549 + rc = -EVMS_FEATURE_FATAL_ERROR;
13551 + ("%s: incorrect [is("PFU64"), should be("PFU64")] child serial number for node '%s'\n",
13553 + child_entry->child_metadata->parent_serial_number,
13554 + rd->parent_sn, node->name);
13556 + /* insure each is in the correct entry */
13557 + if (child_entry->child_metadata->ordering_table[i].
13558 + child_serial_number !=
13559 + child_entry->child_metadata->child_serial_number) {
13560 + rc = -EVMS_FEATURE_FATAL_ERROR;
13562 + ("%s: child reports different index for node '%s'\n",
13563 + __FUNCTION__, node->name);
13565 + struct runtime_entry *other_child_entry;
13567 + /* compare the children's metadata */
13569 + /* look for another present child to
13570 + * compare against.
13572 + other_child_entry = NULL;
13573 + for (j = 0; j < rd->child_count; j++) {
13574 + /* skip comparing to ourselves */
13578 + /* is this child is present? */
13579 + if (rd->child_table[j].child_node) {
13580 + /* yes, use it */
13581 + other_child_entry = &rd->child_table[j];
13585 + /* if we can't find another valid
13586 + * child node's metadata to compare
13587 + * against, just skip this test.
13589 + if (!other_child_entry) {
13593 + memcmp(other_child_entry->child_metadata->
13595 + child_entry->child_metadata->ordering_table,
13596 + sizeof (child_entry->child_metadata->
13597 + ordering_table));
13599 + rc = -EVMS_FEATURE_FATAL_ERROR;
13601 + ("%s: mismatching child metadata for nodes '%s' and '%s'\n",
13603 + rd->child_table[i - 1].child_node->name,
13604 + child_entry->child_node->name);
13607 + /* stop if fatal error encountered */
13608 + if (rc == -EVMS_FEATURE_FATAL_ERROR) {
13616 + * perform_final_adjustments: do final tweaks to parent node
13617 + * @node: parent node
13619 + * This function does the following:
13620 + * sets the vsize (in vsectors) field in each child node
13621 + * sets the voffset (in vsectors) field in each child node
13622 + * frees each child node's metadata
13623 + * sets the parent's total size field
13626 +perform_final_adjustments(struct evms_logical_node *node)
13629 + struct runtime_data *rd;
13630 + struct runtime_entry *child_entry = NULL;
13631 + struct evms_drivelink_metadata *ref_data = NULL;
13633 + rd = (struct runtime_data *) node->private;
13634 + /* find a valid copy of the ordering table.
13635 + * since all the ordering tables are the same
13636 + * we can just pick one to use for all the
13637 + * child computations.
13639 + for (i = 0; i < rd->child_count; i++) {
13640 + child_entry = (struct runtime_entry *) &rd->child_table[i];
13641 + if (child_entry->child_node) {
13642 + ref_data = child_entry->child_metadata;
13646 + /* if we got this far, there should
13647 + * always be at least one valid child.
13651 + /* compute the parent's usable size,
13652 + * and construct the table used to
13653 + * remap parent I/Os to child I/Os */
13654 + for (i = 0; i < rd->child_count; i++) {
13655 + child_entry = (struct runtime_entry *) &rd->child_table[i];
13656 + /* set the LBA count for this child node */
13657 + child_entry->vsize = ref_data->ordering_table[i].child_vsize;
13658 + /* set the start LBA value for this child node */
13659 + child_entry->voffset = node->total_vsectors;
13660 + /* keep a running total of size in sectors */
13661 + node->total_vsectors += child_entry->vsize;
13662 + /* free the metadata for this child node */
13663 + if (ref_data != child_entry->child_metadata) {
13664 + kfree(child_entry->child_metadata);
13666 + child_entry->child_metadata = NULL;
13667 + /* free the feature header for this child node */
13668 + if (child_entry->child_node) {
13669 + kfree(child_entry->child_node->feature_header);
13670 + child_entry->child_node->feature_header = NULL;
13673 + /* free the reference data */
13678 + * finalize_parent_nodes: verify and prepare parent nodes
13679 + * @discover_list: list of potential drivelink parent objects
13681 + * verify the completeness of each parent node. if not complete, purge the in-memory
13682 + * structs for this object and all its children. If complete, perform final tweaks
13683 + * to allow this node to useable.
13685 + * Return value: 0 on success
13686 + * otherwise error code
13689 +finalize_parent_nodes(struct evms_logical_node **discover_list)
13692 + struct evms_logical_node *node, *next_node;
13694 + for (node = *discover_list; node; node = next_node) {
13695 + next_node = node->next;
13696 + /* only check parent nodes */
13697 + if (!node->feature_header) {
13698 + /* valid the children of this parent */
13699 + rc = test_parent_node(node);
13701 + /* compute parent size and
13702 + * child remap table.
13704 + perform_final_adjustments(node);
13706 + /* fatal error encountered.
13707 + * cleanup from this node and
13708 + * delete it from memory.
13710 + evms_cs_remove_logical_node_from_list
13711 + (discover_list, node);
13712 + rc2 = DELETE(node);
13715 + ("error(%d) attempting to delete '%s'.\n",
13716 + rc2, node->name);
13725 + * drivelink_discover: discover drivelinked storage objects
13726 + * @discover_list: the list of objects to inspect
13728 + * perform the drivelink discover process on the objects in the discovery list
13730 + * Return value: 0 on success
13731 + * otherwise error code
13734 +drivelink_discover(struct evms_logical_node **discover_list)
13738 + MOD_INC_USE_COUNT;
13739 + rc = process_child_nodes(discover_list);
13741 + rc = finalize_parent_nodes(discover_list);
13743 + MOD_DEC_USE_COUNT;
13747 +/********************************************************/
13748 +/* Required Plugin Function Table Entry Point: */
13749 +/* Delete function */
13750 +/********************************************************/
13753 + * drivelink_delete: purges a drivelink object and its children from memory
13754 + * @node: the drivelink object to delete
13756 + * purge the drivelink object, its private data, and all its children from memory.
13758 + * Return value: 0 on success
13759 + * otherwise error code
13762 +drivelink_delete(struct evms_logical_node *node)
13765 + struct runtime_data *rd;
13766 + struct runtime_entry *child_entry;
13768 + LOG_DETAILS("deleting '%s'.\n", node->name);
13770 + rd = (struct runtime_data *) node->private;
13772 + for (i = 0; i < rd->child_count; i++) {
13773 + child_entry = &rd->child_table[i];
13774 + /* delete the child node */
13775 + if (child_entry->child_node) {
13776 + rc = DELETE(child_entry->child_node);
13779 + child_entry->child_node = NULL;
13781 + /* delete the child's metadata */
13782 + if (child_entry->child_metadata) {
13783 + kfree(child_entry->child_metadata);
13784 + child_entry->child_metadata = NULL;
13788 + /* delete the child table */
13789 + if (rd->child_table) {
13790 + kfree(rd->child_table);
13791 + rd->child_table = NULL;
13793 + /* delete the instance data */
13795 + node->private = NULL;
13799 + evms_cs_deallocate_logical_node(node);
13800 + MOD_DEC_USE_COUNT;
13807 + * which_child: find the child node targetted by a IO to this drivelink object
13808 + * @parent: parent drivelink object
13809 + * @rsector: relative sector on the parent object
13810 + * @max_io_sects: largest IO size on the child, starting from rsector position
13812 + * This function find the child node a parent rsector maps to.
13813 + * It then adjusts the rsector value to be child relative and
13814 + * optionally computes the max # of sectors that can be access
13815 + * from this starting point on the child.
13818 + * The child node, the child relative rsector and max io size are
13819 + * returned to the caller. On error, the returned child node will
13822 +static struct evms_logical_node *
13823 +which_child(struct evms_logical_node *parent,
13824 + u64 * rsector, u64 * max_io_sects)
13827 + struct evms_logical_node *child = NULL;
13828 + struct runtime_data *rd;
13829 + struct runtime_entry *child_entry = NULL;
13831 + rd = (struct runtime_data *) parent->private;
13832 + for (i = 0; i < rd->child_count; i++) {
13833 + child_entry = (struct runtime_entry *) &rd->child_table[i];
13835 + if (*rsector >= child_entry->vsize) {
13836 + *rsector -= child_entry->vsize;
13838 + /* get the child node */
13839 + child = child_entry->child_node;
13840 + /* compute the sector count if requested */
13841 + if (max_io_sects)
13842 + /* this is only used for INIT I/O
13843 + * to return the largest sector
13844 + * count size for this child based
13845 + * on first sector in the I/O.
13847 + *max_io_sects = child_entry->vsize - *rsector;
13855 + * drivelink_io_error: log an IO error for drivelink
13856 + * @node: drivelink object
13857 + * @bh: buffer head targetting this object
13859 + * this function was primarily created because the function
13860 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
13861 + * to be set on inline functions. Since this was an error path
13862 + * and not mainline, I decided to add a trace statement to help
13863 + * report on the failing condition.
13866 +drivelink_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
13868 + LOG_SERIOUS("%s error on '%s' remapping rsector("PFU64").\n",
13869 + (io_flag) ? "WRITE" : "READ",
13870 + node->name, (u64) bh->b_rsector);
13872 + bh->b_end_io(bh, 0);
13875 +/********************************************************/
13876 +/* Required Plugin Function Table Entry Point: */
13877 +/* Read function & Support routines */
13878 +/********************************************************/
13881 + * drivelink_read: handles IO read operations to drivelink objects
13882 + * @node: drivelink object
13883 + * @bh: buffer head targetting this object
13885 + * handles IO read operations to the drivelink objects. internally remaps the
13886 + * drivelink relative requests to the child relative requests and then routes
13887 + * it to the child for further processing.
13890 +drivelink_read(struct evms_logical_node *node, struct buffer_head *bh)
13892 + struct evms_logical_node *child;
13893 + u64 io_size, rsector;
13895 + rsector = bh->b_rsector;
13896 + child = which_child(node, &rsector, &io_size);
13897 + if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13898 + bh->b_rsector = rsector;
13901 + drivelink_io_error(node, READ, bh);
13905 +/********************************************************/
13906 +/* Required Plugin Function Table Entry Point: */
13907 +/* Write function & Support routines */
13908 +/********************************************************/
13911 + * drivelink_read_write: handles IO write operations to drivelink objects
13912 + * @node: drivelink object
13913 + * @bh: buffer head targetting this object
13915 + * handles IO write operations to the drivelink objects. internally remaps the
13916 + * drivelink relative requests to the child relative requests and then routes
13917 + * it to the child for further processing.
13920 +drivelink_write(struct evms_logical_node *node, struct buffer_head *bh)
13922 + struct evms_logical_node *child;
13923 + u64 io_size, rsector;
13925 + rsector = bh->b_rsector;
13926 + child = which_child(node, &rsector, &io_size);
13927 + if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13928 + bh->b_rsector = rsector;
13931 + drivelink_io_error(node, WRITE, bh);
13935 +/********************************************************/
13936 +/* Required Plugin Function Table Entry Point: */
13937 +/* Init I/O function */
13938 +/********************************************************/
13941 + * drivelink_init_io: performs synchronous IO to drivelink objects
13942 + * @node: drivelink object
13943 + * @io_flag: read/write flag
13944 + * @sect_nr: starting sector, object relative (512 byte units)
13945 + * @num_sects: count of sectors
13946 + * @buf_addr: buffer address to read from/write to
13948 + * This function must determine which child or children a
13949 + * specified I/O request must be passed to. Also if, when,
13950 + * and how a request must be broken up.
13952 + * Return value: 0 on success
13953 + * otherwise error code
13956 +drivelink_init_io(struct evms_logical_node *node, int io_flag,
13966 + u64 starting_sector, remaining_sectors;
13968 + struct runtime_data *rd;
13970 + if ((sect_nr + num_sects) > node->total_vsectors) {
13972 + ("attempted out of bound("PFU64") %s on '%s' at sector("PFU64"), count("PFU64").\n",
13973 + node->total_vsectors, (io_flag) ? "WRITE" : "READ",
13974 + node->name, sect_nr, num_sects);
13977 + rd = (struct runtime_data *) node->private;
13978 + /* make working copies of input parameters */
13979 + starting_sector = sect_nr;
13980 + remaining_sectors = num_sects;
13981 + io_buf = buf_addr;
13982 + /* loop until all I/O is performed */
13983 + while (remaining_sectors) {
13984 + u64 io_start, io_size;
13985 + struct evms_logical_node *child;
13987 + /* compute the child relative io_start
13988 + * and max io_size.
13990 + io_start = starting_sector;
13991 + child = which_child(node, &io_start, &io_size);
13992 + /* adjust io_size based on
13993 + * original remaining sectors
13996 + if (io_size > remaining_sectors)
13997 + io_size = remaining_sectors;
13999 + rc = INIT_IO(child,
14001 + io_start, io_size, io_buf);
14003 + /* if partial volume, return 0's
14004 + * for missing children.
14006 + if (io_flag == READ) {
14007 + memset(io_buf, 0,
14009 + EVMS_VSECTOR_SIZE_SHIFT);
14013 + /* adjust working copies */
14014 + starting_sector += io_size;
14015 + remaining_sectors -= io_size;
14016 + io_buf += io_size <<
14017 + EVMS_VSECTOR_SIZE_SHIFT;
14027 +/********************************************************/
14028 +/* Required Plugin Function Table Entry Point: */
14029 +/* IOCTL function & Support routines */
14030 +/********************************************************/
14033 + * drivelink_ioctl_cmd_plugin_ioctl: drivelink support for the 'plugin ioctl' command
14034 + * @node: drivelink object
14035 + * @inode: VFS supplied parameter
14036 + * @file: VFS supplied parameter
14037 + * @cmd: the specific ioctl command
14038 + * @arg: the specific ioctl arguments
14040 + * this function handles 'plugin ioctl' commands. currently there is no specific
14041 + * commands for this plugin. however, this plugin must broadcast some commands so
14042 + * lower layers can receive them.
14044 + * Return value: 0 on success
14045 + * otherwise error code
14048 +drivelink_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
14049 + struct inode *inode, struct file *file,
14050 + unsigned long cmd, unsigned long arg)
14053 + struct runtime_data *rd;
14054 + struct evms_plugin_ioctl_pkt tmp, *user_parms;
14056 + user_parms = (struct evms_plugin_ioctl_pkt *) arg;
14057 + /* copy user's parameters to kernel space */
14058 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
14062 + rd = (struct runtime_data *) node->private;
14063 + /* is this cmd targetted at this feature ? */
14064 + if (tmp.feature_id == node->plugin->id) {
14065 + switch (tmp.feature_command) {
14069 + } else { /* broadcast this cmd to all children */
14070 + for (i = 0; i < rd->child_count; i++) {
14071 + struct evms_logical_node *child_node;
14073 + child_node = rd->child_table[i].child_node;
14074 + if (child_node) {
14075 + rc = IOCTL(child_node, inode, file,
14082 + /* copy info to userspace */
14083 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
14090 + * drivelink_ioctl_cmd_broadcast: broadcast ioctls to your children
14091 + * @node: drivelink object
14092 + * @inode: VFS supplied parameter
14093 + * @file: VFS supplied parameter
14094 + * @cmd: the specific ioctl command
14095 + * @arg: the specific ioctl arguments
14097 + * broadcast the specified ioctl command and arguments to all this objects
14098 + * children. OR (logical opeation) the return values from all the children
14099 + * and return the OR'd value to the caller.
14101 + * Return value: 0 on success
14102 + * otherwise error code
14105 +drivelink_ioctl_cmd_broadcast(struct evms_logical_node *node,
14106 + struct inode *inode, struct file *file,
14107 + unsigned long cmd, unsigned long arg)
14110 + struct runtime_data *rd;
14112 + rd = (struct runtime_data *) node->private;
14113 + /* broadcast this cmd to all children */
14114 + for (i = 0; i < rd->child_count; i++) {
14115 + struct evms_logical_node *child_node;
14117 + child_node = rd->child_table[i].child_node;
14118 + if (child_node) {
14119 + rc |= IOCTL(child_node, inode, file, cmd, arg);
14126 + * drivelink_ioctl: main ioctl entry point and handler
14127 + * @node: drivelink object
14128 + * @inode: VFS supplied parameter
14129 + * @file: VFS supplied parameter
14130 + * @cmd: a specific ioctl command
14131 + * @arg: a specific ioctl argument
14133 + * handles specific ioctl command internally and routes other ioctls commands to
14134 + * the appropriate entry points.
14136 + * Returns: 0 on success
14137 + * otherwise error code
14140 +drivelink_ioctl(struct evms_logical_node *node,
14141 + struct inode *inode,
14142 + struct file *file, unsigned int cmd, unsigned long arg)
14145 + struct runtime_data *rd = NULL;
14146 + struct hd_geometry hdgeo;
14148 + if ((!node) || (!inode))
14152 + rd = (struct runtime_data *) node->private;
14154 + case HDIO_GETGEO:
14155 + hdgeo.heads = 255;
14156 + hdgeo.sectors = 63;
14157 + hdgeo.cylinders =
14158 + ((unsigned int) node->total_vsectors) /
14159 + hdgeo.heads / hdgeo.sectors;
14161 + if (copy_to_user((int *) arg, &hdgeo, sizeof (hdgeo)))
14164 + case EVMS_QUIESCE_VOLUME:
14165 + case EVMS_GET_DISK_LIST:
14166 + case EVMS_CHECK_MEDIA_CHANGE:
14167 + case EVMS_REVALIDATE_DISK:
14168 + case EVMS_OPEN_VOLUME:
14169 + case EVMS_CLOSE_VOLUME:
14170 + case EVMS_CHECK_DEVICE_STATUS:
14171 + rc = drivelink_ioctl_cmd_broadcast(node, inode, file,
14174 + case EVMS_PLUGIN_IOCTL:
14175 + rc = drivelink_ioctl_cmd_plugin_ioctl(node, inode, file,
14178 + case EVMS_GET_BMAP:
14180 + struct evms_get_bmap_pkt *bmap;
14181 + u64 io_start, io_size;
14182 + struct evms_logical_node *child;
14184 + bmap = (struct evms_get_bmap_pkt *) arg;
14185 + io_start = bmap->rsector;
14186 + child = which_child(node, &io_start, &io_size);
14188 + if (node->block_size !=
14189 + child->block_size) {
14190 + bmap->status = -EPERM;
14192 + bmap->rsector = io_start;
14193 + rc = IOCTL(child,
14208 +/********************************************************/
14209 +/* Required Module Entry Point: */
14210 +/* drivelink_init */
14211 +/********************************************************/
14214 + * drivelink_init: register this module for use within the EVMS framework
14216 + * Return value: 0 on success
14217 + * otherwise error code.
14220 +drivelink_init(void)
14222 + return evms_cs_register_plugin(&plugin_header);
14226 + * drivelink_exit: unregister this module from use within the EVMS framework
14228 + * Return value: 0 on success
14229 + * otherwise error code.
14232 +drivelink_exit(void)
14234 + evms_cs_unregister_plugin(&plugin_header);
14237 +module_init(drivelink_init);
14238 +module_exit(drivelink_exit);
14239 +#ifdef MODULE_LICENSE
14240 +MODULE_LICENSE("GPL");
14242 diff -Naur linux-2002-09-30/drivers/evms/evms_ecr.c evms-2002-09-30/drivers/evms/evms_ecr.c
14243 --- linux-2002-09-30/drivers/evms/evms_ecr.c Wed Dec 31 18:00:00 1969
14244 +++ evms-2002-09-30/drivers/evms/evms_ecr.c Fri Aug 16 16:19:56 2002
14246 +/* -*- linux-c -*- */
14249 + * Copyright (c) International Business Machines Corp., 2000
14251 + * This program is free software; you can redistribute it and/or modify
14252 + * it under the terms of the GNU General Public License as published by
14253 + * the Free Software Foundation; either version 2 of the License, or
14254 + * (at your option) any later version.
14256 + * This program is distributed in the hope that it will be useful,
14257 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
14258 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14259 + * the GNU General Public License for more details.
14261 + * You should have received a copy of the GNU General Public License
14262 + * along with this program; if not, write to the Free Software
14263 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14266 +/* linux/driver/evms/evms_ecr.c
14268 + * EVMS - Cluster enablement (ECR) module
14273 +#include <linux/kernel.h>
14274 +#include <linux/module.h>
14275 +#include <linux/init.h>
14276 +#include <linux/types.h>
14277 +#include <linux/evms/evms.h>
14278 +#include <linux/evms/evms_ecr.h>
14280 +#define LOG_PREFIX "ecr: "
14286 +ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table,
14287 + ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
14297 + * ecr_group_leave
14299 +void ecr_group_leave(ecr_group_t group)
14310 +int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
14311 + size_t size, ecr_instance_t *instance,
14312 + void callback(int ret, ecr_instance_t *instance))
14321 + * ecr_group_send_wait
14323 +int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
14324 + size_t size, int *ret)
14334 + * ecr_group_broadcast
14336 +int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
14337 + ecr_instance_t *instance,
14338 + void callback(u_char ret, ecr_instance_t *instance))
14347 + * ecr_group_broadcast_wait
14349 +int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
14360 + * ecr_group_atomic_execute
14362 +int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
14363 + ecr_instance_t *instance,
14364 + void callback(ecr_instance_t *instance))
14373 + * ecr_group_atomic_execute_wait
14375 +int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
14384 + * ecr_group_success_response
14386 +void ecr_group_success_response(ecr_message_t *handle)
14396 + * ecr_group_failure_response
14398 +void ecr_group_failure_response(ecr_message_t *handle, int ret)
14407 + * ecr_lock_create
14409 +ecr_lock_t ecr_lock_create(char *lockname)
14418 +int ecr_lock(ecr_lock_t lock, u64 start, u64 length,
14419 + ecr_lock_mode_t mode, u_char flag)
14430 +int ecr_unlock(ecr_lock_t lock, u64 start, u64 length)
14437 +/********************************************************/
14438 +/* Required Module Entry Point: */
14440 +/********************************************************/
14442 +static int __init ecr_init(void)
14448 +static void __exit ecr_exit(void)
14453 +module_init(ecr_init);
14454 +module_exit(ecr_exit);
14455 +#ifdef MODULE_LICENSE
14456 +MODULE_LICENSE("GPL");
14459 diff -Naur linux-2002-09-30/drivers/evms/evms_passthru.c evms-2002-09-30/drivers/evms/evms_passthru.c
14460 --- linux-2002-09-30/drivers/evms/evms_passthru.c Wed Dec 31 18:00:00 1969
14461 +++ evms-2002-09-30/drivers/evms/evms_passthru.c Fri Sep 13 16:09:55 2002
14463 +/* -*- linux-c -*- */
14468 + * Copyright (c) International Business Machines Corp., 2000
14470 + * This program is free software; you can redistribute it and/or modify
14471 + * it under the terms of the GNU General Public License as published by
14472 + * the Free Software Foundation; either version 2 of the License, or
14473 + * (at your option) any later version.
14475 + * This program is distributed in the hope that it will be useful,
14476 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
14477 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14478 + * the GNU General Public License for more details.
14480 + * You should have received a copy of the GNU General Public License
14481 + * along with this program; if not, write to the Free Software
14482 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14487 + * linux/drivers/evms/evms_passthru.c
14489 + * EVMS System Data Manager
14494 +#include <linux/module.h>
14495 +#include <linux/kernel.h>
14496 +#include <linux/config.h>
14497 +#include <linux/genhd.h>
14498 +#include <linux/string.h>
14499 +#include <linux/blk.h>
14500 +#include <linux/init.h>
14501 +#include <linux/slab.h>
14502 +#include <linux/evms/evms.h>
14503 +#include <asm/system.h>
14505 +#define EVMS_PASSTHRU_ID 0
14506 +#define LOG_PREFIX "passthru: "
14508 +static int passthru_mgr_discover(struct evms_logical_node **);
14509 +static int passthru_mgr_delete(struct evms_logical_node *);
14510 +static void passthru_mgr_read(struct evms_logical_node *, struct buffer_head *);
14511 +static void passthru_mgr_write(struct evms_logical_node *, struct buffer_head *);
14512 +static int passthru_mgr_ioctl(struct evms_logical_node *,
14514 + struct file *, unsigned int, unsigned long);
14515 +static int passthru_mgr_init_io(struct evms_logical_node *,
14516 + int, u64, u64, void *);
14518 +static struct evms_plugin_fops fops = {
14519 + .discover = passthru_mgr_discover,
14520 + .delete = passthru_mgr_delete,
14521 + .read = passthru_mgr_read,
14522 + .write = passthru_mgr_write,
14523 + .init_io = passthru_mgr_init_io,
14524 + .ioctl = passthru_mgr_ioctl
14527 +static struct evms_plugin_header plugin_header = {
14528 + .id = SetPluginID(IBM_OEM_ID,
14530 + EVMS_PASSTHRU_ID),
14536 + .required_services_version = {
14544 +/*******************************/
14545 +/* discovery support functions */
14546 +/*******************************/
14549 +process_passthru_data(struct evms_logical_node **pp)
14551 + int rc, size_in_sectors;
14552 + struct evms_logical_node *node, *new_node;
14556 + size_in_sectors =
14557 + evms_cs_size_in_vsectors(sizeof (struct evms_feature_header));
14559 + /* allocate "parent" node */
14560 + rc = evms_cs_allocate_logical_node(&new_node);
14562 + /* initialize "parent" node */
14563 + new_node->private = node;
14564 + new_node->flags = node->flags;
14565 + new_node->plugin = &plugin_header;
14566 + new_node->system_id = node->system_id;
14567 + new_node->block_size = node->block_size;
14568 + new_node->hardsector_size = node->hardsector_size;
14569 + new_node->total_vsectors = node->total_vsectors;
14570 + new_node->total_vsectors -=
14571 + (size_in_sectors << 1) +
14572 + node->feature_header->alignment_padding;
14573 + new_node->volume_info = node->volume_info;
14574 + strcpy(new_node->name, node->name);
14575 + if (strlen(node->feature_header->object_name))
14576 + strcat(new_node->name,
14577 + node->feature_header->object_name);
14579 + strcat(new_node->name, "_Passthru");
14581 + /* return "parent" node to caller */
14584 + MOD_INC_USE_COUNT;
14586 + LOG_DETAILS("feature header found on '%s', created '%s'.\n",
14587 + node->name, new_node->name);
14588 + /* we're done with the passthru feature headers
14589 + * so lets delete them now.
14591 + kfree(node->feature_header);
14592 + node->feature_header = NULL;
14594 + /* on any fatal error, delete the node */
14595 + int rc2 = DELETE(node);
14598 + ("error(%d) attempting to delete node(%p,%s).\n",
14599 + rc2, node, node->name);
14605 +/********** Required Plugin Functions **********/
14608 + * Function: passthru_mgr_discover
14612 +passthru_mgr_discover(struct evms_logical_node **discover_list)
14615 + struct evms_logical_node *node, *tmp_list_head;
14617 + MOD_INC_USE_COUNT;
14618 + tmp_list_head = *discover_list;
14619 + *discover_list = NULL;
14621 + while (tmp_list_head) {
14622 + node = tmp_list_head;
14623 + rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
14626 + rc = process_passthru_data(&node);
14629 + rc = evms_cs_add_logical_node_to_list
14630 + (discover_list, node);
14632 + MOD_DEC_USE_COUNT;
14637 + * Function: passthru_mgr_delete
14641 +passthru_mgr_delete(struct evms_logical_node *node)
14644 + struct evms_logical_node *p;
14646 + LOG_DETAILS("deleting '%s'.\n", node->name);
14648 + p = node->private;
14651 + evms_cs_deallocate_logical_node(node);
14652 + MOD_DEC_USE_COUNT;
14658 + * function: passthru_io_error
14660 + * this function was primarily created because the function
14661 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
14662 + * to be set on inline functions. Since this was an error path
14663 + * and not mainline, I decided to add a trace statement to help
14664 + * report on the failing condition.
14668 +passthru_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
14671 + ("attempt to %s beyond boundary("PFU64") on (%s), rsector("PFU64").\n",
14672 + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1,
14673 + node->name, (u64) bh->b_rsector);
14675 + bh->b_end_io(bh, 0);
14679 + * Function: passthru_mgr_read
14682 +passthru_mgr_read(struct evms_logical_node *node, struct buffer_head *bh)
14684 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14685 + node->total_vsectors) {
14686 + R_IO(((struct evms_logical_node *) (node->private)), bh);
14688 + passthru_io_error(node, READ, bh);
14692 + * Function: passthru_mgr_write
14696 +passthru_mgr_write(struct evms_logical_node *node, struct buffer_head *bh)
14698 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14699 + node->total_vsectors) {
14700 + W_IO(((struct evms_logical_node *) (node->private)), bh);
14702 + passthru_io_error(node, WRITE, bh);
14706 + * Function: passthru_mgr_ioctl
14710 +passthru_mgr_ioctl(struct evms_logical_node *node,
14711 + struct inode *inode,
14712 + struct file *file, unsigned int cmd, unsigned long arg)
14716 + if ((!node) || (!inode))
14719 + rc = IOCTL(((struct evms_logical_node *) (node->private)),
14720 + inode, file, cmd, arg);
14725 +passthru_mgr_init_io(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
14726 + u64 sect_nr, /* disk LBA */
14727 + u64 num_sects, /* # of sectors */
14729 +{ /* buffer address */
14731 + if ((sect_nr + num_sects) <= node->total_vsectors) {
14732 + rc = INIT_IO(((struct evms_logical_node *) (node->
14734 + io_flag, sect_nr, num_sects, buf_addr);
14741 + * Function: passthru_init
14745 +evms_passthru_manager_init(void)
14747 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
14751 +evms_passthru_manager_exit(void)
14753 + evms_cs_unregister_plugin(&plugin_header);
14756 +module_init(evms_passthru_manager_init);
14757 +module_exit(evms_passthru_manager_exit);
14758 +#ifdef MODULE_LICENSE
14759 +MODULE_LICENSE("GPL");
14761 diff -Naur linux-2002-09-30/drivers/evms/gpt_part.c evms-2002-09-30/drivers/evms/gpt_part.c
14762 --- linux-2002-09-30/drivers/evms/gpt_part.c Wed Dec 31 18:00:00 1969
14763 +++ evms-2002-09-30/drivers/evms/gpt_part.c Fri Sep 13 16:09:55 2002
14765 +/* -*- linux-c -*- */
14769 + * Copyright (c) International Business Machines Corp., 2000
14771 + * This program is free software; you can redistribute it and/or modify
14772 + * it under the terms of the GNU General Public License as published by
14773 + * the Free Software Foundation; either version 2 of the License, or
14774 + * (at your option) any later version.
14776 + * This program is distributed in the hope that it will be useful,
14777 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
14778 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14779 + * the GNU General Public License for more details.
14781 + * You should have received a copy of the GNU General Public License
14782 + * along with this program; if not, write to the Free Software
14783 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14788 +/* linux/driver/evms/gpt_part.c
14790 + * EVMS - EFI GPT segment manager plugin
14792 + * This plugin provides support for the GUID Partition Table format specified
14793 + * by the Extensible Firmware Interface documentation ... version 1.02
14796 +#include <linux/config.h>
14797 +#include <linux/module.h>
14798 +#include <linux/kernel.h>
14799 +#include <linux/config.h>
14800 +#include <linux/string.h>
14801 +#include <linux/blk.h>
14802 +#include <asm/uaccess.h>
14803 +#include <linux/evms/evms.h>
14805 +/* prefix used in logging messages */
14806 +#define LOG_PREFIX "gpt_part: "
14809 + * struct gpt_private - Private data structure for this plugin
14810 + * @source_object: object this IO will get remapped to
14811 + * @start_sect: source object relative starting address in 512 byte units
14812 + * @nr_sect: partition size in 512 bytes units
14813 + * @type: partition type or filesystem format indicator
14815 + * private copy of the just the fields we require to remap IO requests
14816 + * to the underlying object.
14818 +struct gpt_private {
14819 + struct evms_logical_node *source_disk;
14822 + unsigned char type;
14825 +#define GPT_DISKMAGIC 0x5452415020494645 // "EFI PART"
14826 +#define GPT_PNAME_SIZE 36 // max unicode partition name size
14829 + * struct guid - GUID structure
14830 + * @time_low: timestamp - low order 32 bits
14831 + * @time_mid: timestamp - mid 16 bits
14832 + * @time_high: timestamp - high 16 bits
14833 + * @clock_seq_high: clock - high order 8 bits
14834 + * @clock_seq_low: clock - low order 8 bits
14835 + * @node: spatial reference - unique id (ie. mac address of nic)
14843 + u8 clock_seq_high;
14844 + u8 clock_seq_low;
14849 + * struct gpt_partition - GPT partition record definition
14850 + * @type: partition type
14851 + * @part_id: partition record id
14852 + * @start: address of 1st block of partition
14853 + * @end: address of last block of partition
14854 + * @attributes: bit field reserved by EFI spec
14855 + * @name: unicode name of partition
14857 + * GPT partition record definition
14859 +struct gpt_partition {
14860 + struct guid type;
14861 + struct guid part_id;
14865 + u16 name[GPT_PNAME_SIZE];
14869 + * struct gpt_header - GPT header
14870 + * @signature: EFI compatible header signature
14871 + * @version: spec revision number
14872 + * @size: size (bytes) of gpt header
14873 + * @crc: crc of gpt header
14874 + * @reserve: reserved by spec ... must be zero
14875 + * @my_lba: lba of gpt header
14876 + * @alternate_lba: lba of 2nd copy of gpt header
14877 + * @start_useable: lba of 1st block of useable area on disk
14878 + * @end_useable: lba of last block of useable area on disk
14879 + * @disk_id: GUID - identifies this disk
14880 + * @ptable_lba: lba of partition table
14881 + * @ptable_count: number of entries in the partition table
14882 + * @ptable_entry_size: size of partition table entry
14883 + * @ptable_crc: crc of partition table
14887 +struct gpt_header {
14894 + u64 alternate_lba;
14895 + u64 start_useable;
14897 + struct guid disk_id;
14899 + u32 ptable_count;
14900 + u32 ptable_entry_size;
14904 +struct guid EFI_SYSTEM_PARTITION = {
14910 + {0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B}
14913 +struct guid BASIC_DATA_PARTITION = {
14919 + {0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7}
14922 +struct guid LEGACY_MBR_PARTITION = {
14928 + {0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F}
14931 +struct guid GPT_SWAP_PARTITION = {
14937 + {0x09, 0x33, 0xC8, 0x4B, 0x4F, 0x4F}
14940 +struct guid UNUSED_GPT_PARTITION = {
14942 + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
14945 +static int exported_nodes; /* total # of exported segments
14946 + * produced during this discovery.
14950 +static int partition_discover(struct evms_logical_node **);
14951 +static int partition_delete(struct evms_logical_node *);
14952 +static void partition_read(struct evms_logical_node *, struct buffer_head *);
14953 +static void partition_write(struct evms_logical_node *, struct buffer_head *);
14954 +static int partition_ioctl(struct evms_logical_node *,
14956 + struct file *, unsigned int, unsigned long);
14957 +static int partition_init_io(struct evms_logical_node *,
14958 + int, u64, u64, void *);
14960 +static struct evms_plugin_fops fops = {
14961 + .discover = partition_discover,
14962 + .delete = partition_delete,
14963 + .read = partition_read,
14964 + .write = partition_write,
14965 + .init_io = partition_init_io,
14966 + .ioctl = partition_ioctl
14969 +#define EVMS_GPT_PARTITION_MANAGER_ID 3
14971 +static struct evms_plugin_header plugin_header = {
14972 + .id = SetPluginID(IBM_OEM_ID,
14973 + EVMS_SEGMENT_MANAGER,
14974 + EVMS_GPT_PARTITION_MANAGER_ID),
14980 + .required_services_version = {
14988 +/***************************************************/
14989 +/* List Support - Typedefs, Variables, & Functions */
14990 +/***************************************************/
14994 +struct segment_list_node {
14995 + struct evms_logical_node *segment;
14996 + struct segment_list_node *next;
14999 +struct disk_list_node {
15000 + struct evms_logical_node *disk;
15001 + struct segment_list_node *segment_list;
15002 + struct disk_list_node *next;
15007 +static struct disk_list_node *my_disk_list;
15012 + * Function: Convert a GPT header from disk format to the arch specific
15016 +disk_gpt_header_to_cpu(struct gpt_header *gh)
15018 + gh->signature = le64_to_cpu(gh->signature);
15019 + gh->version = le32_to_cpu(gh->version);
15020 + gh->size = le32_to_cpu(gh->size);
15021 + gh->crc = le32_to_cpu(gh->crc);
15022 + gh->reserve = le32_to_cpu(gh->reserve);
15023 + gh->my_lba = le64_to_cpu(gh->my_lba);
15024 + gh->alternate_lba = le64_to_cpu(gh->alternate_lba);
15025 + gh->start_useable = le64_to_cpu(gh->start_useable);
15026 + gh->end_useable = le64_to_cpu(gh->end_useable);
15027 + gh->disk_id.time_low = le32_to_cpu(gh->disk_id.time_low);
15028 + gh->disk_id.time_mid = le16_to_cpu(gh->disk_id.time_mid);
15029 + gh->disk_id.time_high = le16_to_cpu(gh->disk_id.time_high);
15030 + gh->ptable_lba = le64_to_cpu(gh->ptable_lba);
15031 + gh->ptable_count = le32_to_cpu(gh->ptable_count);
15032 + gh->ptable_entry_size = le32_to_cpu(gh->ptable_entry_size);
15033 + gh->ptable_crc = le32_to_cpu(gh->ptable_crc);
15037 +matching_guids(struct guid *g1, struct guid *g2)
15039 + if ((le32_to_cpu(g1->time_low) == g2->time_low) &&
15040 + (le16_to_cpu(g1->time_mid) == g2->time_mid) &&
15041 + (le16_to_cpu(g1->time_high) == g2->time_high) &&
15042 + (g1->clock_seq_high == g2->clock_seq_high) &&
15043 + (g1->clock_seq_low == g2->clock_seq_low)) {
15049 +isa_basic_data_gpt_partition_record(struct gpt_partition *p)
15051 + return (matching_guids(&p->type, &BASIC_DATA_PARTITION));
15054 +isa_legacy_mbr_gpt_partition_record(struct gpt_partition *p)
15056 + return (matching_guids(&p->type, &LEGACY_MBR_PARTITION));
15059 +isa_esp_gpt_partition_record(struct gpt_partition *p)
15061 + return (matching_guids(&p->type, &EFI_SYSTEM_PARTITION));
15064 +isa_gpt_swap_partition_record(struct gpt_partition *p)
15066 + return (matching_guids(&p->type, &GPT_SWAP_PARTITION));
15069 +isa_unused_gpt_partition_record(struct gpt_partition *p)
15071 + return (matching_guids(&p->type, &UNUSED_GPT_PARTITION));
15074 +static struct disk_list_node **
15075 +lookup_disk(struct evms_logical_node *disk)
15077 + struct disk_list_node **ldln;
15079 + ldln = &my_disk_list;
15081 + if ((*ldln)->disk == disk)
15083 + ldln = &(*ldln)->next;
15088 +static struct segment_list_node **
15089 +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
15091 + struct segment_list_node **lsln;
15093 + lsln = &disk->segment_list;
15095 + if ((*lsln)->segment == segment)
15097 + lsln = &(*lsln)->next;
15102 +static struct evms_logical_node *
15103 +find_segment_on_disk(struct evms_logical_node *disk,
15104 + u64 start_sect, u64 nr_sects)
15106 + struct evms_logical_node *rc = NULL;
15107 + struct disk_list_node **ldln;
15108 + struct segment_list_node **lsln;
15109 + struct gpt_private *gpt_prv;
15111 + ldln = lookup_disk(disk);
15113 + /* disk found in list */
15114 + /* attempt to find segment */
15116 + lsln = &(*ldln)->segment_list;
15118 + gpt_prv = (*lsln)->segment->private;
15119 + if (gpt_prv->start_sect == start_sect)
15120 + if (gpt_prv->nr_sects == nr_sects)
15122 + lsln = &(*lsln)->next;
15125 + rc = (*lsln)->segment;
15130 +/* function description: add_segment_to_disk
15132 + * this function attempts to add a segment to the segment
15133 + * list of a disk. if the specified disk is not found, it
15134 + * will be added to the global disk list. this function will
15135 + * return a pointer to the matching segment in the disk's
15136 + * segment list. the caller must compare the returned pointer
15137 + * to the specified segment to see if the
15138 + * specified segment was already present in the disk's segment
15139 + * list. if the return pointer matches the specified segment,
15140 + * then the specified segment was added to the list. if the
15141 + * return segment pointer to does not match the specified
15142 + * segment pointer, then the specified segment pointer was
15143 + * a duplicate and can be thrown away.
15146 +add_segment_to_disk(struct evms_logical_node *disk,
15147 + struct evms_logical_node *segment)
15150 + struct disk_list_node **ldln, *new_disk;
15151 + struct segment_list_node **lsln, *new_segment;
15153 + ldln = lookup_disk(disk);
15154 + if (*ldln == NULL) {
15155 + /* disk not in list, add disk */
15156 + new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
15158 + memset(new_disk, 0, sizeof (*new_disk));
15159 + new_disk->disk = disk;
15160 + *ldln = new_disk;
15166 + /* attempt to add segment */
15167 + lsln = lookup_segment(*ldln, segment);
15168 + if (*lsln == NULL) {
15169 + /* segment not in list, add segment */
15171 + kmalloc(sizeof (*new_segment), GFP_KERNEL);
15172 + if (new_segment) {
15173 + memset(new_segment, 0, sizeof (*new_segment));
15174 + new_segment->segment = segment;
15175 + *lsln = new_segment;
15186 +remove_segment_from_disk(struct evms_logical_node *disk,
15187 + struct evms_logical_node *segment,
15188 + struct evms_logical_node **empty_disk)
15191 + struct disk_list_node **ldln, *tmp_disk_node;
15192 + struct segment_list_node **lsln, *tmp_segment_node;
15194 + *empty_disk = NULL;
15195 + ldln = lookup_disk(disk);
15196 + if (*ldln == NULL) {
15199 + /* disk found in list */
15200 + /* attempt to add segment */
15201 + lsln = lookup_segment(*ldln, segment);
15202 + if (*lsln == NULL) {
15205 + tmp_segment_node = *lsln;
15206 + /* remove segment from list */
15207 + *lsln = (*lsln)->next;
15208 + /* free the segment list node */
15209 + kfree(tmp_segment_node);
15211 + if ((*ldln)->segment_list == NULL) {
15212 + tmp_disk_node = *ldln;
15213 + *empty_disk = tmp_disk_node->disk;
15214 + /* remove disk from list */
15215 + *ldln = (*ldln)->next;
15216 + /* free the disk list node */
15217 + kfree(tmp_disk_node);
15225 + * Function: add_segment
15228 +process_segment(struct evms_logical_node **discover_list,
15229 + struct evms_logical_node *node,
15232 + int type, int part_num, int evms_top_segment)
15234 + struct gpt_private *gpt_prv = NULL;
15235 + struct evms_logical_node *segment;
15238 + segment = find_segment_on_disk(node, start_sect, nr_sects);
15240 + LOG_DETAILS("exporting segment '%s'.\n", segment->name);
15242 + gpt_prv = kmalloc(sizeof (*gpt_prv), GFP_KERNEL);
15244 + gpt_prv->source_disk = node;
15245 + gpt_prv->start_sect = start_sect;
15246 + gpt_prv->nr_sects = nr_sects;
15247 + gpt_prv->type = type;
15248 + rc = evms_cs_allocate_logical_node(&segment);
15253 + segment->plugin = &plugin_header;
15254 + segment->system_id = (unsigned int) type;
15255 + segment->total_vsectors = nr_sects;
15256 + segment->block_size = node->block_size;
15257 + segment->hardsector_size = node->hardsector_size;
15258 + segment->private = gpt_prv;
15259 + segment->flags = node->flags;
15260 + if (evms_top_segment)
15261 + segment->iflags |= EVMS_TOP_SEGMENT;
15262 + strcpy(segment->name, node->name);
15263 + if (GetPluginType(node->plugin->id) ==
15264 + EVMS_SEGMENT_MANAGER) {
15265 + strcat(segment->name, ".");
15267 + sprintf(segment->name + strlen(segment->name), "%d",
15269 + LOG_DETAILS("creating segment '%s'.\n", segment->name);
15270 + rc = add_segment_to_disk(node, segment);
15273 + ("%s: error(%d) adding segment '%s'!\n",
15274 + __FUNCTION__, rc, segment->name);
15277 + MOD_INC_USE_COUNT;
15284 + evms_cs_deallocate_logical_node(segment);
15288 + evms_cs_add_logical_node_to_list(discover_list, segment);
15289 + exported_nodes++;
15295 +print_mem(void *buffer, int length)
15298 + unsigned char *bufptr;
15300 + bufptr = (unsigned char *) buffer;
15303 + if ((i % 16) == 0)
15304 + printk(KERN_INFO "\n0x%p->", buffer + i);
15305 + printk(KERN_INFO "%02x ", bufptr[i]);
15306 + if (++i >= length)
15309 + printk(KERN_INFO "\n");
15313 + * Function: get GPT Partition Table - reads partition table
15314 + * into memory and performs crc check.
15317 +static struct gpt_partition *
15318 +get_gpt_partition_table(struct evms_logical_node *node, struct gpt_header *gh)
15321 + struct gpt_partition *pt;
15322 + u32 sector_count, calculated_crc;
15325 + evms_cs_size_in_vsectors(gh->ptable_count * gh->ptable_entry_size);
15327 + pt = kmalloc(sector_count * EVMS_VSECTOR_SIZE, GFP_KERNEL);
15330 + rc = INIT_IO(node, 0, gh->ptable_lba, sector_count, pt);
15333 + calculated_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
15338 + ptable_entry_size);
15340 + if (~calculated_crc != gh->ptable_crc) {
15359 + * Function: Validate GPT Header - runs basic checks to
15360 + * sanity check a gpt header.
15364 +isa_valid_gpt_header(struct evms_logical_node *node, u64 lsn,
15365 + struct gpt_header *gh)
15368 + u32 calculated_crc;
15369 + u64 sector_count;
15372 + if (le64_to_cpu(gh->signature) != GPT_DISKMAGIC)
15376 + crc = le32_to_cpu(gh->crc);
15379 + ~(evms_cs_calculate_crc(EVMS_INITIAL_CRC, gh, le32_to_cpu(gh->size)));
15380 + gh->crc = cpu_to_le32(crc);
15382 + if (calculated_crc != crc)
15385 + /* spec says lba reported by header must match actual location on disk */
15386 + if (lsn != le64_to_cpu(gh->my_lba))
15389 + /* sanity check partition table info found in header */
15390 + if (gh->ptable_count == 0 || gh->ptable_entry_size == 0)
15394 + evms_cs_size_in_vsectors(le64_to_cpu(gh->ptable_count) *
15395 + le64_to_cpu(gh->ptable_entry_size));
15397 + if ((le64_to_cpu(gh->ptable_lba) + sector_count - 1) >=
15398 + node->total_vsectors - 1)
15405 + * Function: get GPT Partition Table Header
15408 +static struct gpt_header *
15409 +get_gpt_header(struct evms_logical_node *node, u64 lsn)
15412 + struct gpt_header *gh = NULL;
15414 + gh = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
15416 + rc = INIT_IO(node, 0, lsn, 1, gh);
15418 + if (isa_valid_gpt_header(node, lsn, gh)) {
15419 + disk_gpt_header_to_cpu(gh);
15435 + * Function: Get GPT Information
15439 +get_gpt_info(struct evms_logical_node *node,
15440 + struct gpt_header **gh, struct gpt_partition **ptable)
15442 + struct gpt_header *gh1 = NULL, *gh2 = NULL;
15447 + gh1 = get_gpt_header(node, 1); // offset past protective mbr
15451 + gh2 = get_gpt_header(node, gh1->alternate_lba);
15456 + ("alternate guid partition table header is invalid, using primary copy.\n");
15458 + gh2 = get_gpt_header(node, node->total_vsectors - 1);
15462 + ("primary guid partition table header is invalid, using alternate copy\n");
15464 + LOG_DETAILS("no gpt header discovered on node %s\n",
15470 + *ptable = get_gpt_partition_table(node, *gh);
15481 + * Function: Probe for GPT segments on logical node
15485 +probe_for_segments(struct evms_logical_node **discover_list,
15486 + struct evms_logical_node *node)
15489 + int nextminor = 1;
15490 + int evms_top_segment;
15493 + struct gpt_header *gh = NULL;
15494 + struct gpt_partition *ptable = NULL;
15495 + struct gpt_partition *part = NULL;
15497 + /* no need to inspect our own nodes */
15498 + if (node->plugin->id == plugin_header.id)
15501 + /* nor nodes marked as EVMS_TOP_SEGMENT */
15502 + if (node->iflags & EVMS_TOP_SEGMENT)
15505 + /* look for guid partition table & header */
15506 + if (!get_gpt_info(node, &gh, &ptable)) {
15514 + /* walk the guid partition table, producing segment storage objects */
15515 + for (i = 0, part = ptable; i < gh->ptable_count; i++, part++) {
15517 + if (!isa_unused_gpt_partition_record(part)) {
15519 + pstart = le64_to_cpu(part->start);
15520 + pend = le64_to_cpu(part->end);
15523 + ("gpt partition start="PFU64" end="PFU64"\n",
15524 + pstart, (pend - pstart + 1));
15526 + /* stop other seg mgrs from recursive discovery on a gpt system partition */
15527 + if (isa_esp_gpt_partition_record(part))
15528 + evms_top_segment = 1;
15530 + evms_top_segment = 0;
15532 + rc = process_segment(discover_list,
15535 + (pend - pstart + 1),
15536 + 0, nextminor, evms_top_segment);
15545 + /* remove node we just consumed */
15546 + evms_cs_remove_logical_node_from_list(discover_list, node);
15554 + * Function: partition_discover
15558 +partition_discover(struct evms_logical_node **discover_list)
15561 + struct evms_logical_node *node, *next_node;
15563 + MOD_INC_USE_COUNT;
15564 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
15566 + /* initialize global variable */
15567 + exported_nodes = 0;
15569 + /* examine each node on the discover list */
15570 + next_node = *discover_list;
15571 + while (next_node) {
15572 + node = next_node;
15573 + next_node = node->next;
15574 + probe_for_segments(discover_list, node);
15577 + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
15578 + __FUNCTION__, exported_nodes, rc);
15579 + if (exported_nodes)
15580 + rc = exported_nodes;
15581 + MOD_DEC_USE_COUNT;
15586 + * Function: partition_delete
15590 +partition_delete(struct evms_logical_node *segment)
15593 + struct gpt_private *gpt_prv;
15594 + struct evms_logical_node *empty_disk = NULL;
15596 + LOG_DETAILS("deleting segment '%s'.\n", segment->name);
15601 + gpt_prv = segment->private;
15603 + /* remove the segment from the
15604 + * disk's segment list
15606 + rc = remove_segment_from_disk(gpt_prv->source_disk,
15607 + segment, &empty_disk);
15608 + /* free the local instance data */
15611 + /* free the segment node */
15612 + evms_cs_deallocate_logical_node(segment);
15613 + MOD_DEC_USE_COUNT;
15614 + /* if the last segment on the disk was
15615 + * deleted, delete the disk node too
15618 + DELETE(empty_disk);
15624 + * function: partition_io_error
15626 + * this function was primarily created because the function
15627 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
15628 + * to be set on inline functions. Since this was an error path
15629 + * and not mainline, I decided to add a trace statement to help
15630 + * report on the failing condition.
15634 +partition_io_error(struct evms_logical_node *node, int io_flag,
15635 + struct buffer_head *bh)
15638 + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n",
15639 + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
15642 + bh->b_end_io(bh, 0);
15646 + * Function: partition_read
15650 +partition_read(struct evms_logical_node *partition, struct buffer_head *bh)
15652 + struct gpt_private *gpt_prv = partition->private;
15654 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15655 + partition->total_vsectors) {
15656 + bh->b_rsector += gpt_prv->start_sect;
15657 + R_IO(gpt_prv->source_disk, bh);
15659 + partition_io_error(partition, READ, bh);
15663 + * Function: partition_write
15667 +partition_write(struct evms_logical_node *partition, struct buffer_head *bh)
15669 + struct gpt_private *gpt_prv = partition->private;
15671 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15672 + partition->total_vsectors) {
15673 + bh->b_rsector += gpt_prv->start_sect;
15674 + W_IO(gpt_prv->source_disk, bh);
15676 + partition_io_error(partition, WRITE, bh);
15680 + * Function: partition_init_io
15684 +partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */
15685 + u64 sect_nr, /* disk LBA */
15686 + u64 num_sects, /* # of sectors */
15688 +{ /* buffer address */
15690 + struct gpt_private *gpt_prv = partition->private;
15692 + if ((sect_nr + num_sects) <= partition->total_vsectors) {
15693 + rc = INIT_IO(gpt_prv->source_disk, io_flag,
15694 + sect_nr + gpt_prv->start_sect, num_sects,
15698 + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
15699 + (io_flag) ? "WRITE" : "READ", partition->name,
15700 + (gpt_prv->nr_sects - 1), sect_nr, num_sects);
15708 + * Function: partition_ioctl
15712 +partition_ioctl(struct evms_logical_node *partition,
15713 + struct inode *inode,
15714 + struct file *file, unsigned int cmd, unsigned long arg)
15716 + struct gpt_private *gpt_prv;
15717 + struct hd_geometry hd_geo;
15721 + gpt_prv = partition->private;
15725 + case HDIO_GETGEO:
15727 + rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15730 + if (copy_from_user
15731 + (&hd_geo, (void *) arg,
15732 + sizeof (struct hd_geometry)))
15736 + hd_geo.start = gpt_prv->start_sect;
15738 + ((void *) arg, &hd_geo,
15739 + sizeof (struct hd_geometry)))
15743 + case EVMS_GET_BMAP:
15745 + struct evms_get_bmap_pkt *bmap =
15746 + (struct evms_get_bmap_pkt *) arg;
15747 + bmap->rsector += gpt_prv->start_sect;
15748 + /* intentionally fall thru to
15749 + * default ioctl down to device
15754 + rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15760 + * Function: gpt_module_init
15764 +gpt_module_init(void)
15766 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
15770 + * Function: gpt module exit
15772 +static void __exit
15773 +gpt_module_exit(void)
15775 + evms_cs_unregister_plugin(&plugin_header);
15778 +module_init(gpt_module_init);
15779 +module_exit(gpt_module_exit);
15780 +#ifdef MODULE_LICENSE
15781 +MODULE_LICENSE("GPL");
15783 diff -Naur linux-2002-09-30/drivers/evms/ldev_mgr.c evms-2002-09-30/drivers/evms/ldev_mgr.c
15784 --- linux-2002-09-30/drivers/evms/ldev_mgr.c Wed Dec 31 18:00:00 1969
15785 +++ evms-2002-09-30/drivers/evms/ldev_mgr.c Fri Sep 13 16:45:06 2002
15787 +/* -*- linux-c -*- */
15790 + * Copyright (c) International Business Machines Corp., 2000
15792 + * This program is free software; you can redistribute it and/or modify
15793 + * it under the terms of the GNU General Public License as published by
15794 + * the Free Software Foundation; either version 2 of the License, or
15795 + * (at your option) any later version.
15797 + * This program is distributed in the hope that it will be useful,
15798 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
15799 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15800 + * the GNU General Public License for more details.
15802 + * You should have received a copy of the GNU General Public License
15803 + * along with this program; if not, write to the Free Software
15804 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15807 +/* linux/driver/evms/ldev_mgr.c
15809 + * EVMS - Local Device (Hard Drive) Manager
15811 + * This plugin walks the gendisk list and creates logical disk structures for each
15812 + * local ide or scsi device.
15816 +#include <linux/config.h>
15817 +#include <linux/module.h>
15818 +#include <linux/errno.h>
15819 +#include <linux/kernel.h>
15820 +#include <linux/fs.h>
15821 +#include <linux/slab.h>
15822 +#include <asm/uaccess.h>
15823 +#include <linux/blk.h> /* must be included by all block drivers */
15824 +#include <linux/genhd.h>
15825 +#include <linux/ide.h>
15826 +#include <linux/version.h>
15827 +#include "../scsi/scsi.h"
15828 +#include "../scsi/sd.h"
15829 +#include <linux/init.h>
15830 +#include <linux/evms/evms.h>
15831 +#include <linux/evms/ldev_mgr.h>
15833 +#define LOG_PREFIX "ldev_mgr: "
15835 +#define EVMS_LOCAL_DEVICE_MANAGER_ID 1
15838 + * struct ldev_private - private data used by this plugin
15839 + * @major: major device number
15840 + * @minor: minor device number
15841 + * @bdev: block_device record for this device
15842 + * @gd: gendisk entry for this device
15843 + * @media_changed: media changed status field
15845 + * private data maintained for each device by this plugin
15847 +struct ldev_private {
15848 + int major, minor;
15849 + struct block_device *bdev;
15850 + struct gendisk *gd;
15851 + int media_changed;
15854 +/* prototypes for mandatory plugin interface functions */
15855 +static int discover_disks(struct evms_logical_node **);
15856 +static int ldev_mgr_delete(struct evms_logical_node *);
15857 +static void ldev_mgr_read(struct evms_logical_node *, struct buffer_head *);
15858 +static void ldev_mgr_write(struct evms_logical_node *, struct buffer_head *);
15859 +static int ldev_mgr_ioctl(struct evms_logical_node *,
15861 + struct file *, unsigned int, unsigned long);
15862 +static int ldev_init_io(struct evms_logical_node *,
15863 + int, u64, u64, void *);
15864 +static int ldev_mgr_direct_ioctl(struct inode *,
15865 + struct file *, unsigned int, unsigned long);
15867 +/* plugin function table definition */
15868 +static struct evms_plugin_fops fops = {
15869 + .discover = discover_disks,
15870 + .delete = ldev_mgr_delete,
15871 + .read = ldev_mgr_read,
15872 + .write = ldev_mgr_write,
15873 + .init_io = ldev_init_io,
15874 + .ioctl = ldev_mgr_ioctl,
15875 + .direct_ioctl = ldev_mgr_direct_ioctl
15878 +/* plugin header definition */
15879 +static struct evms_plugin_header plugin_header = {
15880 + .id = SetPluginID(IBM_OEM_ID,
15881 + EVMS_DEVICE_MANAGER,
15882 + EVMS_LOCAL_DEVICE_MANAGER_ID),
15888 + .required_services_version = {
15896 +#define TYPE_NONE 0
15897 +#define TYPE_GENERIC 1
15898 +#define TYPE_IDE 2
15899 +#define TYPE_SCSI 3
15901 +#define INDEX_ALPHA 0
15902 +#define INDEX_NUMERIC 1
15904 +/********************************************************/
15905 +/* Required Plugin Function Table Entry Point: */
15906 +/* Discover function & Support routines */
15907 +/********************************************************/
15909 +#define MAX_NAME_BASE_SIZE 10
15910 +#define MAX_NAME_MODIFIER_SIZE 4
15912 + * struct blk_device_info - block device info
15913 + * @devnode_name_base: base name (ie. hd or sd) for device
15914 + * @null1: guaranteed end-of-string NULL
15915 + * @devnode_name_modifier: name suffix (ie. ag for sdag) for device
15916 + * @null2: guaranteed end-of-string NULL
15917 + * @devnode_name_index: numeric device index (ie. 1 for hda1)
15918 + * @devnode_name_type: indicates numeric or alpha modifier
15919 + * @devnode_type: device type, IDE, SCSI, or GENERIC
15921 + * generic block device naming descriptor structure
15923 +struct blk_device_info {
15924 + char devnode_name_base[MAX_NAME_BASE_SIZE];
15926 + char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
15928 + int devnode_name_index;
15929 + int devnode_name_type;
15933 +static struct blk_device_info *blk_dev_info = NULL;
15935 +#define BLK_DEV_INFO(a,b,c,d,e) \
15936 + strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); \
15937 + blk_dev_info[a].null1 = 0; \
15938 + strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); \
15939 + blk_dev_info[a].null2 = 0; \
15940 + blk_dev_info[a].devnode_name_index = 0; \
15941 + blk_dev_info[a].device_type = d; \
15942 + blk_dev_info[a].devnode_name_type = e;
15945 +init_blk_dev_info(struct blk_device_info *blk_dev_info)
15947 + BLK_DEV_INFO(IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA);
15948 + BLK_DEV_INFO(IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA);
15949 + BLK_DEV_INFO(IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA);
15950 + BLK_DEV_INFO(IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA);
15951 + BLK_DEV_INFO(IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA);
15952 + BLK_DEV_INFO(IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA);
15953 + BLK_DEV_INFO(IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA);
15954 + BLK_DEV_INFO(IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA);
15955 + BLK_DEV_INFO(IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA);
15956 + BLK_DEV_INFO(IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA);
15958 + BLK_DEV_INFO(SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA);
15959 + BLK_DEV_INFO(SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA);
15960 + BLK_DEV_INFO(SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA);
15961 + BLK_DEV_INFO(SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA);
15962 + BLK_DEV_INFO(SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA);
15963 + BLK_DEV_INFO(SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA);
15964 + BLK_DEV_INFO(SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA);
15965 + BLK_DEV_INFO(SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA);
15967 + BLK_DEV_INFO(XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA);
15969 + BLK_DEV_INFO(CYCLADES_MAJOR, "double", "0", TYPE_GENERIC,
15972 + BLK_DEV_INFO(MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA);
15974 + BLK_DEV_INFO(ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA);
15976 + BLK_DEV_INFO(PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA);
15978 + BLK_DEV_INFO(40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA);
15979 + BLK_DEV_INFO(43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC);
15980 + BLK_DEV_INFO(44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA);
15981 + BLK_DEV_INFO(45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA);
15982 + BLK_DEV_INFO(47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC);
15984 + BLK_DEV_INFO(DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC,
15986 + BLK_DEV_INFO(DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC,
15988 + BLK_DEV_INFO(DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC,
15990 + BLK_DEV_INFO(DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC,
15992 + BLK_DEV_INFO(DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC,
15994 + BLK_DEV_INFO(DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC,
15996 + BLK_DEV_INFO(DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC,
15998 + BLK_DEV_INFO(DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC,
16001 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC,
16003 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC,
16005 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC,
16007 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC,
16009 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC,
16011 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC,
16013 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC,
16015 + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC,
16018 + BLK_DEV_INFO(I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA);
16019 + BLK_DEV_INFO(I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA);
16020 + BLK_DEV_INFO(I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA);
16021 + BLK_DEV_INFO(I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA);
16022 + BLK_DEV_INFO(I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA);
16023 + BLK_DEV_INFO(I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA);
16024 + BLK_DEV_INFO(I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA);
16025 + BLK_DEV_INFO(I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA);
16027 + BLK_DEV_INFO(92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16028 + BLK_DEV_INFO(93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA);
16030 + BLK_DEV_INFO(DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA);
16031 + BLK_DEV_INFO(MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA);
16033 + BLK_DEV_INFO(96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16034 + BLK_DEV_INFO(97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16036 + BLK_DEV_INFO(UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16038 + BLK_DEV_INFO(JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC);
16040 + BLK_DEV_INFO(101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC);
16042 + BLK_DEV_INFO(104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16043 + BLK_DEV_INFO(105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16044 + BLK_DEV_INFO(106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16045 + BLK_DEV_INFO(107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16046 + BLK_DEV_INFO(108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16047 + BLK_DEV_INFO(108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16048 + BLK_DEV_INFO(110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16049 + BLK_DEV_INFO(111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16051 + BLK_DEV_INFO(RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC);
16053 + BLK_DEV_INFO(VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC);
16054 + BLK_DEV_INFO(VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC);
16055 + BLK_DEV_INFO(LOOP_MAJOR, "loop", "0", TYPE_GENERIC, INDEX_NUMERIC);
16059 +is_in_device_list(struct gendisk *gd, int major, int minor)
16061 + int found, done, rc;
16062 + struct evms_logical_node *device = NULL;
16063 + struct ldev_private *ldev_prv;
16065 + done = found = FALSE;
16066 + while (done == FALSE) {
16067 + rc = evms_cs_find_next_device(device, &device);
16068 + if (rc || !device)
16071 + ldev_prv = device->private;
16072 + if (ldev_prv->gd == gd)
16073 + if (ldev_prv->major == major)
16074 + if (ldev_prv->minor == minor)
16075 + done = found = TRUE;
16082 +build_devnode_name(char *name_buf, int major)
16084 + char buf[11], *modifier, *buf_ptr;
16085 + int int_mod, done;
16086 + struct blk_device_info *bdi;
16088 + bdi = &blk_dev_info[major];
16090 + /* convert the base name modifier to an integer */
16091 + modifier = bdi->devnode_name_modifier;
16093 + while (*modifier) {
16094 + if (bdi->devnode_name_type == INDEX_ALPHA) {
16096 + int_mod += *modifier - 'a';
16099 + int_mod += *modifier - '0';
16106 + /* add in device_index_value */
16107 + int_mod += bdi->devnode_name_index;
16108 + bdi->devnode_name_index++;
16110 + /* convert integer modifier back to ALPHA/NUMERIC chars */
16111 + memset(buf, 0, sizeof (buf));
16112 + /* fill the buffer from the rear to front with the
16113 + * ascii version of the modifier, leaving space for
16114 + * NULL terminator at the end.
16116 + buf_ptr = &buf[sizeof (buf) - 2];
16119 + if (bdi->devnode_name_type == INDEX_ALPHA) {
16120 + *buf_ptr = (int_mod % 26) + 'a';
16123 + *buf_ptr = (int_mod % 10) + '0';
16134 + /* find beginning of modifier in buffer */
16136 + while (!*modifier)
16139 + /* build the final device devnode name */
16140 + sprintf(name_buf, "%s%s", bdi->devnode_name_base, modifier);
16144 +ldev_mgr_lock_device(struct ldev_private *ldev_prv)
16147 + struct block_device *bdev;
16149 + bdev = bdget(MKDEV(ldev_prv->major, ldev_prv->minor));
16152 + rc = blkdev_get(bdev, FMODE_READ | FMODE_WRITE, 0, BDEV_RAW);
16155 + ldev_prv->bdev = bdev;
16160 +ldev_mgr_unlock_device(struct ldev_private *ldev_prv)
16162 + struct block_device *bdev = ldev_prv->bdev;
16163 + ldev_prv->bdev = NULL;
16165 + LOG_ERROR("error: NULL bdev field detected!\n");
16168 + blkdev_put(bdev, BDEV_RAW);
16171 +#define DEVICE_KNOWN 1234
16172 +#define DEVICE_UNINITIALIZED 1235
16173 +#define DEVICE_MEDIA_NOT_PRESENT 1236
16175 +create_logical_disk(struct evms_logical_node **disk_list,
16176 + struct gendisk *gd, int device_index)
16178 + int rc = 0, major, minor;
16179 + struct evms_logical_node *new_disk = NULL;
16180 + struct ldev_private *ldev_prv = NULL;
16181 + char device_name[EVMS_VOLUME_NAME_SIZE + 1];
16183 + major = gd->major;
16184 + minor = device_index << gd->minor_shift;
16186 + /* skip uninitialized devices */
16187 + if (!blk_size[major])
16188 + rc = DEVICE_UNINITIALIZED;
16189 + else if (!blk_size[major][minor])
16190 + rc = DEVICE_UNINITIALIZED;
16192 + /* construct the devnode name for this device */
16193 + build_devnode_name(device_name, major);
16195 + /* skip devices we already know about */
16196 + if (is_in_device_list(gd, major, minor) == TRUE)
16197 + rc = DEVICE_KNOWN;
16199 + /* allocate the new node */
16201 + rc = evms_cs_allocate_logical_node(&new_disk);
16203 + /* allocate new nodes's instance data */
16205 + ldev_prv = kmalloc(sizeof(struct ldev_private), GFP_KERNEL);
16209 + /* initialize the new node */
16211 + memset(ldev_prv, 0, sizeof(struct ldev_private));
16212 + new_disk->plugin = &plugin_header;
16214 + /* initialize the instance data */
16215 + new_disk->private = ldev_prv;
16216 + ldev_prv->gd = gd;
16217 + ldev_prv->major = major;
16218 + ldev_prv->minor = minor;
16219 + rc = ldev_mgr_lock_device(ldev_prv);
16221 + LOG_ERROR("error(%d): unable to lock device(%d,%d)!\n",
16222 + rc, major, minor);
16226 + /* determine hardsector size */
16227 + new_disk->hardsector_size = 512;
16228 + if (hardsect_size[major]) {
16229 + new_disk->hardsector_size = hardsect_size[major][minor];
16231 + /* save the block size */
16232 + new_disk->block_size = 1024;
16233 + if (blksize_size[major]) {
16234 + new_disk->block_size = blksize_size[major][minor];
16236 + /* obtain the device size in sectors
16238 + * try 64bit size first, if that fails
16239 + * fall back on the 32bit size.
16241 + /* try 64bit size */
16242 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
16243 + rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE64,
16244 + (ulong) & new_disk->total_vsectors);
16246 + /* convert bytes to 512 byte sectors */
16247 + new_disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
16251 + /* try 32bit size */
16252 + ulong dev_size = 0;
16253 + rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE,
16254 + (ulong) & dev_size);
16255 + new_disk->total_vsectors = dev_size;
16257 + if (!rc && !new_disk->total_vsectors) {
16262 + /* remember removable devices */
16264 + if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
16265 + new_disk->flags |= EVMS_DEVICE_REMOVABLE;
16267 + /* save the devnode name for this device */
16268 + strcpy(new_disk->name, device_name);
16270 + /* register this device with evms */
16271 + evms_cs_register_device(new_disk);
16272 + MOD_INC_USE_COUNT;
16274 + /* append this record the linked list */
16275 + evms_cs_add_logical_node_to_list(disk_list, new_disk);
16277 + ("added logical disk(%s) for physical disk(%u,%u,%s), size("PFU64") in 512 byte units\n",
16278 + new_disk->name, major, minor, new_disk->name,
16279 + new_disk->total_vsectors);
16282 + /* reset the "benign" error codes for the caller */
16284 + case DEVICE_UNINITIALIZED:
16285 + case DEVICE_KNOWN:
16286 + case DEVICE_MEDIA_NOT_PRESENT:
16292 + ("error(%d): creating logical disk for device(%d,%d).\n",
16293 + rc, major, minor);
16295 + evms_cs_deallocate_logical_node(new_disk);
16306 +create_logical_generic_disks(struct evms_logical_node **disk_list,
16307 + struct gendisk *gd)
16311 + /* This is a generic device */
16314 + LOG_DEBUG("major name = %s\n", gd->major_name);
16315 + LOG_DEBUG("number of real devices = %i\n", gd->nr_real);
16316 + for (i = 0; i < gd->nr_real; i++) {
16317 + LOG_DEBUG("device %d:\n", i);
16318 + rc = create_logical_disk(disk_list, gd, i);
16326 +create_logical_ide_disks(struct evms_logical_node **disk_list,
16327 + struct gendisk *gd)
16330 + ide_hwif_t *ide_hwif;
16331 + ide_drive_t *drive;
16333 + /* This is an IDE device */
16334 + LOG_DEBUG("found IDE major : %i - searching for disks\n", gd->major);
16336 + ide_hwif = gd->real_devices; /* IDE internal data */
16337 + for (i = 0; i < MAX_DRIVES; i++) {
16338 + drive = &(ide_hwif->drives[i]);
16339 + if (drive->present && (drive->media == ide_disk)) {
16340 + /* force the name index value on ide drives */
16341 + blk_dev_info[gd->major].devnode_name_index = i;
16342 + rc = create_logical_disk(disk_list, gd, i);
16351 +create_logical_scsi_disks(struct evms_logical_node **disk_list,
16352 + struct gendisk *gd)
16355 + Scsi_Disk *SDisks;
16356 + Scsi_Device *SDev;
16358 + /* This is an SCSI device */
16359 + LOG_DEBUG("found SCSI major : %i - searching for disks\n", gd->major);
16360 + LOG_DEBUG("scsi: major name = %s\n", gd->major_name);
16361 + LOG_DEBUG("scsi: number of real devices = %i\n", gd->nr_real);
16362 + SDisks = gd->real_devices; /* SCSI internal data */
16363 + for (i = 0; i < gd->nr_real; i++) {
16364 + SDev = SDisks[i].device;
16366 + ("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
16367 + SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
16368 + rc = create_logical_disk(disk_list, gd, i);
16376 +create_logical_disks(struct gendisk *gd, void *p_disk_list)
16379 + struct evms_logical_node **disk_list = p_disk_list;
16381 + /* create logical disks from all IDE & SCSI devices */
16382 + switch (blk_dev_info[gd->major].device_type) {
16384 + rc = create_logical_ide_disks(disk_list, gd);
16387 + rc = create_logical_scsi_disks(disk_list, gd);
16389 + case TYPE_GENERIC:
16390 + rc = create_logical_generic_disks(disk_list, gd);
16393 + LOG_DEBUG("unrecognized device major : %i\n", gd->major);
16401 +discover_disks(struct evms_logical_node **disk_list)
16405 + MOD_INC_USE_COUNT;
16406 + LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);
16408 + if (blk_dev_info == NULL) {
16409 + /* allocate space for device info array */
16410 + blk_dev_info = kmalloc(sizeof (struct blk_device_info)
16411 + * (MAX_BLKDEV + 1), GFP_KERNEL);
16412 + if (blk_dev_info) {
16413 + /* initialize device info array */
16414 + memset(blk_dev_info, 0,
16415 + sizeof (struct blk_device_info) * (MAX_BLKDEV + 1));
16416 + init_blk_dev_info(blk_dev_info);
16422 + /* create logical disks from the raw devices */
16423 + rc = walk_gendisk(create_logical_disks, disk_list);
16425 + /* free blk_dev_info table and null the ptr to it */
16426 + kfree(blk_dev_info);
16427 + blk_dev_info = NULL;
16429 + LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);
16430 + MOD_DEC_USE_COUNT;
16434 +/********************************************************/
16435 +/* Required Plugin Function Table Entry Point: */
16436 +/* Delete function */
16437 +/********************************************************/
16440 +ldev_mgr_delete(struct evms_logical_node *disk)
16442 + struct ldev_private *ldev_prv;
16444 + /* reset any evms volume related info from
16445 + * the device node, because we can't predict
16446 + * how this node will be used in the future.
16449 + /* removed the feature header if its been used
16451 + if (disk->feature_header) {
16452 + kfree(disk->feature_header);
16453 + disk->feature_header = NULL;
16455 + /* remove the volume_info structure and flag
16456 + * if this has been used directly by an evms
16459 + evms_cs_deallocate_volume_info(disk);
16460 + /* reset the flags field to the appropriate state
16462 + disk->flags &= ~EVMS_VOLUME_FLAG;
16464 + /* disk nodes only get deleted when:
16465 + * 1) there are no references to the disk node
16467 + * 2) the device is removable
16468 + * 3) the device reported a media change
16470 + * All three of these conditions must be true
16471 + * before the disk node can be deleted.
16472 + * evms_check_for_device_changes should set
16473 + * and ensure these conditions before issuing
16476 + * Newly installed removable media will be
16477 + * picked up in this modules discover code.
16479 + * OR disk nodes can will be deleted if the
16480 + * devices they represent go away, for example
16481 + * in the case of a hotunplugged device or a
16482 + * required driver having been unloaded.
16484 + if (disk->flags & (EVMS_MEDIA_CHANGED | EVMS_DEVICE_UNAVAILABLE)) {
16485 + LOG_DETAILS("deleting '%s'.\n", disk->name);
16487 + evms_cs_unregister_device(disk);
16488 + MOD_DEC_USE_COUNT;
16489 + ldev_prv = disk->private;
16490 + ldev_mgr_unlock_device(ldev_prv);
16494 + evms_cs_deallocate_logical_node(disk);
16499 +/********************************************************/
16500 +/* Required Plugin Function Table Entry Point: */
16501 +/* Read function */
16502 +/********************************************************/
16505 + * function: ldev_mgr_io_error
16507 + * this function was primarily created because the function
16508 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
16509 + * to be set on inline functions. Since this was an error path
16510 + * and not mainline, I decided to add a trace statement to help
16511 + * report on the failing condition.
16515 +ldev_mgr_io_error(struct evms_logical_node *disk, int io_flag, struct buffer_head *bh, int rc)
16517 + if (rc == -EOVERFLOW) {
16519 + ("attempt to %s beyond boundary("PFU64") on (%s), rsector(%ld).\n",
16520 + (io_flag) ? "WRITE" : "READ", disk->total_vsectors - 1,
16521 + disk->name, bh->b_rsector);
16522 + } else if (rc == -ENXIO) {
16523 + LOG_SERIOUS("attempt to access a non-existent device(%s).\n",
16526 + bh->b_end_io(bh, 0);
16529 +/********************************************************/
16530 +/* Required Plugin Function Table Entry Point: */
16531 +/* Read function */
16532 +/********************************************************/
16535 +ldev_mgr_read(struct evms_logical_node *disk, struct buffer_head *bh)
16538 + request_queue_t *q;
16539 + struct ldev_private *ldev_prv;
16541 + ldev_prv = disk->private;
16542 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16543 + disk->total_vsectors) {
16544 + bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16545 + q = blk_get_queue(bh->b_rdev);
16547 + disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16548 + q->make_request_fn(q, READ, bh);
16552 + disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16558 + ldev_mgr_io_error(disk, READ, bh, rc);
16562 +/********************************************************/
16563 +/* Required Plugin Function Table Entry Point: */
16564 +/* Write function */
16565 +/********************************************************/
16568 +ldev_mgr_write(struct evms_logical_node *disk, struct buffer_head *bh)
16571 + request_queue_t *q;
16572 + struct ldev_private *ldev_prv;
16574 + ldev_prv = disk->private;
16575 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16576 + disk->total_vsectors) {
16577 + bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16578 + q = blk_get_queue(bh->b_rdev);
16580 + disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16581 + q->make_request_fn(q, WRITE, bh);
16585 + disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16591 + ldev_mgr_io_error(disk, WRITE, bh, rc);
16595 +/********************************************************/
16596 +/* Required Plugin Function Table Entry Point: */
16597 +/* Init_io function & Support routines */
16598 +/********************************************************/
16601 + * function: allocate_bh
16603 + * This function obtains a buffer head from the private
16604 + * buffer head pool (pre-allocated at EVMS initial
16605 + * discovery time).
16607 + * NOTE: All access to the buffer head pool are protected
16608 + * by a private spinlock.
16611 +static inline struct buffer_head *
16614 + struct buffer_head *bh =
16615 + evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
16617 + init_waitqueue_head(&bh->b_wait);
16623 + * function: deallocate_bh
16625 + * This function returns a buffer head to the private
16626 + * buffer head pool (pre-allocated at EVMS initial
16627 + * discovery time).
16629 + * NOTE: All access to the buffer head pool are protected
16630 + * by a private spinlock.
16633 +static inline void
16634 +deallocate_bh(struct buffer_head *bh)
16636 + evms_cs_deallocate_to_pool(evms_bh_pool, bh);
16639 +/* this is the buffer head control block structure definition */
16640 +typedef struct bh_cb_s {
16642 + atomic_t blks_allocated;
16643 + wait_queue_head_t cb_wait;
16647 + * function: __wait_on_bh_cb
16649 + * This is a worker function to wait_on_bh_cb.
16650 + * This function waits for a set of private buffer heads
16651 + * associated to the specified buffer head control block
16652 + * to return from I/O completion. On completion of the
16653 + * last buffer head, the calling function is awakened
16654 + * and continues running.
16656 + * This is the worker function to the function wait_on_bh_cb.
16660 +__wait_on_bh_cb(bh_cb_t * bh_cb)
16662 + struct task_struct *tsk = current;
16663 + DECLARE_WAITQUEUE(wait, tsk);
16665 + add_wait_queue(&bh_cb->cb_wait, &wait);
16667 + run_task_queue(&tq_disk);
16668 + set_task_state(tsk, TASK_UNINTERRUPTIBLE);
16669 + if (!atomic_read(&bh_cb->blks_allocated))
16672 + } while (atomic_read(&bh_cb->blks_allocated));
16673 +#ifdef O1_SCHEDULER
16674 + set_task_state(tsk, TASK_RUNNING);
16676 + tsk->state = TASK_RUNNING;
16678 + remove_wait_queue(&bh_cb->cb_wait, &wait);
16682 + * function: wait_on_bh_cb
16684 + * This function waits for a set of private buffer heads
16685 + * associated to the specified buffer head control block
16686 + * to return from I/O completion. On completion of the
16687 + * last buffer head, the calling function is awakened
16688 + * and continues running.
16692 +wait_on_bh_cb(bh_cb_t * bh_cb)
16694 + if (atomic_read(&bh_cb->blks_allocated))
16695 + __wait_on_bh_cb(bh_cb);
16697 + /* if we ended up with no buffer heads on
16698 + * this pass, lets wait a until a few buffer
16699 + * heads have been freed and try again. This
16700 + * should provide a reasonable delay.
16706 + * function: end_bh_cb_io
16708 + * This is the I/O completion function that is called for
16709 + * each private buffer head obtained from the buffer head
16710 + * pool. Control is return thru this routine so we can track
16711 + * all outstanding requests to know when to awaken the caller,
16712 + * and to regain control after all I/Os have been performed.
16716 +end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
16718 + bh_cb_t *bh_cb = (bh_cb_t *) bh->b_private;
16720 + /* record that errors occurred */
16722 + bh_cb->rc = -EIO;
16724 + mark_buffer_uptodate(bh, uptodate);
16725 + unlock_buffer(bh);
16727 + deallocate_bh(bh);
16728 + atomic_dec(&bh_cb->blks_allocated);
16729 + if (!atomic_read(&bh_cb->blks_allocated))
16730 + if (waitqueue_active(&bh_cb->cb_wait))
16731 + wake_up(&bh_cb->cb_wait);
16735 + * function: ldev_partial_sector_init_io
16737 + * This function is a support function for ldev_init_io,
16738 + * which handles the cases of performing I/O to only a part
16739 + * of non-standard sized hardsector. This function is not
16740 + * designed to be called directly, but via ldev_init_io.
16744 +ldev_partial_sector_init_io(struct evms_logical_node *node,
16750 + void *bufptr, unsigned char **sector_buf)
16753 + struct ldev_private *ldev_prv = node->private;
16754 + kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16755 + struct buffer_head *bh;
16757 + if (*sector_buf == NULL) {
16758 + /* allocate buffer for incoming sector */
16759 + *sector_buf = kmalloc(node->hardsector_size, GFP_KERNEL);
16760 + if (!*sector_buf)
16763 + /* allocate a buffer head from the pool */
16764 + while ((bh = allocate_bh()) == NULL)
16765 + /* yielding the cpu is playing it
16766 + * safe. it might be wiser to just
16767 + * spin. requires more thought.
16771 + /* set up the buffer head for this sector */
16772 + bh->b_end_io = end_bh_cb_io_sync;
16773 + bh->b_size = node->hardsector_size;
16774 + bh->b_rdev = dev;
16775 + bh->b_rsector = next_lsn - sector_lsn;
16776 + bh->b_data = *sector_buf;
16777 + bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16779 + set_bit(BH_Dirty, &bh->b_state);
16780 + set_bit(BH_Lock, &bh->b_state);
16781 + set_bit(BH_Req, &bh->b_state);
16782 + set_bit(BH_Mapped, &bh->b_state);
16783 + bh->b_private = (void *) bh_cb;
16784 + atomic_inc(&bh_cb->blks_allocated);
16786 + /* drive the buffer head down */
16787 + /* to the device */
16788 + generic_make_request(READ, bh);
16790 + /* wait for all bh's I/O's to end */
16791 + wait_on_bh_cb(bh_cb);
16793 + /* copy data to/from user */
16794 + if (io_flag != WRITE)
16797 + *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16798 + io_size << EVMS_VSECTOR_SIZE_SHIFT);
16801 + memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16802 + bufptr, io_size << EVMS_VSECTOR_SIZE_SHIFT);
16804 + /* allocate a buffer head from the pool */
16805 + while ((bh = allocate_bh()) == NULL)
16806 + /* yielding the cpu is playing it
16807 + * safe. it might be wiser to just
16808 + * spin. requires more thought.
16812 + /* set up the buffer head for this sector */
16813 + bh->b_end_io = end_bh_cb_io_sync;
16814 + bh->b_size = node->hardsector_size;
16815 + bh->b_rdev = dev;
16816 + bh->b_rsector = next_lsn - sector_lsn;
16817 + bh->b_data = *sector_buf;
16818 + bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16820 + set_bit(BH_Dirty, &bh->b_state);
16821 + set_bit(BH_Lock, &bh->b_state);
16822 + set_bit(BH_Req, &bh->b_state);
16823 + set_bit(BH_Mapped, &bh->b_state);
16824 + bh->b_private = (void *) bh_cb;
16825 + atomic_inc(&bh_cb->blks_allocated);
16827 + /* drive the buffer head down */
16828 + /* to the device */
16829 + generic_make_request(WRITE, bh);
16831 + /* wait for all bh's I/O's to end */
16832 + wait_on_bh_cb(bh_cb);
16838 + * function: ldev_init_io
16840 + * This function provides support for synchronous I/O
16841 + * operations to the underlying devices. These I/O
16842 + * operations are NOT buffered in any way including the
16843 + * operating system's buffer cache.
16845 + * This function can work with any hardsector size that
16846 + * is a power of 2.
16848 + * node : logical node of the target logical disk
16849 + * io_flag : 0 = read, 1 = write, 2 = read-a-head
16850 + * starting_lsn : the 0-based (disk relative) logical
16851 + * : (512 byte) sector number (lsn)
16852 + * num_lsns : the total number of lsns in this I/O
16853 + * bufptr : address of the memory to read/write the data
16857 +ldev_init_io(struct evms_logical_node *node,
16859 + u64 starting_lsn, u64 num_lsns, void *bufptr)
16861 + int rc = 0, lsns_per_hardsector, lsns_per_blocksize;
16862 + unchar *sector_buf = NULL, *cur_bufptr;
16863 + u64 next_lsn, remaining_lsns, sector_lsn;
16864 + struct ldev_private *ldev_prv = node->private;
16865 + kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16869 + ("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn("PFU64"), num_lsns("PFU64"), bufptr(0x%p)\n",
16870 + __FUNCTION__, ldev_prv->major, ldev_prv->minor, io_flag,
16871 + starting_lsn, num_lsns, bufptr);
16873 + /* check for valid device */
16874 + if (!blk_size[ldev_prv->major][ldev_prv->minor]) {
16875 + node->flags |= EVMS_DEVICE_UNAVAILABLE;
16878 + /* check for 0 length request */
16879 + if (num_lsns == 0) {
16880 + LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);
16881 + return (-EINVAL);
16883 + /* check for out of bound request */
16884 + if ((starting_lsn + num_lsns) > node->total_vsectors) {
16886 + ("%s: attempted %s beyond logical disk boundary("PFU64" LSNs), requesting LSN("PFU64"), total LSNs("PFU64").\n",
16887 + __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
16888 + node->total_vsectors, starting_lsn, num_lsns);
16889 + return (-EINVAL);
16891 + /* check for invalid io_flag value */
16892 + switch (io_flag) {
16893 + case READ: /* read... */
16894 + case WRITE: /* write... */
16895 + case READA: /* reada... */
16898 + return (-EINVAL);
16901 + /* compute some per device info once up-front */
16902 + lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;
16903 + lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;
16905 + /* initialize the buffer head control block */
16906 + memset(&bh_cb, 0, sizeof (bh_cb_t));
16907 + init_waitqueue_head(&bh_cb.cb_wait);
16908 + bh_cb.blks_allocated = (atomic_t)ATOMIC_INIT(0);
16910 + /* only update the local copy of variables */
16911 + cur_bufptr = bufptr;
16912 + next_lsn = starting_lsn;
16913 + remaining_lsns = num_lsns;
16915 + /* check for a mid-sector starting offset
16917 + * if found, perform I/O on part of that
16920 + sector_lsn = next_lsn & (lsns_per_hardsector - 1);
16921 + if (sector_lsn) {
16924 + /* determine bytes in IO to this sector */
16925 + io_size = lsns_per_hardsector - sector_lsn;
16926 + if (io_size > remaining_lsns)
16927 + io_size = remaining_lsns;
16929 + /* perform the partial sector io */
16930 + rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
16932 + sector_lsn, io_size,
16933 + cur_bufptr, §or_buf);
16936 + /* update progress in local variables */
16937 + cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;
16938 + next_lsn += io_size;
16939 + remaining_lsns -= io_size;
16943 + /* continue if no errors found */
16945 + /* perform I/O on all the complete sectors
16946 + * in this request.
16948 + * loop until there are no more complete sectors
16951 + while (remaining_lsns >= lsns_per_hardsector) {
16952 + /* this inner loop attempts to drive as many
16953 + * bytes (in sector size multiples) down to
16954 + * the device as possible using the available
16955 + * buffer heads in the pool.
16957 + while (remaining_lsns >= lsns_per_hardsector) {
16958 + struct buffer_head *bh;
16960 + /* allocate a buffer head from the pool */
16961 + bh = allocate_bh();
16965 + /* set up the buffer head for this I/O */
16966 + bh->b_end_io = end_bh_cb_io_sync;
16968 + (remaining_lsns >= lsns_per_blocksize) ?
16969 + node->block_size : node->hardsector_size;
16970 + bh->b_data = cur_bufptr;
16971 + bh->b_rdev = dev;
16972 + bh->b_rsector = next_lsn;
16973 + bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16975 + set_bit(BH_Dirty, &bh->b_state);
16976 + set_bit(BH_Lock, &bh->b_state);
16977 + set_bit(BH_Req, &bh->b_state);
16978 + set_bit(BH_Mapped, &bh->b_state);
16979 + bh->b_private = (void *) &bh_cb;
16980 + atomic_inc(&bh_cb.blks_allocated);
16982 + /* drive the buffer head down */
16983 + /* to the device */
16984 + generic_make_request(io_flag, bh);
16986 + /* update progress in local variables */
16987 + cur_bufptr += bh->b_size;
16989 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16990 + remaining_lsns -=
16991 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16993 + /* wait for all bh's I/O's to end */
16994 + wait_on_bh_cb(&bh_cb);
16998 + /* continue if no errors found */
17000 + /* check for a mid-sector ending offset
17002 + * if found, perform I/O on part of that
17005 + if (remaining_lsns)
17006 + /* perform the partial sector io */
17007 + rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
17009 + 0, remaining_lsns,
17013 + /* free the sector buffer if it was allocated */
17015 + kfree(sector_buf);
17017 + /* coalesce return codes */
17020 + LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);
17026 +ldev_mgr_direct_ioctl(struct inode *inode,
17027 + struct file *file, unsigned int cmd, unsigned long arg)
17030 + struct ldev_private *ldev_prv;
17031 + struct evms_plugin_ioctl_pkt tmp, *user_parms;
17032 + struct ldev_plugin_ioctl pi_data;
17033 + struct evms_logical_node *disk;
17035 + MOD_INC_USE_COUNT;
17037 + user_parms = (struct evms_plugin_ioctl_pkt *) arg;
17038 + /* copy user's parameters to kernel space */
17039 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
17043 + /* validate its meant for us */
17044 + if (tmp.feature_id != plugin_header.id) {
17050 + /* copy feature ioctl data to kernel space */
17051 + if (copy_from_user(&pi_data, tmp.feature_ioctl_data,
17052 + sizeof (pi_data))) {
17058 + /* find the disk node specified by the disk_handle */
17059 + int done = FALSE;
17062 + rc = evms_cs_find_next_device(disk,
17072 + DEV_HANDLE_TO_NODE(pi_data.disk_handle)) {
17079 + /* perform feature command */
17080 + ldev_prv = (struct ldev_private *) disk->private;
17081 + switch (tmp.feature_command) {
17083 + case LDEV_MGR_BROADCAST_IOCTL_CMD:
17084 + save_dev = inode->i_rdev;
17086 + MKDEV(ldev_prv->major, ldev_prv->minor);
17087 + rc = ldev_prv->bdev->bd_op->ioctl(inode, file,
17090 + inode->i_rdev = save_dev;
17098 + /* return status value */
17100 + copy_to_user((struct evms_plugin_ioctl_pkt *) arg, &tmp, sizeof (tmp));
17101 + MOD_DEC_USE_COUNT;
17105 +/********************************************************/
17106 +/* Required Plugin Function Table Entry Point: */
17107 +/* IOCTL function & Support routines */
17108 +/********************************************************/
17111 +ldev_mgr_ioctl(struct evms_logical_node *disk,
17112 + struct inode *inode,
17113 + struct file *file, unsigned int cmd, unsigned long arg)
17116 + struct ldev_private *ldev_prv = disk->private;
17118 + struct block_device *save_bdev;
17120 + if (!inode || !disk)
17123 + save_dev = inode->i_rdev;
17124 + inode->i_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
17125 + save_bdev = inode->i_bdev;
17126 + inode->i_bdev = ldev_prv->bdev;
17127 + /* check device availability */
17128 + if (!blk_get_queue(MKDEV(ldev_prv->major, ldev_prv->minor))) {
17129 + disk->flags |= EVMS_DEVICE_UNAVAILABLE;
17132 + case EVMS_QUIESCE_VOLUME:
17133 + case EVMS_PLUGIN_IOCTL:
17135 + case EVMS_GET_BMAP:
17137 + struct evms_get_bmap_pkt *bmap =
17138 + (struct evms_get_bmap_pkt *) arg;
17139 + bmap->dev = MKDEV(ldev_prv->major, ldev_prv->minor);
17140 + bmap->status = 0;
17143 + case EVMS_OPEN_VOLUME:
17144 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17147 + rc = ldev_prv->bdev->bd_op->open(inode, file);
17150 + case EVMS_CLOSE_VOLUME:
17151 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17154 + rc = ldev_prv->bdev->bd_op->release(inode, file);
17157 + case EVMS_CHECK_MEDIA_CHANGE:
17158 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17161 + /* once we detect that media changed
17162 + * is 'set', don't send any more ioctls
17163 + * down to the device, until the
17164 + * media change has been 'reset' by a
17165 + * revalidate disk ioctl. when already
17166 + * 'set', just return a 1 w/o actually
17167 + * performing another ioctl call to the
17170 + if (ldev_prv->media_changed == TRUE) {
17174 + rc = ldev_prv->bdev->bd_op->
17175 + check_media_change(MKDEV
17176 + (ldev_prv->major,
17177 + ldev_prv->minor));
17179 + ldev_prv->media_changed = TRUE;
17180 + disk->flags |= EVMS_MEDIA_CHANGED;
17184 + case EVMS_REVALIDATE_DISK:
17185 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17188 + /* don't actually send this ioctl down
17189 + * to the device, until we know that
17190 + * previous check media change ioctl
17193 + * when we do actually send the ioctl
17194 + * down, reset the local media_changed
17197 + if (ldev_prv->media_changed == FALSE)
17199 + rc = ldev_prv->bdev->bd_op->
17201 + (ldev_prv->major, ldev_prv->minor));
17202 + ldev_prv->media_changed = FALSE;
17205 + case EVMS_GET_DISK_LIST:
17206 + rc = evms_cs_add_item_to_list((struct evms_list_node **) arg,
17211 + case EVMS_CHECK_DEVICE_STATUS:
17213 + int *status = (int *) arg;
17214 + *status |= disk->flags;
17217 + case EVMS_UPDATE_DEVICE_INFO:
17218 + /* determine hardsector size */
17219 + disk->hardsector_size = 512;
17220 + if (hardsect_size[ldev_prv->major]) {
17221 + disk->hardsector_size = hardsect_size[ldev_prv->major][ldev_prv->minor];
17223 + /* save the block size */
17224 + disk->block_size = 1024;
17225 + if (blksize_size[ldev_prv->major]) {
17226 + disk->block_size = blksize_size[ldev_prv->major][ldev_prv->minor];
17228 + /* device size in sectors
17230 + * try 64bit size first, if that fails
17231 + * fall back on the 32bit size.
17233 + /* try 64bit size */
17234 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
17235 + rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE64,
17236 + (ulong) & disk->total_vsectors);
17238 + /* convert bytes to 512 byte sectors */
17239 + disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
17243 + /* try 32bit size */
17244 + ulong dev_size = 0;
17245 + rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE,
17246 + (ulong) & dev_size);
17247 + disk->total_vsectors = dev_size;
17251 + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17254 + rc = ldev_prv->bdev->bd_op->ioctl(inode, file, cmd,
17259 + inode->i_bdev = save_bdev;
17260 + inode->i_rdev = save_dev;
17265 +/********************************************************/
17266 +/* Required Module Entry Point: */
17267 +/* ldev_mgr_init */
17268 +/********************************************************/
17271 +ldev_mgr_init(void)
17273 + return evms_cs_register_plugin(&plugin_header);
17276 +static void __exit
17277 +ldev_mgr_exit(void)
17279 + evms_cs_unregister_plugin(&plugin_header);
17282 +module_init(ldev_mgr_init);
17283 +module_exit(ldev_mgr_exit);
17284 +#ifdef MODULE_LICENSE
17285 +MODULE_LICENSE("GPL");
17287 diff -Naur linux-2002-09-30/drivers/evms/lvm_vge.c evms-2002-09-30/drivers/evms/lvm_vge.c
17288 --- linux-2002-09-30/drivers/evms/lvm_vge.c Wed Dec 31 18:00:00 1969
17289 +++ evms-2002-09-30/drivers/evms/lvm_vge.c Fri Sep 13 16:45:06 2002
17291 +/* -*- linux-c -*- */
17293 + * Copyright (c) International Business Machines Corp., 2000
17295 + * This program is free software; you can redistribute it and/or modify
17296 + * it under the terms of the GNU General Public License as published by
17297 + * the Free Software Foundation; either version 2 of the License, or
17298 + * (at your option) any later version.
17300 + * This program is distributed in the hope that it will be useful,
17301 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
17302 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17303 + * the GNU General Public License for more details.
17305 + * You should have received a copy of the GNU General Public License
17306 + * along with this program; if not, write to the Free Software
17307 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17310 + * linux/drivers/evms/lvm_vge.c
17312 + * EVMS Linux LVM Region Manager
17315 +#define LOG_PREFIX "lvm: "
17317 +#include <linux/kernel.h>
17318 +#include <linux/module.h>
17319 +#include <linux/vmalloc.h>
17320 +#include <linux/version.h>
17321 +#include <asm/uaccess.h>
17323 +#include <linux/evms/evms.h>
17324 +#include <linux/evms/evms_lvm.h>
17326 +/* Plugin API prototypes. */
17327 +static int lvm_discover(struct evms_logical_node ** evms_node_list);
17328 +static int lvm_discover_end(struct evms_logical_node ** evms_node_list);
17329 +static int lvm_delete_node(struct evms_logical_node * logical_node);
17330 +static void lvm_read(struct evms_logical_node * node, struct buffer_head * bh);
17331 +static void lvm_write(struct evms_logical_node * node, struct buffer_head * bh);
17332 +static int lvm_init_io(struct evms_logical_node * node,
17336 + void * buf_addr);
17337 +static int lvm_ioctl(struct evms_logical_node * logical_node,
17338 + struct inode * inode,
17339 + struct file * file,
17340 + unsigned int cmd,
17341 + unsigned long arg);
17342 +static int lvm_direct_ioctl(struct inode * inode,
17343 + struct file * file,
17344 + unsigned int cmd,
17345 + unsigned long args);
17347 +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
17348 + u64 snap_sector);
17350 +/* LVM Plugin function table and header. */
17351 +static struct evms_plugin_fops lvm_fops = {
17352 + .discover = lvm_discover,
17353 + .end_discover = lvm_discover_end,
17354 + .delete = lvm_delete_node,
17355 + .read = lvm_read,
17356 + .write = lvm_write,
17357 + .init_io = lvm_init_io,
17358 + .ioctl = lvm_ioctl,
17359 + .direct_ioctl = lvm_direct_ioctl
17362 +static struct evms_plugin_header lvm_plugin_header = {
17363 + .id = SetPluginID(IBM_OEM_ID,
17364 + EVMS_REGION_MANAGER,
17367 + .major = EVMS_LVM_VERSION_MAJOR,
17368 + .minor = EVMS_LVM_VERSION_MINOR,
17369 + .patchlevel = EVMS_LVM_VERSION_PATCH
17371 + .required_services_version = {
17376 + .fops = &lvm_fops
17379 +static struct lvm_volume_group * lvm_group_list = NULL;
17380 +static struct proc_dir_entry * lvm_proc = NULL;
17383 +/********** Miscellaneous Functions **********/
17389 + * @org_sector: Logical sector to remap.
17390 + * @size: Size (in sectors) or request to remap.
17391 + * @new_sector: Remapped sector.
17392 + * @new_size: New size (in sectors).
17393 + * @pe_start_sector: Starting sector of PE - needed for snapshotting.
17394 + * @pv_entry: New node for which new_sector is relative.
17396 + * Common function to remap LV lba to PV lba in appropriate PE. This
17397 + * function needs to deal with requests that span PEs and/or stripes. If
17398 + * this occurs, the request will simply be chopped off at the boundary of
17399 + * the first PE/stripe. It is up to the calling function to loop
17400 + * accordingly to finish the full remapping. This function is now partially
17401 + * 64-bit enabled. The striping section contains code that currently cannot
17402 + * eliminate at least one mod operation on 64 bit values.
17404 +static int remap_sector(struct evms_logical_node * node,
17407 + u64 * new_sector,
17409 + u64 * pe_start_sector,
17410 + struct lvm_physical_volume ** pv_entry)
17412 + struct lvm_logical_volume * volume = node->private;
17413 + struct le_table_entry * le_entry;
17414 + u32 le, offset_in_le;
17416 + *new_size = size;
17418 + if ( volume->stripes > 1 ) {
17419 + /* Volume is striped. Reset the size if the request crosses
17420 + * a stripe boundary. Striping in LVM is not 64-bit enabled.
17422 + u32 column, columns, sectors_per_column;
17423 + u32 sector_in_column, stripe_in_column, le_in_column;
17424 + u32 offset_in_stripe, stripe_in_le;
17425 + u32 org_sector32 = org_sector;
17427 + sectors_per_column = volume->stripes * volume->pe_size;
17428 + column = org_sector32 / sectors_per_column;
17429 + sector_in_column = org_sector32 % sectors_per_column;
17430 + stripe_in_column = sector_in_column / volume->stripe_size;
17431 + le_in_column = stripe_in_column % volume->stripes;
17432 + columns = volume->num_le / volume->stripes;
17433 + le = column + (columns * le_in_column);
17435 + offset_in_stripe = org_sector32 % volume->stripe_size;
17436 + stripe_in_le = stripe_in_column / volume->stripes;
17437 + offset_in_le = offset_in_stripe +
17438 + stripe_in_le * volume->stripe_size;
17440 + if ( offset_in_stripe + size > volume->stripe_size ) {
17441 + *new_size = volume->stripe_size - offset_in_stripe;
17444 + /* Linear volume. Just find LE and offset. Reset the size if
17445 + * the request crosses an LE boundary. This path is 64-bit safe.
17447 + le = org_sector >> volume->pe_size_shift;
17448 + offset_in_le = org_sector & (volume->pe_size - 1);
17450 + if ( offset_in_le + size > volume->pe_size ) {
17451 + *new_size = volume->pe_size - offset_in_le;
17455 + le_entry = &volume->le_map[le];
17456 + *pe_start_sector = le_entry->pe_sector_offset;
17457 + *new_sector = le_entry->pe_sector_offset + offset_in_le;
17458 + *pv_entry = le_entry->owning_pv;
17464 + * add_group_to_list
17466 + * Add a volume group to the end of the LVM global group list.
17468 +static int add_group_to_list(struct lvm_volume_group * group)
17470 + struct lvm_volume_group ** p_group;
17472 + for ( p_group = &lvm_group_list;
17473 + *p_group; p_group = &(*p_group)->next_group ) {
17477 + *p_group = group;
17478 + group->next_group = NULL;
17483 + * remove_group_from_list
17485 + * Remove an LVM volume group from the global LVM list.
17487 +static int remove_group_from_list(struct lvm_volume_group * group)
17489 + struct lvm_volume_group ** p_group;
17491 + for ( p_group = &lvm_group_list;
17492 + *p_group; p_group = &(*p_group)->next_group ) {
17493 + if ( *p_group == group ) {
17494 + *p_group = (*p_group)->next_group;
17495 + group->next_group = NULL;
17504 + * find_group_by_uuid
17506 + * Use the vg_uuid to find the desired volume group.
17508 +static int find_group_by_uuid(u8 * vg_uuid,
17509 + struct lvm_volume_group ** group)
17511 + struct lvm_volume_group * gp;
17513 + for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
17514 + if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
17524 + * find_pv_by_number
17526 + * Search the PV list of the specified volume group, looking for the
17527 + * specified PV number. If found, return a pointer to that PV.
17529 +static struct lvm_physical_volume *
17530 +find_pv_by_number(u32 pv_number,
17531 + struct lvm_volume_group * group)
17533 + struct lvm_physical_volume * pv_entry;
17535 + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
17536 + if ( pv_entry->pv_number == pv_number ) {
17544 + * translate_lv_name
17545 + * @lvm_lv_name: Input LVM-style name.
17546 + * @evms_node_name: Output EVMS-style name.
17548 + * In LVM, volumes have names based on their dev-node, which follow the
17549 + * pattern /dev/group_name/volume_name. In EVMS, the same volume needs
17550 + * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
17551 + * the lv_disk_t needs to be translated before copying to the associated
17552 + * node. evms_node_name must point to a NAME_LEN sized buffer.
17554 +static int translate_lv_name(char * lvm_lv_name, char * evms_node_name)
17558 + memset(evms_node_name, 0, NAME_LEN);
17560 + /* Make sure the string starts with /dev/, and skip over it. */
17561 + ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
17562 + if ( ptr != lvm_lv_name ) {
17563 + LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
17566 + ptr = &ptr[strlen(DEV_DIRECTORY)];
17568 + /* ptr now points to "group_name/volume_name".
17569 + * Use this to create the name for the EVMS node.
17571 + strcpy(evms_node_name, LVM_DEV_DIRECTORY);
17572 + strncat(evms_node_name, ptr, NAME_LEN - strlen(evms_node_name) - 1);
17578 + * check_pv_for_lv
17580 + * Run through all LE maps of all LVs in this group, and make sure the
17581 + * specified PV is not being pointed to by any LEs.
17583 +static int check_pv_for_lv(struct lvm_physical_volume * pv_entry,
17584 + struct lvm_volume_group * group)
17586 + struct lvm_logical_volume * volume;
17589 + for ( i = 1; i <= MAX_LV; i++ ) {
17590 + if ( (volume = group->volume_list[i]) ) {
17591 + for ( j = 0; j < volume->num_le; j++ ) {
17592 + if ( volume->le_map[j].owning_pv == pv_entry ) {
17602 +/********** Metadata I/O Functions **********/
17606 + * endian_convert_pv
17608 + * Endian-neutral conversion for PV structures.
17610 +static inline void endian_convert_pv(struct pv_disk * pv)
17612 + pv->version = le16_to_cpup(&pv->version);
17613 + pv->pv_on_disk.base = le32_to_cpup(&pv->pv_on_disk.base);
17614 + pv->pv_on_disk.size = le32_to_cpup(&pv->pv_on_disk.size);
17615 + pv->vg_on_disk.base = le32_to_cpup(&pv->vg_on_disk.base);
17616 + pv->vg_on_disk.size = le32_to_cpup(&pv->vg_on_disk.size);
17617 + pv->pv_uuidlist_on_disk.base =
17618 + le32_to_cpup(&pv->pv_uuidlist_on_disk.base);
17619 + pv->pv_uuidlist_on_disk.size =
17620 + le32_to_cpup(&pv->pv_uuidlist_on_disk.size);
17621 + pv->lv_on_disk.base = le32_to_cpup(&pv->lv_on_disk.base);
17622 + pv->lv_on_disk.size = le32_to_cpup(&pv->lv_on_disk.size);
17623 + pv->pe_on_disk.base = le32_to_cpup(&pv->pe_on_disk.base);
17624 + pv->pe_on_disk.size = le32_to_cpup(&pv->pe_on_disk.size);
17625 + pv->pv_major = le32_to_cpup(&pv->pv_major);
17626 + pv->pv_number = le32_to_cpup(&pv->pv_number);
17627 + pv->pv_status = le32_to_cpup(&pv->pv_status);
17628 + pv->pv_allocatable = le32_to_cpup(&pv->pv_allocatable);
17629 + pv->pv_size = le32_to_cpup(&pv->pv_size);
17630 + pv->lv_cur = le32_to_cpup(&pv->lv_cur);
17631 + pv->pe_size = le32_to_cpup(&pv->pe_size);
17632 + pv->pe_total = le32_to_cpup(&pv->pe_total);
17633 + pv->pe_allocated = le32_to_cpup(&pv->pe_allocated);
17634 + pv->pe_start = le32_to_cpup(&pv->pe_start);
17640 + * Read in the PV structure from the specified node. If it contains a
17641 + * valid PV signature, allocate a new struct pv_disk and copy the data.
17643 +static int read_pv(struct evms_logical_node * node, struct pv_disk ** pv)
17645 + struct pv_disk * pv_buffer;
17646 + int rc = -ENOMEM;
17650 + /* Buffer for reading the PV metadata. */
17651 + pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
17652 + if (!pv_buffer) {
17653 + LOG_CRITICAL("Error allocating PV metadata buffer for %s\n",
17658 + /* Read the first two sectors. */
17659 + rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
17660 + evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer);
17662 + LOG_SERIOUS("Error reading PV metadata from %s\n", node->name);
17666 + /* Endian-neutral conversion of PV metadata. */
17667 + endian_convert_pv(pv_buffer);
17669 + /* Check for an LVM signature and make sure the sizes match.
17670 + * Versions 1 and 2 are both valid now. Thanks LVM! :)
17672 + if ( !(pv_buffer->id[0] == 'H' &&
17673 + pv_buffer->id[1] == 'M' &&
17674 + (pv_buffer->version == 1 || pv_buffer->version == 2) &&
17675 + pv_buffer->pv_size == node->total_vsectors) ) {
17676 + LOG_EXTRA("%s is not an LVM PV\n", node->name);
17681 + /* This is a valid PV. Allocate a new pv_disk. */
17682 + *pv = kmalloc(sizeof(struct pv_disk), GFP_NOIO);
17684 + LOG_CRITICAL("Error allocating new PV for %s\n", node->name);
17689 + /* Copy the metadata. */
17690 + memcpy(*pv, pv_buffer, sizeof(struct pv_disk));
17693 + kfree(pv_buffer);
17699 + * endian_convert_vg
17701 + * Endian-neutral conversion for VG structures
17703 +static inline void endian_convert_vg(struct vg_disk * vg)
17705 + vg->vg_number = le32_to_cpup(&vg->vg_number);
17706 + vg->vg_access = le32_to_cpup(&vg->vg_access);
17707 + vg->vg_status = le32_to_cpup(&vg->vg_status);
17708 + vg->lv_max = le32_to_cpup(&vg->lv_max);
17709 + vg->lv_cur = le32_to_cpup(&vg->lv_cur);
17710 + vg->lv_open = le32_to_cpup(&vg->lv_open);
17711 + vg->pv_max = le32_to_cpup(&vg->pv_max);
17712 + vg->pv_cur = le32_to_cpup(&vg->pv_cur);
17713 + vg->pv_act = le32_to_cpup(&vg->pv_act);
17714 + vg->dummy = le32_to_cpup(&vg->dummy);
17715 + vg->vgda = le32_to_cpup(&vg->vgda);
17716 + vg->pe_size = le32_to_cpup(&vg->pe_size);
17717 + vg->pe_total = le32_to_cpup(&vg->pe_total);
17718 + vg->pe_allocated = le32_to_cpup(&vg->pe_allocated);
17719 + vg->pvg_total = le32_to_cpup(&vg->pvg_total);
17725 + * Read in the VG structure from the specified node. Allocate a new
17726 + * struct vg_disk and copy the data.
17728 +static int read_vg(struct evms_logical_node * node,
17729 + struct pv_disk * pv,
17730 + struct vg_disk ** vg)
17732 + struct vg_disk * vg_buffer;
17733 + unsigned long vg_sectors;
17734 + int rc = -ENOMEM;
17736 + /* Allocate a buffer to read the VG metadata. */
17737 + vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
17738 + vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
17739 + if (!vg_buffer) {
17740 + LOG_CRITICAL("Error allocating VG metadata buffer for %s\n",
17745 + /* Read the VG metadata. */
17746 + rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base),
17747 + vg_sectors, vg_buffer);
17749 + LOG_SERIOUS("Error reading VG metadata from %s\n", node->name);
17753 + /* Endian-neutral conversion of VG metadata. */
17754 + endian_convert_vg(vg_buffer);
17756 + /* Allocate a new struct vg_disk. */
17757 + *vg = kmalloc(sizeof(struct vg_disk), GFP_NOIO);
17759 + LOG_CRITICAL("Error allocating new VG for %s\n", node->name);
17764 + /* Copy the metadata. */
17765 + memcpy(*vg, vg_buffer, sizeof(struct vg_disk));
17768 + kfree(vg_buffer);
17776 +static int read_uuid_list(struct evms_logical_node * node,
17777 + struct pv_disk * pv,
17778 + struct lvm_volume_group * group)
17780 + u64 start_sector;
17781 + unsigned long total_sectors;
17782 + unsigned char * uuid_buffer;
17783 + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17784 + unsigned long uuid_list_size;
17787 + if (group->uuid_list) {
17788 + LOG_EXTRA("Already read PV UUIDs for group %s\n",
17793 + start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
17794 + total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
17795 + uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17798 + /* Allocate a buffer to perform the I/Os. */
17799 + uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
17800 + if (!uuid_buffer) {
17801 + LOG_CRITICAL("Error allocating buffer for UUID list in group %s\n",
17807 + /* Allocate memory for the UUID array for this group. */
17808 + group->uuid_list = vmalloc(uuid_list_size);
17809 + if (!group->uuid_list) {
17810 + LOG_CRITICAL("Error allocating UUID list for group %s\n",
17815 + memset(group->uuid_list, 0, uuid_list_size);
17817 + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17818 + rc = INIT_IO(node, 0, start_sector + i,
17819 + IO_BUFFER_SECTORS, uuid_buffer);
17821 + LOG_SERIOUS("Error reading PV UUID list from %s\n",
17825 + /* Copy the I/O buffer into the UUID array. */
17826 + memcpy(&(group->uuid_list[i * EVMS_VSECTOR_SIZE]),
17827 + uuid_buffer, buffer_size);
17830 + /* Clear out the unused portion at the end of the uuid_list. */
17831 + memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0,
17832 + uuid_list_size - pv->pv_uuidlist_on_disk.size);
17835 + kfree(uuid_buffer);
17840 + vfree(group->uuid_list);
17841 + group->uuid_list = NULL;
17846 + * endian_convert_lv
17848 + * Endian-neutral conversion for LV structures
17850 +static inline void endian_convert_lv(struct lv_disk * lv)
17852 + lv->lv_access = le32_to_cpup(&lv->lv_access);
17853 + lv->lv_status = le32_to_cpup(&lv->lv_status);
17854 + lv->lv_open = le32_to_cpup(&lv->lv_open);
17855 + lv->lv_dev = le32_to_cpup(&lv->lv_dev);
17856 + lv->lv_number = le32_to_cpup(&lv->lv_number);
17857 + lv->lv_mirror_copies = le32_to_cpup(&lv->lv_mirror_copies);
17858 + lv->lv_recovery = le32_to_cpup(&lv->lv_recovery);
17859 + lv->lv_schedule = le32_to_cpup(&lv->lv_schedule);
17860 + lv->lv_size = le32_to_cpup(&lv->lv_size);
17861 + lv->lv_snapshot_minor = le32_to_cpup(&lv->lv_snapshot_minor);
17862 + lv->lv_chunk_size = le16_to_cpup(&lv->lv_chunk_size);
17863 + lv->dummy = le16_to_cpup(&lv->dummy);
17864 + lv->lv_allocated_le = le32_to_cpup(&lv->lv_allocated_le);
17865 + lv->lv_stripes = le32_to_cpup(&lv->lv_stripes);
17866 + lv->lv_stripesize = le32_to_cpup(&lv->lv_stripesize);
17867 + lv->lv_badblock = le32_to_cpup(&lv->lv_badblock);
17868 + lv->lv_allocation = le32_to_cpup(&lv->lv_allocation);
17869 + lv->lv_io_timeout = le32_to_cpup(&lv->lv_io_timeout);
17870 + lv->lv_read_ahead = le32_to_cpup(&lv->lv_read_ahead);
17873 +static inline void endian_convert_lvs(struct lvm_volume_group * group)
17876 + for ( i = 0; i < group->vg->lv_max; i++ ) {
17877 + endian_convert_lv(&(group->lv_array[i]));
17884 + * Read in the LV structures for the specified group. Do the read from
17885 + * the first PV in the group. If that one fails, keep trying on the
17886 + * remaining PVs until one works. This function will allocate a buffer
17887 + * for the group to read in the structures.
17889 +static int read_lv(struct lvm_volume_group * group)
17891 + struct lvm_physical_volume * pv_entry = group->pv_list;
17892 + unsigned char * lv_buffer = NULL;
17893 + u64 start_sector;
17894 + unsigned long total_sectors, lv_array_size = 0;
17895 + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17898 + if (group->lv_array) {
17903 + LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n",
17908 + /* Allocate a buffer to do the actual I/Os. */
17909 + lv_buffer = kmalloc(buffer_size, GFP_NOIO);
17910 + if (!lv_buffer) {
17911 + LOG_CRITICAL("Error allocating buffer for LV structs for Group %s\n",
17916 + /* Read in the LV structures 4k at a time. If one PV returns errors,
17917 + * start over with the next PV in the group.
17919 + while (rc && pv_entry) {
17920 + start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
17921 + total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
17922 + lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17925 + /* Allocate the buffer for this group to
17926 + * hold the entire LV array.
17928 + if (group->lv_array) {
17929 + vfree(group->lv_array);
17930 + group->lv_array = NULL;
17932 + group->lv_array = vmalloc(lv_array_size);
17933 + if (!group->lv_array) {
17934 + LOG_CRITICAL("Error allocating lv_array buffer for Group %s\n",
17939 + memset(group->lv_array, 0, lv_array_size);
17941 + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17942 + rc = INIT_IO(pv_entry->logical_node, 0,
17943 + start_sector + i, IO_BUFFER_SECTORS,
17946 + LOG_SERIOUS("Error reading LV metadata from %s in Group %s\n",
17947 + pv_entry->logical_node->name,
17950 + /* Try the next PV if the current one
17951 + * caused any errors.
17953 + pv_entry = pv_entry->next;
17956 + /* Copy the I/O buffer into the lv_array. */
17957 + memcpy(&(((char *)(group->lv_array))[i * EVMS_VSECTOR_SIZE]),
17958 + lv_buffer, buffer_size);
17963 + LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n",
17968 + /* Clear out the unused portion at the end of the lv_array. */
17969 + memset(&(((char *)(group->lv_array))[pv_entry->pv->lv_on_disk.size]),
17970 + 0, lv_array_size - pv_entry->pv->lv_on_disk.size);
17972 + /* Endian-neutral conversion of the LV metadata. */
17973 + endian_convert_lvs(group);
17976 + kfree(lv_buffer);
17980 + vfree(group->lv_array);
17981 + group->lv_array = NULL;
17986 + * endian_convert_pe_map
17988 + * Endian-neutral conversion for PE structures
17990 +static inline void endian_convert_pe_map(struct lvm_physical_volume * pv_entry)
17993 + for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
17994 + pv_entry->pe_map[i].lv_num =
17995 + le16_to_cpup(&pv_entry->pe_map[i].lv_num);
17996 + pv_entry->pe_map[i].le_num =
17997 + le16_to_cpup(&pv_entry->pe_map[i].le_num);
18004 + * Read in the PE map for the specified PV. This function will allocate a
18005 + * buffer to read in the data.
18007 +static int read_pe_map(struct lvm_physical_volume * pv_entry)
18009 + struct evms_logical_node * node = pv_entry->logical_node;
18010 + struct pv_disk * pv = pv_entry->pv;
18011 + unsigned char * pe_buffer;
18012 + u64 start_sector;
18013 + unsigned long total_sectors, pe_map_size;
18014 + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
18015 + int i, rc = -ENOMEM;
18017 + if (pv_entry->pe_map) {
18021 + start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
18022 + total_sectors = evms_cs_size_in_vsectors(pv->pe_total *
18023 + sizeof(struct pe_disk));
18024 + pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
18026 + /* Allocate a buffer for performing the I/O. */
18027 + pe_buffer = kmalloc(buffer_size, GFP_NOIO);
18028 + if (!pe_buffer) {
18029 + LOG_CRITICAL("Error allocating buffer for PE maps for %s\n",
18034 + /* Allocate a buffer to hold the PE map for this PV. */
18035 + pv_entry->pe_map = vmalloc(pe_map_size);
18036 + if (!pv_entry->pe_map) {
18037 + LOG_CRITICAL("Error allocating PE map for %s\n", node->name);
18040 + memset(pv_entry->pe_map, 0, pe_map_size);
18042 + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
18043 + rc = INIT_IO(node, 0, start_sector + i,
18044 + IO_BUFFER_SECTORS, pe_buffer);
18046 + LOG_SERIOUS("Error reading PE maps from %s.\n",
18050 + /* Copy the data to the actual PE map. */
18051 + memcpy(&(((char *)(pv_entry->pe_map))[i * EVMS_VSECTOR_SIZE]),
18052 + pe_buffer, buffer_size);
18055 + /* Clear out the unused portion at the end of the PE map. */
18056 + memset(&(((char *)(pv_entry->pe_map))[total_sectors * EVMS_VSECTOR_SIZE]),
18057 + 0, pe_map_size - total_sectors * EVMS_VSECTOR_SIZE);
18059 + /* Endian-neutral conversion of the PE metadata. */
18060 + endian_convert_pe_map(pv_entry);
18063 + kfree(pe_buffer);
18068 + vfree(pv_entry->pe_map);
18069 + pv_entry->pe_map = NULL;
18074 +/********** Snapshot Manipulation Functions **********/
18078 + * snapshot_check_quiesce_original
18080 + * For this snapshot LV, check that both it and its original are quiesced.
18083 +snapshot_check_quiesce_original(struct lvm_logical_volume * snap_volume)
18085 + struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18087 + if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
18091 + if ( org_volume && !(org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18099 + * snapshot_check_quiesce_all
18101 + * Go through the list of all snapshots for an original volume, and make
18102 + * sure everyone is in a quiesced state.
18104 +static int snapshot_check_quiesce_all(struct lvm_logical_volume * org_volume)
18106 + struct lvm_logical_volume * snap;
18108 + if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18112 + for ( snap = org_volume->snapshot_next;
18113 + snap; snap = snap->snapshot_next ) {
18114 + if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
18123 + * invalidate_snapshot_volume
18125 + * In the event a snapshot volume becomes full or corrupted, its metadata
18126 + * must be altered in order to prevent it from being used again. Write some
18127 + * invalid data into the first entry of the COW table. If this volume is
18128 + * not fully deleted by the user/engine, this invalid COW entry will be
18129 + * detected by build_snapshot_maps(), and will cause the volume to be
18130 + * deleted before being exported to EVMS during discover. This is obviously
18131 + * a hack, but it is the same hack currently used by LVM. We're just trying
18132 + * to be compatible. :)
18134 +static int invalidate_snapshot_volume(struct lvm_logical_volume * snap_volume)
18136 + struct evms_logical_node tmp_node;
18138 + tmp_node.private = snap_volume;
18139 + tmp_node.total_vsectors = snap_volume->lv_size;
18141 + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18142 + LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n",
18143 + snap_volume->name);
18147 + LOG_WARNING("Invalidating full/corrupt snapshot %s\n",
18148 + snap_volume->name);
18149 + LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
18151 + if (snap_volume->cow_table) {
18152 + snap_volume->cow_table[0].pv_org_rsector =
18153 + cpu_to_le64(((u64)1));
18154 + if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
18155 + LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18156 + snap_volume->name);
18159 + LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18160 + snap_volume->name);
18163 + snap_volume->lv_status &= ~LV_ACTIVE;
18168 + * remove_snapshot_from_chain
18170 + * Remove a snapshot volume from its original's chain of snapshots. This
18171 + * does not delete the snapshot volume. At runtime, we cannot delete
18172 + * volumes at the region-manager level, because EVMS may have this volume
18173 + * exported, and there is no way to notify EVMS of the deletion. It will
18174 + * eventually need to be deleted in the engine, which will then tell the
18175 + * EVMS kernel services to delete the volume in the kernel.
18177 +static int remove_snapshot_from_chain(struct lvm_logical_volume * snap_volume)
18179 + struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18180 + struct lvm_logical_volume ** p_volume;
18182 + if (org_volume) {
18183 + for ( p_volume = &org_volume->snapshot_next;
18185 + p_volume = &(*p_volume)->snapshot_next ) {
18186 + if ( *p_volume == snap_volume ) {
18187 + *p_volume = snap_volume->snapshot_next;
18193 + snap_volume->snapshot_org = NULL;
18194 + snap_volume->snapshot_next = NULL;
18201 + * The snapshot hash tables are NEVER going to have 4 billion entries, so
18202 + * we can safely cast the org_sector to 32 bits and just mod it by the
18203 + * hash table size.
18205 +static u32 snapshot_hash(u64 org_sector,
18206 + struct lvm_logical_volume * snap_volume)
18208 + return (((u32)org_sector) % snap_volume->hash_table_size);
18212 + * snapshot_search_hash_chain
18214 + * Search the hash chain that is anchored at the specified head pointer.
18215 + * If the sector number is found, the result pointer is set to that entry
18216 + * in the chain, and a 1 is returned. If the sector is not found, the
18217 + * result pointer is set to the previous entry and 0 is returned. If the
18218 + * result pointer is NULL, this means either the list is empty, or the
18219 + * specified sector should become the first list item.
18221 +static int snapshot_search_hash_chain(u64 org_sector,
18222 + struct snapshot_map_entry * head,
18223 + struct snapshot_map_entry ** result)
18225 + struct snapshot_map_entry * curr = head;
18226 + struct snapshot_map_entry * prev = head;
18227 + while ( curr && curr->org_sector < org_sector ) {
18229 + curr = curr->next;
18232 + /* Either an empty chain or went off the end of the chain. */
18235 + } else if ( curr->org_sector != org_sector ) {
18236 + *result = curr->prev;
18239 + /* Found the desired sector. */
18246 + * insert_snapshot_map_entry
18248 + * Insert a new entry into a snapshot hash chain, immediately following the
18249 + * specified entry. This function should not be used to add an entry into
18250 + * an empty list, or as the first entry in an existing list. For that case,
18251 + * use insert_snapshot_map_entry_at_head().
18253 +static int insert_snapshot_map_entry(struct snapshot_map_entry * entry,
18254 + struct snapshot_map_entry * base)
18256 + entry->next = base->next;
18257 + entry->prev = base;
18258 + base->next = entry;
18259 + if (entry->next) {
18260 + entry->next->prev = entry;
18266 + * insert_snapshot_map_entry_at_head
18268 + * Insert a new entry into a snapshot chain as the first entry.
18270 +static int insert_snapshot_map_entry_at_head(struct snapshot_map_entry * entry,
18271 + struct snapshot_map_entry ** head)
18273 + entry->next = *head;
18274 + entry->prev = NULL;
18276 + if (entry->next) {
18277 + entry->next->prev = entry;
18283 + * add_cow_entry_to_snapshot_map
18285 + * Convert a cow table entry (from the on-disk data) into an appropriate
18286 + * entry for the snapshot map. Insert this new entry into the appropriate
18287 + * map for the specified volume.
18289 + * The cow_entry passed into this function must have already been
18290 + * endian-converted from disk-order to cpu-order.
18292 +static int add_cow_entry_to_snapshot_map(struct lv_COW_table_disk * cow_entry,
18293 + struct lvm_logical_volume * volume)
18295 + struct snapshot_map_entry * new_entry, * target_entry;
18296 + struct snapshot_map_entry ** hash_table, * chain_head;
18299 + if ( cow_entry->pv_org_number == 0 ) {
18303 + new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector,
18304 + cow_entry->pv_snap_rsector);
18305 + if (!new_entry) {
18309 + new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number,
18311 + if (!new_entry->snap_pv) {
18312 + kfree(new_entry);
18316 + hash_value = snapshot_hash(new_entry->org_sector, volume);
18317 + hash_table = volume->snapshot_map[cow_entry->pv_org_number];
18318 + chain_head = hash_table[hash_value];
18319 + if ( snapshot_search_hash_chain(new_entry->org_sector,
18320 + chain_head, &target_entry) ) {
18321 + /* In general, we should not find this entry in the snapshot
18322 + * map already. However, it could happen on a re-discover, but
18323 + * the build_snapshot_maps function should weed out those cases.
18324 + * In either event, we can simply ignore duplicates.
18326 + LOG_WARNING("Detected a duplicate snapshot map entry\n");
18327 + LOG_WARNING("Snap PV "PFU64":"PFU64", Org PV "PFU64":"PFU64"\n",
18328 + cow_entry->pv_snap_number,
18329 + cow_entry->pv_snap_rsector,
18330 + cow_entry->pv_org_number,
18331 + cow_entry->pv_org_rsector);
18332 + kfree(new_entry);
18334 + if (target_entry) {
18335 + insert_snapshot_map_entry(new_entry, target_entry);
18337 + insert_snapshot_map_entry_at_head(new_entry,
18338 + &hash_table[hash_value]);
18346 + * snapshot_remap_sector
18348 + * Perform a sector remap on a snapshot volume. This should be called from
18349 + * the I/O read path, after the LE-to-PE translation has already been
18350 + * performed. First, determine the base sector of the chunk containing the
18351 + * specified sector, and save the remainder. Then, perform a search through
18352 + * the snapshot map for the specified volume. If an match is found, change
18353 + * the PV and sector numbers to the new values. If no match is found, leave
18354 + * the values alone, meaning the read should proceed down the original
18358 +snapshot_remap_sector(struct lvm_logical_volume * snap_volume,
18359 + u64 pe_start_sector,
18361 + struct lvm_physical_volume ** pv_entry)
18363 + struct snapshot_map_entry ** hash_table;
18364 + struct snapshot_map_entry * chain_head, * result;
18366 + u64 chunk_sector, remainder;
18368 + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18372 + chunk_sector = ((*sector - pe_start_sector) &
18373 + ((u64)(~(snap_volume->chunk_size - 1)))) +
18375 + remainder = *sector - chunk_sector;
18376 + hash_value = snapshot_hash(chunk_sector, snap_volume);
18377 + hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
18378 + chain_head = hash_table[hash_value];
18380 + if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
18381 + *pv_entry = result->snap_pv;
18382 + *sector = result->snap_sector + remainder;
18387 + * snapshot_read_write_chunk
18389 + * This function takes care of reading one chunk of data from the
18390 + * original, and writing it to the snapshot. Since the original now has
18391 + * a fixed sized buffer for this data, we may have to loop to get the
18392 + * whole chunk copied.
18394 +static int snapshot_read_write_chunk(struct lvm_logical_volume * org_volume,
18395 + struct lvm_physical_volume * org_pv,
18396 + u64 chunk_sector,
18397 + struct lvm_logical_volume * snap_volume,
18398 + struct lvm_physical_volume ** snap_pv,
18399 + u64 * snap_sector)
18401 + u32 io_size = snap_volume->chunk_size;
18402 + u64 snap_pe_start_sector, size;
18403 + int i, iterations = 1;
18405 + if ( org_volume->chunk_size < snap_volume->chunk_size ) {
18406 + iterations = snap_volume->chunk_size / org_volume->chunk_size;
18407 + io_size = org_volume->chunk_size;
18410 + remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1,
18411 + snap_sector, &size, &snap_pe_start_sector, snap_pv);
18413 + /* Check for an incomplete volume. */
18414 + if (!*snap_sector || !*snap_pv) {
18415 + invalidate_snapshot_volume(snap_volume);
18419 + for ( i = 0; i < iterations; i++ ) {
18421 + /* Read the chunk from the original volume. This is a physical
18422 + * read, not logical. Thus, stripe boundary considerations are
18423 + * unnecessary. Also, chunks are always aligned with PEs, so PE
18424 + * boundary considerations are unnecessary.
18426 + if ( INIT_IO(org_pv->logical_node, 0,
18427 + chunk_sector + i * io_size, io_size,
18428 + org_volume->chunk_data_buffer) ) {
18432 + /* Write this chunk to the snapshot volume. This does duplicate
18433 + * the local init_io code, but we need to have the remapped
18434 + * sector later on, so this is slightly more efficient. Snapshot
18435 + * volumes cannot be striped, so there is no need to consider
18436 + * stripe-boundary conditions. And just like the read in the
18437 + * previous line, chunks are always aligned with PEs, so we
18438 + * don't have to consider PE-boundary conditions.
18440 + if ( INIT_IO((*snap_pv)->logical_node, 1,
18441 + *snap_sector + i * io_size, io_size,
18442 + org_volume->chunk_data_buffer) ) {
18443 + /* An error writing the chunk to the snapshot is the
18444 + * same situation as the snapshot being full.
18446 + invalidate_snapshot_volume(snap_volume);
18455 + * snapshot_copy_data
18457 + * On a write to a snapshotted volume, check all snapshots to see if the
18458 + * specified chunk has already been remapped. If it has not, read the
18459 + * original data from the volume, write the data to the next available
18460 + * chunk on the snapshot, update the COW table, write the COW table to
18461 + * the snapshot, and insert a new entry into the snapshot map.
18463 + * Now converted to copy data to a single snapshot. The looping is left
18464 + * up to lvm_write.
18466 +static int snapshot_copy_data(struct lvm_logical_volume * org_volume,
18467 + struct lvm_logical_volume * snap_volume,
18468 + u64 pe_start_sector,
18470 + struct lvm_physical_volume * org_pv)
18472 + struct lvm_physical_volume * snap_pv;
18473 + struct snapshot_map_entry ** hash_table, * chain_head;
18474 + struct snapshot_map_entry * target_entry, * new_map_entry;
18475 + u64 chunk_sector, snap_sector;
18479 + /* Lock out this snapshot while we are remapping. */
18480 + down(&snap_volume->snap_semaphore);
18482 + /* Make sure the snapshot has not been deactivated. */
18483 + if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
18487 + /* Search the hash table to see if this sector has already been
18488 + * remapped on this snapshot.
18490 + chunk_sector = ((org_sector - pe_start_sector) &
18491 + ((u64)(~(snap_volume->chunk_size - 1)))) +
18493 + hash_value = snapshot_hash(chunk_sector, snap_volume);
18494 + hash_table = snap_volume->snapshot_map[org_pv->pv_number];
18495 + chain_head = hash_table[hash_value];
18497 + if ( snapshot_search_hash_chain(chunk_sector,
18498 + chain_head, &target_entry) ) {
18499 + /* Chunk is already remapped. */
18503 + /* Is there room on the snapshot to remap this chunk? */
18504 + if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
18505 + /* At this point, the snapshot is full. Any further
18506 + * writes to the original will cause the snapshot to
18507 + * become "corrupt" because they can't be remapped.
18508 + * Take this snapshot permanently offline.
18510 + goto out_invalidate;
18513 + rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector,
18514 + snap_volume, &snap_pv, &snap_sector);
18516 + rc = (rc > 0) ? -EIO : 0;
18520 + /* Fill in the appropriate COW table entry and write that
18521 + * metadata sector back to the snapshot volume. Since we are
18522 + * only writing one sector, there are no boundary conditions.
18523 + * Must endian-convert each entry as it is added.
18525 + snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number =
18526 + cpu_to_le64((u64)(org_pv->pv_number));
18527 + snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector =
18528 + cpu_to_le64p(&chunk_sector);
18529 + snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number =
18530 + cpu_to_le64((u64)(snap_pv->pv_number));
18531 + snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector =
18532 + cpu_to_le64p(&snap_sector);
18534 + if ( lvm_init_io(snap_volume->volume_node, 4,
18535 + snap_volume->current_cow_sector,
18536 + 1, snap_volume->cow_table) ) {
18537 + /* The data was written to the snapshot, but
18538 + * writing the metadata failed.
18540 + goto out_invalidate;
18543 + snap_volume->next_cow_entry++;
18544 + if ( snap_volume->next_cow_entry >=
18545 + (EVMS_VSECTOR_SIZE / sizeof (struct lv_COW_table_disk)) ) {
18546 + snap_volume->next_cow_entry = 0;
18547 + snap_volume->current_cow_sector++;
18548 + memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18549 + if ( lvm_init_io(snap_volume->volume_node, 4,
18550 + snap_volume->current_cow_sector,
18551 + 1, snap_volume->cow_table) ) {
18552 + /* Can't clear out the next sector of metadata. */
18553 + goto out_invalidate;
18556 + snap_volume->next_free_chunk += snap_volume->chunk_size;
18558 + /* Create a new snapshot map entry and add it in the appropriate
18559 + * place in the map.
18561 + new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector);
18562 + if (!new_map_entry) {
18564 + goto out_invalidate;
18566 + new_map_entry->snap_pv = snap_pv;
18567 + if (target_entry) {
18568 + insert_snapshot_map_entry(new_map_entry, target_entry);
18570 + insert_snapshot_map_entry_at_head(new_map_entry,
18571 + &(hash_table[hash_value]));
18575 + up(&snap_volume->snap_semaphore);
18579 + invalidate_snapshot_volume(snap_volume);
18584 + * get_snapshot_stats
18586 +static int get_snapshot_stats(struct lvm_snapshot_stat_ioctl * snap_stats)
18588 + struct lvm_logical_volume * volume;
18589 + struct lvm_volume_group * group;
18591 + /* Make sure the parameters are in range. */
18592 + if ( snap_stats->lv_number < 1 || snap_stats->lv_number > MAX_LV ) {
18596 + /* Make sure the specified group and volume exist, and that
18597 + * this is a snapshot volume.
18599 + find_group_by_uuid(snap_stats->vg_uuid, &group);
18601 + ! (volume = group->volume_list[snap_stats->lv_number]) ||
18602 + ! (volume->lv_access & LV_SNAPSHOT) ) {
18606 + /* Return the starting LBA of the next available chunk. */
18607 + snap_stats->next_free_chunk = volume->next_free_chunk;
18608 + snap_stats->lv_status = volume->lv_status;
18614 +/********** Memory Allocation/Deallocation Functions **********/
18618 + * deallocate_physical_volume
18620 + * Free the memory used by this physical volume. Do not delete the EVMS
18621 + * node in this function, since this could be called during an error
18622 + * path when we want to save the logical node.
18624 +static int deallocate_physical_volume(struct lvm_physical_volume * pv_entry)
18626 + if (pv_entry->pv) {
18627 + kfree(pv_entry->pv);
18628 + pv_entry->pv = NULL;
18631 + if (pv_entry->pe_map) {
18632 + vfree(pv_entry->pe_map);
18633 + pv_entry->pe_map = NULL;
18641 + * allocate_physical_volume
18643 + * Create a new struct lvm_physical_volume for the specified volume group.
18644 + * Initialize the new PV with the evms node and lvm pv information.
18646 +static struct lvm_physical_volume *
18647 +allocate_physical_volume(struct evms_logical_node * node, struct pv_disk * pv)
18649 + struct lvm_physical_volume * new_pv;
18651 + new_pv = kmalloc(sizeof(struct lvm_physical_volume), GFP_NOIO);
18653 + LOG_CRITICAL("Error allocating physical volume for %s.\n",
18659 + /* Initialize the PV. */
18660 + memset(new_pv, 0, sizeof(struct lvm_physical_volume));
18661 + new_pv->logical_node = node;
18663 + new_pv->pv_number = pv->pv_number;
18670 + * allocate_snapshot_map_entry
18672 + * Allocate memory for a new entry in the snapshot map and fill in the
18673 + * sector values. The PV pointer is not filled in here, but can easily
18674 + * be found by using the find_pv_by_number function.
18676 +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
18679 + struct snapshot_map_entry * new_entry;
18681 + new_entry = kmalloc(sizeof(struct snapshot_map_entry), GFP_NOIO);
18682 + if (!new_entry) {
18685 + memset(new_entry, 0, sizeof(struct snapshot_map_entry));
18686 + new_entry->org_sector = org_sector;
18687 + new_entry->snap_sector = snap_sector;
18689 + return new_entry;
18693 + * deallocate_snapshot_map
18695 + * This function will delete one hash table, which is part of the whole
18696 + * snapshot remapping structure. Each hash table is an array of pointers
18697 + * to linked lists of struct snapshot_map_entry's.
18699 +static int deallocate_snapshot_map(struct snapshot_map_entry ** table,
18702 + struct snapshot_map_entry * entry, * next;
18706 + for ( i = 0; i < table_size; i++ ) {
18707 + for ( entry = table[i]; entry; entry = next ) {
18708 + next = entry->next;
18718 + * deallocate_logical_volume
18720 + * Delete the in-memory representation of a single LVM logical volume,
18721 + * including its PE map and any snapshot data. Do not alter the parent
18722 + * volume group, except to remove this volume from its volume list.
18724 +static int deallocate_logical_volume(struct lvm_logical_volume * volume)
18726 + struct lvm_volume_group * group = volume->group;
18727 + struct lvm_logical_volume * org_volume, * snap_volume;
18730 + if ( volume->lv_access & LV_SNAPSHOT ) {
18731 + /* This volume is a snapshot. Remove it from the linked
18732 + * list of volumes that are snapshotting the original.
18733 + * First, the original volume must be quiesced.
18735 + org_volume = volume->snapshot_org;
18737 + if ( snapshot_check_quiesce_original(volume) ) {
18741 + remove_snapshot_from_chain(volume);
18743 + /* If the snapshot that was just removed was the last/only
18744 + * volume snapshotting the original, then mark the original
18745 + * as no longer being snapshotted.
18747 + if ( org_volume && !org_volume->snapshot_next ) {
18748 + org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
18750 + } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
18751 + /* If this volume is a snapshot original, all of its snapshots
18752 + * must also be deleted. However, Those deletions need to be
18753 + * taken care of by the engine. So just check that they have
18754 + * all been quiesced before removing the original.
18756 + if ( snapshot_check_quiesce_all(volume) ) {
18760 + /* In case there are any snapshots remaining, we must clear out
18761 + * their pointers to this original to prevent errors when those
18762 + * snapshots are accessed or deleted.
18764 + for ( snap_volume = volume->snapshot_next;
18765 + snap_volume; snap_volume = snap_volume->snapshot_next ) {
18766 + snap_volume->snapshot_org = NULL;
18770 + if (volume->name) {
18771 + LOG_DEBUG("Deleting volume %s\n", volume->name);
18774 + /* Free all the memory. This includes the LE-to-PE map, any snapshot
18775 + * hash tables, the COW table, and chunk data buffer.
18777 + if (volume->le_map) {
18778 + vfree(volume->le_map);
18779 + volume->le_map = NULL;
18781 + if (volume->snapshot_map) {
18782 + for ( i = 1; i <= group->pv_count; i++ ) {
18783 + deallocate_snapshot_map(volume->snapshot_map[i],
18784 + volume->hash_table_size);
18786 + kfree(volume->snapshot_map);
18787 + volume->snapshot_map = NULL;
18789 + if (volume->cow_table) {
18790 + kfree(volume->cow_table);
18791 + volume->cow_table = NULL;
18793 + if (volume->chunk_data_buffer) {
18794 + kfree(volume->chunk_data_buffer);
18795 + volume->chunk_data_buffer = NULL;
18798 + /* Remove this volume from the group's list. */
18799 + if ( group && group->volume_list[volume->lv_number] == volume ) {
18800 + group->volume_list[volume->lv_number] = NULL;
18801 + group->volume_count--;
18809 + * allocate_logical_volume
18811 + * Allocate space for a new LVM logical volume, including space for the
18812 + * LE-to-PE map and any necessary snapshot data.
18814 +static struct lvm_logical_volume *
18815 +allocate_logical_volume(struct lv_disk * lv, struct lvm_volume_group * group)
18817 + struct lvm_logical_volume * new_volume;
18818 + u32 table_entries_per_chunk, table_chunks;
18821 + /* Allocate space for the new logical volume. */
18822 + new_volume = kmalloc(sizeof(struct lvm_logical_volume), GFP_NOIO);
18823 + if (!new_volume) {
18824 + LOG_CRITICAL("Error allocating new logical volume %s\n",
18828 + memset(new_volume, 0, sizeof(struct lvm_logical_volume));
18830 + /* Allocate space for the LE to PE mapping table. */
18831 + new_volume->le_map = vmalloc(lv->lv_allocated_le *
18832 + sizeof(struct le_table_entry));
18833 + if (!new_volume->le_map) {
18834 + LOG_CRITICAL("Error creating LE map for logical volume %s\n",
18838 + memset(new_volume->le_map, 0,
18839 + lv->lv_allocated_le * sizeof(struct le_table_entry));
18841 + /* Initialize the rest of the new volume.
18842 + * Need the +1 on lv_number to match the PE Map entries on the PV.
18844 + new_volume->lv_number = lv->lv_number + 1;
18845 + new_volume->lv_size = lv->lv_size;
18846 + new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED;
18847 + new_volume->lv_status = lv->lv_status | LV_ACTIVE;
18848 + new_volume->lv_minor = MINOR(lv->lv_dev);
18849 + new_volume->stripes = lv->lv_stripes;
18850 + new_volume->stripe_size = lv->lv_stripesize;
18851 + new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize);
18852 + new_volume->pe_size = group->vg->pe_size;
18853 + new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size);
18854 + new_volume->num_le = lv->lv_allocated_le;
18855 + new_volume->group = group;
18856 + /* Different naming scheme for EVMS nodes. */
18857 + if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
18861 + if ( new_volume->lv_access & LV_SNAPSHOT ) {
18862 + /* This volume is a snapshot, initialize the remaining data,
18863 + * and allocate space for the remapping structures, and one
18864 + * sector's worth of COW tables.
18866 + new_volume->chunk_size = lv->lv_chunk_size;
18867 + new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size;
18868 + new_volume->snap_org_minor = lv->lv_snapshot_minor;
18869 + new_volume->next_cow_entry = 0;
18870 + new_volume->current_cow_sector = 0;
18871 + table_entries_per_chunk = (new_volume->chunk_size <<
18872 + EVMS_VSECTOR_SIZE_SHIFT) /
18873 + sizeof(struct lv_COW_table_disk);
18874 + table_chunks = (new_volume->num_chunks +
18875 + table_entries_per_chunk - 1) /
18876 + table_entries_per_chunk;
18877 + new_volume->next_free_chunk = table_chunks *
18878 + new_volume->chunk_size;
18879 + new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size /
18880 + MAX_HASH_CHAIN_ENTRIES) + 1;
18882 + new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
18883 + if (!new_volume->cow_table) {
18884 + LOG_CRITICAL("Error allocating COW table for logical volume %s\n",
18888 + memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18890 + new_volume->snapshot_map = kmalloc((group->pv_count + 1) *
18891 + sizeof(struct snapshot_map_entry **),
18893 + if (!new_volume->snapshot_map) {
18894 + LOG_CRITICAL("Error allocating snapshot map for logical volume %s\n",
18899 + new_volume->snapshot_map[0] = NULL;
18900 + for ( i = 1; i <= group->pv_count; i++ ) {
18901 + new_volume->snapshot_map[i] =
18902 + vmalloc(new_volume->hash_table_size *
18903 + sizeof(struct snapshot_map_entry *));
18904 + if (!new_volume->snapshot_map[i]) {
18905 + LOG_CRITICAL("Error allocating snapshot sub-map for logical volume %s\n",
18909 + memset(new_volume->snapshot_map[i], 0,
18910 + new_volume->hash_table_size *
18911 + sizeof(struct snapshot_map_entry *));
18913 + init_MUTEX(&new_volume->snap_semaphore);
18914 + } else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
18915 + /* This volume is a snapshot original, allocate space to use for
18916 + * copying snapshot chunks. This will now be a fixed size
18917 + * instead of being based on the chunk size of the snapshots.
18919 + new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
18920 + new_volume->chunk_data_buffer =
18921 + kmalloc(new_volume->chunk_size <<
18922 + EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
18923 + if (!new_volume->chunk_data_buffer) {
18924 + LOG_SERIOUS("Error allocating snapshot chunk buffer for logical volume %s\n",
18928 + memset(new_volume->chunk_data_buffer, 0,
18929 + new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
18933 + return new_volume;
18935 + deallocate_logical_volume(new_volume);
18936 + new_volume = NULL;
18941 + * deallocate_volume_group
18943 + * Delete the entire in-memory representation of an LVM volume group,
18944 + * including all PVs and logical volumes. If this group is on LVM's
18945 + * volume group list, remove it.
18947 +static int deallocate_volume_group(struct lvm_volume_group * group)
18949 + struct lvm_physical_volume * pv_entry, * next_pv;
18952 + LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
18954 + /* Remove the group from the global list. */
18955 + remove_group_from_list(group);
18957 + /* Delete the LV metadata array. */
18958 + if (group->lv_array) {
18959 + vfree(group->lv_array);
18960 + group->lv_array = NULL;
18963 + /* Delete the PV UUID list. */
18964 + if (group->uuid_list) {
18965 + vfree(group->uuid_list);
18966 + group->uuid_list = NULL;
18969 + /* Delete all logical volumes. */
18970 + for ( i = 1; i <= MAX_LV; i++ ) {
18971 + if (group->volume_list[i]) {
18972 + deallocate_logical_volume(group->volume_list[i]);
18973 + group->volume_list[i] = NULL;
18977 + /* Delete all PVs from the group's list. */
18978 + for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
18979 + next_pv = pv_entry->next;
18980 + if (pv_entry->logical_node) {
18981 + /* Send a delete command down to the segment manager. */
18982 + LOG_DEBUG("Deleting PV %s from group %s\n",
18983 + pv_entry->logical_node->name, group->vg_name);
18984 + DELETE(pv_entry->logical_node);
18985 + pv_entry->logical_node = NULL;
18987 + deallocate_physical_volume(pv_entry);
18990 + /* Delete the VG metadata. */
18992 + kfree(group->vg);
18993 + group->vg = NULL;
19001 + * allocate_volume_group
19003 + * Allocate space for a new LVM volume group and all of its sub-fields.
19004 + * Initialize the appropriate fields.
19005 + * vg parameter should already have an allocate/initialized struct vg_disk.
19007 +static struct lvm_volume_group * allocate_volume_group(struct vg_disk * vg,
19010 + struct lvm_volume_group * new_group;
19012 + /* The volume group itself. */
19013 + new_group = kmalloc(sizeof(struct lvm_volume_group), GFP_NOIO);
19014 + if (!new_group) {
19019 + /* Initialize the new group. */
19020 + memset(new_group, 0, sizeof(struct lvm_volume_group));
19021 + memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
19022 + strncpy(new_group->vg_name, vg_name, NAME_LEN - 1);
19023 + new_group->vg = vg;
19024 + /* Default sector and block sizes. */
19025 + new_group->hard_sect_size = 512;
19026 + new_group->block_size = 1024;
19027 + new_group->flags = EVMS_VG_DIRTY;
19029 + LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
19032 + return new_group;
19036 + * remove_pv_from_group
19038 + * In the engine, when a PV is removed from a group (on a vgreduce), that
19039 + * same PV must be removed from that group in the kernel. Otherwise, when
19040 + * the rediscover occurs, that PV will still appear in the group, and
19041 + * will cause segfaults when we try to read metadata from it.
19043 +static int remove_pv_from_group(int pv_number, unsigned char * vg_uuid)
19045 + struct lvm_volume_group * group;
19046 + struct lvm_physical_volume * pv_entry;
19047 + struct lvm_physical_volume ** p_pv_entry;
19049 + /* Make sure the numbers are in range. */
19050 + if ( pv_number < 0 || pv_number > MAX_PV ) {
19054 + /* Make sure the group exists. */
19055 + find_group_by_uuid(vg_uuid, &group);
19060 + /* Make sure the PV is in this group. */
19061 + pv_entry = find_pv_by_number(pv_number, group);
19063 + LOG_WARNING("Did not find PV %d in group %s\n",
19064 + pv_number, group->vg_name);
19068 + /* Make sure the PV is not in use by any volumes. */
19069 + if ( check_pv_for_lv(pv_entry, group) ) {
19070 + LOG_SERIOUS("PV %d in group %s still contains LVs\n",
19071 + pv_number, group->vg_name);
19075 + /* Take this PV out of the group's list. */
19076 + for ( p_pv_entry = &group->pv_list;
19077 + *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
19078 + if ( *p_pv_entry == pv_entry ) {
19079 + *p_pv_entry = (*p_pv_entry)->next;
19080 + pv_entry->next = NULL;
19085 + group->pv_count--;
19087 + /* There is no way that this PV was the last in this group, so the
19088 + * group never needs to be deleted at this point. The only way this
19089 + * group will exist in the kernel is if there are volumes exported from
19090 + * it. If this was the last PV, then those volumes must be on that PV,
19091 + * and it wouldn't be allowed to be removed from the group (above).
19094 + /* Free up the memory for this PV. Just drop the node. */
19095 + deallocate_physical_volume(pv_entry);
19097 + LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
19102 +/********** Consistency Checking Functions **********/
19106 + * clear_le_entries_for_missing_pv
19108 + * In the event that a PV turns up missing during a rediscover, we
19109 + * need to erase any LE map entries that might point to it.
19112 +clear_le_entries_for_missing_pv(struct lvm_volume_group * group,
19113 + struct lvm_physical_volume * pv_entry)
19115 + struct lvm_logical_volume * volume;
19118 + for ( i = 1; i <= MAX_LV; i++ ) {
19119 + if (group->volume_list[i]) {
19120 + volume = group->volume_list[i];
19121 + for ( j = 0; j < volume->num_le; j++ ) {
19122 + if ( volume->le_map[j].owning_pv == pv_entry ) {
19123 + volume->le_map[j].owning_pv = NULL;
19124 + volume->le_map[j].pe_sector_offset = 0;
19132 + * check_volume_groups
19134 + * This function performs some simple consistency checks on all dirty
19135 + * volume groups. Any groups that have no PVs are deleted. If any metadata
19136 + * structures (PV or VG) are missing, they are read in from disk.
19138 +static int check_volume_groups(void)
19140 + struct lvm_volume_group * group, * next_group;
19141 + struct lvm_physical_volume * pv_entry, * next_pv;
19144 + for ( group = lvm_group_list; group; group = next_group ) {
19145 + next_group = group->next_group;
19147 + LOG_DEBUG("Checking Group %s\n", group->vg_name);
19149 + /* If a group has no PVs, it can be safely deleted,
19150 + * because we can't find any volumes on it.
19152 + if (!group->pv_count) {
19153 + LOG_WARNING("No PVs found for Group %s.\n",
19155 + if (!group->volume_count) {
19156 + deallocate_volume_group(group);
19161 + /* Make sure all metadata for the PVs is present. On a
19162 + * rediscover, it may be missing, because we delete it at the
19163 + * end of discovery. If any is missing, read it in from disk.
19164 + * This is only necessary in the kernel. It can't happen in
19167 + for ( pv_entry = group->pv_list;
19168 + pv_entry; pv_entry = next_pv ) {
19169 + next_pv = pv_entry->next;
19170 + if (!pv_entry->pv) {
19171 + LOG_DEBUG("Re-reading PV metadata for %s\n",
19172 + pv_entry->logical_node->name);
19173 + rc = read_pv(pv_entry->logical_node,
19176 + /* What happens if we can't re-read the
19177 + * PV metadata? This PV must be removed
19178 + * from the group. Need to also clear
19179 + * all LE entries in all LVs that are
19180 + * pointing to this PV before it can be
19181 + * removed from the list.
19183 + LOG_SERIOUS("PV metadata is missing or cannot be read from %s\n",
19184 + pv_entry->logical_node->name);
19185 + clear_le_entries_for_missing_pv(group,
19187 + remove_pv_from_group(pv_entry->pv_number,
19191 + pv_entry->pv_number = pv_entry->pv->pv_number;
19193 + /* Check for a "stale" PV. This case should be
19194 + * already be covered, as long as the Engine is
19195 + * calling the PV_REMOVE ioctl when it does a
19196 + * vgreduce or a pvremove. If this is the last
19197 + * PV in the group, the group will be deleted.
19199 + if (!pv_entry->pv_number) {
19200 + remove_pv_from_group(0, group->vg_uuid);
19205 + if (!pv_entry->pe_map) {
19206 + LOG_DEBUG("Re-reading PE maps for %s\n",
19207 + pv_entry->logical_node->name);
19208 + rc = read_pe_map(pv_entry);
19210 + LOG_WARNING("Error reading PE maps for %s\n",
19211 + pv_entry->logical_node->name);
19212 + LOG_WARNING("Any volumes residing on %s will be incomplete!\n",
19213 + pv_entry->logical_node->name);
19218 + /* Make sure the metadata for the VG is present. If it's
19219 + * missing, read it in from the first PV in the VG.
19221 + if (!group->vg && group->pv_count) {
19222 + LOG_DEBUG("Re-reading VG metadata for Group %s\n",
19224 + pv_entry = group->pv_list;
19225 + rc = read_vg(pv_entry->logical_node,
19226 + pv_entry->pv, &group->vg);
19228 + /* What happens if we can't re-read the
19229 + * VG metadata? It's definitely bad
19230 + * news. Should we delete the VG?
19236 + /* Display a warning if the number of PVs found for the group
19237 + * doesn't match the number of PVs recorded for the VG.
19239 + if ( group->vg && group->pv_count != group->vg->pv_cur ) {
19240 + LOG_WARNING("Group %s is incomplete.\n",
19242 + LOG_WARNING(" Only %d of %d PVs found.\n",
19243 + group->pv_count, group->vg->pv_cur);
19244 + LOG_WARNING(" Volumes in this group may be incomplete.\n");
19254 + * Make sure all volumes in this group have valid LE-to-PE maps. Any
19255 + * volume that doesn't is marked as incomplete. This is safe for
19256 + * re-discovery because only new volumes could have corrupted LE maps.
19258 +static int check_le_maps(struct lvm_volume_group * group)
19260 + struct lvm_logical_volume * volume;
19263 + for ( i = 1; i <= MAX_LV; i++ ) {
19264 + volume = group->volume_list[i];
19269 + if (!volume->le_map) {
19270 + /* No point in keeping the volume around if it has
19271 + * no LE map at all.
19273 + LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
19274 + deallocate_logical_volume(volume);
19278 + /* If any entries in the LE map are missing, mark this volume
19281 + for ( j = 0, count = 0; j < volume->num_le; j++ ) {
19282 + if ( !volume->le_map[j].owning_pv ||
19283 + !volume->le_map[j].pe_sector_offset) {
19288 + LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19290 + LOG_SERIOUS(" Missing %d out of %d LEs.\n",
19291 + count, volume->num_le);
19292 + volume->lv_access |= EVMS_LV_INCOMPLETE;
19299 + * check_snapshot_map
19301 + * For snapshot volumes, make sure the snapshot map is intact, and that
19302 + * any existing entries in the map are in the correct order and there
19303 + * are no duplicate entries.
19305 +static int check_snapshot_map(struct lvm_logical_volume * snap_volume)
19307 + struct snapshot_map_entry ** table, * curr;
19310 + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
19313 + if (!snap_volume->snapshot_map) {
19314 + snap_volume->lv_access |= EVMS_LV_INVALID;
19318 + for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
19319 + if (!snap_volume->snapshot_map[i]) {
19320 + snap_volume->lv_access |= EVMS_LV_INVALID;
19323 + table = snap_volume->snapshot_map[i];
19324 + for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
19325 + for ( curr = table[j]; curr; curr = curr->next ) {
19326 + if ( curr->next &&
19327 + curr->org_sector >=
19328 + curr->next->org_sector) {
19329 + snap_volume->lv_access |=
19340 + * check_logical_volumes
19342 + * Perform a consistency check on all of the logical volumes that have been
19343 + * discovered. Any volume that has any inconsistencies will be marked as
19344 + * incomplete or invalid, depending on the severity of the problem. At the
19345 + * end, all invalid volumes are deleted. If the deleted_incompletes
19346 + * parameter is set, those will also be deleted.
19348 +static int check_logical_volumes(int final_discovery)
19350 + struct lvm_volume_group * group;
19351 + struct lvm_logical_volume * volume, * snap, * next;
19354 + /* Check every valid, dirty volume group. */
19355 + for ( group = lvm_group_list; group; group = group->next_group ) {
19356 + if ( ! (group->flags & EVMS_VG_DIRTY) ) {
19359 + /* Check every valid volume in this group. */
19360 + for ( i = 1; i <= MAX_LV; i++ ) {
19361 + volume = group->volume_list[i];
19366 + LOG_DEBUG("Checking logical volume %s\n", volume->name);
19368 + if (!volume->group) {
19369 + volume->group = group;
19372 + /* All LE-map entries must have valid values. The I/O
19373 + * paths now detect missing LE entries.
19375 + if (volume->le_map) {
19376 + for ( j = 0, count = 0;
19377 + j < volume->num_le; j++ ) {
19378 + if ( !volume->le_map[j].owning_pv ||
19379 + !volume->le_map[j].pe_sector_offset ) {
19384 + LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19386 + LOG_SERIOUS(" Missing %d out of %d LEs.\n",
19387 + count, volume->num_le);
19388 + volume->lv_access |= EVMS_LV_INCOMPLETE;
19390 + /* In case this volume was previously
19391 + * marked incomplete.
19393 + volume->lv_access &=
19394 + ~EVMS_LV_INCOMPLETE;
19397 + /* This should only ever happen due to
19398 + * memory corruption.
19400 + LOG_SERIOUS("Volume %s has no LE map.\n",
19402 + volume->lv_access |= EVMS_LV_INVALID;
19405 + if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19406 + /* For a snapshot original, check all snapshots
19407 + * in the chain, to make sure they point back to
19408 + * the original. Also, make sure there is memory
19409 + * for the chunk buffer.
19411 + for ( snap = volume->snapshot_next, count = 0;
19413 + snap = snap->snapshot_next, count++ ) {
19414 + if ( snap->snapshot_org != volume ) {
19415 + LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19417 + snap->snapshot_org = NULL;
19418 + snap->lv_access |=
19423 + LOG_WARNING("No snapshots found for volume %s\n",
19425 + if (final_discovery) {
19426 + volume->lv_access &=
19427 + ~LV_SNAPSHOT_ORG;
19429 + } else if (!volume->chunk_data_buffer) {
19430 + volume->lv_access |= EVMS_LV_INVALID;
19432 + } else if ( volume->lv_access & LV_SNAPSHOT ) {
19433 + /* For a snapshot volume, make sure it points
19434 + * back to its original. Also make sure there is
19435 + * memory for the cow table, and that any
19436 + * existing snapshot entries in the snapshot map
19437 + * are correctly ordered.
19439 + /* Is there a COW table? */
19440 + if (!volume->cow_table) {
19441 + LOG_SERIOUS("Snapshot volume %s has no COW table\n",
19443 + volume->lv_access |= EVMS_LV_INVALID;
19445 + /* Is the snapshot map in order? */
19446 + if ( check_snapshot_map(volume) ) {
19447 + LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n",
19449 + volume->lv_access |= EVMS_LV_INVALID;
19451 + /* Is there an original volume? This is only
19452 + * a real problem during final discovery.
19454 + if (!volume->snapshot_org) {
19455 + LOG_SERIOUS("Snapshot volume %s not pointing at an original\n",
19457 + if (final_discovery) {
19458 + volume->lv_access |=
19462 + /* Is the original the correct one? */
19463 + else if ( volume->snap_org_minor !=
19464 + volume->snapshot_org->lv_minor ) {
19465 + LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19467 + volume->lv_access |= EVMS_LV_INVALID;
19470 + /* Delete any invalid volumes from use. Delete
19471 + * incomplete volumes as well if this is not final
19472 + * discovery. If a snapshot original is bad, delete all
19473 + * of its snapshots.
19475 + if ( volume->lv_access & EVMS_LV_INVALID ||
19476 + (!final_discovery &&
19477 + (volume->lv_access & EVMS_LV_INCOMPLETE) &&
19478 + (volume->lv_access & EVMS_LV_NEW)) ) {
19479 + if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19480 + for ( snap = volume->snapshot_next;
19481 + snap; snap = next ) {
19482 + next = snap->snapshot_next;
19483 + snap->snapshot_next = NULL;
19484 + snap->snapshot_org = NULL;
19485 + invalidate_snapshot_volume(snap);
19486 + deallocate_logical_volume(snap);
19488 + volume->snapshot_next = NULL;
19489 + } else if ( volume->lv_access & LV_SNAPSHOT ) {
19490 + invalidate_snapshot_volume(volume);
19492 + deallocate_logical_volume(volume);
19501 +/********** Volume Group Discovery Functions **********/
19505 + * find_group_for_pv
19507 + * This is a discover-time function. It reads the VG metadata info for the
19508 + * specified node, and locates the appropriate group that owns that
19509 + * node. If that group does not already exist, it is created and
19512 +static int find_group_for_pv(struct evms_logical_node * node,
19513 + struct pv_disk * pv,
19514 + struct lvm_volume_group ** group)
19516 + struct vg_disk * vg;
19521 + /* Check for an unassigned PV. */
19522 + if ( pv->vg_name[0] == 0 ) {
19526 + /* Read the VG on-disk info for this PV. If this succeeds, it
19527 + * allocates a new VG metadata structure.
19529 + rc = read_vg(node, pv, &vg);
19534 + /* Use the UUID from the VG metadata to determine if this group
19535 + * has already been discovered and constructed.
19537 + find_group_by_uuid(vg->vg_uuid, group);
19540 + /* Create a new group entry and add to the global list. */
19541 + *group = allocate_volume_group(vg, pv->vg_name);
19545 + add_group_to_list(*group);
19546 + } else if (!(*group)->vg) {
19547 + /* On a rediscover, the VG metadata for an existing group might
19548 + * be missing. Fill it in if necessary. This check is also not
19549 + * necessary in the engine, since the metadata is never deleted.
19551 +/* Should we re-copy vg_name? (vg_uuid can not be allowed to change).
19552 + * Or should vg_name changes be done through direct ioctl only?
19554 + (*group)->vg = vg;
19559 + /* Read in the UUID list for this group, if it isn't present. */
19560 + rc = read_uuid_list(node, pv, *group);
19562 + LOG_WARNING("Error reading UUID list for group %s.\n",
19563 + (*group)->vg_name);
19564 + LOG_WARNING("May not be able to verify PV UUIDs for group %s\n",
19565 + (*group)->vg_name);
19568 + /* In the kernel, any time we even see a PV for a group, that group
19569 + * must be marked dirty so its volumes will be re-exported.
19571 + (*group)->flags |= EVMS_VG_DIRTY;
19577 + * check_for_duplicate_pv
19579 + * Search the list of PVs in the specified volume group. If the
19580 + * specified node already exists in the list, we can discard it.
19582 +static int check_for_duplicate_pv(struct evms_logical_node * node,
19583 + struct pv_disk * pv,
19584 + struct lvm_volume_group * group)
19586 + struct lvm_physical_volume * pv_entry;
19588 + /* For re-discovery, we need to search all existing PVs in this VG to
19589 + * make sure we didn't get a duplicate from the plugin below us. The
19590 + * plugins below us should be re-exporting the same node on
19591 + * re-discovery, instead of creating a new node to represent the same
19592 + * objects, so just check the memory location.
19594 + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19595 + if ( pv_entry->logical_node == node ) {
19597 + /* We found a duplicate. Just ignore the duplicate. */
19598 + LOG_DEBUG("PV %s is already in Group %s.\n",
19599 + node->name, group->vg_name);
19601 + /* Even if the node was a duplicate, we may need to
19602 + * fill in the pv entry for this partition, since we
19603 + * always delete those at the end of discovery.
19605 + if (!pv_entry->pv) {
19606 + pv_entry->pv = pv;
19607 + pv_entry->pv_number = pv->pv_number;
19616 + /* No duplicate was found. */
19623 + * Verify that the specified PV belongs in the specified group by
19624 + * searching for the PV's UUID in the group's list.
19626 +static int verify_pv_uuid(struct lvm_physical_volume * pv_entry,
19627 + struct lvm_volume_group * group)
19631 + /* Obviously the UUID list must be present in order to search. */
19632 + if (!group->uuid_list) {
19633 + LOG_WARNING("UUID list is missing from group %s.\n",
19635 + LOG_WARNING("Cannot verify UUID for PV %s\n",
19636 + pv_entry->logical_node->name);
19640 + /* Start with the UUID entry for this PV's number. */
19641 + if ( ! memcmp(pv_entry->pv->pv_uuid,
19642 + &(group->uuid_list[(pv_entry->pv_number - 1) * NAME_LEN]),
19647 + /* If it wasn't found there, then search the entire group's list. */
19648 + for ( i = 0; i < group->vg->pv_cur; i++ ) {
19649 + if ( ! memcmp(pv_entry->pv->pv_uuid,
19650 + &(group->uuid_list[i * NAME_LEN]), UUID_LEN) ) {
19651 + /* Found the UUID. */
19652 + LOG_WARNING("Detected UUID mismatch for PV %s!\n",
19653 + pv_entry->logical_node->name);
19654 + LOG_WARNING("PV %s is recorded as being at index %d,\n",
19655 + pv_entry->logical_node->name,
19656 + pv_entry->pv_number);
19657 + LOG_WARNING(" but Group %s has it recorded at index %d.\n",
19658 + group->vg_name, i + 1);
19659 + LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
19660 + LOG_WARNING("If you have any snapshot regions in group %s\n",
19662 + LOG_WARNING(" it is recommended that you delete them immediately!\n");
19667 + LOG_SERIOUS("Could not find UUID for PV %s in group %s\n",
19668 + pv_entry->logical_node->name, group->vg_name);
19673 + * add_pv_to_group
19675 + * Adds the physical volume to the appropriate volume group. The PV
19676 + * passed into this function MUST be part of a valid VG.
19678 +static int add_pv_to_group(struct lvm_physical_volume * pv_entry,
19679 + struct lvm_volume_group * group)
19683 + /* Make sure this PV's UUID is listed in the group. */
19684 + rc = verify_pv_uuid(pv_entry, group);
19686 + LOG_SERIOUS("PV %s does not belong in group %s!\n",
19687 + pv_entry->logical_node->name, group->vg_name);
19691 + /* Add this PV to the beginning of its group's list. */
19692 + pv_entry->next = group->pv_list;
19693 + group->pv_list = pv_entry;
19694 + group->pv_count++;
19696 + /* Update the group's block and hardsector sizes as appropriate. */
19697 + group->block_size = max(pv_entry->logical_node->block_size,
19698 + group->block_size);
19699 + group->hard_sect_size = max(pv_entry->logical_node->hardsector_size,
19700 + group->hard_sect_size);
19702 + /* Check for the Partial or Removable flag on the PV. */
19703 + if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
19704 + group->flags |= EVMS_VG_PARTIAL_PVS;
19706 + if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
19707 + group->flags |= EVMS_VG_REMOVABLE_PVS;
19710 + LOG_DETAILS("PV %s added to Group %s\n",
19711 + pv_entry->logical_node->name, group->vg_name);
19717 + * discover_volume_groups
19719 + * Examine the list of logical nodes. Any node that contains a valid PV
19720 + * structure is consumed and added to the appropriate volume group. PVs
19721 + * which do not belong to any group are deleted. Everything else is left
19722 + * on the discovery list.
19724 +static int discover_volume_groups(struct evms_logical_node ** evms_node_list)
19726 + struct evms_logical_node * node, * next_node;
19727 + struct pv_disk * pv;
19728 + struct lvm_volume_group * group;
19729 + struct lvm_physical_volume * pv_entry;
19732 + LOG_EXTRA("Searching for PVs in the node list.\n");
19734 + /* Run through the discovery list. */
19735 + for ( node = *evms_node_list; node; node = next_node ) {
19736 + /* Save the next node. We may remove this one from the list. */
19737 + next_node = node->next;
19739 + /* Read the PV metadata. This will also create a new struct pv_disk
19740 + * if it finds the correct LVM signatures.
19742 + rc = read_pv(node, &pv);
19744 + /* This node is not an LVM PV, or an error occurred.
19745 + * Just leave the node on the discovery list.
19750 + rc = find_group_for_pv(node, pv, &group);
19752 + /* Error getting the group for this PV. */
19758 + /* This node is an unassigned PV. */
19759 + LOG_DETAILS("PV %s is unassigned.\n", node->name);
19764 + rc = check_for_duplicate_pv(node, pv, group);
19766 + /* This node is already in the group. This check is also
19767 + * only in the kernel because the engine has no notion
19768 + * of rediscover, and thus can never get a duplicate.
19770 + evms_cs_remove_logical_node_from_list(evms_node_list,
19775 + /* Allocate a PV entry for this node. */
19776 + pv_entry = allocate_physical_volume(node, pv);
19781 + /* Add this PV to the appropriate volume group. */
19782 + rc = add_pv_to_group(pv_entry, group);
19784 + deallocate_physical_volume(pv_entry);
19788 + rc = read_pe_map(pv_entry);
19790 + LOG_WARNING("Error reading PE maps for node %s\n",
19792 + LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
19795 + evms_cs_remove_logical_node_from_list(evms_node_list, node);
19798 + LOG_EXTRA("Group discovery complete.\n");
19803 +/********** Logical Volume Discovery Functions **********/
19809 + * After all logical volumes have been discovered, the mappings from
19810 + * logical extents to physical extents must be constructed. Each PV
19811 + * contains a map on-disk of its PEs. Each PE map entry contains the
19812 + * logical volume number and the logical extent number on that volume.
19813 + * Our internal map is the reverse of this map for each volume, listing
19814 + * the PV node and sector offset for every logical extent on the volume.
19816 +static int build_le_maps(struct lvm_volume_group * group)
19818 + struct lvm_logical_volume ** volume_list = group->volume_list;
19819 + struct lvm_physical_volume * pv_entry;
19820 + struct evms_logical_node * node;
19821 + struct pv_disk * pv;
19822 + struct pe_disk * pe_map;
19824 + u32 lv_number, le_number, first_pe_sector;
19827 + LOG_DEBUG("Building LE maps for new volumes in group %s.\n",
19830 + /* For every PV in this VG. */
19831 + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19832 + node = pv_entry->logical_node;
19833 + pv = pv_entry->pv;
19834 + pe_map = pv_entry->pe_map;
19836 + /* Version 1 metadata uses pe_on_disk.base + .size to find start
19837 + * of first PE. Version 2 uses pe_start.
19839 + if (pv->version == 1) {
19840 + first_pe_sector =
19841 + evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19842 + pv->pe_on_disk.size);
19844 + first_pe_sector = pv->pe_start;
19845 + if (!first_pe_sector) {
19846 + first_pe_sector =
19847 + evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19848 + pv->pe_on_disk.size);
19852 + /* For every entry in the PE map, calculate the PE's sector offset
19853 + * and update the correct LV's PE map. LV number of 0 marks an unused PE.
19854 + * For re-discovery, only compute entries for new volumes. If a PV
19855 + * is read-only, all LVs on that PV will also be read-only.
19857 + for ( i = 0; i < pv->pe_total; i++ ) {
19858 + lv_number = pe_map[i].lv_num;
19859 + if ( lv_number &&
19860 + volume_list[lv_number] &&
19861 + volume_list[lv_number]->lv_access &
19862 + (EVMS_LV_NEW | EVMS_LV_INCOMPLETE) ) {
19863 + le_number = pe_map[i].le_num;
19864 + offset = i * pv->pe_size + first_pe_sector;
19865 + volume_list[lv_number]->le_map[le_number].owning_pv =
19867 + volume_list[lv_number]->le_map[le_number].pe_sector_offset =
19869 + if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
19870 + volume_list[lv_number]->lv_access &=
19881 + * build_snapshot_maps
19883 + * For every volume in this group that is a snapshot, read all of the
19884 + * existing entries in the COW table, and build up the snapshot mapping
19885 + * structures accordingly.
19887 + * For reference, the COW tables attached to the snapshot volumes will
19888 + * always be in disk-order (little-endian), so that it can always be
19889 + * immediately written to disk. Therefore, endian conversions are necessary
19890 + * any time the COW table is accessed. This function will make a local
19891 + * copy of each COW table sector, and convert the local copy before
19892 + * building the snapshot maps.
19894 +static int build_snapshot_maps(struct lvm_volume_group * group)
19896 + struct lvm_logical_volume * volume;
19897 + struct evms_logical_node tmp_node;
19898 + struct lv_COW_table_disk cow_table[EVMS_VSECTOR_SIZE /
19899 + sizeof(struct lv_COW_table_disk)];
19900 + unsigned long max_entries = EVMS_VSECTOR_SIZE /
19901 + sizeof(struct lv_COW_table_disk);
19904 + /* Check every volume in the group to see if it is a snapshot. Also
19905 + * check to make sure it is a new volume in the case of re-discovery.
19907 + for ( i = 1; i <= MAX_LV; i++ ) {
19909 + /* The volume must exist, must be new, and must be a snapshot.
19911 + volume = group->volume_list[i];
19913 + !(volume->lv_access & EVMS_LV_NEW) ||
19914 + !(volume->lv_access & LV_SNAPSHOT)) {
19918 + /* Set up a temporary EVMS node. */
19919 + tmp_node.private = volume;
19921 + LOG_DEBUG("Building snapshot map for volume %s\n",
19925 + /* Read in one sector's worth of COW tables. */
19926 + if ( lvm_init_io(&tmp_node, 0,
19927 + volume->current_cow_sector,
19928 + 1, volume->cow_table) ) {
19932 + /* Endian-conversion of this COW table
19933 + * to a local table.
19935 + for ( j = 0; j < max_entries; j++ ) {
19936 + cow_table[j].pv_org_number =
19937 + le64_to_cpu(volume->cow_table[j].pv_org_number);
19938 + cow_table[j].pv_org_rsector =
19939 + le64_to_cpu(volume->cow_table[j].pv_org_rsector);
19940 + cow_table[j].pv_snap_number =
19941 + le64_to_cpu(volume->cow_table[j].pv_snap_number);
19942 + cow_table[j].pv_snap_rsector =
19943 + le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
19946 + /* Translate every valid COW table entry into
19947 + * a snapshot map entry.
19949 + for ( volume->next_cow_entry = 0;
19950 + volume->next_cow_entry < max_entries &&
19951 + cow_table[volume->next_cow_entry].pv_org_number;
19952 + volume->next_cow_entry++ ) {
19953 + /* org_rsector must be a valid sector number,
19954 + * i.e. it can't be within a PVs metadata. This
19955 + * is how we detect invalidated snapshots.
19957 + if ( cow_table[volume->next_cow_entry].pv_org_rsector < 10 ||
19958 + cow_table[volume->next_cow_entry].pv_org_number > group->pv_count ||
19959 + add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]), volume) ) {
19960 + /* This volume either has an invalid COW entry,
19961 + * or had an error adding that COW entry to the
19962 + * snapshot map. This snapshot is done.
19966 + volume->next_free_chunk += volume->chunk_size;
19969 + /* Move on to the next sector if necessary. */
19970 + if ( volume->next_cow_entry == max_entries ) {
19971 + volume->current_cow_sector++;
19981 + invalidate_snapshot_volume(volume);
19982 + deallocate_logical_volume(volume);
19987 + * link_snapshot_volumes
19989 + * This function examines the list of logical volumes in this group and
19990 + * sets up the necessary pointers to link snapshots and their originals.
19991 + * A singly-linked list is created starting with the original volume. Also,
19992 + * all snapshot volumes point directly back to their original. This
19993 + * function should not be run until all volumes have been discovered.
19994 + * In the case of re-discovery, all of these links/lists get rebuilt as if
19995 + * they were not already there. Currently this should not pose a problem.
19997 +static int link_snapshot_volumes(struct lvm_volume_group * group)
19999 + struct lvm_logical_volume * org_volume, * snap_volume;
20000 + u32 org_minor, buffer_size = 0;
20003 + for ( i = 1; i <= MAX_LV; i++ ) {
20005 + /* Only process snapshot-originals. */
20006 + org_volume = group->volume_list[i];
20007 + if ( !org_volume || !(org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
20011 + /* For snapshot-originals, look for all other volumes that
20012 + * claim to be snapshotting it. For each one that is found,
20013 + * insert it at the start of the original's list of snapshots.
20014 + * Need to start with a NULL snapshot_next, otherwise could
20015 + * wind up with circular lists.
20017 + org_minor = org_volume->lv_minor;
20018 + org_volume->snapshot_next = NULL;
20020 + for ( j = 1; j <= MAX_LV; j++ ) {
20021 + snap_volume = group->volume_list[j];
20022 + if ( snap_volume &&
20023 + snap_volume->lv_access & LV_SNAPSHOT &&
20024 + (snap_volume->snap_org_minor == org_minor) ) {
20025 + snap_volume->snapshot_org = org_volume;
20026 + snap_volume->snapshot_next =
20027 + org_volume->snapshot_next;
20028 + org_volume->snapshot_next = snap_volume;
20029 + if ( snap_volume->chunk_size > buffer_size ) {
20030 + buffer_size = snap_volume->chunk_size;
20032 + LOG_DEBUG("Linking snapshot (%s) to original (%s)\n",
20033 + snap_volume->name, org_volume->name);
20037 + /* If no snapshots were found for a volume that claims to be
20038 + * under snapshot, mark the group dirty. If this is final
20039 + * discovery, the original will have the snapshot flag turned
20040 + * off in check_logical_volumes().
20042 + if (!org_volume->snapshot_next) {
20043 + LOG_WARNING("No snapshots found for original (%s)\n",
20044 + org_volume->name);
20045 + group->flags |= EVMS_VG_DIRTY;
20052 + * discover_volumes_in_group
20054 +static int discover_volumes_in_group(struct lvm_volume_group * group)
20056 + struct lv_disk * lv_array = group->lv_array;
20057 + struct lvm_logical_volume * new_volume;
20060 + /* Search through the LV structs for valid LV entries. */
20061 + for ( i = 0; i < group->vg->lv_max; i++ ) {
20063 + /* Only discover valid, active volumes. */
20064 + if ( !lv_array[i].lv_name[0] ||
20065 + lv_array[i].lv_number >= MAX_LV ) {
20069 + /* Make sure this volume isn't already in the list. */
20070 + if (group->volume_list[lv_array[i].lv_number + 1]) {
20074 + /* Create a new logical volume and place it in the appropriate
20075 + * spot in this VG's volume list.
20077 + new_volume = allocate_logical_volume(&(lv_array[i]), group);
20078 + if (!new_volume) {
20079 + /* This volume will be missing, but other
20080 + * volumes in this group can still be built.
20082 + LOG_CRITICAL("Error allocating LV %s in Group %s\n",
20083 + lv_array[i].lv_name, group->vg_name);
20087 + group->volume_list[new_volume->lv_number] = new_volume;
20088 + group->volume_count++;
20089 + group->flags |= EVMS_VG_DIRTY;
20091 + LOG_DEBUG("Discovered volume %s in group %s.\n",
20092 + new_volume->name, group->vg_name);
20099 + * discover_logical_volumes
20101 + * After all PVs have been claimed and added to the appropriate VG list,
20102 + * the volumes for each VG must be constructed. For each group, read all
20103 + * the LV structs off the first PV in the list. Search this list of
20104 + * structs for valid LVs. For each valid LV, create a new volume and add
20105 + * it to the group.
20107 +static int discover_logical_volumes(int final_discovery)
20109 + struct lvm_volume_group *group;
20112 + /* Look for volumes in each valid VG entry. We even need to check ones
20113 + * that aren't dirty - We could have deleted an incomplete volume on
20114 + * the previous pass, and need to rediscover it in case this is final
20115 + * discovery and we now want to export it.
20117 + for ( group = lvm_group_list; group; group = group->next_group ) {
20119 + if ( ! group->vg ||
20120 + (! final_discovery &&
20121 + ! (group->flags & EVMS_VG_DIRTY)) ) {
20125 + LOG_DEBUG("Searching for volumes in group %s\n",
20128 + /* Read in the LV array from disk if necessary. */
20129 + rc = read_lv(group);
20131 + LOG_WARNING("Unable to read LV metadata for group %s\n",
20133 + LOG_WARNING("No regions can be discovered for group %s\n",
20138 + /* Assemble each volume in the group. */
20139 + discover_volumes_in_group(group);
20141 + /* Build the LE map for each LV discovered in this group. This
20142 + * must be done after all LVS in the group are discovered.
20144 + build_le_maps(group);
20145 + check_le_maps(group);
20147 + /* Set up all of the initial snapshot maps. Only the kernel
20148 + * keeps track of the snapshot maps.
20150 + build_snapshot_maps(group);
20152 + /* Set up the pointers to link snapshot volumes
20153 + * with their originals.
20155 + link_snapshot_volumes(group);
20164 + * The last thing the plugin must do is take each newly constructed volume
20165 + * and place it on the evms logical node list. A zero return-code from
20166 + * this function means nothing new was added to the list, and a positive
20167 + * return code means that many new items were added to the list.
20169 +static int export_volumes(struct evms_logical_node ** evms_node_list,
20170 + int final_discover)
20172 + struct lvm_volume_group * group;
20173 + struct evms_logical_node * new_node;
20174 + struct lvm_logical_volume * volume;
20175 + int i, count = 0;
20177 + LOG_EXTRA("Exporting volumes\n");
20179 + /* For every valid, dirty volume group. */
20180 + for ( group = lvm_group_list; group; group = group->next_group ) {
20181 + if ( ! (group->flags & EVMS_VG_DIRTY) ) {
20185 + /* Export every valid volume in the group. For re-discovery,
20186 + * we re-export the same logical node.
20188 + for ( i = 1; i <= MAX_LV; i++ ) {
20189 + volume = group->volume_list[i];
20194 + /* For new volumes, create a new EVMS node and
20195 + * initialize the appropriate fields.
20197 + if ( volume->lv_access & EVMS_LV_NEW ) {
20198 + if ( evms_cs_allocate_logical_node(&new_node) ) {
20201 + MOD_INC_USE_COUNT;
20203 + volume->volume_node = new_node;
20204 + volume->lv_access &= (~EVMS_LV_QUIESCED &
20206 + new_node->hardsector_size =
20207 + group->hard_sect_size;
20208 + new_node->block_size = group->block_size;
20209 + new_node->plugin = &lvm_plugin_header;
20210 + new_node->private = volume;
20211 + memcpy(new_node->name, volume->name, NAME_LEN);
20213 + /* Snapshot volumes should report the
20214 + * size of their original.
20216 + new_node->total_vsectors =
20217 + (volume->lv_access & LV_SNAPSHOT) ?
20218 + volume->snapshot_org->lv_size :
20221 + /* Is the volume read-only? */
20222 + if ( ! (volume->lv_access & LV_WRITE) ) {
20223 + new_node->flags |=
20224 + EVMS_VOLUME_READ_ONLY;
20225 + LOG_DEBUG("LVM volume %s is read-only\n",
20229 + /* Is the volume incomplete? */
20230 + if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
20231 + new_node->flags |=
20232 + (EVMS_VOLUME_READ_ONLY |
20233 + EVMS_VOLUME_PARTIAL);
20234 + LOG_DEBUG("LVM volume %s is incomplete\n",
20238 + /* Does the volume group contain any partial or
20241 + if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
20242 + new_node->flags |= EVMS_VOLUME_PARTIAL;
20244 + if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
20245 + new_node->flags |=
20246 + EVMS_DEVICE_REMOVABLE;
20250 + /* Export the node, only if it hasn't been exported
20251 + * during this full EVMS discover.
20253 + if ( ! (volume->lv_access & EVMS_LV_EXPORTED) ) {
20254 + if ( ! evms_cs_add_logical_node_to_list(evms_node_list,
20255 + volume->volume_node) ) {
20256 + LOG_DETAILS("Exporting LVM volume %s\n",
20258 + volume->lv_access |= EVMS_LV_EXPORTED;
20263 + if (final_discover) {
20264 + volume->lv_access &= ~EVMS_LV_EXPORTED;
20268 + /* The group is clean now. */
20269 + group->flags &= ~EVMS_VG_DIRTY;
20278 + * This function runs through the entire lvm data structure, removing
20279 + * all items that are not needed at runtime. Currently, this is just the
20280 + * struct vg_disk structure and the struct pv_disk structure for each PV.
20281 + * Also, any groups that don't contain any volumes are deleted. All of the
20282 + * other volume_group, logical_volume and evms_logical_node structures will
20283 + * be kept around at run-time.
20285 +static int lvm_cleanup(void)
20287 + struct lvm_volume_group * group, * next_group;
20288 + struct lvm_physical_volume * pv_entry;
20290 + for ( group = lvm_group_list; group; group = next_group ) {
20291 + next_group = group->next_group;
20293 + /* Delete groups with no volumes. */
20294 + if (!group->volume_count) {
20295 + LOG_WARNING("Group %s contains no logical volumes. Deleting.\n",
20297 + remove_group_from_list(group);
20298 + deallocate_volume_group(group);
20299 + /* Need to go back to the start of the list,
20300 + * just to be safe. :)
20302 + next_group = lvm_group_list;
20306 + /* Delete data structures that aren't used at runtime. */
20308 + kfree(group->vg);
20309 + group->vg = NULL;
20312 + for ( pv_entry = group->pv_list;
20313 + pv_entry; pv_entry = pv_entry->next) {
20314 + if (pv_entry->pv) {
20315 + kfree(pv_entry->pv);
20316 + pv_entry->pv = NULL;
20318 + if (pv_entry->pe_map) {
20319 + vfree(pv_entry->pe_map);
20320 + pv_entry->pe_map = NULL;
20323 + if (group->lv_array) {
20324 + vfree(group->lv_array);
20325 + group->lv_array = NULL;
20327 + if (group->uuid_list) {
20328 + vfree(group->uuid_list);
20329 + group->uuid_list = NULL;
20338 + * Support for the BMAP ioctl used by LILO to translate filesystem blocks
20339 + * to disk blocks to map kernel images for boot time.
20341 +static int lvm_get_bmap(struct evms_logical_node * node,
20342 + struct evms_get_bmap_pkt * bmap,
20343 + struct evms_logical_node ** pv_node)
20345 + struct lvm_logical_volume * volume = node->private;
20346 + struct lvm_physical_volume * pv_entry;
20347 + u64 pe_start_sector, new_sector = 0, new_size = 0;
20350 + /* No kernel images allowed on snapshot LVs. */
20351 + if ( volume->lv_access & LV_SNAPSHOT ) {
20355 + /* Range check. */
20356 + if ( bmap->rsector >= volume->lv_size ) {
20360 + rc = remap_sector(node, bmap->rsector, 1, &new_sector,
20361 + &new_size, &pe_start_sector, &pv_entry);
20363 + if (rc || !pv_entry || !new_sector) {
20367 + bmap->rsector = new_sector;
20368 + *pv_node = pv_entry->logical_node;
20374 + * lvm_global_proc_read
20376 + * A callback function for the lvm-global proc-fs entry. This will print
20377 + * general info about all LVM VGs, PVs, and LVs.
20379 +static int lvm_global_proc_read(char * page, char ** start, off_t off,
20380 + int count, int * eof, void * data)
20382 + struct lvm_volume_group * group;
20383 + struct lvm_physical_volume * pv_entry;
20384 + struct lvm_logical_volume * volume, * snap;
20385 + int vgs = 0, lvs = 0, pvs = 0;
20388 + PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
20389 + PROCPRINT("Plugin ID: %x.%x.%x\n",
20390 + GetPluginOEM(lvm_plugin_header.id),
20391 + GetPluginType(lvm_plugin_header.id),
20392 + GetPluginID(lvm_plugin_header.id));
20393 + PROCPRINT("Plugin Version: %d.%d.%d\n",
20394 + lvm_plugin_header.version.major,
20395 + lvm_plugin_header.version.minor,
20396 + lvm_plugin_header.version.patchlevel);
20397 + PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
20398 + lvm_plugin_header.required_services_version.major,
20399 + lvm_plugin_header.required_services_version.minor,
20400 + lvm_plugin_header.required_services_version.patchlevel);
20402 + /* Count all existing items. */
20403 + for ( group = lvm_group_list; group; group = group->next_group ) {
20404 + lvs += group->volume_count;
20405 + pvs += group->pv_count;
20410 + PROCPRINT("Total: %d VGs %d PVs %d LVs\n", vgs, pvs, lvs);
20412 + /* Print out specifics about each VG. */
20413 + for ( group = lvm_group_list; group; group = group->next_group ) {
20415 + PROCPRINT("VG: %s [%d PV, %d LV]\n",
20416 + group->vg_name, group->pv_count, group->volume_count);
20417 + PROCPRINT("PVs:\n");
20418 + for ( pv_entry = group->pv_list;
20419 + pv_entry; pv_entry = pv_entry->next ) {
20420 + if (pv_entry->logical_node) {
20421 + PROCPRINT("\t%s\t%10Ld KB\n",
20422 + pv_entry->logical_node->name,
20423 + (long long)pv_entry->logical_node->total_vsectors / 2);
20426 + PROCPRINT("LVs:\n");
20427 + for ( i = 1; i <= MAX_LV; i++ ) {
20428 + if (group->volume_list[i]) {
20429 + volume = group->volume_list[i];
20430 + PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
20432 + (long long)volume->lv_size / 2,
20434 + if ( volume->lv_access & LV_SNAPSHOT ) {
20435 + PROCPRINT("\tSnapshot of : ");
20436 + if (volume->snapshot_org) {
20437 + PROCPRINT("%s : ",
20438 + volume->snapshot_org->name);
20440 + PROCPRINT("(unknown) : ");
20442 + PROCPRINT("%ld%% full : ",
20443 + (long)(volume->next_free_chunk) *
20444 + 100 / (long)(volume->lv_size));
20445 + if ( volume->lv_status & LV_ACTIVE ) {
20446 + PROCPRINT("active");
20448 + PROCPRINT("disabled");
20450 + } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20451 + PROCPRINT("\tSnapshotted by : ");
20452 + for ( snap = volume->snapshot_next;
20454 + snap = snap->snapshot_next ) {
20455 + PROCPRINT("%s ", snap->name);
20464 + *start = page + off;
20468 + return sz > count ? count : sz;
20472 +/********** Required EVMS Plugin Functions **********/
20478 + * This is the entry point into the LVM discovery process. It is a three
20479 + * phase process. First, the list of nodes are examined for PVs, and the
20480 + * appropriate volume groups are created. Then each volume group is
20481 + * examined to find all available logical volumes. Finally, each LVM
20482 + * logical volume has a new EVMS node created for it, and added to the
20485 +static int lvm_discover(struct evms_logical_node ** evms_node_list)
20489 + MOD_INC_USE_COUNT;
20490 + LOG_EXTRA("Beginning discovery.\n");
20492 + discover_volume_groups(evms_node_list);
20494 + check_volume_groups();
20496 + discover_logical_volumes(FALSE);
20498 + check_logical_volumes(FALSE);
20500 + rc = export_volumes(evms_node_list, FALSE);
20502 + LOG_EXTRA("Discovery complete.\n");
20503 + MOD_DEC_USE_COUNT;
20508 + * lvm_discover_end
20510 + * The discovery process at the region-manager level is now iterative,
20511 + * much like the EVMS feature level. This allows the ability to stack
20512 + * LVM on top of MD, or vice-versa. To accomplish this correctly, and
20513 + * also to accomplish partial volume discovery, a second discover
20514 + * entry point is needed, so EVMS can tell the region managers that
20515 + * discovery is over, and to finish up any discovery that is not yet
20516 + * complete. When this function is called, it should be assumed that
20517 + * the node list has had nothing new added to it since the last call
20518 + * of the regular discover function. Therefore, when this function is
20519 + * called, we do not need to try to discovery any additional volume
20520 + * groups. We will, however, look for logical volumes once more. This
20521 + * gives us the ability to export (read-only) volumes that have
20522 + * partially corrupted LE maps due to missing PVs in their VG.
20524 +static int lvm_discover_end(struct evms_logical_node ** evms_node_list)
20528 + MOD_INC_USE_COUNT;
20529 + LOG_EXTRA("Beginning final discovery\n");
20531 + discover_volume_groups(evms_node_list);
20533 + check_volume_groups();
20535 + discover_logical_volumes(TRUE);
20537 + check_logical_volumes(TRUE);
20539 + rc = export_volumes(evms_node_list, TRUE);
20543 + LOG_EXTRA("Final discovery complete.\n");
20544 + MOD_DEC_USE_COUNT;
20549 + * lvm_delete_node
20551 + * This function deletes the in-memory representation of an LVM logical volume.
20553 +static int lvm_delete_node(struct evms_logical_node * logical_node)
20555 + struct lvm_logical_volume * volume = logical_node->private;
20556 + struct lvm_volume_group * group = volume->group;
20558 + LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
20560 + if ( deallocate_logical_volume(volume) ) {
20564 + /* If we just removed the last volume from this group, the entire group
20565 + * must also be deleted.
20567 + if ( group && group->volume_count == 0 ) {
20568 + remove_group_from_list(group);
20569 + deallocate_volume_group(group);
20572 + /* Free the logical node. */
20573 + evms_cs_deallocate_logical_node(logical_node);
20574 + MOD_DEC_USE_COUNT;
20581 +static void lvm_read(struct evms_logical_node * node,
20582 + struct buffer_head * bh)
20584 + struct lvm_logical_volume * volume = node->private;
20585 + struct lvm_physical_volume * pv_entry;
20586 + u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20587 + u64 new_sector, new_size, pe_start_sector;
20589 + /* If this volume is a snapshot, lock the volume, and do
20590 + * the LE-PE translation on its original volume.
20592 + if ( volume->lv_access & LV_SNAPSHOT ) {
20593 + down(&volume->snap_semaphore);
20594 + if (!volume->snapshot_org) {
20597 + node = volume->snapshot_org->volume_node;
20600 + /* Make sure the volume is active and readable. */
20601 + if ( !(volume->lv_access & LV_READ &&
20602 + volume->lv_status & LV_ACTIVE) ) {
20606 + /* Check if I/O goes past end of logical volume. Must use the
20607 + * node, not the volume, so snapshots will work correctly.
20609 + if ( bh->b_rsector + size > node->total_vsectors ) {
20613 + /* Logical-to-Physical remapping. Check for incomplete volumes.
20614 + * Check intermediate boundary conditions as well.
20616 + if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20617 + &new_size, &pe_start_sector, &pv_entry) ||
20618 + !pe_start_sector || !pv_entry ||
20619 + size != new_size ) {
20623 + /* For snapshot volumes, check if this sector's chunk has been
20624 + * remapped. If it has, new_sector and pv_entry will be changed
20625 + * accordingly. If not, they remain the same.
20627 + if ( volume->lv_access & LV_SNAPSHOT ) {
20628 + snapshot_remap_sector(volume, pe_start_sector,
20629 + &new_sector, &pv_entry);
20632 + bh->b_rsector = new_sector;
20633 + R_IO(pv_entry->logical_node, bh);
20636 + /* Unlock the snapshot. */
20637 + if ( volume->lv_access & LV_SNAPSHOT ) {
20638 + up(&volume->snap_semaphore);
20643 + bh->b_end_io(bh, 0);
20650 +static void lvm_write(struct evms_logical_node * node,
20651 + struct buffer_head * bh)
20653 + struct lvm_logical_volume * volume = node->private;
20654 + struct lvm_logical_volume * snap_volume;
20655 + struct lvm_physical_volume * pv_entry;
20656 + u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20657 + u64 new_sector, new_size, pe_start_sector;
20659 + /* Make sure the volume is active and writable. */
20660 + if ( !(volume->lv_access & LV_WRITE &&
20661 + volume->lv_status & LV_ACTIVE) ) {
20665 + /* Check if I/O goes past end of logical volume. */
20666 + if ( bh->b_rsector + size > node->total_vsectors ) {
20670 + /* Logical-to-Physical remapping. Check for incomplete volumes.
20671 + * Check intermediate boundary conditions as well.
20673 + if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20674 + &new_size, &pe_start_sector, &pv_entry) ||
20675 + !pe_start_sector || !pv_entry ||
20676 + size != new_size ) {
20680 + /* Copy-on-write for snapshotting. */
20681 + if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20682 + /* Originals can be snapshotted multiple times. */
20683 + for ( snap_volume = volume->snapshot_next;
20684 + snap_volume; snap_volume = snap_volume->snapshot_next ) {
20685 + if ( snapshot_copy_data(volume, snap_volume,
20686 + pe_start_sector, new_sector,
20693 + bh->b_rsector = new_sector;
20694 + W_IO(pv_entry->logical_node, bh);
20698 + bh->b_end_io(bh, 0);
20705 + * Init_io on a snapshot volume treats it like a regular volume.
20707 +static int lvm_init_io(struct evms_logical_node * node,
20713 + struct lvm_logical_volume * volume = node->private;
20714 + struct lvm_physical_volume * pv_entry;
20715 + u64 pe_start_sector, new_sector, new_size;
20718 + /* Only allow internal writes to snapshots (io_flag==4). Disallow
20719 + * writes to snapshot originals.
20721 + if ( io_flag == WRITE &&
20722 + volume->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG) ) {
20726 + /* The node for a snapshot reports the size of the original. If a
20727 + * request comes in in that range, just return.
20729 + else if ( volume->lv_access & LV_SNAPSHOT &&
20730 + sect_nr >= volume->lv_size &&
20731 + sect_nr < node->total_vsectors ) {
20732 + if ( io_flag == READ ) {
20733 + memset(buf_addr, 0,
20734 + num_sects << EVMS_VSECTOR_SIZE_SHIFT);
20739 + /* Regular range check. */
20740 + else if ( sect_nr + num_sects > volume->lv_size ) {
20744 + if ( io_flag == 4 ) {
20748 + /* Init IO needs to deal with the possibility of a request that spans
20749 + * PEs or stripes. This is possible because there is no limit on
20750 + * num_sects. To handle this, we loop through remap_sector and
20751 + * INIT_IO until num_sects reaches zero.
20753 + while (num_sects) {
20754 + if ( remap_sector(node, sect_nr, num_sects, &new_sector,
20755 + &new_size, &pe_start_sector, &pv_entry) ) {
20759 + /* If the volume is incomplete, clear the buffer (on a read). */
20760 + if (!pe_start_sector || !pv_entry) {
20761 + if ( io_flag == READ ) {
20762 + memset(buf_addr, 0,
20763 + new_size << EVMS_VSECTOR_SIZE_SHIFT);
20766 + rc = INIT_IO(pv_entry->logical_node, io_flag,
20767 + new_sector, new_size, buf_addr);
20769 + num_sects -= new_size;
20770 + sect_nr += new_size;
20771 + buf_addr = (void *)(((unsigned long) buf_addr) +
20772 + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
20781 +static int lvm_ioctl(struct evms_logical_node * logical_node,
20782 + struct inode * inode,
20783 + struct file * file,
20784 + unsigned int cmd,
20785 + unsigned long arg)
20787 + struct lvm_logical_volume * volume = logical_node->private;
20790 + LOG_ENTRY_EXIT("Ioctl %d\n", cmd);
20794 + case HDIO_GETGEO:
20796 + /* Fixed geometry for all LVM volumes. */
20797 + unsigned char heads = 64;
20798 + unsigned char sectors = 32;
20801 + struct hd_geometry * hd = (struct hd_geometry *)arg;
20802 + cylinders = logical_node->total_vsectors;
20803 + cylinders = (cylinders / heads) / sectors;
20809 + if ( copy_to_user((char *)(&hd->heads),
20810 + &heads, sizeof(heads)) ||
20811 + copy_to_user((char *)(&hd->sectors),
20812 + §ors, sizeof(sectors)) ||
20813 + copy_to_user((short *)(&hd->cylinders),
20814 + &cylinders, sizeof(cylinders)) ||
20815 + copy_to_user((long *)(&hd->start),
20816 + &start, sizeof(start)) ) {
20822 + case EVMS_QUIESCE_VOLUME:
20824 + struct evms_quiesce_vol_pkt * tmp =
20825 + (struct evms_quiesce_vol_pkt *)arg;
20826 + if (tmp->command) {
20827 + volume->lv_access |= EVMS_LV_QUIESCED;
20829 + volume->lv_access &= ~EVMS_LV_QUIESCED;
20834 + case EVMS_GET_BMAP:
20836 + struct evms_get_bmap_pkt * bmap =
20837 + (struct evms_get_bmap_pkt *)arg;
20838 + struct evms_logical_node * pv_node;
20840 + rc = lvm_get_bmap(logical_node, bmap, &pv_node);
20842 + rc = IOCTL(pv_node, inode, file, cmd,
20843 + (unsigned long) bmap);
20848 + case EVMS_GET_DISK_LIST:
20849 + case EVMS_CHECK_MEDIA_CHANGE:
20850 + case EVMS_REVALIDATE_DISK:
20851 + case EVMS_OPEN_VOLUME:
20852 + case EVMS_CLOSE_VOLUME:
20853 + case EVMS_CHECK_DEVICE_STATUS:
20855 + /* These five ioctl all need to
20856 + * be broadcast to all PVs.
20858 + struct lvm_volume_group * group = volume->group;
20859 + struct lvm_physical_volume * pv_entry;
20860 + for ( pv_entry = group->pv_list;
20861 + pv_entry; pv_entry = pv_entry->next ) {
20862 + rc |= IOCTL(pv_entry->logical_node, inode,
20869 + /* Currently LVM does not send any ioctl's down to the
20870 + * PVs. Which PV would they go to? What would we do with
20871 + * the return codes?
20880 + * lvm_direct_ioctl
20882 + * This function provides a method for user-space to communicate directly
20883 + * with a plugin in the kernel.
20885 +static int lvm_direct_ioctl(struct inode * inode,
20886 + struct file * file,
20887 + unsigned int cmd,
20888 + unsigned long args)
20890 + struct evms_plugin_ioctl_pkt pkt, * user_pkt;
20891 + struct lvm_pv_remove_ioctl pv_remove, * user_pv_remove;
20892 + struct lvm_snapshot_stat_ioctl snap_stats, * user_snap_stats;
20895 + MOD_INC_USE_COUNT;
20897 + user_pkt = (struct evms_plugin_ioctl_pkt *)args;
20899 + /* Copy user's parameters to kernel space. */
20900 + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
20901 + MOD_DEC_USE_COUNT;
20905 + /* Make sure this is supposed to be our ioctl. */
20906 + if ( pkt.feature_id != lvm_plugin_header.id ) {
20907 + MOD_DEC_USE_COUNT;
20911 + switch (pkt.feature_command) {
20913 + case EVMS_LVM_PV_REMOVE_IOCTL:
20915 + (struct lvm_pv_remove_ioctl *)pkt.feature_ioctl_data;
20916 + if ( copy_from_user(&pv_remove, user_pv_remove,
20917 + sizeof(pv_remove)) ) {
20921 + rc = remove_pv_from_group(pv_remove.pv_number,
20922 + pv_remove.vg_uuid);
20925 + case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
20926 + user_snap_stats =
20927 + (struct lvm_snapshot_stat_ioctl *)pkt.feature_ioctl_data;
20928 + if ( copy_from_user(&snap_stats, user_snap_stats,
20929 + sizeof(snap_stats)) ) {
20933 + rc = get_snapshot_stats(&snap_stats);
20934 + if ( copy_to_user(user_snap_stats, &snap_stats,
20935 + sizeof(snap_stats)) ) {
20947 + copy_to_user(user_pkt, &pkt, sizeof(pkt));
20948 + MOD_DEC_USE_COUNT;
20955 +int __init lvm_vge_init(void)
20957 + struct proc_dir_entry *pde;
20959 + lvm_group_list = NULL;
20962 + /* Register the global proc-fs entries. */
20963 + pde = evms_cs_get_evms_proc_dir();
20965 + lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
20967 + create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG,
20968 + lvm_proc, lvm_global_proc_read,
20973 + /* Register this plugin with EVMS. */
20974 + return evms_cs_register_plugin(&lvm_plugin_header);
20980 +void __exit lvm_vge_exit(void)
20982 + struct lvm_volume_group * group, * next_group;
20983 + struct proc_dir_entry * pde;
20986 + /* If LVM is called for module_exit, that means the reference
20987 + * count must be zero, which means there should be no volumes,
20988 + * and thus no volume groups. But, check anyway and delete
20989 + * any volumes and groups that are still hanging around.
20991 + if (lvm_group_list) {
20992 + LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
20995 + for ( group = lvm_group_list; group; group = next_group ) {
20996 + next_group = group->next_group;
20998 + LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n",
21001 + for ( i = 1; i <= MAX_LV; i++ ) {
21002 + if (group->volume_list[i]) {
21003 + lvm_delete_node(group->volume_list[i]->volume_node);
21008 + /* Unregister the proc-fs entries. */
21009 + pde = evms_cs_get_evms_proc_dir();
21011 + remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
21012 + remove_proc_entry(LVM_PROC_NAME, pde);
21015 + /* Unregister this plugin from EVMS. */
21016 + evms_cs_unregister_plugin(&lvm_plugin_header);
21019 +module_init(lvm_vge_init);
21020 +module_exit(lvm_vge_exit);
21021 +#ifdef MODULE_LICENSE
21022 +MODULE_LICENSE("GPL");
21025 diff -Naur linux-2002-09-30/drivers/evms/md_core.c evms-2002-09-30/drivers/evms/md_core.c
21026 --- linux-2002-09-30/drivers/evms/md_core.c Wed Dec 31 18:00:00 1969
21027 +++ evms-2002-09-30/drivers/evms/md_core.c Sun Sep 29 23:25:48 2002
21030 + * Copyright (c) International Business Machines Corp., 2000
21032 + * This program is free software; you can redistribute it and/or modify
21033 + * it under the terms of the GNU General Public License as published by
21034 + * the Free Software Foundation; either version 2 of the License, or
21035 + * (at your option) any later version.
21037 + * This program is distributed in the hope that it will be useful,
21038 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
21039 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
21040 + * the GNU General Public License for more details.
21042 + * You should have received a copy of the GNU General Public License
21043 + * along with this program; if not, write to the Free Software
21044 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21047 + * linux/drivers/evms/md_core.c
21049 + * EVMS Linux MD Region Manager
21054 +#include <linux/module.h>
21055 +#include <linux/kmod.h>
21056 +#include <linux/kernel.h>
21057 +#include <linux/config.h>
21058 +#include <linux/genhd.h>
21059 +#include <linux/string.h>
21060 +#include <linux/blk.h>
21061 +#include <linux/init.h>
21062 +#include <linux/slab.h>
21063 +#include <linux/vmalloc.h>
21064 +#include <linux/evms/evms.h>
21065 +#include <linux/evms/evms_md.h>
21066 +#include <linux/sysctl.h>
21067 +#include <asm/system.h>
21068 +#include <asm/uaccess.h>
21070 +#define LOG_PREFIX "md core: "
21073 + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
21074 + * is 100 KB/sec, so the extra system load does not show up that much.
21075 + * Increase it if you want to have more _guaranteed_ speed. Note that
21076 + * the RAID driver will use the maximum available bandwith if the IO
21077 + * subsystem is idle. There is also an 'absolute maximum' reconstruction
21078 + * speed limit - in case reconstruction slows down your system despite
21079 + * idle IO detection.
21081 + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
21084 +static MD_LIST_HEAD(all_raid_disks);
21085 +static MD_LIST_HEAD(pending_raid_disks);
21087 +static int sysctl_speed_limit_min = 100;
21088 +static int sysctl_speed_limit_max = 100000;
21091 +static mdk_personality_t *pers[MAX_PERSONALITY];
21093 +static int md_blocksizes[MAX_MD_DEVS];
21094 +static int md_hardsect_sizes[MAX_MD_DEVS];
21095 +int evms_md_size[MAX_MD_DEVS];
21096 +static struct evms_thread *evms_md_recovery_thread = NULL;
21099 + * Enables to iterate over all existing md arrays
21101 +static LIST_HEAD(all_mddevs);
21102 +static LIST_HEAD(incomplete_mddevs);
21103 +static LIST_HEAD(running_mddevs);
21106 + * The mapping between kdev and mddev is not necessary a simple
21107 + * one! Eg. HSM uses several sub-devices to implement Logical
21108 + * Volumes. All these sub-devices map to the same mddev.
21110 +struct dev_mapping evms_mddev_map[MAX_MD_DEVS];
21113 +/* Support functions for discovery */
21114 +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node);
21115 +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node);
21116 +static int evms_md_import_device (struct evms_logical_node **discover_list,
21117 + struct evms_logical_node *node);
21118 +static void evms_md_autostart_arrays(struct evms_logical_node **discover_list);
21119 +static void evms_md_run_devices (struct evms_logical_node **discover_list);
21120 +static int evms_md_run_array (struct evms_logical_node ** discover_list,
21122 +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list,
21124 +static int evms_md_create_logical_node(struct evms_logical_node **discover_list,
21125 + mddev_t *mddev, uint flags);
21126 +static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
21127 +static int evms_md_analyze_sbs (mddev_t * mddev);
21128 +static mddev_t * alloc_mddev (kdev_t dev);
21129 +static void free_mddev(mddev_t * mddev);
21130 +static void evms_md_create_recovery_thread(void);
21131 +static void evms_md_destroy_recovery_thread(void);
21132 +static int do_md_run (mddev_t * mddev);
21133 +static int do_md_stop (mddev_t * mddev, int ro);
21135 +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node);
21136 +static void kick_rdev_from_array (mdk_rdev_t * rdev);
21137 +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
21138 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
21140 +/* Plugin API prototypes */
21141 +static int md_discover( struct evms_logical_node ** discover_list );
21142 +static int md_end_discover( struct evms_logical_node ** discover_list );
21143 +static int md_delete( struct evms_logical_node * node);
21144 +static void md_read( struct evms_logical_node * node,
21145 + struct buffer_head * bh);
21146 +static void md_write( struct evms_logical_node * node,
21147 + struct buffer_head * bh);
21148 +static int md_sync_io( struct evms_logical_node *node,
21153 +static int md_ioctl( struct evms_logical_node *node,
21154 + struct inode *inode,
21155 + struct file *file,
21156 + unsigned int cmd,
21157 + unsigned long arg);
21158 +static int md_ioctl_cmd_broadcast(
21159 + struct evms_logical_node *node,
21160 + struct inode *inode,
21161 + struct file *file,
21162 + unsigned long cmd,
21163 + unsigned long arg);
21165 +static int md_direct_ioctl(
21166 + struct inode *inode,
21167 + struct file *file,
21168 + unsigned int cmd,
21169 + unsigned long arg);
21171 +/* global MD data structures */
21172 +static struct evms_plugin_fops md_fops = {
21173 + .discover = md_discover,
21174 + .end_discover = md_end_discover,
21175 + .delete = md_delete,
21177 + .write = md_write,
21178 + .init_io = md_sync_io,
21179 + .ioctl = md_ioctl,
21180 + .direct_ioctl = md_direct_ioctl
21183 +static struct evms_plugin_header md_plugin_header = {
21184 + .id = SetPluginID(IBM_OEM_ID,
21185 + EVMS_REGION_MANAGER,
21188 + .major = EVMS_MD_MAJOR_VERSION,
21189 + .minor = EVMS_MD_MINOR_VERSION,
21190 + .patchlevel = EVMS_MD_PATCHLEVEL_VERSION
21192 + .required_services_version = {
21193 + .major = EVMS_MD_COMMON_SERVICES_MAJOR,
21194 + .minor = EVMS_MD_COMMON_SERVICES_MINOR,
21195 + .patchlevel = EVMS_MD_COMMON_SERVICES_PATCHLEVEL
21200 +/* global variables */
21201 +static int exported_nodes; /* total # of exported devices
21202 + * produced during this discovery.
21204 +static struct evms_logical_node **cur_discover_list = NULL;
21206 +/**********************************************************/
21207 +/* SYSCTL - EVMS/RAID folder */
21208 +/**********************************************************/
21210 +#ifdef CONFIG_PROC_FS
21211 +static struct ctl_table_header *md_table_header;
21213 +static ctl_table md_table[] = {
21214 + {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
21215 + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
21216 + {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
21217 + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
21221 +static ctl_table md_dir_table[] = {
21222 + {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
21226 +static ctl_table evms_dir_table[] = {
21227 + {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
21231 +static ctl_table dev_dir_table[] = {
21232 + {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
21236 +/********** Required EVMS Plugin Functions **********/
21239 + * Function: md_discover
21240 + * We should only export complete MD device nodes
21242 +static int md_discover( struct evms_logical_node ** discover_list )
21244 + MOD_INC_USE_COUNT;
21245 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21247 + /* initialize global variable */
21248 + exported_nodes = 0;
21249 + cur_discover_list = discover_list;
21250 + evms_md_autostart_arrays(discover_list);
21252 + LOG_ENTRY_EXIT("%s: EXIT (exported nodes: %d)\n", __FUNCTION__,exported_nodes);
21253 + cur_discover_list = NULL;
21254 + MOD_DEC_USE_COUNT;
21255 + return(exported_nodes);
21258 +static mddev_t * evms_md_find_incomplete_array(int level)
21261 + struct list_head *tmp,*tmp2;
21262 + mdk_rdev_t *rdev;
21264 + ITERATE_INCOMPLETE_MDDEV(mddev,tmp) {
21265 + ITERATE_RDEV(mddev, rdev, tmp2) {
21266 + if (rdev->sb && rdev->sb->level == level)
21274 + * Function: md_end_discover
21276 +static int md_end_discover( struct evms_logical_node ** discover_list )
21279 + struct list_head *tmp;
21280 + mdk_rdev_t *rdev;
21282 + struct evms_logical_node *node;
21283 + int done = FALSE;
21285 + MOD_INC_USE_COUNT;
21286 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21287 + rc = md_discover(discover_list);
21291 + if ( (mddev = evms_md_find_incomplete_array(5)) != NULL) {
21292 + evms_md_run_incomplete_array(discover_list, mddev);
21296 + if ( (mddev = evms_md_find_incomplete_array(1)) != NULL) {
21297 + evms_md_run_incomplete_array(discover_list, mddev);
21301 + if ( (mddev = evms_md_find_incomplete_array(0)) != NULL) {
21302 + evms_md_run_incomplete_array(discover_list, mddev);
21306 + if ( (mddev = evms_md_find_incomplete_array(-1)) != NULL) {
21307 + evms_md_run_incomplete_array(discover_list, mddev);
21316 + * At this point, delete all mddevs which did not start.
21318 + ITERATE_MDDEV(mddev,tmp) {
21319 + if (mddev->pers == NULL) {
21320 + LOG_WARNING("%s: deleting md%d\n", __FUNCTION__, mdidx(mddev));
21321 + free_mddev(mddev);
21327 + * At this point, delete all rdevs which do not belong to any of discovered MD arrays.
21329 + ITERATE_RDEV_ALL(rdev, tmp) {
21330 + if (!rdev->mddev) {
21331 + node = rdev->node;
21333 + if (node->plugin->id == md_plugin_header.id)
21334 + evms_md_export_rdev(rdev, FALSE);
21336 + evms_md_export_rdev(rdev, TRUE);
21341 + LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
21342 + MOD_DEC_USE_COUNT;
21348 + * Function: md_delete_node
21350 +static int md_delete( struct evms_logical_node * node)
21352 + struct evms_md *evms_md;
21355 + evms_md = node->private;
21356 + mddev = evms_md->mddev;
21357 + LOG_DEFAULT("md_delete() [%s]\n", evms_md_partition_name(node));
21360 + do_md_stop(mddev,0);
21362 + if (evms_md->instance_plugin_hdr.fops)
21363 + kfree(evms_md->instance_plugin_hdr.fops);
21367 + evms_cs_deallocate_logical_node(node);
21373 + * Function: md_read
21375 +static void md_read( struct evms_logical_node * node,
21376 + struct buffer_head * bh)
21378 + struct evms_md *evms_md;
21381 + evms_md = node->private;
21382 + mddev = evms_md->mddev;
21383 + if (evms_md_check_boundary(node, bh)) return;
21384 + if (mddev && mddev->pers)
21385 + mddev->pers->read(node, bh);
21390 + * Function: md_write
21392 +static void md_write( struct evms_logical_node * node,
21393 + struct buffer_head * bh)
21395 + struct evms_md *evms_md;
21398 + evms_md = node->private;
21399 + mddev = evms_md->mddev;
21400 + if (evms_md_check_boundary(node, bh)) return;
21402 + LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21403 + bh->b_end_io(bh, 0);
21406 + if (mddev && mddev->pers)
21407 + mddev->pers->write(node, bh);
21411 + * Function: md_sync_io
21413 +static int md_sync_io(
21414 + struct evms_logical_node *node,
21420 + struct evms_md *evms_md;
21424 + evms_md = node->private;
21425 + mddev = evms_md->mddev;
21427 + if (sect_nr + num_sects > node->total_vsectors) {
21428 + LOG_ERROR("%s: attempt to %s beyond MD device(%s) boundary("PFU64") with sect_nr("PFU64") and num_sects("PFU64")\n",
21430 + rw ? "WRITE" : "READ",
21432 + node->total_vsectors,
21433 + sect_nr,num_sects);
21437 + if ((mddev->ro) && (rw != READ)) {
21438 + LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21442 + if (!rc && mddev && mddev->pers) {
21444 + * Check if the personality can handle synchronous I/O,
21445 + * otherwise use the generic function.
21447 + if (mddev->pers->sync_io)
21448 + rc = mddev->pers->sync_io(mddev, rw, sect_nr, num_sects, buf_addr);
21450 + rc = evms_md_sync_io(node, rw, sect_nr, num_sects, buf_addr);
21457 + * md_end_sync_request - End IO handler for synchronous I/O functions
21459 +static void md_end_sync_request(struct buffer_head *bh, int uptodate)
21461 + struct evms_md_sync_cb * cb = (struct evms_md_sync_cb *) bh->b_private;
21465 + /* we are done with the bh */
21466 + evms_cs_deallocate_to_pool(evms_bh_pool, bh);
21468 + if (atomic_dec_and_test(&cb->io_count)) {
21469 + if (waitqueue_active(&cb->wait))
21470 + wake_up(&cb->wait);
21475 + * md_sync_request_submit_bh - submit a page-size bh
21476 + * @node - target MD node
21477 + * @bh - pointer to the buffer head
21478 + * @sector - the sector number
21479 + * @data - pointer to buffer
21480 + * @rw - READ/WRITE
21481 + * @cb - MD synchronous I/O control block
21483 +static inline void md_sync_request_submit_bh(
21484 + struct evms_logical_node *node,
21485 + struct buffer_head *bh,
21486 + unsigned long sector,
21489 + struct evms_md_sync_cb *cb)
21492 + bh->b_this_page = (struct buffer_head *)1;
21493 + bh->b_rsector = sector;
21494 + bh->b_size = PAGE_SIZE;
21496 + set_bit(BH_Dirty, &bh->b_state);
21497 + set_bit(BH_Lock, &bh->b_state);
21498 + set_bit(BH_Req, &bh->b_state);
21499 + set_bit(BH_Mapped, &bh->b_state);
21500 + atomic_set(&bh->b_count, 1);
21501 + bh->b_data = data;
21502 + bh->b_page = virt_to_page(data);
21503 + bh->b_list = BUF_LOCKED;
21504 + bh->b_end_io = md_end_sync_request;
21505 + bh->b_private = cb;
21506 + atomic_inc(&cb->io_count);
21514 + * evms_md_allocate_bh
21516 + * Note that this function will not return unless we got a free bh
21518 +static inline struct buffer_head *evms_md_allocate_bh(void)
21520 + struct buffer_head *bh;
21522 + while ((bh = evms_cs_allocate_from_pool(evms_bh_pool, FALSE)) == NULL)
21523 + schedule(); /* just yield for a someone to deallocate a bh */
21524 + init_waitqueue_head(&bh->b_wait);
21525 + bh->b_count = (atomic_t)ATOMIC_INIT(0);
21530 + * md_partial_sync_io -
21531 + * This function handles synchronous I/O when sector is not page aligned
21532 + * @node - evms node for the MD array
21533 + * @rw - READ/WRITE
21534 + * @sector - the sector
21535 + * @nsects - on input, the total sectors for the request
21536 + * @nsects - on output, number of sectors completed
21537 + * @data - data buffer
21539 +int evms_md_partial_sync_io(
21540 + struct evms_logical_node *node,
21547 + u32 offset, size;
21548 + struct buffer_head *bh;
21549 + struct evms_md_sync_cb cb;
21552 + size = (u32)(*nsects << EVMS_VSECTOR_SIZE_SHIFT);
21554 + /* calculate byte offset */
21555 + offset = (u32)((sector & (EVMS_MD_SECTS_PER_PAGE-1)) << EVMS_VSECTOR_SIZE_SHIFT);
21556 + if (!offset && (*nsects >= EVMS_MD_SECTS_PER_PAGE)) {
21558 + return 0; /* Nothing to do */
21564 + page = kmalloc(PAGE_SIZE, GFP_KERNEL);
21566 + LOG_ERROR("%s: no memory!\n", __FUNCTION__);
21570 + bh = evms_md_allocate_bh();
21573 + memset(&cb, 0, sizeof(cb));
21574 + init_waitqueue_head(&cb.wait);
21575 + cb.io_count = (atomic_t)ATOMIC_INIT(0);
21576 + md_sync_request_submit_bh(
21578 + (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21579 + page, READ, &cb);
21580 + wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21585 + size = (size <= (PAGE_SIZE - offset)) ? size : (PAGE_SIZE - offset);
21589 + /* copy data and return */
21590 + memcpy(data, page+offset, size);
21593 + /* copy data and then write */
21594 + memcpy(page+offset, data, size);
21596 + bh = evms_md_allocate_bh();
21598 + md_sync_request_submit_bh(
21600 + (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21601 + page, WRITE, &cb);
21602 + wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21614 + *nsects = (u64)(size >> EVMS_VSECTOR_SIZE_SHIFT);
21621 + * evms_md_sync_io - This function handles synchronous I/O
21623 +int evms_md_sync_io(
21624 + struct evms_logical_node *node,
21627 + u64 total_nr_sects,
21631 + u64 total_nr_pages, size;
21633 + struct buffer_head *bh;
21634 + struct evms_md_sync_cb cb;
21636 + if (sector % EVMS_MD_SECTS_PER_PAGE) {
21637 + nsects = total_nr_sects;
21638 + rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21640 + total_nr_sects -= nsects;
21641 + sector += nsects;
21642 + data += (nsects << EVMS_VSECTOR_SIZE_SHIFT);
21643 + if (total_nr_sects == 0)
21650 + total_nr_pages = total_nr_sects / EVMS_MD_SECTS_PER_PAGE;
21651 + size = total_nr_sects << EVMS_VSECTOR_SIZE_SHIFT;
21653 + memset(&cb, 0, sizeof(cb));
21654 + init_waitqueue_head(&cb.wait);
21655 + cb.io_count = (atomic_t)ATOMIC_INIT(0);
21657 + while (!rc && total_nr_pages) {
21659 + bh = evms_md_allocate_bh();
21661 + md_sync_request_submit_bh(node, bh,(unsigned long)sector, data, rw, &cb);
21663 + sector += EVMS_MD_SECTS_PER_PAGE;
21664 + size -= PAGE_SIZE;
21665 + total_nr_pages--;
21666 + data += PAGE_SIZE;
21669 + wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21673 + if (!rc && size) {
21674 + nsects = size >> EVMS_VSECTOR_SIZE_SHIFT;
21675 + rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21682 + * Function: md_ioctl
21684 +static int md_ioctl(
21685 + struct evms_logical_node * node,
21686 + struct inode * inode,
21687 + struct file * file,
21688 + unsigned int cmd,
21689 + unsigned long arg)
21691 + struct evms_md * evms_md = node->private;
21695 + if ((!inode) || (!evms_md) )
21701 + * We have a problem here : there is no easy way to give a CHS
21702 + * virtual geometry. We currently pretend that we have a 2 heads
21703 + * 4 sectors (with a BIG number of cylinders...). This drives
21704 + * dosfs just mad... ;-)
21707 + case HDIO_GETGEO:
21709 + struct hd_geometry hdgeo;
21711 + hdgeo.sectors = 4;
21712 + hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
21713 + hdgeo.heads / hdgeo.sectors;
21715 + if (copy_to_user((int *)arg,
21721 + case EVMS_QUIESCE_VOLUME:
21722 + case EVMS_GET_DISK_LIST:
21723 + case EVMS_CHECK_MEDIA_CHANGE:
21724 + case EVMS_REVALIDATE_DISK:
21725 + case EVMS_OPEN_VOLUME:
21726 + case EVMS_CLOSE_VOLUME:
21727 + case EVMS_CHECK_DEVICE_STATUS:
21728 + rc = md_ioctl_cmd_broadcast(
21729 + node, inode, file, cmd, arg);
21731 + case EVMS_PLUGIN_IOCTL:
21732 + rc = md_direct_ioctl(
21733 + inode, file, cmd, arg);
21736 + mddev = evms_md->mddev;
21737 + if (mddev == NULL) {
21739 + } else if (mddev->pers->evms_ioctl == NULL) {
21742 + rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
21749 +static int md_ioctl_cmd_broadcast(
21750 + struct evms_logical_node *node,
21751 + struct inode *inode,
21752 + struct file *file,
21753 + unsigned long cmd,
21754 + unsigned long arg)
21757 + struct evms_md *evms_md;
21759 + struct list_head *tmp;
21760 + mdk_rdev_t *rdev;
21762 + evms_md = node->private;
21763 + mddev = evms_md->mddev;
21765 + /* broadcast this cmd to all children */
21766 + ITERATE_RDEV(mddev,rdev,tmp) {
21767 + if (!rdev->mddev) {
21771 + if (!rdev->virtual_spare) {
21772 + rc |= IOCTL(rdev->node, inode, file, cmd, arg);
21779 +static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
21781 + mdk_rdev_t *rdev;
21782 + mdp_disk_t *disk = NULL;
21785 + if (evms_md_find_rdev(mddev,dev))
21788 + LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
21789 + if ((rdev = kmalloc(sizeof(*rdev),GFP_KERNEL)) == NULL)
21792 + memset(rdev, 0, sizeof(*rdev));
21794 + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
21795 + disk = mddev->sb->disks + i;
21796 + if (!disk->major && !disk->minor)
21798 + if (disk_removed(disk))
21801 + if (i == MD_SB_DISKS) {
21802 + LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
21807 + if (disk_removed(disk)) {
21811 + if (disk->number != i) {
21817 + disk->number = i;
21820 + disk->raid_disk = disk->number;
21821 + disk->major = MAJOR(dev);
21822 + disk->minor = MINOR(dev);
21824 + mark_disk_spare(disk);
21826 + rdev->mddev = mddev;
21828 + rdev->desc_nr = disk->number;
21829 + rdev->virtual_spare = 1;
21831 + /* bind rdev to mddev array */
21832 + list_add(&rdev->all, &all_raid_disks);
21833 + list_add(&rdev->same_set, &mddev->disks);
21834 + MD_INIT_LIST_HEAD(&rdev->pending);
21836 + mddev->sb->nr_disks++;
21837 + mddev->sb->spare_disks++;
21838 + mddev->sb->working_disks++;
21841 + mddev->sb_dirty = 1;
21843 + evms_md_update_sb(mddev);
21848 +static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
21850 + mdk_rdev_t *rdev = NULL;
21851 + mdp_disk_t *disk;
21854 + disk = evms_md_find_disk(mddev,dev);
21858 + rdev = evms_md_find_rdev(mddev,dev);
21860 + if (rdev && !rdev->faulty) {
21862 + * The disk is active in the array,
21863 + * must ask the personality to do it
21865 + if (mddev->pers && mddev->pers->diskop) {
21866 + /* Assume spare, try to remove it first. */
21867 + rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
21869 + rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
21875 + remove_descriptor(disk,mddev->sb);
21877 + kick_rdev_from_array(rdev);
21878 + mddev->sb_dirty = 1;
21879 + evms_md_update_sb(mddev);
21887 + * Function: md_direct_ioctl
21889 + * This function provides a method for user-space to communicate directly
21890 + * with a plugin in the kernel.
21892 +static int md_direct_ioctl(
21893 + struct inode * inode,
21894 + struct file * file,
21895 + unsigned int cmd,
21896 + unsigned long args )
21898 + struct evms_plugin_ioctl_pkt argument;
21900 + mddev_t *mddev = NULL;
21901 + struct evms_md_ioctl ioctl_arg;
21902 + struct evms_md_kdev device;
21903 + struct evms_md_array_info array_info, *usr_array_info;
21906 + MOD_INC_USE_COUNT;
21908 + // Copy user's parameters to kernel space
21909 + if ( copy_from_user(&argument, (struct evms_plugin_ioctl_pkt*)args, sizeof(argument)) ) {
21910 + MOD_DEC_USE_COUNT;
21914 + // Make sure this is supposed to be our ioctl.
21915 + if ( argument.feature_id != md_plugin_header.id ) {
21916 + MOD_DEC_USE_COUNT;
21920 + // Copy user's md ioclt parmeters to kernel space
21921 + if ( copy_from_user(&ioctl_arg,
21922 + (struct evms_md_ioctl*)argument.feature_ioctl_data,
21923 + sizeof(ioctl_arg)) )
21926 + if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
21927 + md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
21928 + mddev = kdev_to_mddev(md_kdev);
21929 + if (mddev == NULL)
21936 + switch(argument.feature_command) {
21937 + case EVMS_MD_PERS_IOCTL_CMD:
21938 + if (mddev->pers->md_pers_ioctl == NULL) {
21939 + MOD_DEC_USE_COUNT;
21942 + rc = mddev->pers->md_pers_ioctl(mddev,
21945 + copy_to_user((struct evms_md_ioctl*)argument.feature_ioctl_data,
21947 + sizeof(ioctl_arg));
21950 + case EVMS_MD_ADD:
21951 + if ( copy_from_user(&device,
21952 + (struct evms_md_kdev *)ioctl_arg.arg,
21953 + sizeof(device)) )
21956 + rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
21959 + case EVMS_MD_REMOVE:
21960 + if ( copy_from_user(&device,
21961 + (struct evms_md_kdev *)ioctl_arg.arg,
21962 + sizeof(device)) )
21965 + rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
21968 + case EVMS_MD_ACTIVATE:
21972 + case EVMS_MD_DEACTIVATE:
21976 + case EVMS_MD_GET_ARRAY_INFO:
21978 + usr_array_info = (struct evms_md_array_info *)ioctl_arg.arg;
21979 + if ( copy_from_user(&array_info, usr_array_info,
21980 + sizeof(array_info)) )
21983 + array_info.state = 0;
21984 + if (mddev->curr_resync)
21985 + array_info.state |= EVMS_MD_ARRAY_SYNCING;
21986 + copy_to_user(&usr_array_info->state, &array_info.state,
21987 + sizeof(usr_array_info->state));
21988 + if (copy_to_user(array_info.sb, mddev->sb,
21989 + sizeof(mdp_super_t)))
21999 + argument.status = rc;
22000 + copy_to_user((struct evms_plugin_ioctl_pkt*)args, &argument, sizeof(argument));
22001 + MOD_DEC_USE_COUNT;
22008 +void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
22010 + unsigned int minor = MINOR(dev);
22012 + if (MAJOR(dev) != MD_MAJOR) {
22016 + if (evms_mddev_map[minor].mddev != NULL) {
22020 + evms_mddev_map[minor].mddev = mddev;
22021 + evms_mddev_map[minor].data = data;
22024 +void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
22026 + unsigned int minor = MINOR(dev);
22028 + if (MAJOR(dev) != MD_MAJOR) {
22032 + if (evms_mddev_map[minor].mddev != mddev) {
22036 + evms_mddev_map[minor].mddev = NULL;
22037 + evms_mddev_map[minor].data = NULL;
22040 +static mddev_t * alloc_mddev (kdev_t dev)
22044 + if (MAJOR(dev) != MD_MAJOR) {
22048 + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
22052 + memset(mddev, 0, sizeof(*mddev));
22054 + mddev->__minor = MINOR(dev);
22055 + init_MUTEX(&mddev->reconfig_sem);
22056 + init_MUTEX(&mddev->recovery_sem);
22057 + init_MUTEX(&mddev->resync_sem);
22058 + INIT_LIST_HEAD(&mddev->disks);
22059 + INIT_LIST_HEAD(&mddev->all_mddevs);
22060 + INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22061 + INIT_LIST_HEAD(&mddev->running_mddevs);
22062 + mddev->active = (atomic_t)ATOMIC_INIT(0);
22063 + mddev->recovery_active = (atomic_t)ATOMIC_INIT(0);
22066 + * The 'base' mddev is the one with data NULL.
22067 + * personalities can create additional mddevs
22070 + evms_md_add_mddev_mapping(mddev, dev, 0);
22071 + list_add(&mddev->all_mddevs, &all_mddevs);
22073 + MOD_INC_USE_COUNT;
22074 + evms_md_create_recovery_thread();
22079 +mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
22081 + mdk_rdev_t * rdev;
22082 + struct list_head *tmp;
22084 + ITERATE_RDEV(mddev,rdev,tmp) {
22085 + if (rdev->desc_nr == nr)
22092 +mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
22094 + struct list_head *tmp;
22095 + mdk_rdev_t *rdev;
22097 + ITERATE_RDEV(mddev,rdev,tmp) {
22098 + if (rdev->dev == dev)
22104 +mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, struct evms_logical_node * node)
22106 + struct list_head *tmp;
22107 + mdk_rdev_t *rdev;
22109 + ITERATE_RDEV(mddev,rdev,tmp) {
22110 + if (rdev->node == node)
22116 +static MD_LIST_HEAD(device_names);
22118 +static char * org_partition_name (kdev_t dev)
22120 + struct gendisk *hd;
22121 + static char nomem [] = "<nomem>";
22122 + dev_name_t *dname;
22123 + struct list_head *tmp = device_names.next;
22125 + while (tmp != &device_names) {
22126 + dname = list_entry(tmp, dev_name_t, list);
22127 + if (dname->dev == dev)
22128 + return dname->name;
22132 + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
22137 + * ok, add this new device name to the list
22139 + hd = get_gendisk (dev);
22140 + dname->name = NULL;
22142 + dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
22143 + if (!dname->name) {
22144 + sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
22145 + dname->name = dname->namebuf;
22148 + dname->dev = dev;
22149 + MD_INIT_LIST_HEAD(&dname->list);
22150 + list_add(&dname->list, &device_names);
22152 + return dname->name;
22156 +#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
22157 +char * evms_md_partition_name (struct evms_logical_node *node)
22159 + if (node && node->name)
22160 + return node->name;
22162 + return EVMS_MD_NULL_PARTITION_NAME;
22165 +static char * get_partition_name (mdk_rdev_t *rdev)
22168 + return evms_md_partition_name(rdev->node);
22170 + return org_partition_name(rdev->dev);
22174 + * Function: evms_md_calc_dev_sboffset
22175 + * return the LSN for md super block.
22177 +static u64 evms_md_calc_dev_sboffset (struct evms_logical_node *node,mddev_t *mddev, int persistent)
22181 + size = node->total_vsectors;
22182 + if (persistent) {
22183 + size = MD_NEW_SIZE_SECTORS(size);
22185 + return size; /* size in sectors */
22189 + * Function: evms_md_calc_dev_size
22190 + * return data size (in blocks) for an "extended" device.
22192 +static unsigned long evms_md_calc_dev_size (struct evms_logical_node *node,
22196 + unsigned long size;
22197 + u64 size_in_sectors;
22199 + size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
22200 + size = size_in_sectors >> 1;
22201 + if (!mddev->sb) {
22205 + if (mddev->sb->chunk_size)
22206 + size &= ~(mddev->sb->chunk_size/1024 - 1);
22210 +static unsigned int zoned_raid_size (mddev_t *mddev)
22212 + unsigned int mask;
22213 + mdk_rdev_t * rdev;
22214 + struct list_head *tmp;
22216 + if (!mddev->sb) {
22221 + * do size and offset calculations.
22223 + mask = ~(mddev->sb->chunk_size/1024 - 1);
22225 + ITERATE_RDEV(mddev,rdev,tmp) {
22226 + rdev->size &= mask;
22227 + evms_md_size[mdidx(mddev)] += rdev->size;
22233 + * We check wether all devices are numbered from 0 to nb_dev-1. The
22234 + * order is guaranteed even after device name changes.
22236 + * Some personalities (raid0, linear) use this. Personalities that
22237 + * provide data have to be able to deal with loss of individual
22238 + * disks, so they do their checking themselves.
22240 +int evms_md_check_ordering (mddev_t *mddev)
22243 + mdk_rdev_t *rdev;
22244 + struct list_head *tmp;
22247 + * First, all devices must be fully functional
22249 + ITERATE_RDEV(mddev,rdev,tmp) {
22250 + if (rdev->faulty) {
22251 + LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
22252 + mdidx(mddev), get_partition_name(rdev));
22258 + ITERATE_RDEV(mddev,rdev,tmp) {
22261 + if (c != mddev->nb_dev) {
22265 + if (mddev->nb_dev != mddev->sb->raid_disks) {
22266 + LOG_ERROR("%s: [md%d] array needs %d disks, has %d, aborting.\n",
22267 + __FUNCTION__, mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
22271 + * Now the numbering check
22273 + for (i = 0; i < mddev->nb_dev; i++) {
22275 + ITERATE_RDEV(mddev,rdev,tmp) {
22276 + if (rdev->desc_nr == i)
22280 + LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
22284 + LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
22293 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
22295 + if (disk_active(disk)) {
22296 + sb->working_disks--;
22298 + if (disk_spare(disk)) {
22299 + sb->spare_disks--;
22300 + sb->working_disks--;
22302 + sb->failed_disks--;
22306 + disk->major = disk->minor = 0;
22307 + mark_disk_removed(disk);
22310 +#define BAD_MINOR \
22311 +"%s: invalid raid minor (%x)\n"
22314 +"disabled device %s, could not read superblock.\n"
22316 +#define BAD_CSUM \
22317 +"invalid superblock checksum on %s\n"
22320 +static int alloc_array_sb (mddev_t * mddev)
22327 + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
22328 + if (!mddev->sb) {
22329 + LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22332 + md_clear_page(mddev->sb);
22336 +static int alloc_disk_sb (mdk_rdev_t * rdev)
22341 + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
22343 + LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22346 + md_clear_page(rdev->sb);
22352 + * Function: free_disk_sb
22355 +static void free_disk_sb (mdk_rdev_t * rdev)
22358 + free_page((unsigned long) rdev->sb);
22360 + rdev->sb_offset = 0;
22363 + if (!rdev->virtual_spare && !rdev->faulty)
22369 + * Function: evms_md_read_disk_sb
22370 + * Read the MD superblock.
22372 +static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
22375 + struct evms_logical_node *node = rdev->node;
22376 + u64 sb_offset_in_sectors;
22382 + if (node->total_vsectors <= MD_RESERVED_SECTORS) {
22383 + LOG_DETAILS("%s is too small, total_vsectors("PFU64")\n",
22384 + evms_md_partition_name(node), node->total_vsectors);
22389 + * Calculate the position of the superblock,
22390 + * it's at the end of the disk
22392 + sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
22393 + rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
22394 + LOG_DEBUG("(read) %s's sb offset("PFU64") total_vsectors("PFU64")\n",
22395 + evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
22398 + * Read superblock
22400 + rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
22405 +static unsigned int calc_sb_csum (mdp_super_t * sb)
22407 + unsigned int disk_csum, csum;
22409 + disk_csum = sb->sb_csum;
22411 + csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
22412 + sb->sb_csum = disk_csum;
22419 + * Check one RAID superblock for generic plausibility
22422 +static int check_disk_sb (mdk_rdev_t * rdev)
22425 + int ret = -EINVAL;
22433 + if (sb->md_magic != MD_SB_MAGIC) {
22437 + if (sb->md_minor >= MAX_MD_DEVS) {
22438 + LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
22441 + if (calc_sb_csum(sb) != sb->sb_csum) {
22442 + LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
22446 + switch (sb->level) {
22453 + LOG_ERROR("%s: EVMS MD does not support MD level %d\n", __FUNCTION__, sb->level);
22461 +static kdev_t dev_unit(kdev_t dev)
22463 + unsigned int mask;
22464 + struct gendisk *hd = get_gendisk(dev);
22468 + mask = ~((1 << hd->minor_shift) - 1);
22470 + return MKDEV(MAJOR(dev), MINOR(dev) & mask);
22473 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
22475 + struct list_head *tmp;
22476 + mdk_rdev_t *rdev;
22478 + ITERATE_RDEV(mddev,rdev,tmp)
22479 + if (dev_unit(rdev->dev) == dev_unit(dev))
22485 +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
22487 + struct list_head *tmp;
22488 + mdk_rdev_t *rdev;
22490 + ITERATE_RDEV(mddev1,rdev,tmp)
22491 + if (match_dev_unit(mddev2, rdev->dev))
22498 +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
22500 + mdk_rdev_t *same_pdev;
22502 + if (rdev->mddev) {
22507 + same_pdev = match_dev_unit(mddev, rdev->dev);
22509 + LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
22510 + " protection against single-disk failure might be compromised.\n",
22511 + mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
22513 + list_add(&rdev->same_set, &mddev->disks);
22514 + rdev->mddev = mddev;
22516 + if (rdev->sb && disk_active(&rdev->sb->this_disk))
22517 + mddev->nr_raid_disks++;
22518 + LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22521 +static void unbind_rdev_from_array (mdk_rdev_t * rdev)
22523 + if (!rdev->mddev) {
22527 + list_del(&rdev->same_set);
22528 + MD_INIT_LIST_HEAD(&rdev->same_set);
22529 + rdev->mddev->nb_dev--;
22530 + if (rdev->sb && disk_active(&rdev->sb->this_disk))
22531 + rdev->mddev->nr_raid_disks--;
22532 + LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22533 + rdev->mddev = NULL;
22538 + * Function: evms_md_export_rdev
22539 + * EVMS MD version of export_rdev()
22540 + * Discard this MD "extended" device
22542 +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node)
22544 + LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
22547 + free_disk_sb(rdev);
22548 + list_del(&rdev->all);
22549 + MD_INIT_LIST_HEAD(&rdev->all);
22550 + if (rdev->pending.next != &rdev->pending) {
22551 + LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
22552 + list_del(&rdev->pending);
22553 + MD_INIT_LIST_HEAD(&rdev->pending);
22555 + if (rdev->node && delete_node) {
22556 + if (cur_discover_list) {
22557 + LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
22558 + get_partition_name(rdev));
22559 + evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
22561 + LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
22562 + DELETE(rdev->node);
22563 + rdev->node = NULL;
22566 + rdev->faulty = 0;
22571 +static void kick_rdev_from_array (mdk_rdev_t * rdev)
22573 + LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
22574 + unbind_rdev_from_array(rdev);
22575 + evms_md_export_rdev(rdev, TRUE);
22578 +static void export_array (mddev_t *mddev)
22580 + struct list_head *tmp;
22581 + mdk_rdev_t *rdev;
22582 + mdp_super_t *sb = mddev->sb;
22584 + LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
22586 + mddev->sb = NULL;
22587 + free_page((unsigned long) sb);
22590 + LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
22591 + ITERATE_RDEV(mddev,rdev,tmp) {
22592 + if (!rdev->mddev) {
22596 + kick_rdev_from_array(rdev);
22598 + if (mddev->nb_dev)
22602 +static void free_mddev (mddev_t *mddev)
22604 + struct evms_logical_node *node;
22605 + struct evms_md *evms_md;
22612 + node = mddev->node;
22614 + export_array(mddev);
22615 + evms_md_size[mdidx(mddev)] = 0;
22619 + * Make sure nobody else is using this mddev
22620 + * (careful, we rely on the global kernel lock here)
22622 + while (atomic_read(&mddev->resync_sem.count) != 1)
22624 + while (atomic_read(&mddev->recovery_sem.count) != 1)
22627 + evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
22628 + list_del(&mddev->all_mddevs);
22629 + INIT_LIST_HEAD(&mddev->all_mddevs);
22630 + if (!list_empty(&mddev->running_mddevs)) {
22631 + list_del(&mddev->running_mddevs);
22632 + INIT_LIST_HEAD(&mddev->running_mddevs);
22634 + if (!list_empty(&mddev->incomplete_mddevs)) {
22635 + list_del(&mddev->incomplete_mddevs);
22636 + INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22641 + evms_md = node->private;
22642 + evms_md->mddev = NULL;
22644 + MOD_DEC_USE_COUNT;
22645 + evms_md_destroy_recovery_thread();
22649 +static void print_desc(mdp_disk_t *desc)
22651 + printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
22652 + desc->raid_disk,desc->state);
22655 +static void print_sb(mdp_super_t *sb)
22659 + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
22660 + sb->major_version, sb->minor_version, sb->patch_version,
22661 + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
22663 + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
22664 + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
22665 + sb->layout, sb->chunk_size);
22666 + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
22667 + sb->utime, sb->state, sb->active_disks, sb->working_disks,
22668 + sb->failed_disks, sb->spare_disks,
22669 + sb->sb_csum, sb->events_lo);
22671 + for (i = 0; i < MD_SB_DISKS; i++) {
22672 + mdp_disk_t *desc;
22674 + desc = sb->disks + i;
22675 + if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
22676 + printk(" D %2d: ", i);
22677 + print_desc(desc);
22680 + printk(" THIS: ");
22681 + print_desc(&sb->this_disk);
22685 +static void print_rdev(mdk_rdev_t *rdev)
22687 + printk("rdev %s: SZ:%08ld F:%d DN:%d ",
22688 + get_partition_name(rdev),
22689 + rdev->size, rdev->faulty, rdev->desc_nr);
22691 + printk("rdev superblock:\n");
22692 + print_sb(rdev->sb);
22694 + printk("no rdev superblock!\n");
22697 +void evms_md_print_devices (void)
22699 + struct list_head *tmp, *tmp2;
22700 + mdk_rdev_t *rdev;
22704 + printk(": **********************************\n");
22705 + printk(": * <COMPLETE RAID STATE PRINTOUT> *\n");
22706 + printk(": **********************************\n");
22707 + ITERATE_MDDEV(mddev,tmp) {
22708 + printk("md%d: ", mdidx(mddev));
22710 + ITERATE_RDEV(mddev,rdev,tmp2)
22711 + printk("<%s>", get_partition_name(rdev));
22714 + printk(" array superblock:\n");
22715 + print_sb(mddev->sb);
22717 + printk(" no array superblock.\n");
22719 + ITERATE_RDEV(mddev,rdev,tmp2)
22720 + print_rdev(rdev);
22722 + printk(": **********************************\n");
22726 +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
22729 + mdp_super_t *tmp1, *tmp2;
22731 + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
22732 + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
22734 + if (!tmp1 || !tmp2) {
22736 + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
22744 + * nr_disks is not constant
22746 + tmp1->nr_disks = 0;
22747 + tmp2->nr_disks = 0;
22749 + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
22763 +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
22765 + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
22766 + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
22767 + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
22768 + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
22776 + * Function: evms_md_find_rdev_all
22777 + * EVMS MD version of find_rdev_all()
22778 + * Search entire all_raid_disks for "node"
22779 + * Return the MD "extended" device if found.
22781 +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node)
22783 + struct list_head *tmp;
22784 + mdk_rdev_t *rdev;
22786 + tmp = all_raid_disks.next;
22787 + while (tmp != &all_raid_disks) {
22788 + rdev = list_entry(tmp, mdk_rdev_t, all);
22789 + if (rdev->node == node)
22797 + * Function: evms_md_find_mddev_all
22799 +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node)
22801 + struct list_head *tmp;
22804 + ITERATE_MDDEV(mddev,tmp) {
22805 + if (mddev->node == node)
22813 + * Function: evms_md_write_disk_sb
22814 + * EVMS MD version of write_disk_sb
22816 +static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
22818 + unsigned long size;
22819 + u64 sb_offset_in_sectors;
22825 + if (rdev->faulty) {
22829 + if (rdev->sb->md_magic != MD_SB_MAGIC) {
22834 + sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
22835 + if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
22836 + LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
22837 + get_partition_name(rdev),
22839 + (unsigned long)(sb_offset_in_sectors >> 1));
22843 + * If the disk went offline meanwhile and it's just a spare, then
22844 + * its size has changed to zero silently, and the MD code does
22845 + * not yet know that it's faulty.
22847 + size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
22848 + if (size != rdev->size) {
22849 + LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
22850 + get_partition_name(rdev), rdev->size, size);
22854 + LOG_DETAILS("(write) %s's sb offset: "PFU64"\n",get_partition_name(rdev), sb_offset_in_sectors);
22856 + INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
22862 +static int evms_md_sync_sbs(mddev_t * mddev)
22864 + mdk_rdev_t *rdev;
22865 + struct list_head *tmp;
22866 + mdp_disk_t * disk;
22868 + ITERATE_RDEV(mddev,rdev,tmp) {
22869 + if (rdev->virtual_spare || rdev->faulty)
22872 + /* copy everything from the master */
22873 + memcpy(rdev->sb, mddev->sb, sizeof(mdp_super_t));
22875 + /* this_disk is unique, copy it from the master */
22876 +// rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
22877 + // use the SB disk array since if update occurred on normal shutdown
22878 + // the rdevs may be out of date.
22879 + disk = evms_md_find_disk(mddev, rdev->dev);
22881 + rdev->sb->this_disk = *disk;
22884 + rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
22889 +static int evms_md_update_sb_sync(mddev_t * mddev, int clean)
22891 + mdk_rdev_t *rdev;
22892 + struct list_head *tmp;
22894 + int found = FALSE;
22896 + ITERATE_RDEV(mddev,rdev,tmp) {
22898 + if (rdev->virtual_spare || rdev->faulty)
22901 + if ((rc = evms_md_read_disk_sb(rdev))) {
22902 + LOG_ERROR("%s: error reading superblock on %s!\n",
22903 + __FUNCTION__, evms_md_partition_name(rdev->node));
22907 + if ((rc = check_disk_sb(rdev))) {
22908 + LOG_ERROR("%s: %s has invalid sb!\n",
22909 + __FUNCTION__, evms_md_partition_name(rdev->node));
22913 + rdev->desc_nr = rdev->sb->this_disk.number;
22914 + rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
22916 + /* copy master superlbock from the first good rdev */
22919 + memcpy(mddev->sb, rdev->sb, sizeof(mdp_super_t));
22921 + mddev->sb->state |= 1 << MD_SB_CLEAN;
22923 + mddev->sb->state &= ~(1 << MD_SB_CLEAN);
22926 + if (!rc && found) {
22927 + evms_md_update_sb(mddev);
22929 + LOG_SERIOUS("%s: BUG! BUG! superblocks will not be updated!\n", __FUNCTION__);
22935 +int evms_md_update_sb(mddev_t * mddev)
22937 + int err, count = 100;
22938 + struct list_head *tmp;
22939 + mdk_rdev_t *rdev;
22943 + mddev->sb->utime = CURRENT_TIME;
22944 + if ((++mddev->sb->events_lo)==0)
22945 + ++mddev->sb->events_hi;
22947 + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
22949 + * oops, this 64-bit counter should never wrap.
22950 + * Either we are in around ~1 trillion A.C., assuming
22951 + * 1 reboot per second, or we have a bug:
22954 + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
22956 + evms_md_sync_sbs(mddev);
22959 + * do not write anything to disk if using
22960 + * nonpersistent superblocks
22962 + if (mddev->sb->not_persistent)
22965 + LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
22968 + ITERATE_RDEV(mddev,rdev,tmp) {
22969 + if (!rdev->virtual_spare && !rdev->faulty) {
22970 + LOG_DETAILS(" %s [events: %x]",
22971 + get_partition_name(rdev),
22972 + rdev->sb->events_lo);
22973 + err += evms_md_write_disk_sb(rdev);
22975 + if (rdev->faulty)
22976 + LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
22977 + if (rdev->virtual_spare)
22978 + LOG_DETAILS(" skipping virtual spare.\n");
22983 + LOG_WARNING("errors occurred during superblock update, repeating\n");
22986 + LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
22992 + * Function: evms_md_import_device
22993 + * Insure that node is not yet imported.
22994 + * Read and validate the MD super block on this device
22995 + * Add to the global MD "extended" devices list (all_raid_disks)
22998 +static int evms_md_import_device (struct evms_logical_node **discover_list,
22999 + struct evms_logical_node *node)
23002 + mdk_rdev_t *rdev;
23004 + LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
23006 + if (evms_md_find_rdev_all(node)) {
23007 + LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
23011 + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
23013 + LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
23016 + memset(rdev, 0, sizeof(*rdev));
23018 + if ((err = alloc_disk_sb(rdev)))
23021 + rdev->node = node; /* set this for evms_md_read_disk_sb() */
23023 + rdev->desc_nr = -1;
23024 + rdev->faulty = 0;
23026 + if (!node->total_vsectors) {
23027 + LOG_ERROR("%s has zero size!\n", evms_md_partition_name(node));
23032 + if ((err = evms_md_read_disk_sb(rdev))) {
23033 + LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
23036 + if ((err = check_disk_sb(rdev))) {
23037 + LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
23040 + rdev->desc_nr = rdev->sb->this_disk.number;
23041 + rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
23042 + LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
23043 + list_add(&rdev->all, &all_raid_disks);
23044 + MD_INIT_LIST_HEAD(&rdev->pending);
23046 + if (rdev->faulty && rdev->sb)
23047 + free_disk_sb(rdev);
23053 + free_disk_sb(rdev);
23062 + * Function: evms_md_analyze_sbs
23063 + * EVMS MD version of analyze_sbs()
23065 +static int evms_md_analyze_sbs (mddev_t * mddev)
23067 + int out_of_date = 0, i;
23068 + struct list_head *tmp, *tmp2;
23069 + mdk_rdev_t *rdev, *rdev2, *freshest;
23072 + LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
23074 + * Verify the RAID superblock on each real device
23076 + ITERATE_RDEV(mddev,rdev,tmp) {
23077 + if (rdev->faulty) {
23085 + if (check_disk_sb(rdev))
23090 + * The superblock constant part has to be the same
23091 + * for all disks in the array.
23095 + ITERATE_RDEV(mddev,rdev,tmp) {
23100 + if (!sb_equal(sb, rdev->sb)) {
23101 + LOG_WARNING("kick out %s\n",get_partition_name(rdev));
23102 + kick_rdev_from_array(rdev);
23108 + * OK, we have all disks and the array is ready to run. Let's
23109 + * find the freshest superblock, that one will be the superblock
23110 + * that represents the whole array.
23113 + if (alloc_array_sb(mddev))
23118 + ITERATE_RDEV(mddev,rdev,tmp) {
23121 + * if the checksum is invalid, use the superblock
23122 + * only as a last resort. (decrease it's age by
23125 + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
23126 + if (rdev->sb->events_lo || rdev->sb->events_hi)
23127 + if ((rdev->sb->events_lo--)==0)
23128 + rdev->sb->events_hi--;
23130 + LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
23137 + * Find the newest superblock version
23139 + ev1 = md_event(rdev->sb);
23140 + ev2 = md_event(freshest->sb);
23141 + if (ev1 != ev2) {
23147 + if (out_of_date) {
23148 + LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
23150 + memcpy (sb, freshest->sb, sizeof(*sb));
23153 + * at this point we have picked the 'best' superblock
23154 + * from all available superblocks.
23155 + * now we validate this superblock and kick out possibly
23158 + ITERATE_RDEV(mddev,rdev,tmp) {
23160 + * Kick all non-fresh devices
23163 + ev1 = md_event(rdev->sb);
23164 + ev2 = md_event(sb);
23167 + LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
23168 + kick_rdev_from_array(rdev);
23171 + LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
23177 + * Remove unavailable and faulty devices ...
23179 + * note that if an array becomes completely unrunnable due to
23180 + * missing devices, we do not write the superblock back, so the
23181 + * administrator has a chance to fix things up. The removal thus
23182 + * only happens if it's nonfatal to the contents of the array.
23184 + for (i = 0; i < MD_SB_DISKS; i++) {
23186 + mdp_disk_t *desc;
23188 + desc = sb->disks + i;
23191 + * We kick faulty devices/descriptors immediately.
23193 + * Note: multipath devices are a special case. Since we
23194 + * were able to read the superblock on the path, we don't
23195 + * care if it was previously marked as faulty, it's up now
23198 + if (disk_faulty(desc) && mddev->sb->level != -4) {
23200 + ITERATE_RDEV(mddev,rdev,tmp) {
23201 + if (rdev->desc_nr != desc->number)
23203 + LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
23204 + kick_rdev_from_array(rdev);
23209 + LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
23210 + __FUNCTION__ ,mdidx(mddev), desc->number);
23213 + * Don't call remove_descriptor(),
23214 + * let the administrator remove it from the user-land */
23215 + /* remove_descriptor(desc, sb); */
23217 + } else if (disk_faulty(desc)) {
23219 + * multipath entry marked as faulty, unfaulty it
23223 + dev = MKDEV(desc->major, desc->minor);
23225 + rdev = evms_md_find_rdev(mddev, dev);
23227 + mark_disk_spare(desc);
23229 + LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
23230 + __FUNCTION__ ,mdidx(mddev), desc->number);
23232 + * Don't call remove_descriptor(),
23233 + * let the administrator remove it from the user-land */
23234 + /* remove_descriptor(desc, sb); */
23239 + * Is this device present in the rdev ring?
23242 + ITERATE_RDEV(mddev,rdev,tmp) {
23244 + * Multi-path IO special-case: since we have no
23245 + * this_disk descriptor at auto-detect time,
23246 + * we cannot check rdev->number.
23247 + * We can check the device though.
23249 + if ((sb->level == -4) && (rdev->dev ==
23250 + MKDEV(desc->major,desc->minor))) {
23254 + if (rdev->desc_nr == desc->number) {
23262 + LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
23263 + mdidx(mddev), desc->number);
23264 + remove_descriptor(desc, sb);
23268 + * Kick all rdevs that are not in the
23269 + * descriptor array:
23271 + ITERATE_RDEV(mddev,rdev,tmp) {
23272 + if (rdev->desc_nr == -1)
23273 + kick_rdev_from_array(rdev);
23277 + * Do a final reality check.
23279 + if (mddev->sb->level != -4) {
23280 + ITERATE_RDEV(mddev,rdev,tmp) {
23281 + if (rdev->desc_nr == -1) {
23286 + * is the desc_nr unique?
23288 + ITERATE_RDEV(mddev,rdev2,tmp2) {
23289 + if ((rdev2 != rdev) &&
23290 + (rdev2->desc_nr == rdev->desc_nr)) {
23298 +#define OLD_VERSION KERN_ALERT \
23299 +"md%d: unsupported raid array version %d.%d.%d\n"
23301 +#define NOT_CLEAN_IGNORE KERN_ERR \
23302 +"md%d: raid array is not clean -- starting background reconstruction\n"
23305 + * Check if we can support this RAID array
23307 + if (sb->major_version != MD_MAJOR_VERSION ||
23308 + sb->minor_version > MD_MINOR_VERSION) {
23310 + LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
23312 + sb->major_version,
23313 + sb->minor_version,
23314 + sb->patch_version);
23318 + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
23319 + (sb->level == 4) || (sb->level == 5)))
23320 + LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
23321 + mdidx(mddev), sb->level);
23323 + LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
23326 + LOG_WARNING("ABORT analyze_sbs()!!!\n");
23331 +static int device_size_calculation (mddev_t * mddev)
23333 + int data_disks = 0, persistent;
23334 + //unsigned int readahead;
23335 + mdp_super_t *sb = mddev->sb;
23336 + struct list_head *tmp;
23337 + mdk_rdev_t *rdev;
23340 + * Do device size calculation. Bail out if too small.
23341 + * (we have to do this after having validated chunk_size,
23342 + * because device size has to be modulo chunk_size)
23344 + persistent = !mddev->sb->not_persistent;
23345 + ITERATE_RDEV(mddev,rdev,tmp) {
23346 + if (rdev->faulty)
23348 + if (rdev->size) {
23349 + LOG_DEFAULT("%s: already calculated %s\n", __FUNCTION__, get_partition_name(rdev));
23352 + rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
23353 + if (rdev->size < sb->chunk_size / 1024) {
23354 + LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
23355 + get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
23360 + switch (sb->level) {
23371 + zoned_raid_size(mddev);
23375 + zoned_raid_size(mddev);
23376 + data_disks = sb->raid_disks;
23383 + data_disks = sb->raid_disks-1;
23386 + LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
23389 + if (!evms_md_size[mdidx(mddev)])
23390 + evms_md_size[mdidx(mddev)] = sb->size * data_disks;
23398 +#define TOO_BIG_CHUNKSIZE KERN_ERR \
23399 +"too big chunk_size: %d > %d\n"
23401 +#define TOO_SMALL_CHUNKSIZE KERN_ERR \
23402 +"too small chunk_size: %d < %ld\n"
23404 +#define BAD_CHUNKSIZE KERN_ERR \
23405 +"no chunksize specified, see 'man raidtab'\n"
23407 +static int do_md_run (mddev_t * mddev)
23411 + struct list_head *tmp;
23412 + mdk_rdev_t *rdev;
23415 + if (!mddev->nb_dev) {
23424 + * Resize disks to align partitions size on a given
23427 + evms_md_size[mdidx(mddev)] = 0;
23430 + * Analyze all RAID superblock(s)
23432 + if (evms_md_analyze_sbs(mddev)) {
23437 + mddev->chunk_size = chunk_size = mddev->sb->chunk_size;
23438 + pnum = level_to_pers(mddev->sb->level);
23440 + if ((pnum != MULTIPATH) && (pnum != RAID1)) {
23441 + if (!chunk_size) {
23443 + * 'default chunksize' in the old md code used to
23444 + * be PAGE_SIZE, baaad.
23445 + * we abort here to be on the safe side. We dont
23446 + * want to continue the bad practice.
23448 + printk(BAD_CHUNKSIZE);
23451 + if (chunk_size > MAX_CHUNK_SIZE) {
23452 + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
23456 + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
23458 + if ( (1 << ffz(~chunk_size)) != chunk_size) {
23462 + if (chunk_size < PAGE_SIZE) {
23463 + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
23468 + printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
23470 + if (pnum >= MAX_PERSONALITY) {
23476 +#ifdef CONFIG_KMOD
23477 + char module_name[80];
23478 + sprintf (module_name, "md-personality-%d", pnum);
23479 + request_module (module_name);
23483 + printk(KERN_ERR "personality %d is not loaded!\n",
23488 + if (device_size_calculation(mddev))
23492 + * Drop all container device buffers, from now on
23493 + * the only valid external interface is through the md
23495 + * Also find largest hardsector size
23497 + md_hardsect_sizes[mdidx(mddev)] = 512;
23498 + ITERATE_RDEV(mddev,rdev,tmp) {
23499 + if (rdev->faulty)
23501 + invalidate_device(rdev->dev, 1);
23502 +/* if (get_hardsect_size(rdev->dev)
23503 + > md_hardsect_sizes[mdidx(mddev)])
23504 + md_hardsect_sizes[mdidx(mddev)] =
23505 + get_hardsect_size(rdev->dev); */
23506 + if (rdev->node->hardsector_size > md_hardsect_sizes[mdidx(mddev)]) {
23507 + md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
23511 + md_blocksizes[mdidx(mddev)] = 1024;
23512 + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
23513 + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
23515 + mddev->pers = pers[pnum];
23517 + err = mddev->pers->run(mddev);
23519 + LOG_WARNING("%s: pers->run() failed.\n", __FUNCTION__);
23520 + mddev->pers = NULL;
23523 + mddev->sb->state &= ~(1 << MD_SB_CLEAN);
23525 + evms_md_update_sb(mddev);
23527 + if (incomplete_mddev(mddev)) {
23528 + LOG_DEFAULT("%s: [md%d] was incomplete!\n", __FUNCTION__, mdidx(mddev));
23529 + list_del(&mddev->incomplete_mddevs);
23530 + INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23533 + list_add(&mddev->running_mddevs, &running_mddevs);
23538 +#undef TOO_BIG_CHUNKSIZE
23539 +#undef BAD_CHUNKSIZE
23542 +#define OUT(x) do { err = (x); goto out; } while (0)
23545 +#define STILL_MOUNTED KERN_WARNING \
23546 +"md%d still mounted.\n"
23547 +#define STILL_IN_USE \
23548 +"md%d still in use.\n"
23550 +static int do_md_stop (mddev_t * mddev, int ro)
23552 + int err = 0, resync_interrupted = 0, clean = 0;
23553 + kdev_t dev = mddev_to_kdev(mddev);
23555 + if (atomic_read(&mddev->active)>1) {
23556 + printk(STILL_IN_USE, mdidx(mddev));
23560 + if (mddev->pers) {
23562 + * It is safe to call stop here, it only frees private
23563 + * data. Also, it tells us if a device is unstoppable
23564 + * (eg. resyncing is in progress)
23566 + if (mddev->pers->stop_resync)
23567 + if (mddev->pers->stop_resync(mddev))
23568 + resync_interrupted = 1;
23570 + if (mddev->recovery_running)
23571 + evms_cs_interrupt_thread(evms_md_recovery_thread);
23574 + * This synchronizes with signal delivery to the
23575 + * resync or reconstruction thread. It also nicely
23576 + * hangs the process if some reconstruction has not
23579 + down(&mddev->recovery_sem);
23580 + up(&mddev->recovery_sem);
23582 + invalidate_device(dev, 1);
23588 + mddev->node->plugin = &md_plugin_header;
23591 + set_device_ro(dev, 0);
23592 + if (mddev->pers->stop(mddev)) {
23594 + set_device_ro(dev, 1);
23602 + * mark it clean only if there was no resync
23605 + if (!mddev->recovery_running && !resync_interrupted) {
23606 + LOG_DEBUG("%s: marking sb clean...\n", __FUNCTION__);
23609 + evms_md_update_sb_sync(mddev, clean);
23612 + set_device_ro(dev, 1);
23616 + * Free resources if final stop
23619 + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
23620 + free_mddev(mddev);
23623 + printk (KERN_INFO
23624 + "md%d switched to read-only mode.\n", mdidx(mddev));
23630 +static int evms_md_run_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23632 + mdk_rdev_t *rdev;
23633 + struct list_head *tmp;
23637 + if (mddev->disks.prev == &mddev->disks) {
23642 + LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
23644 + ITERATE_RDEV(mddev,rdev,tmp) {
23645 + LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
23648 + err = do_md_run (mddev);
23651 + * remove all nodes consumed by this md device from the discover list
23653 + ITERATE_RDEV(mddev,rdev,tmp) {
23654 + LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
23655 + evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23656 + flags |= rdev->node->flags;
23658 + err = evms_md_create_logical_node(discover_list,mddev,flags);
23660 + exported_nodes++;
23663 + LOG_WARNING("%s: could not start [md%d] containing: \n",__FUNCTION__,mdidx(mddev));
23664 + ITERATE_RDEV(mddev,rdev,tmp) {
23665 + LOG_WARNING(" (%s, desc_nr=%d)\n", get_partition_name(rdev), rdev->desc_nr);
23667 + LOG_WARNING("%s: will try restart [md%d] again later.\n",__FUNCTION__,mdidx(mddev));
23669 + mddev->sb_dirty = 0;
23674 +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23676 + mdk_rdev_t *rdev;
23678 + LOG_DEFAULT("%s [md%d]\n",
23679 + __FUNCTION__, mdidx(mddev));
23680 + if (evms_md_run_array(discover_list,mddev) == 0) {
23682 + * We succeeded running this MD device.
23683 + * Now read MD superblock on this newly created MD node.
23685 + if (mddev->node &&
23686 + (evms_md_import_device(discover_list,mddev->node) == 0)) {
23688 + * Yes, there is a superblock on this MD node.
23689 + * We probably have a MD stacking case here.
23691 + rdev = evms_md_find_rdev_all(mddev->node);
23693 + list_add(&rdev->pending, &pending_raid_disks);
23694 + evms_md_run_devices(discover_list);
23696 + LOG_WARNING("%s: imported %s but no rdev was found!\n",
23698 + evms_md_partition_name(mddev->node));
23702 + if (incomplete_mddev(mddev)) {
23703 + list_del(&mddev->incomplete_mddevs);
23704 + INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23709 + * lets try to run arrays based on all disks that have arrived
23710 + * until now. (those are in the ->pending list)
23712 + * the method: pick the first pending disk, collect all disks with
23713 + * the same UUID, remove all from the pending list and put them into
23714 + * the 'same_array' list. Then order this list based on superblock
23715 + * update time (freshest comes first), kick out 'old' disks and
23716 + * compare superblocks. If everything's fine then run it.
23718 + * If "unit" is allocated, then bump its reference count
23720 +static void evms_md_run_devices (struct evms_logical_node **discover_list)
23722 + struct list_head candidates;
23723 + struct list_head *tmp;
23724 + mdk_rdev_t *rdev0, *rdev;
23729 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
23730 + while (pending_raid_disks.next != &pending_raid_disks) {
23731 + rdev0 = list_entry(pending_raid_disks.next,
23732 + mdk_rdev_t, pending);
23733 + MD_INIT_LIST_HEAD(&candidates);
23734 + ITERATE_RDEV_PENDING(rdev,tmp) {
23735 + if (uuid_equal(rdev0, rdev)) {
23736 + if (!sb_equal(rdev0->sb, rdev->sb)) {
23737 + LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
23738 + get_partition_name(rdev),get_partition_name(rdev0));
23741 + list_del(&rdev->pending);
23742 + list_add(&rdev->pending, &candidates);
23747 + * now we have a set of devices, with all of them having
23748 + * mostly sane superblocks. It's time to allocate the
23751 + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
23752 + mddev = kdev_to_mddev(md_kdev);
23753 + if (mddev && (!incomplete_mddev(mddev))) {
23754 + LOG_DETAILS("md%d already running, cannot run %s\n",
23755 + mdidx(mddev), get_partition_name(rdev0));
23757 + ITERATE_RDEV(mddev,rdev,tmp) {
23759 + * This is EVMS re-discovery!
23760 + * Remove all nodes consumed by this md device from the discover list
23762 + evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23765 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23766 + if (evms_md_find_mddev_all(rdev->node))
23768 + * We have found an MD superblock on top of a running MD array.
23769 + * Delete rdev but keep the MD array.
23771 + evms_md_export_rdev(rdev, FALSE);
23773 + evms_md_export_rdev(rdev, TRUE);
23779 + mddev = alloc_mddev(md_kdev);
23780 + if (mddev == NULL) {
23781 + LOG_ERROR("cannot allocate memory for md drive.\n");
23784 + LOG_DETAILS("created md%d\n", mdidx(mddev));
23786 + LOG_DETAILS("%s: found INCOMPLETE md%d\n", __FUNCTION__, mdidx(mddev));
23789 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23790 + bind_rdev_to_array(rdev, mddev);
23791 + list_del(&rdev->pending);
23792 + MD_INIT_LIST_HEAD(&rdev->pending);
23795 + if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
23796 + (mddev->nb_dev == rdev0->sb->nr_disks)) {
23797 + evms_md_run_array(discover_list,mddev);
23799 + LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
23800 + mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
23801 + list_add(&mddev->incomplete_mddevs, &incomplete_mddevs);
23802 + ITERATE_RDEV(mddev,rdev,tmp) {
23803 + evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23807 + LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
23810 +void evms_md_recover_arrays(void)
23812 + if (!evms_md_recovery_thread) {
23816 + evms_cs_wakeup_thread(evms_md_recovery_thread);
23819 +int evms_md_error_dev(
23823 + mdk_rdev_t * rdev;
23825 + rdev = evms_md_find_rdev(mddev, dev);
23827 + return evms_md_error(mddev,rdev->node);
23829 + LOG_ERROR("%s: could not find %s in md%d\n",
23830 + __FUNCTION__, org_partition_name(dev), mdidx(mddev));
23835 +int evms_md_error(
23837 + struct evms_logical_node *node)
23839 + mdk_rdev_t * rrdev;
23841 + /* check for NULL first */
23846 + LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
23847 + mdidx(mddev), node->name,
23848 + __builtin_return_address(0),__builtin_return_address(1),
23849 + __builtin_return_address(2),__builtin_return_address(3));
23851 + rrdev = evms_md_find_rdev_from_node(mddev, node);
23852 + if (!rrdev || rrdev->faulty)
23854 + if (!mddev->pers->error_handler
23855 + || mddev->pers->error_handler(mddev,node) <= 0) {
23856 + free_disk_sb(rrdev);
23857 + rrdev->faulty = 1;
23861 + * if recovery was running, stop it now.
23863 + if (mddev->pers->stop_resync)
23864 + mddev->pers->stop_resync(mddev);
23865 + if (mddev->recovery_running)
23866 + evms_cs_interrupt_thread(evms_md_recovery_thread);
23867 + evms_md_recover_arrays();
23872 +int evms_register_md_personality (int pnum, mdk_personality_t *p)
23874 + if (pnum >= MAX_PERSONALITY) {
23879 + if (pers[pnum]) {
23885 + LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
23889 +int evms_unregister_md_personality (int pnum)
23891 + if (pnum >= MAX_PERSONALITY) {
23896 + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
23897 + pers[pnum] = NULL;
23901 +mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
23903 + mdp_super_t *sb = mddev->sb;
23904 + mdp_disk_t *disk;
23905 + mdk_rdev_t *rdev;
23908 + for (i = 0, j = 0; j < mddev->nb_dev; i++) {
23909 + rdev = evms_md_find_rdev_nr(mddev, i);
23910 + if (rdev == NULL)
23913 + if (rdev->faulty)
23916 + if (!rdev->virtual_spare)
23920 + disk = &sb->disks[rdev->desc_nr];
23921 + if (disk_faulty(disk)) {
23925 + if (disk_active(disk))
23932 +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
23934 + mdp_super_t *sb = mddev->sb;
23935 + mdp_disk_t *disk;
23938 + for (i=0; i < MD_SB_DISKS; i++) {
23939 + disk = &sb->disks[i];
23940 + if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
23946 +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
23947 +void evms_md_sync_acct(
23949 + unsigned long nr_sectors)
23951 + unsigned int major = MAJOR(dev);
23952 + unsigned int index;
23954 + index = disk_index(dev);
23955 + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23958 + sync_io[major][index] += nr_sectors;
23961 +static int is_mddev_idle(mddev_t *mddev)
23963 + mdk_rdev_t * rdev;
23964 + struct list_head *tmp;
23966 + unsigned long curr_events;
23969 + ITERATE_RDEV(mddev,rdev,tmp) {
23970 + int major = MAJOR(rdev->dev);
23971 + int idx = disk_index(rdev->dev);
23973 + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23976 + curr_events = kstat.dk_drive_rblk[major][idx] +
23977 + kstat.dk_drive_wblk[major][idx] ;
23978 + curr_events -= sync_io[major][idx];
23979 + if ((curr_events - rdev->last_events) > 32) {
23980 + rdev->last_events = curr_events;
23987 +MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
23989 +void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
23991 + /* another "blocks" (512byte) blocks have been synced */
23992 + atomic_sub(blocks, &mddev->recovery_active);
23993 + wake_up(&mddev->recovery_wait);
23995 + // stop recovery, signal do_sync ....
23999 +#define SYNC_MARKS 10
24000 +#define SYNC_MARK_STEP (3*HZ)
24001 +int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
24004 + unsigned int max_sectors, currspeed,
24005 + j, window, err, serialize;
24006 + unsigned long mark[SYNC_MARKS];
24007 + unsigned long mark_cnt[SYNC_MARKS];
24009 + struct list_head *tmp;
24010 + unsigned long last_check;
24013 + err = down_interruptible(&mddev->resync_sem);
24019 + ITERATE_MDDEV(mddev2,tmp) {
24020 + if (mddev2 == mddev)
24022 + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
24023 + LOG_DEFAULT("delaying resync of md%d until md%d "
24024 + "has finished resync (they share one or more physical units)\n",
24025 + mdidx(mddev), mdidx(mddev2));
24031 + interruptible_sleep_on(&evms_resync_wait);
24032 + if (md_signal_pending(current)) {
24033 + md_flush_signals();
24040 + mddev->curr_resync = 1;
24042 + max_sectors = mddev->sb->size<<1;
24044 + LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
24045 + LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
24046 + sysctl_speed_limit_min);
24047 + LOG_DEFAULT("using maximum available idle IO bandwith "
24048 + "(but not more than %d KB/sec) for reconstruction.\n",
24049 + sysctl_speed_limit_max);
24052 + * Resync has low priority.
24054 +#ifdef O1_SCHEDULER
24055 + set_user_nice(current,19);
24057 + current->nice = 19;
24060 + is_mddev_idle(mddev); /* this also initializes IO event counters */
24061 + for (m = 0; m < SYNC_MARKS; m++) {
24062 + mark[m] = jiffies;
24066 + mddev->resync_mark = mark[last_mark];
24067 + mddev->resync_mark_cnt = mark_cnt[last_mark];
24070 + * Tune reconstruction:
24072 + window = MD_READAHEAD*(PAGE_SIZE/512);
24073 + LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
24074 + window/2,max_sectors/2);
24076 + atomic_set(&mddev->recovery_active, 0);
24077 + init_waitqueue_head(&mddev->recovery_wait);
24079 + for (j = 0; j < max_sectors;) {
24082 + sectors = mddev->pers->sync_request(mddev, j);
24084 + if (sectors < 0) {
24088 + atomic_add(sectors, &mddev->recovery_active);
24090 + mddev->curr_resync = j;
24092 + if (last_check + window > j)
24097 + run_task_queue(&tq_disk);
24100 + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
24102 + int next = (last_mark+1) % SYNC_MARKS;
24104 + mddev->resync_mark = mark[next];
24105 + mddev->resync_mark_cnt = mark_cnt[next];
24106 + mark[next] = jiffies;
24107 + mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
24108 + last_mark = next;
24112 + if (md_signal_pending(current)) {
24114 + * got a signal, exit.
24116 + mddev->curr_resync = 0;
24117 + LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
24118 + md_flush_signals();
24124 + * this loop exits only if either when we are slower than
24125 + * the 'hard' speed limit, or the system was IO-idle for
24127 + * the system might be non-idle CPU-wise, but we only care
24128 + * about not overloading the IO subsystem. (things like an
24129 + * e2fsck being done on the RAID array should execute fast)
24131 + if (md_need_resched(current))
24134 + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
24136 + if (currspeed > sysctl_speed_limit_min) {
24137 +#ifdef O1_SCHEDULER
24138 + set_user_nice(current,19);
24140 + current->nice = 19;
24143 + if ((currspeed > sysctl_speed_limit_max) ||
24144 + !is_mddev_idle(mddev)) {
24145 +#ifdef O1_SCHEDULER
24146 + set_current_state(TASK_INTERRUPTIBLE);
24148 + current->state = TASK_INTERRUPTIBLE;
24150 + md_schedule_timeout(HZ/4);
24154 +#ifdef O1_SCHEDULER
24155 + set_user_nice(current,-20);
24157 + current->nice = -20;
24160 + LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
24163 + * this also signals 'finished resyncing' to md_stop
24166 + wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
24167 + up(&mddev->resync_sem);
24169 + mddev->curr_resync = 0;
24170 + wake_up(&evms_resync_wait);
24177 + * This is a kernel thread which syncs a spare disk with the active array
24179 + * the amount of foolproofing might seem to be a tad excessive, but an
24180 + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
24181 + * of my root partition with the first 0.5 gigs of my /home partition ... so
24182 + * i'm a bit nervous ;)
24184 +void evms_md_do_recovery(void *data)
24189 + mdp_disk_t *spare;
24190 + struct list_head *tmp;
24192 + LOG_DEFAULT("recovery thread got woken up ...\n");
24194 + ITERATE_MDDEV(mddev,tmp) {
24199 + if (mddev->recovery_running)
24201 + if (sb->active_disks == sb->raid_disks)
24203 + if (!sb->spare_disks) {
24204 + LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
24205 + "-- continuing in degraded mode\n", mdidx(mddev));
24213 + * now here we get the spare and resync it.
24215 + spare = evms_md_get_spare(mddev);
24220 + LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
24221 + mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24222 + if (!mddev->pers->diskop)
24225 + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
24228 + down(&mddev->recovery_sem);
24229 + mddev->recovery_running = 1;
24230 + err = evms_md_do_sync(mddev, spare);
24231 + if (err == -EIO) {
24232 + LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
24233 + mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24234 + if (!disk_faulty(spare)) {
24235 + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
24236 + mark_disk_faulty(spare);
24237 + mark_disk_nonsync(spare);
24238 + mark_disk_inactive(spare);
24239 + sb->spare_disks--;
24240 + sb->working_disks--;
24241 + sb->failed_disks++;
24244 + if (disk_faulty(spare))
24245 + mddev->pers->diskop(mddev, &spare,
24246 + DISKOP_SPARE_INACTIVE);
24247 + if (err == -EINTR || err == -ENOMEM) {
24249 + * Recovery got interrupted, or ran out of mem ...
24250 + * signal back that we have finished using the array.
24252 + mddev->pers->diskop(mddev, &spare,
24253 + DISKOP_SPARE_INACTIVE);
24254 + up(&mddev->recovery_sem);
24255 + mddev->recovery_running = 0;
24258 + mddev->recovery_running = 0;
24259 + up(&mddev->recovery_sem);
24261 + if (!disk_faulty(spare)) {
24263 + * the SPARE_ACTIVE diskop possibly changes the
24266 + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
24267 + mark_disk_sync(spare);
24268 + mark_disk_active(spare);
24269 + sb->active_disks++;
24270 + sb->spare_disks--;
24272 + mddev->sb_dirty = 1;
24273 + evms_md_update_sb(mddev);
24276 + LOG_DEFAULT("recovery thread finished ...\n");
24280 +static void evms_md_create_recovery_thread(void)
24282 + static char * name = "evms_mdrecoveryd";
24284 + if (!evms_md_recovery_thread) {
24285 + /* Create MD recovery thread */
24286 + evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
24287 + if (!evms_md_recovery_thread)
24288 + LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
24292 +static void evms_md_destroy_recovery_thread(void)
24294 + if (evms_md_recovery_thread && !MOD_IN_USE) {
24295 + /* Destroy MD recovery thread */
24296 + evms_cs_unregister_thread(evms_md_recovery_thread);
24297 + evms_md_recovery_thread = NULL;
24302 + * evms_md_create_logical_node
24304 +static int evms_md_create_logical_node(
24305 + struct evms_logical_node **discover_list,
24310 + struct evms_md *evms_md = NULL;
24311 + struct evms_logical_node *newnode = NULL;
24312 + struct evms_plugin_header *hdr = NULL;
24313 + struct evms_plugin_fops *fops = NULL;
24315 + rc = evms_cs_allocate_logical_node(&newnode);
24317 + evms_md = kmalloc(sizeof(*evms_md), GFP_KERNEL);
24322 + memset(evms_md,0,sizeof(*evms_md));
24323 + evms_md->mddev = mddev;
24325 + fops = kmalloc(sizeof(*fops), GFP_KERNEL);
24327 + /* copy MD plugin header
24328 + * copy function table
24329 + * replace read and write function pointers.
24331 + evms_md->instance_plugin_hdr = md_plugin_header;
24332 + memcpy(fops, &md_fops, sizeof(*fops));
24333 + fops->read = mddev->pers->read;
24334 + fops->write = mddev->pers->write;
24335 + evms_md->instance_plugin_hdr.fops = fops;
24336 + hdr = &evms_md->instance_plugin_hdr;
24338 + LOG_WARNING("%s: No memory to copy function table\n",__FUNCTION__);
24339 + rc = 0; /* clear rc and continue */
24340 + hdr = &md_plugin_header;
24345 + if (!rc && hdr) {
24346 + memset(newnode,0,sizeof(*newnode));
24347 + newnode->plugin = hdr;
24348 + newnode->total_vsectors = (u64)evms_md_size[mdidx(mddev)] * 2;
24349 + newnode->block_size = md_blocksizes[mdidx(mddev)];
24350 + newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
24351 + sprintf(newnode->name,"md/md%d",mdidx(mddev));
24352 + newnode->private = evms_md;
24353 + newnode->flags = flags;
24355 + rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
24357 + LOG_ERROR("%s: could not add md node %s\n", __FUNCTION__, newnode->name);
24359 + LOG_DEBUG("%s: added [%s] to discover list (total_vsectors="PFU64")\n",
24360 + __FUNCTION__, newnode->name, newnode->total_vsectors);
24365 + mddev->node = newnode;
24373 + evms_cs_deallocate_logical_node(newnode);
24380 + * Function: evms_md_autostart_arrays
24381 + * Discover MD "extended" devices
24382 + * Add MD "extended" devices to pending list for further processing
24384 +static void evms_md_autostart_arrays (struct evms_logical_node **discover_list)
24386 + struct evms_logical_node *node, *next_node;
24387 + mdk_rdev_t *rdev;
24390 + LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
24392 + /* examine each node on the discover list */
24393 + next_node = *discover_list;
24394 + while(next_node) {
24395 + node = next_node;
24396 + next_node = node->next;
24398 + rc = evms_md_import_device(discover_list, node);
24399 + if (rc && (rc != -EEXIST)) {
24400 + LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
24407 + rdev = evms_md_find_rdev_all(node);
24409 + LOG_ERROR("find_rdev_all() failed\n");
24412 + if (rdev->faulty) {
24418 + list_add(&rdev->pending, &pending_raid_disks);
24419 + } else if (rc == -EEXIST) {
24420 + struct evms_logical_node *md_node;
24422 + * Must be in a re-discovery process here.
24423 + * Find the EVMS MD node that this rdev is a member of
24425 + if (rdev->mddev) {
24426 + md_node = rdev->mddev->node;
24428 + rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
24431 + exported_nodes++;
24432 + LOG_DETAILS("Added MD node (%s) to discover list\n",
24435 + case 1: /* already on the list */
24436 + case 2: /* already on the list */
24439 + LOG_WARNING("could not add md node (%s), rc=%d\n",
24440 + md_node->name, rc);
24443 + LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
24444 + rdev->mddev->__minor);
24447 + LOG_ERROR("This device [%s] does not belong to any array!\n",
24448 + get_partition_name(rdev));
24449 + evms_md_export_rdev(rdev, TRUE);
24451 + evms_cs_remove_logical_node_from_list(discover_list,node);
24455 + evms_md_run_devices(discover_list);
24456 + LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
24459 +#ifdef CONFIG_PROC_FS
24460 +static int status_resync(char * page, off_t * offset, int count, mddev_t * mddev)
24463 + off_t off = *offset;
24464 + unsigned long max_blocks, resync, res, dt, db, rt;
24466 + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
24467 + max_blocks = mddev->sb->size;
24470 + * Should not happen.
24472 + if (!max_blocks) {
24476 + res = (resync/1024)*1000/(max_blocks/1024 + 1);
24478 + int i, x = res/50, y = 20-x;
24480 + for (i = 0; i < x; i++)
24482 + sz += sprintf(page + sz, ">");
24483 + for (i = 0; i < y; i++)
24487 + if (!mddev->recovery_running)
24491 + PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
24492 + res/10, res % 10, resync, max_blocks);
24497 + PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
24498 + res/10, res % 10, resync, max_blocks);
24501 + * We do not want to overflow, so the order of operands and
24502 + * the * 100 / 100 trick are important. We do a +1 to be
24503 + * safe against division by zero. We only estimate anyway.
24505 + * dt: time from mark until now
24506 + * db: blocks written from mark until now
24507 + * rt: remaining time
24509 + dt = ((jiffies - mddev->resync_mark) / HZ);
24511 + db = resync - (mddev->resync_mark_cnt/2);
24512 + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
24514 + PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
24516 + PROCPRINT(" speed=%ldK/sec", db/dt);
24523 +static int evms_md_status_read_proc(char *page, char **start, off_t off,
24524 + int count, int *eof, void *data)
24526 + int sz = 0, j, size;
24527 + struct list_head *tmp, *tmp2;
24528 + mdk_rdev_t *rdev;
24531 + PROCPRINT("Enterprise Volume Management System: MD Status\n");
24532 + PROCPRINT("Personalities : ");
24533 + for (j = 0; j < MAX_PERSONALITY; j++)
24535 + PROCPRINT("[%s] ", pers[j]->name);
24540 + ITERATE_MDDEV(mddev,tmp) {
24541 + PROCPRINT("md%d : %sactive", mdidx(mddev),
24542 + mddev->pers ? "" : "in");
24543 + if (mddev->pers) {
24545 + PROCPRINT(" (read-only)");
24546 + PROCPRINT(" %s", mddev->pers->name);
24550 + ITERATE_RDEV(mddev,rdev,tmp2) {
24551 + PROCPRINT(" %s[%d]",
24552 + rdev->node->name, rdev->desc_nr);
24553 + if (rdev->faulty) {
24554 + PROCPRINT("(F)");
24557 + size += rdev->size;
24560 + if (mddev->nb_dev) {
24562 + PROCPRINT("\n "PFU64" blocks",
24563 + mddev->node->total_vsectors >> 1);
24565 + PROCPRINT("\n %d blocks", size);
24568 + if (!mddev->pers) {
24573 + sz += mddev->pers->status (page+sz, mddev);
24575 + PROCPRINT("\n ");
24576 + if (mddev->curr_resync) {
24577 + sz += status_resync (page+sz, &off, count, mddev);
24579 + if (atomic_read(&mddev->resync_sem.count) != 1)
24580 + PROCPRINT(" resync=DELAYED");
24587 + *start = page + off;
24591 + return sz > count ? count : sz;
24595 +/* Function: md_core_init
24597 +int __init md_core_init(void)
24599 +#ifdef CONFIG_PROC_FS
24600 + struct proc_dir_entry *evms_proc_dir;
24603 +#ifdef CONFIG_PROC_FS
24604 + evms_proc_dir = evms_cs_get_evms_proc_dir();
24605 + if (evms_proc_dir) {
24606 + create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
24608 + md_table_header = register_sysctl_table(dev_dir_table, 1);
24611 + return evms_cs_register_plugin(&md_plugin_header);
24614 +static void __exit md_core_exit(void)
24616 +#ifdef CONFIG_PROC_FS
24617 + struct proc_dir_entry *evms_proc_dir;
24619 + evms_proc_dir = evms_cs_get_evms_proc_dir();
24620 + if (evms_proc_dir) {
24621 + remove_proc_entry("mdstat", evms_proc_dir);
24623 + unregister_sysctl_table(md_table_header);
24625 + evms_cs_unregister_plugin(&md_plugin_header);
24628 +module_init(md_core_init);
24629 +module_exit(md_core_exit);
24630 +#ifdef MODULE_LICENSE
24631 +MODULE_LICENSE("GPL");
24635 + * In order to have the coexistence of this EVMS plugin and the orginal MD
24636 + * module, the symbols exported by this plugin are prefixed with "evms_"
24639 +MD_EXPORT_SYMBOL(evms_md_size);
24640 +MD_EXPORT_SYMBOL(evms_register_md_personality);
24641 +MD_EXPORT_SYMBOL(evms_unregister_md_personality);
24642 + /* Export the following function for use with rdev->node in evms_md_k.h */
24643 +MD_EXPORT_SYMBOL(evms_md_partition_name);
24644 + /* Export the following function for use with disks[] in md_p.h */
24645 +MD_EXPORT_SYMBOL(evms_md_error);
24646 +MD_EXPORT_SYMBOL(evms_md_error_dev);
24647 +MD_EXPORT_SYMBOL(evms_md_update_sb);
24648 +MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
24649 +MD_EXPORT_SYMBOL(evms_md_find_rdev);
24650 +MD_EXPORT_SYMBOL(evms_md_find_rdev_from_node);
24651 +MD_EXPORT_SYMBOL(evms_md_print_devices);
24652 +MD_EXPORT_SYMBOL(evms_mddev_map);
24653 +MD_EXPORT_SYMBOL(evms_md_check_ordering);
24654 +MD_EXPORT_SYMBOL(evms_md_partial_sync_io);
24655 +MD_EXPORT_SYMBOL(evms_md_sync_io);
24656 +MD_EXPORT_SYMBOL(evms_md_do_sync);
24657 +MD_EXPORT_SYMBOL(evms_md_sync_acct);
24658 +MD_EXPORT_SYMBOL(evms_md_done_sync);
24659 +MD_EXPORT_SYMBOL(evms_md_recover_arrays);
24660 +MD_EXPORT_SYMBOL(evms_md_get_spare);
24662 diff -Naur linux-2002-09-30/drivers/evms/md_linear.c evms-2002-09-30/drivers/evms/md_linear.c
24663 --- linux-2002-09-30/drivers/evms/md_linear.c Wed Dec 31 18:00:00 1969
24664 +++ evms-2002-09-30/drivers/evms/md_linear.c Thu Aug 15 13:50:12 2002
24667 + linear.c : Multiple Devices driver for Linux
24668 + Copyright (C) 1994-96 Marc ZYNGIER
24669 + <zyngier@ufr-info-p7.ibp.fr> or
24670 + <maz@gloups.fdn.fr>
24672 + Linear mode management functions.
24674 + This program is free software; you can redistribute it and/or modify
24675 + it under the terms of the GNU General Public License as published by
24676 + the Free Software Foundation; either version 2, or (at your option)
24677 + any later version.
24679 + You should have received a copy of the GNU General Public License
24680 + (for example /usr/src/linux/COPYING); if not, write to the Free
24681 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24684 +#include <linux/module.h>
24685 +#include <linux/evms/evms_md.h>
24686 +#include <linux/evms/evms_linear.h>
24687 +#include <linux/slab.h>
24690 +#define MAJOR_NR MD_MAJOR
24692 +#define MD_PERSONALITY
24694 +#define LOG_PREFIX "md linear: "
24695 +static int linear_run (mddev_t *mddev)
24697 + linear_conf_t *conf;
24698 + struct linear_hash *table;
24699 + mdk_rdev_t *rdev;
24700 + int size, i, j, nb_zone;
24701 + unsigned int curr_offset;
24703 + MOD_INC_USE_COUNT;
24705 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
24708 + mddev->private = conf;
24710 + if (evms_md_check_ordering(mddev)) {
24711 + printk("linear: disks are not ordered, aborting!\n");
24716 + * Find the smallest device.
24719 + conf->smallest = NULL;
24721 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
24722 + dev_info_t *disk = conf->disks + j;
24723 + disk->node = rdev->node;
24724 + disk->dev = rdev->dev;
24725 + disk->size = rdev->size;
24726 + disk->offset = curr_offset;
24728 + curr_offset += disk->size;
24730 + if (!conf->smallest || (disk->size < conf->smallest->size))
24731 + conf->smallest = disk;
24734 + nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size +
24735 + ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
24737 + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
24739 + if (!conf->hash_table)
24743 + * Here we generate the linear hash table
24745 + table = conf->hash_table;
24748 + for (j = 0; j < mddev->nb_dev; j++) {
24749 + dev_info_t *disk = conf->disks + j;
24752 + table[-1].dev1 = disk;
24754 + size += disk->size;
24757 + table->dev0 = disk;
24758 + table->dev1 = NULL;
24759 + size -= conf->smallest->size;
24763 + if (table-conf->hash_table != nb_zone)
24765 + LOG_DETAILS("%s: nr_zones=%d, smallest=%lu\n",
24766 + __FUNCTION__, conf->nr_zones, conf->smallest->size);
24772 + MOD_DEC_USE_COUNT;
24776 +static int linear_stop (mddev_t *mddev)
24778 + linear_conf_t *conf = mddev_to_conf(mddev);
24780 + kfree(conf->hash_table);
24783 + MOD_DEC_USE_COUNT;
24789 + * Function: linear_map
24791 +static int linear_map(
24793 + struct evms_logical_node **node,
24794 + struct buffer_head *bh)
24796 + linear_conf_t *conf = mddev_to_conf(mddev);
24797 + struct linear_hash *hash;
24798 + dev_info_t *tmp_dev;
24799 + unsigned long block;
24801 + block = (bh->b_rsector >> 1);
24802 + hash = conf->hash_table + (block / conf->smallest->size);
24803 + if (block >= (hash->dev0->size + hash->dev0->offset)) {
24804 + if (!hash->dev1) {
24805 + LOG_ERROR("%s: hash->dev1==NULL for block %ld\n", __FUNCTION__, block);
24808 + tmp_dev = hash->dev1;
24810 + tmp_dev = hash->dev0;
24812 + if ( (block + (bh->b_size >> 10)) > (tmp_dev->size + tmp_dev->offset)
24813 + || block < tmp_dev->offset) {
24814 + LOG_ERROR("%s: Block %ld out of bounds on node %s size %ld offset %ld\n",
24817 + tmp_dev->node->name,
24819 + tmp_dev->offset);
24822 + bh->b_rsector -= (tmp_dev->offset << 1);
24823 + *node = tmp_dev->node;
24827 +static void linear_read(
24828 + struct evms_logical_node *md_node,
24829 + struct buffer_head *bh)
24831 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24832 + struct evms_logical_node *node;
24834 + if (evms_md_check_boundary(md_node, bh)) return;
24836 + if (!linear_map(mddev, &node, bh)) {
24839 + bh->b_end_io(bh, 0);
24843 +static void linear_write(
24844 + struct evms_logical_node *md_node,
24845 + struct buffer_head *bh)
24847 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24848 + struct evms_logical_node *node;
24850 + if (evms_md_check_boundary(md_node, bh)) return;
24852 + if (!linear_map(mddev, &node, bh)) {
24855 + bh->b_end_io(bh, 0);
24859 +static int linear_status (char *page, mddev_t *mddev)
24866 + linear_conf_t *conf = mddev_to_conf(mddev);
24868 + sz += sprintf(page+sz, " ");
24869 + for (j = 0; j < conf->nr_zones; j++)
24871 + sz += sprintf(page+sz, "[%s",
24872 + partition_name(conf->hash_table[j].dev0->dev));
24874 + if (conf->hash_table[j].dev1)
24875 + sz += sprintf(page+sz, "/%s] ",
24876 + partition_name(conf->hash_table[j].dev1->dev));
24878 + sz += sprintf(page+sz, "] ");
24880 + sz += sprintf(page+sz, "\n");
24882 + sz += sprintf(page+sz, " %dk rounding", mddev->chunk_size/1024);
24886 +static int linear_evms_ioctl (
24888 + struct inode * inode,
24889 + struct file * file,
24890 + unsigned int cmd,
24891 + unsigned long arg)
24894 + struct evms_logical_node *node;
24897 + case EVMS_GET_BMAP:
24899 + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
24900 + struct buffer_head *bh =
24901 + evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
24903 + bh->b_rsector = (unsigned long)bmap->rsector;
24904 + bh->b_size = node->block_size;
24905 + rc = linear_map(mddev, &node, bh);
24907 + bmap->rsector = (u64)bh->b_rsector;
24909 + rc = IOCTL(node, inode, file, cmd, arg);
24913 + evms_cs_deallocate_to_pool(evms_bh_pool, bh);
24925 +static mdk_personality_t linear_personality = {
24926 + .name = "evms_linear",
24927 + .read = linear_read,
24928 + .write = linear_write,
24929 + .run = linear_run,
24930 + .stop = linear_stop,
24931 + .status = linear_status,
24932 + .evms_ioctl = linear_evms_ioctl
24935 +static int md__init linear_init (void)
24937 + return evms_register_md_personality (LINEAR, &linear_personality);
24940 +static void linear_exit (void)
24942 + evms_unregister_md_personality (LINEAR);
24946 +module_init(linear_init);
24947 +module_exit(linear_exit);
24948 +#ifdef MODULE_LICENSE
24949 +MODULE_LICENSE("GPL");
24951 diff -Naur linux-2002-09-30/drivers/evms/md_raid0.c evms-2002-09-30/drivers/evms/md_raid0.c
24952 --- linux-2002-09-30/drivers/evms/md_raid0.c Wed Dec 31 18:00:00 1969
24953 +++ evms-2002-09-30/drivers/evms/md_raid0.c Thu Aug 15 13:50:12 2002
24956 + raid0.c : Multiple Devices driver for Linux
24957 + Copyright (C) 1994-96 Marc ZYNGIER
24958 + <zyngier@ufr-info-p7.ibp.fr> or
24959 + <maz@gloups.fdn.fr>
24960 + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
24963 + RAID-0 management functions.
24965 + This program is free software; you can redistribute it and/or modify
24966 + it under the terms of the GNU General Public License as published by
24967 + the Free Software Foundation; either version 2, or (at your option)
24968 + any later version.
24970 + You should have received a copy of the GNU General Public License
24971 + (for example /usr/src/linux/COPYING); if not, write to the Free
24972 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24975 +#include <linux/module.h>
24976 +#include <linux/evms/evms_raid0.h>
24978 +#define MAJOR_NR MD_MAJOR
24980 +#define MD_PERSONALITY
24982 +#define LOG_PREFIX "md raid0: "
24984 +static int create_strip_zones (mddev_t *mddev)
24986 + int i, c, j, j1, j2;
24987 + unsigned long current_offset, curr_zone_offset, rdev_size_in_sects;
24988 + raid0_conf_t *conf = mddev_to_conf(mddev);
24989 + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
24992 + * The number of 'same size groups'
24994 + conf->nr_strip_zones = 0;
24996 + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
24997 + LOG_DEBUG(" looking at %s\n", evms_md_partition_name(rdev1->node));
24999 + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
25000 + LOG_DEBUG(" comparing %s(%ld sectors) with %s(%ld sectors)\n",
25001 + evms_md_partition_name(rdev1->node), rdev1->size << 1,
25002 + evms_md_partition_name(rdev2->node), rdev2->size << 1);
25003 + if (rdev2 == rdev1) {
25004 + LOG_DEBUG(" END\n");
25007 + if (rdev2->size == rdev1->size)
25010 + * Not unique, dont count it as a new
25013 + LOG_DEBUG(" EQUAL\n");
25017 + LOG_DEBUG(" NOT EQUAL\n");
25020 + LOG_DEBUG(" ==> UNIQUE\n");
25021 + conf->nr_strip_zones++;
25022 + LOG_DEBUG(" %d zones\n",conf->nr_strip_zones);
25025 + LOG_DEBUG(" FINAL %d zones\n",conf->nr_strip_zones);
25027 + conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
25028 + conf->nr_strip_zones);
25029 + if (!conf->strip_zone)
25033 + conf->smallest = NULL;
25034 + current_offset = 0;
25035 + curr_zone_offset = 0;
25037 + for (i = 0; i < conf->nr_strip_zones; i++)
25039 + struct strip_zone *zone = conf->strip_zone + i;
25041 + LOG_DEBUG(" zone %d\n", i);
25042 + zone->dev_offset = current_offset;
25046 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
25048 + LOG_DEBUG(" checking %s ...",evms_md_partition_name(rdev->node));
25049 + rdev_size_in_sects = rdev->size << 1;
25050 + if (rdev_size_in_sects > current_offset)
25052 + LOG_DEBUG(" contained as device %d\n", c);
25053 + zone->node[c] = rdev->node;
25055 + if (!smallest || (rdev_size_in_sects < (smallest->size <<1) )) {
25057 + LOG_DEBUG(" (%ld) is smallest!.\n", rdev_size_in_sects);
25060 + LOG_DEBUG(" nope.\n");
25063 + zone->nb_dev = c;
25064 + zone->size_in_sects = ((smallest->size <<1) - current_offset) * c;
25065 + LOG_DEBUG(" zone->nb_dev: %d, size: %ld\n",
25066 + zone->nb_dev,zone->size_in_sects);
25068 + if (!conf->smallest || (zone->size_in_sects < conf->smallest->size_in_sects))
25069 + conf->smallest = zone;
25071 + zone->zone_offset = curr_zone_offset;
25072 + curr_zone_offset += zone->size_in_sects;
25074 + current_offset = smallest->size << 1;
25075 + LOG_DEBUG(" current zone offset: %ld\n",current_offset);
25077 + LOG_DEBUG(" done.\n");
25081 +static int raid0_run (mddev_t *mddev)
25083 + unsigned long cur=0, i=0, size, zone0_size, nb_zone;
25084 + unsigned long mddev_size_in_sects = evms_md_size[mdidx(mddev)] << 1;
25085 + raid0_conf_t *conf;
25087 + MOD_INC_USE_COUNT;
25089 + conf = vmalloc(sizeof (raid0_conf_t));
25092 + mddev->private = (void *)conf;
25094 + if (evms_md_check_ordering(mddev)) {
25095 + LOG_ERROR("disks are not ordered, aborting!\n");
25096 + goto out_free_conf;
25099 + if (create_strip_zones (mddev))
25100 + goto out_free_conf;
25102 + LOG_DETAILS("evms_md_size is %ld sectors.\n", mddev_size_in_sects);
25103 + LOG_DETAILS("conf->smallest->size_in_sects is %ld sectors.\n", conf->smallest->size_in_sects);
25104 + nb_zone = mddev_size_in_sects / conf->smallest->size_in_sects +
25105 + (mddev_size_in_sects % conf->smallest->size_in_sects ? 1 : 0);
25106 + LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
25107 + conf->nr_zones = nb_zone;
25109 + LOG_DEBUG("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
25111 + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
25112 + if (!conf->hash_table)
25113 + goto out_free_zone_conf;
25114 + size = conf->strip_zone[cur].size_in_sects;
25117 + while (cur < conf->nr_strip_zones) {
25118 + conf->hash_table[i].zone0 = conf->strip_zone + cur;
25121 + * If we completely fill the slot
25123 + if (size >= conf->smallest->size_in_sects) {
25124 + conf->hash_table[i++].zone1 = NULL;
25125 + size -= conf->smallest->size_in_sects;
25128 + if (++cur == conf->nr_strip_zones)
25130 + size = conf->strip_zone[cur].size_in_sects;
25134 + if (++cur == conf->nr_strip_zones) {
25136 + * Last dev, set unit1 as NULL
25138 + conf->hash_table[i].zone1=NULL;
25143 + * Here we use a 2nd dev to fill the slot
25145 + zone0_size = size;
25146 + size = conf->strip_zone[cur].size_in_sects;
25147 + conf->hash_table[i++].zone1 = conf->strip_zone + cur;
25148 + size -= (conf->smallest->size_in_sects - zone0_size);
25152 +out_free_zone_conf:
25153 + vfree(conf->strip_zone);
25154 + conf->strip_zone = NULL;
25158 + mddev->private = NULL;
25160 + MOD_DEC_USE_COUNT;
25164 +static int raid0_stop (mddev_t *mddev)
25166 + raid0_conf_t *conf = mddev_to_conf(mddev);
25168 + vfree (conf->hash_table);
25169 + conf->hash_table = NULL;
25170 + vfree (conf->strip_zone);
25171 + conf->strip_zone = NULL;
25173 + mddev->private = NULL;
25175 + MOD_DEC_USE_COUNT;
25181 + * Function: raid0_map
25183 + * Return 0 for success, else error
25187 +static inline int raid0_map(
25189 + unsigned long lsn,
25190 + unsigned long size,
25191 + struct evms_logical_node **node,
25192 + unsigned long *new_lsn,
25193 + unsigned long *new_size)
25195 + unsigned int sect_in_chunk, chunksize_bits, chunk_size_in_sects;
25196 + raid0_conf_t *conf = mddev_to_conf(mddev);
25197 + struct raid0_hash *hash;
25198 + struct strip_zone *zone;
25199 + unsigned long chunk;
25201 + chunk_size_in_sects = mddev->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT;
25202 + chunksize_bits = ffz(~chunk_size_in_sects);
25203 + hash = conf->hash_table + (lsn / conf->smallest->size_in_sects);
25205 + /* Sanity check */
25209 + if (!hash->zone0)
25212 + if (lsn >= (hash->zone0->size_in_sects + hash->zone0->zone_offset)) {
25213 + if (!hash->zone1)
25215 + zone = hash->zone1;
25217 + zone = hash->zone0;
25219 + sect_in_chunk = lsn & (chunk_size_in_sects - 1);
25220 + chunk = (lsn - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
25221 + *node = zone->node[(lsn >> chunksize_bits) % zone->nb_dev];
25223 + *new_lsn = ((chunk << chunksize_bits) + zone->dev_offset) + sect_in_chunk;
25225 + *new_size = (size <= chunk_size_in_sects - sect_in_chunk) ?
25226 + size : chunk_size_in_sects - sect_in_chunk;
25231 + LOG_ERROR("%s: bug: hash==NULL for lsn %lu\n", __FUNCTION__, lsn);
25234 + LOG_ERROR("%s: bug: hash->zone0==NULL for lsn %lu\n", __FUNCTION__, lsn);
25237 + LOG_ERROR("%s: bug: hash->zone1==NULL for lsn %lu\n", __FUNCTION__, lsn);
25242 +void raid0_error(int rw, struct evms_logical_node *node, struct buffer_head *bh)
25244 + LOG_ERROR(" %s FAILED on node(%s) rsector(%lu) size(%d)\n",
25245 + (rw == READ) ? "READ" : "WRITE",
25250 + bh->b_end_io(bh, 0);
25253 +static inline void raid0_rw (
25254 + struct evms_logical_node *md_node,
25255 + struct buffer_head *bh,
25258 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25259 + struct evms_logical_node *node;
25260 + unsigned long new_lsn, size_in_sects, new_size;
25262 + if (evms_md_check_boundary(md_node, bh)) return;
25263 + size_in_sects = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
25264 + if (!raid0_map(mddev, bh->b_rsector, size_in_sects, &node, &new_lsn, &new_size)) {
25265 + if (new_size == size_in_sects) {
25267 + * This is the normal case:
25268 + * the request is entirely within the stripe boundary
25270 + bh->b_rsector = new_lsn;
25271 + if (rw == READ) {
25280 + * Need more processing here (ie. break up the request)
25282 + LOG_ERROR("This version of EVMS RAID0 does not support I/O requests that are:\n");
25283 + LOG_ERROR(" - larger than the stripe size\n");
25284 + LOG_ERROR(" - cross the stripe boundary\n");
25287 + raid0_error(rw, node, bh);
25290 +static void raid0_read(
25291 + struct evms_logical_node *md_node,
25292 + struct buffer_head *bh)
25294 + raid0_rw(md_node, bh, READ);
25297 +static void raid0_write(
25298 + struct evms_logical_node *md_node,
25299 + struct buffer_head *bh)
25301 + raid0_rw(md_node, bh, WRITE);
25304 +static int raid0_status (char *page, mddev_t *mddev)
25310 + raid0_conf_t *conf = mddev_to_conf(mddev);
25312 + sz += sprintf(page + sz, " ");
25313 + for (j = 0; j < conf->nr_zones; j++) {
25314 + sz += sprintf(page + sz, "[z%d",
25315 + conf->hash_table[j].zone0 - conf->strip_zone);
25316 + if (conf->hash_table[j].zone1)
25317 + sz += sprintf(page+sz, "/z%d] ",
25318 + conf->hash_table[j].zone1 - conf->strip_zone);
25320 + sz += sprintf(page+sz, "] ");
25323 + sz += sprintf(page + sz, "\n");
25325 + for (j = 0; j < conf->nr_strip_zones; j++) {
25326 + sz += sprintf(page + sz, " z%d=[", j);
25327 + for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
25328 + sz += sprintf (page+sz, "%s/", conf->strip_zone[j].node[k]->name);
25330 + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
25331 + conf->strip_zone[j].zone_offset,
25332 + conf->strip_zone[j].dev_offset,
25333 + conf->strip_zone[j].size_in_sects);
25336 + sz += sprintf(page + sz, " %dk chunks", mddev->chunk_size/1024);
25340 +static int raid0_evms_ioctl (
25342 + struct inode * inode,
25343 + struct file * file,
25344 + unsigned int cmd,
25345 + unsigned long arg)
25348 + struct evms_logical_node *node;
25351 + case EVMS_GET_BMAP:
25353 + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
25354 + unsigned long new_lsn, new_size;
25355 + unsigned long size = mddev->node->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
25356 + rc = raid0_map(mddev,
25357 + (unsigned long)bmap->rsector,
25364 + bmap->rsector = (u64)new_lsn;
25365 + rc = IOCTL(node, inode, file, cmd, arg);
25378 +static mdk_personality_t raid0_personality = {
25379 + .name = "evms_raid0",
25380 + .read = raid0_read,
25381 + .write = raid0_write,
25382 + .run = raid0_run,
25383 + .stop = raid0_stop,
25384 + .status = raid0_status,
25385 + .evms_ioctl = raid0_evms_ioctl
25388 +static int md__init raid0_init (void)
25390 + return evms_register_md_personality (RAID0, &raid0_personality);
25393 +static void raid0_exit (void)
25395 + evms_unregister_md_personality (RAID0);
25398 +module_init(raid0_init);
25399 +module_exit(raid0_exit);
25400 +#ifdef MODULE_LICENSE
25401 +MODULE_LICENSE("GPL");
25403 diff -Naur linux-2002-09-30/drivers/evms/md_raid1.c evms-2002-09-30/drivers/evms/md_raid1.c
25404 --- linux-2002-09-30/drivers/evms/md_raid1.c Wed Dec 31 18:00:00 1969
25405 +++ evms-2002-09-30/drivers/evms/md_raid1.c Mon Sep 30 00:02:48 2002
25408 + * md_raid1.c : Multiple Devices driver for Linux
25410 + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
25412 + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
25414 + * RAID-1 management functions.
25416 + * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
25418 + * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
25419 + * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
25421 + * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified
25422 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
25424 + * This program is free software; you can redistribute it and/or modify
25425 + * it under the terms of the GNU General Public License as published by
25426 + * the Free Software Foundation; either version 2, or (at your option)
25427 + * any later version.
25429 + * You should have received a copy of the GNU General Public License
25430 + * (for example /usr/src/linux/COPYING); if not, write to the Free
25431 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25434 +#include <linux/module.h>
25435 +#include <linux/slab.h>
25436 +#include <linux/evms/evms_raid1.h>
25437 +#include <asm/atomic.h>
25439 +#define MAJOR_NR MD_MAJOR
25441 +#define MD_PERSONALITY
25443 +#define MAX_WORK_PER_DISK 128
25445 +#define NR_RESERVED_BUFS 32
25447 +#define LOG_PREFIX "md raid1: "
25449 + * The following can be used to debug the driver
25451 +#define RAID1_DEBUG 0
25454 +#define PRINTK(x...) LOG_DEFAULT(x)
25456 +#define __inline__
25458 +#define PRINTK(x...) do { } while (0)
25462 +static mdk_personality_t raid1_personality;
25463 +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
25464 +struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;
25466 +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
25468 + /* return a linked list of "cnt" struct buffer_heads.
25469 + * don't take any off the free list unless we know we can
25470 + * get all we need, otherwise we could deadlock
25472 + struct buffer_head *bh=NULL;
25475 + struct buffer_head *t;
25476 + md_spin_lock_irq(&conf->device_lock);
25477 + if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
25479 + t = conf->freebh;
25480 + conf->freebh = t->b_next;
25484 + conf->freebh_cnt--;
25487 + md_spin_unlock_irq(&conf->device_lock);
25490 + t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
25496 + PRINTK("raid1: waiting for %d bh\n", cnt);
25497 + conf->freebh_blocked = 1;
25498 + wait_disk_event(conf->wait_buffer,
25499 + !conf->freebh_blocked ||
25500 + conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
25501 + conf->freebh_blocked = 0;
25507 +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
25509 + unsigned long flags;
25510 + spin_lock_irqsave(&conf->device_lock, flags);
25512 + struct buffer_head *t = bh;
25514 + if (t->b_pprev == NULL)
25515 + kmem_cache_free(bh_cachep, t);
25517 + t->b_next= conf->freebh;
25518 + conf->freebh = t;
25519 + conf->freebh_cnt++;
25522 + spin_unlock_irqrestore(&conf->device_lock, flags);
25523 + wake_up(&conf->wait_buffer);
25526 +static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
25528 + /* allocate cnt buffer_heads, possibly less if kmalloc fails */
25531 + while (i < cnt) {
25532 + struct buffer_head *bh;
25533 + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
25536 + md_spin_lock_irq(&conf->device_lock);
25537 + bh->b_pprev = &conf->freebh;
25538 + bh->b_next = conf->freebh;
25539 + conf->freebh = bh;
25540 + conf->freebh_cnt++;
25541 + md_spin_unlock_irq(&conf->device_lock);
25548 +static void raid1_shrink_bh(raid1_conf_t *conf)
25550 + /* discard all buffer_heads */
25552 + md_spin_lock_irq(&conf->device_lock);
25553 + while (conf->freebh) {
25554 + struct buffer_head *bh = conf->freebh;
25555 + conf->freebh = bh->b_next;
25556 + kmem_cache_free(bh_cachep, bh);
25557 + conf->freebh_cnt--;
25559 + md_spin_unlock_irq(&conf->device_lock);
25563 +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
25565 + struct raid1_bh *r1_bh = NULL;
25568 + md_spin_lock_irq(&conf->device_lock);
25569 + if (!conf->freer1_blocked && conf->freer1) {
25570 + r1_bh = conf->freer1;
25571 + conf->freer1 = r1_bh->next_r1;
25572 + conf->freer1_cnt--;
25573 + r1_bh->next_r1 = NULL;
25574 + r1_bh->state = (1 << R1BH_PreAlloc);
25575 + r1_bh->bh_req.b_state = 0;
25577 + md_spin_unlock_irq(&conf->device_lock);
25580 + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
25582 + memset(r1_bh, 0, sizeof(*r1_bh));
25585 + conf->freer1_blocked = 1;
25586 + wait_disk_event(conf->wait_buffer,
25587 + !conf->freer1_blocked ||
25588 + conf->freer1_cnt > NR_RESERVED_BUFS/2
25590 + conf->freer1_blocked = 0;
25594 +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
25596 + struct buffer_head *bh = r1_bh->mirror_bh_list;
25597 + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
25599 + r1_bh->mirror_bh_list = NULL;
25601 + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
25602 + unsigned long flags;
25603 + spin_lock_irqsave(&conf->device_lock, flags);
25604 + r1_bh->next_r1 = conf->freer1;
25605 + conf->freer1 = r1_bh;
25606 + conf->freer1_cnt++;
25607 + spin_unlock_irqrestore(&conf->device_lock, flags);
25608 + /* don't need to wakeup wait_buffer because
25609 + * raid1_free_bh below will do that
25614 + raid1_free_bh(conf, bh);
25617 +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
25621 + while (i < cnt) {
25622 + struct raid1_bh *r1_bh;
25623 + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
25626 + memset(r1_bh, 0, sizeof(*r1_bh));
25627 + set_bit(R1BH_PreAlloc, &r1_bh->state);
25628 + r1_bh->mddev = conf->mddev;
25630 + raid1_free_r1bh(r1_bh);
25636 +static void raid1_shrink_r1bh(raid1_conf_t *conf)
25638 + md_spin_lock_irq(&conf->device_lock);
25639 + while (conf->freer1) {
25640 + struct raid1_bh *r1_bh = conf->freer1;
25641 + conf->freer1 = r1_bh->next_r1;
25642 + conf->freer1_cnt--;
25645 + md_spin_unlock_irq(&conf->device_lock);
25650 +static inline void raid1_free_buf(struct raid1_bh *r1_bh)
25652 + unsigned long flags;
25653 + struct buffer_head *bh = r1_bh->mirror_bh_list;
25654 + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
25655 + r1_bh->mirror_bh_list = NULL;
25657 + spin_lock_irqsave(&conf->device_lock, flags);
25658 + r1_bh->next_r1 = conf->freebuf;
25659 + conf->freebuf = r1_bh;
25660 + spin_unlock_irqrestore(&conf->device_lock, flags);
25661 + raid1_free_bh(conf, bh);
25664 +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
25666 + struct raid1_bh *r1_bh;
25668 + md_spin_lock_irq(&conf->device_lock);
25669 + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
25670 + r1_bh = conf->freebuf;
25671 + conf->freebuf = r1_bh->next_r1;
25672 + r1_bh->next_r1= NULL;
25673 + md_spin_unlock_irq(&conf->device_lock);
25677 +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
25681 + md_spin_lock_irq(&conf->device_lock);
25682 + while (i < cnt) {
25683 + struct raid1_bh *r1_bh;
25684 + struct page *page;
25686 + page = alloc_page(GFP_KERNEL);
25690 + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
25692 + __free_page(page);
25695 + memset(r1_bh, 0, sizeof(*r1_bh));
25696 + r1_bh->bh_req.b_page = page;
25697 + r1_bh->bh_req.b_data = page_address(page);
25698 + r1_bh->next_r1 = conf->freebuf;
25699 + conf->freebuf = r1_bh;
25702 + md_spin_unlock_irq(&conf->device_lock);
25706 +static void raid1_shrink_buffers (raid1_conf_t *conf)
25708 + md_spin_lock_irq(&conf->device_lock);
25709 + while (conf->freebuf) {
25710 + struct raid1_bh *r1_bh = conf->freebuf;
25711 + conf->freebuf = r1_bh->next_r1;
25712 + __free_page(r1_bh->bh_req.b_page);
25715 + md_spin_unlock_irq(&conf->device_lock);
25720 + * EVMS raid1 version of raid1_map()
25722 +static int evms_raid1_map (mddev_t *mddev, struct evms_logical_node **node, kdev_t *rdev)
25724 + raid1_conf_t *conf = mddev_to_conf(mddev);
25728 + * Later we do read balancing on the read side
25729 + * now we use the first available disk.
25732 + for (i = 0; i < MD_SB_DISKS; i++) {
25733 + if (conf->mirrors[i].operational) {
25734 + *node = conf->mirrors[i].node;
25735 + *rdev = conf->mirrors[i].dev;
25740 + LOG_ERROR("huh, no more operational devices?\n");
25744 +static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
25746 + unsigned long flags;
25747 + mddev_t *mddev = r1_bh->mddev;
25748 + raid1_conf_t *conf = mddev_to_conf(mddev);
25750 + md_spin_lock_irqsave(&retry_list_lock, flags);
25751 + if (evms_raid1_retry_list == NULL)
25752 + evms_raid1_retry_tail = &evms_raid1_retry_list;
25753 + *evms_raid1_retry_tail = r1_bh;
25754 + evms_raid1_retry_tail = &r1_bh->next_r1;
25755 + r1_bh->next_r1 = NULL;
25756 + md_spin_unlock_irqrestore(&retry_list_lock, flags);
25757 + evms_cs_wakeup_thread(conf->thread);
25761 +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
25763 + unsigned long flags;
25764 + spin_lock_irqsave(&conf->segment_lock, flags);
25765 + if (sector < conf->start_active)
25766 + conf->cnt_done--;
25767 + else if (sector >= conf->start_future && conf->phase == phase)
25768 + conf->cnt_future--;
25769 + else if (!--conf->cnt_pending)
25770 + wake_up(&conf->wait_ready);
25772 + spin_unlock_irqrestore(&conf->segment_lock, flags);
25775 +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
25777 + unsigned long flags;
25778 + spin_lock_irqsave(&conf->segment_lock, flags);
25779 + if (sector >= conf->start_ready)
25780 + --conf->cnt_ready;
25781 + else if (sector >= conf->start_active) {
25782 + if (!--conf->cnt_active) {
25783 + conf->start_active = conf->start_ready;
25784 + wake_up(&conf->wait_done);
25787 + spin_unlock_irqrestore(&conf->segment_lock, flags);
25791 + * raid1_end_bh_io() is called when we have finished servicing a mirrored
25792 + * operation and are ready to return a success/failure code to the buffer
25795 +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
25797 + struct buffer_head *bh = r1_bh->master_bh;
25799 + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
25800 + test_bit(R1BH_SyncPhase, &r1_bh->state));
25802 + bh->b_end_io(bh, uptodate);
25803 + raid1_free_r1bh(r1_bh);
25806 +void raid1_end_read_request (struct buffer_head *bh, int uptodate)
25808 + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
25809 + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL);
25811 + set_bit (R1BH_Uptodate, &r1_bh->state);
25812 + raid1_end_bh_io(r1_bh, uptodate);
25814 + evms_md_error_dev(r1_bh->mddev, bh->b_dev);
25815 + LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);
25816 + raid1_reschedule_retry(r1_bh);
25820 +void raid1_end_write_request (struct buffer_head *bh, int uptodate)
25822 + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
25824 + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL);
25826 + evms_md_error_dev(r1_bh->mddev, bh->b_dev);
25828 + set_bit (R1BH_Uptodate, &r1_bh->state);
25831 + * Let's see if all mirrored write operations have finished
25834 + if (atomic_dec_and_test(&r1_bh->remaining))
25835 + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
25839 + * This routine returns the disk from which the requested read should
25840 + * be done. It bookkeeps the last read position for every disk
25841 + * in array and when new read requests come, the disk which last
25842 + * position is nearest to the request, is chosen.
25844 + * TODO: now if there are 2 mirrors in the same 2 devices, performance
25845 + * degrades dramatically because position is mirror, not device based.
25846 + * This should be changed to be device based. Also atomic sequential
25847 + * reads should be somehow balanced.
25850 +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
25852 + int new_disk = conf->last_used;
25853 + const int sectors = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
25854 + const unsigned long this_sector = bh->b_rsector;
25855 + int disk = new_disk;
25856 + unsigned long new_distance;
25857 + unsigned long current_distance;
25860 + * Check if it is sane at all to balance
25863 + if (conf->resync_mirrors || conf->mddev->recovery_running)
25867 + /* make sure that disk is operational */
25868 + while( !conf->mirrors[new_disk].operational) {
25869 + if (new_disk <= 0) new_disk = conf->raid_disks;
25871 + if (new_disk == disk) {
25873 + * This means no working disk was found
25874 + * Nothing much to do, lets not change anything
25875 + * and hope for the best...
25878 + new_disk = conf->last_used;
25884 + /* now disk == new_disk == starting point for search */
25887 + * Don't touch anything for sequential reads.
25890 + if (this_sector == conf->mirrors[new_disk].head_position)
25894 + * If reads have been done only on a single disk
25895 + * for a time, lets give another disk a change.
25896 + * This is for kicking those idling disks so that
25897 + * they would find work near some hotspot.
25900 + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
25901 + conf->sect_count = 0;
25905 + new_disk = conf->raid_disks;
25907 + if (new_disk == disk)
25909 + } while ((conf->mirrors[new_disk].write_only) ||
25910 + (!conf->mirrors[new_disk].operational));
25915 + current_distance = abs(this_sector -
25916 + conf->mirrors[disk].head_position);
25918 + /* Find the disk which is closest */
25922 + disk = conf->raid_disks;
25925 + if ((conf->mirrors[disk].write_only) ||
25926 + (!conf->mirrors[disk].operational))
25929 + new_distance = abs(this_sector -
25930 + conf->mirrors[disk].head_position);
25932 + if (new_distance < current_distance) {
25933 + conf->sect_count = 0;
25934 + current_distance = new_distance;
25937 + } while (disk != conf->last_used);
25940 + conf->mirrors[new_disk].head_position = this_sector + sectors;
25942 + conf->last_used = new_disk;
25943 + conf->sect_count += sectors;
25948 +static void raid1_read(struct evms_logical_node *md_node, struct buffer_head *bh)
25950 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25951 + raid1_conf_t *conf = mddev_to_conf(mddev);
25952 + struct mirror_info *mirror;
25953 + struct buffer_head *bh_req;
25954 + struct raid1_bh * r1_bh;
25956 + if (evms_md_check_boundary(md_node, bh)) return;
25958 + if (!buffer_locked(bh))
25961 + r1_bh = raid1_alloc_r1bh (conf);
25963 + spin_lock_irq(&conf->segment_lock);
25964 + wait_event_lock_irq(conf->wait_done,
25965 + bh->b_rsector < conf->start_active ||
25966 + bh->b_rsector >= conf->start_future,
25967 + conf->segment_lock);
25968 + if (bh->b_rsector < conf->start_active)
25969 + conf->cnt_done++;
25971 + conf->cnt_future++;
25973 + set_bit(R1BH_SyncPhase, &r1_bh->state);
25975 + spin_unlock_irq(&conf->segment_lock);
25977 + r1_bh->mddev = mddev;
25978 + r1_bh->cmd = READ;
25979 + r1_bh->master_bh = bh;
25981 + mirror = conf->mirrors + raid1_read_balance(conf, bh);
25983 + bh_req = &r1_bh->bh_req;
25984 + memcpy(bh_req, bh, sizeof(*bh));
25985 + bh_req->b_blocknr = bh->b_rsector;
25986 + bh_req->b_dev = mirror->dev;
25987 + bh_req->b_end_io = raid1_end_read_request;
25988 + bh_req->b_private = r1_bh;
25989 + evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL);
25990 + R_IO(mirror->node, bh_req);
25993 +static void raid1_write(
25994 + struct evms_logical_node *md_node,
25995 + struct buffer_head *bh)
25997 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25998 + raid1_conf_t *conf = mddev_to_conf(mddev);
25999 + struct raid1_bh * r1_bh;
26000 + struct buffer_head *bhl;
26001 + struct buffer_head *mbh;
26004 + if (evms_md_check_boundary(md_node, bh)) return;
26006 + if (!buffer_locked(bh))
26009 + r1_bh = raid1_alloc_r1bh (conf);
26011 + spin_lock_irq(&conf->segment_lock);
26012 + wait_event_lock_irq(conf->wait_done,
26013 + bh->b_rsector < conf->start_active ||
26014 + bh->b_rsector >= conf->start_future,
26015 + conf->segment_lock);
26016 + if (bh->b_rsector < conf->start_active)
26017 + conf->cnt_done++;
26019 + conf->cnt_future++;
26021 + set_bit(R1BH_SyncPhase, &r1_bh->state);
26023 + spin_unlock_irq(&conf->segment_lock);
26026 + * i think the read and write branch should be separated completely,
26027 + * since we want to do read balancing on the read side for example.
26028 + * Alternative implementations? :) --mingo
26031 + r1_bh->mddev = mddev;
26032 + r1_bh->cmd = WRITE;
26033 + r1_bh->master_bh = bh;
26035 + bhl = raid1_alloc_bh(conf, conf->raid_disks);
26037 + for (i=0, sum_bhs=0;
26038 + (sum_bhs < conf->raid_disks) && (i < MD_SB_DISKS);
26040 + if (!conf->mirrors[i].operational)
26044 + * We should use a private pool (size depending on NR_REQUEST),
26045 + * to avoid writes filling up the memory with bhs
26047 + * Such pools are much faster than kmalloc anyways (so we waste
26048 + * almost nothing by not using the master bh when writing and
26049 + * win alot of cleanness) but for now we are cool enough. --mingo
26051 + * It's safe to sleep here, buffer heads cannot be used in a shared
26052 + * manner in the write branch. Look how we lock the buffer at the
26053 + * beginning of this function to grok the difference ;)
26056 + if (mbh == NULL) {
26060 + bhl = mbh->b_next;
26061 + mbh->b_next = NULL;
26062 + mbh->b_this_page = (struct buffer_head *)1;
26065 + * prepare mirrored mbh (fields ordered for max mem throughput):
26067 + mbh->b_blocknr = bh->b_rsector;
26068 + mbh->b_rdev = bh->b_rdev;
26069 + mbh->b_dev = conf->mirrors[i].dev;
26070 + mbh->b_rsector = bh->b_rsector;
26071 + mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
26072 + (1<<BH_Mapped) | (1<<BH_Lock);
26074 + atomic_set(&mbh->b_count, 1);
26075 + mbh->b_size = bh->b_size;
26076 + mbh->b_page = bh->b_page;
26077 + mbh->b_data = bh->b_data;
26078 + mbh->b_list = BUF_LOCKED;
26079 + mbh->b_end_io = raid1_end_write_request;
26080 + mbh->b_private = conf->mirrors[i].node;
26082 + mbh->b_next = r1_bh->mirror_bh_list;
26083 + r1_bh->mirror_bh_list = mbh;
26087 + if (bhl) raid1_free_bh(conf,bhl);
26089 + /* Gag - all mirrors non-operational.. */
26090 + raid1_end_bh_io(r1_bh, 0);
26093 + atomic_set(&r1_bh->remaining, sum_bhs);
26096 + * We have to be a bit careful about the semaphore above, thats
26097 + * why we start the requests separately. Since kmalloc() could
26098 + * fail, sleep and make_request() can sleep too, this is the
26099 + * safer solution. Imagine, end_request decreasing the semaphore
26100 + * before we could have set it up ... We could play tricks with
26101 + * the semaphore (presetting it and correcting at the end if
26102 + * sum_bhs is not 'n' but we have to do end_request by hand if
26103 + * all requests finish until we had a chance to set up the
26104 + * semaphore correctly ... lots of races).
26106 + bhl = r1_bh->mirror_bh_list;
26108 + struct evms_logical_node *node;
26111 + bhl = mbh->b_next;
26112 + node = (struct evms_logical_node *)mbh->b_private;
26113 + mbh->b_private = r1_bh;
26115 + evms_cs_volume_request_in_progress(mbh->b_rdev, 1, NULL);
26121 +static int raid1_status (char *page, mddev_t *mddev)
26123 + raid1_conf_t *conf = mddev_to_conf(mddev);
26126 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
26127 + conf->working_disks);
26128 + for (i = 0; i < conf->raid_disks; i++)
26129 + sz += sprintf (page+sz, "%s",
26130 + conf->mirrors[i].operational ? "U" : "_");
26131 + sz += sprintf (page+sz, "]");
26135 +#define LAST_DISK KERN_ALERT \
26136 +"EVMS raid1: only one disk left and IO error.\n"
26138 +#define NO_SPARE_DISK KERN_ALERT \
26139 +"EVMS raid1: no spare disk left, degrading mirror level by one.\n"
26141 +#define DISK_FAILED KERN_ALERT \
26142 +"EVMS raid1: Disk failure on %s, disabling device. \n" \
26143 +" Operation continuing on %d devices\n"
26145 +#define START_SYNCING KERN_ALERT \
26146 +"EVMS raid1: start syncing spare disk.\n"
26148 +#define ALREADY_SYNCING KERN_INFO \
26149 +"EVMS raid1: syncing already in progress.\n"
26151 +static void mark_disk_bad (mddev_t *mddev, int failed)
26153 + raid1_conf_t *conf = mddev_to_conf(mddev);
26154 + struct mirror_info *mirror = conf->mirrors+failed;
26155 + mdp_super_t *sb = mddev->sb;
26157 + mirror->operational = 0;
26158 + mark_disk_faulty(sb->disks+mirror->number);
26159 + mark_disk_nonsync(sb->disks+mirror->number);
26160 + mark_disk_inactive(sb->disks+mirror->number);
26161 + if (!mirror->write_only)
26162 + sb->active_disks--;
26163 + sb->working_disks--;
26164 + sb->failed_disks++;
26165 + mddev->sb_dirty = 1;
26166 + evms_cs_wakeup_thread(conf->thread);
26167 + if (!mirror->write_only)
26168 + conf->working_disks--;
26169 + LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);
26172 +static int raid1_error (
26174 + struct evms_logical_node *node)
26176 + raid1_conf_t *conf = mddev_to_conf(mddev);
26177 + struct mirror_info * mirrors = conf->mirrors;
26178 + int disks = MD_SB_DISKS;
26181 + /* Find the drive.
26182 + * If it is not operational, then we have already marked it as dead
26183 + * else if it is the last working disks, ignore the error, let the
26184 + * next level up know.
26185 + * else mark the drive as failed
26188 + for (i = 0; i < disks; i++)
26189 + if (mirrors[i].node==node && mirrors[i].operational)
26194 + if (i < conf->raid_disks && conf->working_disks == 1) {
26195 + /* Don't fail the drive, act as though we were just a
26196 + * normal single drive
26201 + mark_disk_bad(mddev, i);
26206 +#undef NO_SPARE_DISK
26207 +#undef DISK_FAILED
26208 +#undef START_SYNCING
26211 +static void print_raid1_conf (raid1_conf_t *conf)
26214 + struct mirror_info *tmp;
26216 + LOG_DEFAULT("RAID1 conf printout:\n");
26218 + LOG_DEFAULT("(conf==NULL)\n");
26221 + LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",
26222 + conf->working_disks,conf->raid_disks, conf->nr_disks);
26224 + for (i = 0; i < conf->nr_disks; i++) {
26225 + tmp = conf->mirrors + i;
26226 + LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
26227 + i, tmp->spare,tmp->operational,
26228 + tmp->number,tmp->raid_disk,tmp->used_slot,
26229 + evms_md_partition_name(tmp->node));
26233 +static void close_sync(raid1_conf_t *conf)
26235 + mddev_t *mddev = conf->mddev;
26236 + /* If reconstruction was interrupted, we need to close the "active" and "pending"
26238 + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
26240 + /* this is really needed when recovery stops too... */
26241 + spin_lock_irq(&conf->segment_lock);
26242 + conf->start_active = conf->start_pending;
26243 + conf->start_ready = conf->start_pending;
26244 + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
26245 + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
26246 + conf->start_future = mddev->sb->size+1;
26247 + conf->cnt_pending = conf->cnt_future;
26248 + conf->cnt_future = 0;
26249 + conf->phase = conf->phase ^1;
26250 + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
26251 + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
26253 + conf->cnt_future = conf->cnt_done;;
26254 + conf->cnt_done = 0;
26255 + spin_unlock_irq(&conf->segment_lock);
26256 + wake_up(&conf->wait_done);
26259 +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
26262 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
26263 + raid1_conf_t *conf = mddev->private;
26264 + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;
26265 + mdp_super_t *sb = mddev->sb;
26266 + mdp_disk_t *failed_desc, *spare_desc;
26267 + mdk_rdev_t *spare_rdev, *failed_rdev;
26269 + print_raid1_conf(conf);
26270 + md_spin_lock_irq(&conf->device_lock);
26272 + * find the disk ...
26276 + case DISKOP_SPARE_ACTIVE:
26279 + * Find the failed disk within the RAID1 configuration ...
26280 + * (this can only be in the first conf->working_disks part)
26282 + for (i = 0; i < conf->raid_disks; i++) {
26283 + tmp = conf->mirrors + i;
26284 + if ((!tmp->operational && !tmp->spare) ||
26285 + !tmp->used_slot) {
26291 + * When we activate a spare disk we _must_ have a disk in
26292 + * the lower (active) part of the array to replace.
26294 +/* if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
26299 + */ /* fall through */
26301 + case DISKOP_HOT_SPARE_ACTIVE:
26302 + case DISKOP_SPARE_WRITE:
26303 + case DISKOP_SPARE_INACTIVE:
26306 + * Find the spare disk ... (can only be in the 'high'
26307 + * area of the array)
26308 + ##### Actually it can be sooner now that we have improved MD #####
26309 + This support required for expanding number of active mirrors.
26311 + for (i = 0; i < MD_SB_DISKS; i++) {
26312 + tmp = conf->mirrors + i;
26313 + if (tmp->spare && tmp->number == (*d)->number) {
26318 + if (spare_disk == -1) {
26325 + case DISKOP_HOT_REMOVE_SPARE:
26327 + for (i = 0; i < MD_SB_DISKS; i++) {
26328 + tmp = conf->mirrors + i;
26329 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
26330 + if (tmp->operational) {
26333 + } else if (!tmp->spare){
26338 + removed_disk = i;
26342 + if (removed_disk == -1) {
26349 + case DISKOP_HOT_REMOVE_DISK:
26350 + if (conf->working_disks <= 1) {
26354 + for (i = 0; i < MD_SB_DISKS; i++) {
26355 + tmp = conf->mirrors + i;
26356 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
26357 + removed_disk = i;
26361 + if (removed_disk == -1) {
26368 + case DISKOP_HOT_ADD_DISK:
26376 + * Switch the spare disk to write-only mode:
26378 + case DISKOP_SPARE_WRITE:
26379 + sdisk = conf->mirrors + spare_disk;
26380 + sdisk->operational = 1;
26381 + sdisk->write_only = 1;
26384 + * Deactivate a spare disk:
26386 + case DISKOP_SPARE_INACTIVE:
26387 + close_sync(conf);
26388 + sdisk = conf->mirrors + spare_disk;
26389 + sdisk->operational = 0;
26390 + sdisk->write_only = 0;
26393 + * Activate (mark read-write) the (now sync) spare disk,
26394 + * which means we switch it's 'raid position' (->raid_disk)
26395 + * with the failed disk. (only the first 'conf->nr_disks'
26396 + * slots are used for 'real' disks and we must preserve this
26399 + case DISKOP_SPARE_ACTIVE:
26400 + close_sync(conf);
26401 + sdisk = conf->mirrors + spare_disk;
26402 + if (failed_disk < 0) {
26403 + // preset failed disk to itself if no failed disk.
26404 + failed_disk = spare_disk;
26405 + // try to find spare earlier in array
26406 + for (i = conf->raid_disks; i < spare_disk; i++) {
26407 + tmp = conf->mirrors + i;
26408 + if ((tmp->spare) || !tmp->used_slot) {
26414 + fdisk = conf->mirrors + failed_disk;
26416 + spare_desc = &sb->disks[sdisk->number];
26417 + failed_desc = &sb->disks[fdisk->number];
26419 + if (spare_desc != *d) {
26425 + if (spare_desc->raid_disk != sdisk->raid_disk) {
26431 + if (sdisk->raid_disk != spare_disk) {
26437 + if (failed_desc->raid_disk != fdisk->raid_disk) {
26443 + if (fdisk->raid_disk != failed_disk) {
26450 + * do the switch finally
26452 + spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
26453 + failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
26455 + /* There must be a spare_rdev, but there may not be a
26456 + * failed_rdev. That slot might be empty...
26458 + spare_rdev->desc_nr = failed_desc->number;
26460 + failed_rdev->desc_nr = spare_desc->number;
26462 + xchg_values(*spare_desc, *failed_desc);
26463 + xchg_values(*fdisk, *sdisk);
26466 + * (careful, 'failed' and 'spare' are switched from now on)
26468 + * we want to preserve linear numbering and we want to
26469 + * give the proper raid_disk number to the now activated
26470 + * disk. (this means we switch back these values)
26473 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
26474 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
26475 + xchg_values(spare_desc->number, failed_desc->number);
26476 + xchg_values(sdisk->number, fdisk->number);
26478 + *d = failed_desc;
26480 + if (sdisk->dev == MKDEV(0,0))
26481 + sdisk->used_slot = 0;
26483 + * this really activates the spare.
26485 + fdisk->spare = 0;
26486 + fdisk->write_only = 0;
26489 + * if we activate a spare, we definitely replace a
26490 + * non-operational disk slot in the 'low' area of
26491 + * the disk array.
26494 + conf->working_disks++;
26498 + /* Activate a spare disk without a failed disk */
26499 + case DISKOP_HOT_SPARE_ACTIVE:
26500 + sdisk = conf->mirrors + spare_disk;
26501 + sdisk->spare = 0;
26502 + sdisk->write_only = 0;
26503 + conf->working_disks++;
26504 + conf->raid_disks++;
26505 + if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS)
26506 + LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);
26509 + case DISKOP_HOT_REMOVE_SPARE:
26510 + rdisk = conf->mirrors + removed_disk;
26512 + if (removed_disk < conf->raid_disks) {
26518 + LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n",
26519 + __FUNCTION__, evms_md_partition_name(rdisk->node),
26520 + conf->mddev->__minor, conf->nr_disks-1);
26522 + rdisk->dev = MKDEV(0,0);
26523 + rdisk->node = NULL;
26524 + rdisk->used_slot = 0;
26525 + conf->nr_disks--;
26528 + case DISKOP_HOT_REMOVE_DISK:
26529 + rdisk = conf->mirrors + removed_disk;
26531 + LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n",
26532 + __FUNCTION__, evms_md_partition_name(rdisk->node),
26533 + conf->mddev->__minor, conf->nr_disks-1);
26535 + rdisk->dev = MKDEV(0,0);
26536 + rdisk->node = NULL;
26537 + rdisk->used_slot = 0;
26538 + rdisk->operational = 0;
26539 + conf->working_disks--;
26540 + conf->nr_disks--;
26541 + sb->raid_disks--; //decrement raid disks. md_core now increments
26542 + //when activating new spare, don't assume add spare here
26550 + md_spin_unlock_irq(&conf->device_lock);
26551 + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
26552 + /* should move to "END_REBUILD" when such exists */
26553 + raid1_shrink_buffers(conf);
26555 + print_raid1_conf(conf);
26560 +#define IO_ERROR KERN_ALERT \
26561 +"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"
26563 +#define REDIRECT_SECTOR KERN_ERR \
26564 +"EVMS raid1: %s: redirecting sector %lu to another mirror\n"
26567 + * This is a kernel thread which:
26569 + * 1. Retries failed read operations on working mirrors.
26570 + * 2. Updates the raid superblock when problems encounter.
26571 + * 3. Performs writes following reads for array syncronising.
26573 +static void end_sync_write(struct buffer_head *bh, int uptodate);
26574 +static void end_sync_read(struct buffer_head *bh, int uptodate);
26576 +static void raid1d (void *data)
26578 + struct raid1_bh *r1_bh;
26579 + struct buffer_head *bh;
26580 + unsigned long flags;
26582 + mdk_rdev_t *rdev;
26584 + struct evms_logical_node *node;
26585 + raid1_conf_t *conf = (raid1_conf_t *) data;
26588 + mddev = conf->mddev;
26589 + if (mddev->sb_dirty) {
26590 + LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");
26591 + mddev->sb_dirty = 0;
26592 + evms_md_update_sb(mddev);
26594 + md_spin_lock_irqsave(&retry_list_lock, flags);
26595 + r1_bh = evms_raid1_retry_list;
26598 + evms_raid1_retry_list = r1_bh->next_r1;
26599 + md_spin_unlock_irqrestore(&retry_list_lock, flags);
26601 + mddev = r1_bh->mddev;
26602 + bh = &r1_bh->bh_req;
26603 + switch(r1_bh->cmd) {
26605 + /* have to allocate lots of bh structures and
26606 + * schedule writes
26608 + if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
26609 + int i, sum_bhs = 0;
26610 + int disks = MD_SB_DISKS;
26611 + struct buffer_head *bhl, *mbh;
26613 + conf = mddev_to_conf(mddev);
26614 + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
26615 + for (i = 0; i < disks ; i++) {
26616 + if (!conf->mirrors[i].operational)
26618 + if (i==conf->last_used)
26619 + /* we read from here, no need to write */
26621 + if (i < conf->raid_disks
26622 + && !conf->resync_mirrors
26623 + && !conf->mirrors[i].write_only)
26624 + /* don't need to write this,
26625 + * we are just rebuilding */
26632 + bhl = mbh->b_next;
26633 + mbh->b_this_page = (struct buffer_head *)1;
26637 + * prepare mirrored bh (fields ordered for max mem throughput):
26639 + mbh->b_blocknr = bh->b_blocknr;
26640 + mbh->b_dev = conf->mirrors[i].dev;
26641 + mbh->b_rsector = bh->b_blocknr;
26642 + mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
26643 + (1<<BH_Mapped) | (1<<BH_Lock);
26644 + atomic_set(&mbh->b_count, 1);
26645 + mbh->b_size = bh->b_size;
26646 + mbh->b_page = bh->b_page;
26647 + mbh->b_data = bh->b_data;
26648 + mbh->b_list = BUF_LOCKED;
26649 + mbh->b_end_io = end_sync_write;
26650 + mbh->b_private = conf->mirrors[i].node;
26652 + mbh->b_next = r1_bh->mirror_bh_list;
26653 + r1_bh->mirror_bh_list = mbh;
26657 + atomic_set(&r1_bh->remaining, sum_bhs);
26658 + if (bhl) raid1_free_bh(conf, bhl);
26659 + mbh = r1_bh->mirror_bh_list;
26662 + /* nowhere to write this too... I guess we
26665 + sync_request_done(bh->b_blocknr, conf);
26666 + evms_md_done_sync(mddev,
26667 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT,
26669 + raid1_free_buf(r1_bh);
26673 + node = (struct evms_logical_node *)mbh->b_private;
26674 + mbh->b_private = r1_bh;
26677 + evms_md_sync_acct(mbh->b_dev,
26678 + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT);
26679 + mbh = mbh->b_next;
26683 + /* There is no point trying a read-for-reconstruct
26684 + * as reconstruct is about to be aborted
26686 + rdev = evms_md_find_rdev(mddev,bh->b_dev);
26688 + LOG_ERROR(IO_ERROR,
26689 + evms_md_partition_name(rdev->node),
26691 + evms_md_done_sync(mddev, bh->b_size>>EVMS_VSECTOR_SIZE_SHIFT, 0);
26699 + evms_raid1_map(mddev, &node, &bh->b_dev);
26700 + if (bh->b_dev == dev) {
26701 + rdev = evms_md_find_rdev(mddev,dev);
26703 + LOG_ERROR(" unrecoverable read error on %s at LBA(%lu)\n",
26704 + evms_md_partition_name(rdev->node),
26705 + r1_bh->master_bh->b_rsector);
26706 + raid1_end_bh_io(r1_bh, 0);
26708 + /* retry I/O on new device */
26709 + bh->b_rdev = r1_bh->master_bh->b_rdev;
26710 + bh->b_rsector = bh->b_blocknr;
26711 + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, 1, NULL);
26717 + md_spin_unlock_irqrestore(&retry_list_lock, flags);
26720 +#undef REDIRECT_SECTOR
26723 + * Private kernel thread to reconstruct mirrors after an unclean
26726 +static void raid1syncd (void *data)
26728 + raid1_conf_t *conf = data;
26729 + mddev_t *mddev = conf->mddev;
26731 + if (!conf->resync_mirrors)
26733 + if (conf->resync_mirrors == 2)
26735 + down(&mddev->recovery_sem);
26736 + if (!evms_md_do_sync(mddev, NULL)) {
26738 + * Only if everything went Ok.
26740 + conf->resync_mirrors = 0;
26743 + close_sync(conf);
26745 + up(&mddev->recovery_sem);
26746 + raid1_shrink_buffers(conf);
26750 + * perform a "sync" on one "block"
26752 + * We need to make sure that no normal I/O request - particularly write
26753 + * requests - conflict with active sync requests.
26754 + * This is achieved by conceptually dividing the device space into a
26755 + * number of sections:
26756 + * DONE: 0 .. a-1 These blocks are in-sync
26757 + * ACTIVE: a.. b-1 These blocks may have active sync requests, but
26758 + * no normal IO requests
26759 + * READY: b .. c-1 These blocks have no normal IO requests - sync
26760 + * request may be happening
26761 + * PENDING: c .. d-1 These blocks may have IO requests, but no new
26762 + * ones will be added
26763 + * FUTURE: d .. end These blocks are not to be considered yet. IO may
26764 + * be happening, but not sync
26767 + * phase which flips (0 or 1) each time d moves and
26769 + * z = active io requests in FUTURE since d moved - marked with
26771 + * y = active io requests in FUTURE before d moved, or PENDING -
26772 + * marked with previous phase
26773 + * x = active sync requests in READY
26774 + * w = active sync requests in ACTIVE
26775 + * v = active io requests in DONE
26777 + * Normally, a=b=c=d=0 and z= active io requests
26778 + * or a=b=c=d=END and v= active io requests
26779 + * Allowed changes to a,b,c,d:
26780 + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
26782 + * C: b=c, w+=x, x=0
26784 + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
26786 + * At start of sync we apply A.
26787 + * When y reaches 0, we apply B then A then being sync requests
26788 + * When sync point reaches c-1, we wait for y==0, and W==0, and
26789 + * then apply apply B then A then D then C.
26790 + * Finally, we apply E
26792 + * The sync request simply issues a "read" against a working drive
26793 + * This is marked so that on completion the raid1d thread is woken to
26794 + * issue suitable write requests
26797 +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
26799 + raid1_conf_t *conf = mddev_to_conf(mddev);
26800 + struct mirror_info *mirror;
26801 + struct raid1_bh *r1_bh;
26802 + struct buffer_head *bh;
26807 + spin_lock_irq(&conf->segment_lock);
26808 + if (!sector_nr) {
26809 + /* initialize ...*/
26811 + conf->start_active = 0;
26812 + conf->start_ready = 0;
26813 + conf->start_pending = 0;
26814 + conf->start_future = 0;
26816 + /* we want enough buffers to hold twice the window of 128*/
26817 + buffs = 128 *2 / (PAGE_SIZE>>9);
26818 + buffs = raid1_grow_buffers(conf, buffs);
26822 + conf->window = buffs*(PAGE_SIZE>>9)/2;
26823 + conf->cnt_future += conf->cnt_done+conf->cnt_pending;
26824 + conf->cnt_done = conf->cnt_pending = 0;
26825 + if (conf->cnt_ready || conf->cnt_active)
26828 + while (sector_nr >= conf->start_pending) {
26829 + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
26830 + sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
26831 + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
26832 + wait_event_lock_irq(conf->wait_done,
26833 + !conf->cnt_active,
26834 + conf->segment_lock);
26835 + wait_event_lock_irq(conf->wait_ready,
26836 + !conf->cnt_pending,
26837 + conf->segment_lock);
26838 + conf->start_active = conf->start_ready;
26839 + conf->start_ready = conf->start_pending;
26840 + conf->start_pending = conf->start_future;
26841 + conf->start_future = conf->start_future+conf->window;
26842 + // Note: falling off the end is not a problem
26843 + conf->phase = conf->phase ^1;
26844 + conf->cnt_active = conf->cnt_ready;
26845 + conf->cnt_ready = 0;
26846 + conf->cnt_pending = conf->cnt_future;
26847 + conf->cnt_future = 0;
26848 + wake_up(&conf->wait_done);
26850 + conf->cnt_ready++;
26851 + spin_unlock_irq(&conf->segment_lock);
26854 + /* If reconstructing, and >1 working disc,
26855 + * could dedicate one to rebuild and others to
26856 + * service read requests ..
26858 + disk = conf->last_used;
26859 + /* make sure disk is operational */
26860 + while (!conf->mirrors[disk].operational) {
26861 + if (disk <= 0) disk = conf->raid_disks;
26863 + if (disk == conf->last_used)
26866 + conf->last_used = disk;
26868 + mirror = conf->mirrors+conf->last_used;
26870 + r1_bh = raid1_alloc_buf (conf);
26871 + r1_bh->mddev = mddev;
26872 + r1_bh->cmd = SPECIAL;
26873 + bh = &r1_bh->bh_req;
26875 + block_nr = sector_nr;
26877 + while (!(block_nr & 1) && bsize < PAGE_SIZE
26878 + && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
26882 + bh->b_size = bsize;
26883 + bh->b_list = BUF_LOCKED;
26884 + bh->b_dev = mirror->dev;
26885 + bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
26890 + if (bh->b_data != page_address(bh->b_page))
26892 + bh->b_end_io = end_sync_read;
26893 + bh->b_private = r1_bh;
26894 + bh->b_blocknr = sector_nr;
26895 + bh->b_rsector = sector_nr;
26896 + init_waitqueue_head(&bh->b_wait);
26898 + R_IO(mirror->node, bh);
26899 + evms_md_sync_acct(bh->b_dev, bsize/512);
26901 + return (bsize >> 9);
26904 + raid1_shrink_buffers(conf);
26905 + spin_unlock_irq(&conf->segment_lock);
26909 +static void end_sync_read(struct buffer_head *bh, int uptodate)
26911 + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
26913 + /* we have read a block, now it needs to be re-written,
26914 + * or re-read if the read failed.
26915 + * We don't do much here, just schedule handling by raid1d
26918 + evms_md_error_dev(r1_bh->mddev, bh->b_dev);
26920 + set_bit(R1BH_Uptodate, &r1_bh->state);
26921 + raid1_reschedule_retry(r1_bh);
26924 +static void end_sync_write(struct buffer_head *bh, int uptodate)
26926 + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
26929 + evms_md_error_dev(r1_bh->mddev, bh->b_dev);
26930 + if (atomic_dec_and_test(&r1_bh->remaining)) {
26931 + mddev_t *mddev = r1_bh->mddev;
26932 + unsigned long sect = bh->b_blocknr;
26933 + int size = bh->b_size;
26935 + raid1_free_buf(r1_bh);
26936 + sync_request_done(sect, mddev_to_conf(mddev));
26937 + evms_md_done_sync(mddev, size>>EVMS_VSECTOR_SIZE_SHIFT, uptodate);
26941 +#define INVALID_LEVEL KERN_WARNING \
26942 +"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"
26944 +#define NO_SB KERN_ERR \
26945 +"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"
26947 +#define ERRORS KERN_ERR \
26948 +"EVMS raid1: disabled mirror %s (errors detected)\n"
26950 +#define NOT_IN_SYNC KERN_ERR \
26951 +"EVMS raid1: disabled mirror %s (not in sync)\n"
26953 +#define INCONSISTENT KERN_ERR \
26954 +"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"
26956 +#define ALREADY_RUNNING KERN_ERR \
26957 +"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"
26959 +#define OPERATIONAL KERN_INFO \
26960 +"EVMS raid1: device %s operational as mirror %d\n"
26962 +#define MEM_ERROR KERN_ERR \
26963 +"EVMS raid1: couldn't allocate memory for md%d\n"
26965 +#define SPARE KERN_INFO \
26966 +"EVMS raid1: spare disk %s\n"
26968 +#define NONE_OPERATIONAL KERN_ERR \
26969 +"EVMS raid1: no operational mirrors for md%d\n"
26971 +#define ARRAY_IS_ACTIVE KERN_INFO \
26972 +"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"
26974 +#define THREAD_ERROR KERN_ERR \
26975 +"EVMS raid1: couldn't allocate thread for md%d\n"
26977 +#define START_RESYNC KERN_WARNING \
26978 +"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"
26980 +static int raid1_run (mddev_t *mddev)
26982 + raid1_conf_t *conf;
26983 + int i, j, disk_idx;
26984 + struct mirror_info *disk;
26985 + mdp_super_t *sb = mddev->sb;
26986 + mdp_disk_t *descriptor;
26987 + mdk_rdev_t *rdev;
26988 + struct md_list_head *tmp;
26989 + int start_recovery = 0;
26991 + MOD_INC_USE_COUNT;
26993 + LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
26994 + if (sb->level != 1) {
26995 + LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);
26999 + * copy the already verified devices into our private RAID1
27000 + * bookkeeping area. [whatever we allocate in raid1_run(),
27001 + * should be freed in raid1_stop()]
27004 + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
27005 + mddev->private = conf;
27007 + LOG_ERROR(MEM_ERROR, mdidx(mddev));
27010 + memset(conf, 0, sizeof(*conf));
27012 + ITERATE_RDEV(mddev,rdev,tmp) {
27013 + if (rdev->faulty) {
27014 + LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));
27021 + if (rdev->desc_nr == -1) {
27025 + descriptor = &sb->disks[rdev->desc_nr];
27026 + disk_idx = descriptor->raid_disk;
27027 + disk = conf->mirrors + disk_idx;
27029 + if (disk_faulty(descriptor)) {
27030 + disk->number = descriptor->number;
27031 + disk->raid_disk = disk_idx;
27032 + disk->node = rdev->node;
27033 + disk->dev = rdev->dev;
27034 + disk->sect_limit = MAX_WORK_PER_DISK;
27035 + disk->operational = 0;
27036 + disk->write_only = 0;
27038 + disk->used_slot = 1;
27039 + disk->head_position = 0;
27042 + if (disk_active(descriptor)) {
27043 + if (!disk_sync(descriptor)) {
27044 + LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));
27047 + if ((descriptor->number > MD_SB_DISKS) ||
27048 + (disk_idx > sb->raid_disks)) {
27050 + LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));
27053 + if (disk->operational) {
27054 + LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);
27057 + LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);
27058 + disk->number = descriptor->number;
27059 + disk->raid_disk = disk_idx;
27060 + disk->node = rdev->node;
27061 + disk->dev = rdev->dev;
27062 + disk->sect_limit = MAX_WORK_PER_DISK;
27063 + disk->operational = 1;
27064 + disk->write_only = 0;
27066 + disk->used_slot = 1;
27067 + disk->head_position = 0;
27068 + conf->working_disks++;
27071 + * Must be a spare disk ..
27073 + LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));
27074 + disk->number = descriptor->number;
27075 + disk->raid_disk = disk_idx;
27076 + disk->node = rdev->node;
27077 + disk->dev = rdev->dev;
27078 + disk->sect_limit = MAX_WORK_PER_DISK;
27079 + disk->operational = 0;
27080 + disk->write_only = 0;
27082 + disk->used_slot = 1;
27083 + disk->head_position = 0;
27086 + conf->raid_disks = sb->raid_disks;
27087 + conf->nr_disks = sb->nr_disks;
27088 + conf->mddev = mddev;
27089 + conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
27091 + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
27092 + init_waitqueue_head(&conf->wait_buffer);
27093 + init_waitqueue_head(&conf->wait_done);
27094 + init_waitqueue_head(&conf->wait_ready);
27096 + if (!conf->working_disks) {
27097 + LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));
27098 + goto out_free_conf;
27102 + /* pre-allocate some buffer_head structures.
27103 + * As a minimum, 1 r1bh and raid_disks buffer_heads
27104 + * would probably get us by in tight memory situations,
27105 + * but a few more is probably a good idea.
27106 + * For now, try NR_RESERVED_BUFS r1bh and
27107 + * NR_RESERVED_BUFS*raid_disks bufferheads
27108 + * This will allow at least NR_RESERVED_BUFS concurrent
27109 + * reads or writes even if kmalloc starts failing
27111 + if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
27112 + raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
27113 + < NR_RESERVED_BUFS*conf->raid_disks) {
27114 + LOG_ERROR(MEM_ERROR, mdidx(mddev));
27115 + goto out_free_conf;
27118 + for (i = 0; i < MD_SB_DISKS; i++) {
27120 + descriptor = sb->disks+i;
27121 + disk_idx = descriptor->raid_disk;
27122 + disk = conf->mirrors + disk_idx;
27124 + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
27125 + !disk->used_slot) {
27127 + disk->number = descriptor->number;
27128 + disk->raid_disk = disk_idx;
27129 + disk->dev = MKDEV(0,0);
27131 + disk->operational = 0;
27132 + disk->write_only = 0;
27134 + disk->used_slot = 1;
27135 + disk->head_position = 0;
27140 + * find the first working one and use it as a starting point
27141 + * to read balancing.
27143 + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
27145 + conf->last_used = j;
27148 + if (conf->working_disks != sb->raid_disks) {
27149 + LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",
27151 + start_recovery = 1;
27155 + const char * name = "evms_raid1d";
27157 + conf->thread = evms_cs_register_thread(raid1d, conf, name);
27158 + if (!conf->thread) {
27159 + LOG_ERROR(THREAD_ERROR, mdidx(mddev));
27160 + goto out_free_conf;
27164 + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
27165 + (conf->working_disks > 1)) {
27166 + const char * name = "evms_raid1syncd";
27168 + conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);
27169 + if (!conf->resync_thread) {
27170 + LOG_ERROR(THREAD_ERROR, mdidx(mddev));
27171 + goto out_free_conf;
27174 + LOG_WARNING(START_RESYNC, mdidx(mddev));
27175 + conf->resync_mirrors = 1;
27176 + evms_cs_wakeup_thread(conf->resync_thread);
27180 + * Regenerate the "device is in sync with the raid set" bit for
27183 + for (i = 0; i < MD_SB_DISKS; i++) {
27184 + mark_disk_nonsync(sb->disks+i);
27185 + for (j = 0; j < sb->raid_disks; j++) {
27186 + if (!conf->mirrors[j].operational)
27188 + if (sb->disks[i].number == conf->mirrors[j].number)
27189 + mark_disk_sync(sb->disks+i);
27192 + sb->active_disks = conf->working_disks;
27194 + if (start_recovery)
27195 + evms_md_recover_arrays();
27198 + LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
27200 + * Ok, everything is just fine now
27205 + raid1_shrink_r1bh(conf);
27206 + raid1_shrink_bh(conf);
27207 + raid1_shrink_buffers(conf);
27209 + mddev->private = NULL;
27211 + MOD_DEC_USE_COUNT;
27215 +#undef INVALID_LEVEL
27218 +#undef NOT_IN_SYNC
27219 +#undef INCONSISTENT
27220 +#undef ALREADY_RUNNING
27221 +#undef OPERATIONAL
27223 +#undef NONE_OPERATIONAL
27224 +#undef ARRAY_IS_ACTIVE
27226 +static int raid1_stop_resync (mddev_t *mddev)
27228 + raid1_conf_t *conf = mddev_to_conf(mddev);
27230 + LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27231 + if (conf->resync_thread) {
27232 + if (conf->resync_mirrors) {
27233 + conf->resync_mirrors = 2;
27234 + evms_cs_interrupt_thread(conf->resync_thread);
27235 + LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");
27243 +static int raid1_restart_resync (mddev_t *mddev)
27245 + raid1_conf_t *conf = mddev_to_conf(mddev);
27247 + LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27248 + if (conf->resync_mirrors) {
27249 + if (!conf->resync_thread) {
27253 + conf->resync_mirrors = 1;
27254 + evms_cs_wakeup_thread(conf->resync_thread);
27260 +static int raid1_stop (mddev_t *mddev)
27262 + raid1_conf_t *conf = mddev_to_conf(mddev);
27264 + LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27265 + evms_cs_unregister_thread(conf->thread);
27266 + if (conf->resync_thread)
27267 + evms_cs_unregister_thread(conf->resync_thread);
27268 + raid1_shrink_r1bh(conf);
27269 + raid1_shrink_bh(conf);
27270 + raid1_shrink_buffers(conf);
27272 + mddev->private = NULL;
27273 + MOD_DEC_USE_COUNT;
27277 +static int raid1_evms_ioctl (
27279 + struct inode * inode,
27280 + struct file * file,
27281 + unsigned int cmd,
27282 + unsigned long arg)
27285 + struct evms_logical_node *node = NULL;
27286 + raid1_conf_t *conf = mddev_to_conf(mddev);
27289 + case EVMS_GET_BMAP:
27291 + for (i = 0; i < MD_SB_DISKS; i++) {
27292 + if (conf->mirrors[i].operational) {
27293 + node = conf->mirrors[i].node;
27299 + rc = IOCTL(node, inode, file, cmd, arg);
27312 +static mdk_personality_t raid1_personality = {
27313 + .name = "evms_raid1",
27314 + .read = raid1_read,
27315 + .write = raid1_write,
27316 + .run = raid1_run,
27317 + .stop = raid1_stop,
27318 + .status = raid1_status,
27319 + .error_handler = raid1_error,
27320 + .diskop = raid1_diskop,
27321 + .stop_resync = raid1_stop_resync,
27322 + .restart_resync = raid1_restart_resync,
27323 + .sync_request = raid1_sync_request,
27324 + .evms_ioctl = raid1_evms_ioctl
27327 +static int md__init raid1_init (void)
27329 + return evms_register_md_personality (RAID1, &raid1_personality);
27332 +static void raid1_exit (void)
27334 + evms_unregister_md_personality (RAID1);
27337 +module_init(raid1_init);
27338 +module_exit(raid1_exit);
27339 +#ifdef MODULE_LICENSE
27340 +MODULE_LICENSE("GPL");
27342 diff -Naur linux-2002-09-30/drivers/evms/md_raid5.c evms-2002-09-30/drivers/evms/md_raid5.c
27343 --- linux-2002-09-30/drivers/evms/md_raid5.c Wed Dec 31 18:00:00 1969
27344 +++ evms-2002-09-30/drivers/evms/md_raid5.c Thu Sep 26 14:40:58 2002
27347 + * md_raid5.c : Multiple Devices driver for Linux
27348 + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
27349 + * Copyright (C) 1999, 2000 Ingo Molnar
27351 + * RAID-5 management functions.
27353 + * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified
27354 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
27356 + * This program is free software; you can redistribute it and/or modify
27357 + * it under the terms of the GNU General Public License as published by
27358 + * the Free Software Foundation; either version 2, or (at your option)
27359 + * any later version.
27361 + * You should have received a copy of the GNU General Public License
27362 + * (for example /usr/src/linux/COPYING); if not, write to the Free
27363 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27366 +#include <linux/config.h>
27367 +#include <linux/module.h>
27368 +#include <linux/locks.h>
27369 +#include <linux/slab.h>
27370 +#include <linux/evms/evms_raid5.h>
27371 +#include <asm/bitops.h>
27372 +#include <asm/atomic.h>
27374 +#define LOG_PREFIX "md raid5: "
27376 +static mdk_personality_t raid5_personality;
27382 +#define NR_STRIPES 256
27383 +#define IO_THRESHOLD 1
27384 +#define HASH_PAGES 1
27385 +#define HASH_PAGES_ORDER 0
27386 +#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
27387 +#define HASH_MASK (NR_HASH - 1)
27388 +#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
27391 + * The following can be used to debug the driver
27393 +#define RAID5_DEBUG 0
27394 +#define RAID5_PARANOIA 1
27395 +#if RAID5_PARANOIA && CONFIG_SMP
27396 +#define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
27398 +#define CHECK_DEVLOCK()
27401 +static void print_raid5_conf(raid5_conf_t * conf);
27403 +static inline void
27404 +__release_stripe(raid5_conf_t * conf, struct stripe_head *sh)
27406 + if (atomic_dec_and_test(&sh->count)) {
27407 + if (!list_empty(&sh->lru))
27409 + if (atomic_read(&conf->active_stripes) == 0)
27411 + if (test_bit(STRIPE_HANDLE, &sh->state)) {
27412 + if (test_bit(STRIPE_DELAYED, &sh->state))
27413 + list_add_tail(&sh->lru, &conf->delayed_list);
27415 + list_add_tail(&sh->lru, &conf->handle_list);
27416 + evms_cs_wakeup_thread(conf->thread);
27418 + if (test_and_clear_bit
27419 + (STRIPE_PREREAD_ACTIVE, &sh->state)) {
27420 + atomic_dec(&conf->preread_active_stripes);
27421 + if (atomic_read(&conf->preread_active_stripes) <
27423 + evms_cs_wakeup_thread(conf->thread);
27425 + list_add_tail(&sh->lru, &conf->inactive_list);
27426 + atomic_dec(&conf->active_stripes);
27427 + if (!conf->inactive_blocked ||
27428 + atomic_read(&conf->active_stripes) <
27429 + (NR_STRIPES * 3 / 4))
27430 + wake_up(&conf->wait_for_stripe);
27435 +release_stripe(struct stripe_head *sh)
27437 + raid5_conf_t *conf = sh->raid_conf;
27438 + unsigned long flags;
27440 + spin_lock_irqsave(&conf->device_lock, flags);
27441 + __release_stripe(conf, sh);
27442 + spin_unlock_irqrestore(&conf->device_lock, flags);
27446 +remove_hash(struct stripe_head *sh)
27449 + if (sh->hash_pprev) {
27450 + if (sh->hash_next)
27451 + sh->hash_next->hash_pprev = sh->hash_pprev;
27452 + *sh->hash_pprev = sh->hash_next;
27453 + sh->hash_pprev = NULL;
27457 +static __inline__ void
27458 +insert_hash(raid5_conf_t * conf, struct stripe_head *sh)
27460 + struct stripe_head **shp = &stripe_hash(conf, sh->sector);
27463 + if ((sh->hash_next = *shp) != NULL)
27464 + (*shp)->hash_pprev = &sh->hash_next;
27466 + sh->hash_pprev = shp;
27469 +/* find an idle stripe, make sure it is unhashed, and return it. */
27470 +static struct stripe_head *
27471 +get_free_stripe(raid5_conf_t * conf)
27473 + struct stripe_head *sh = NULL;
27474 + struct list_head *first;
27477 + if (list_empty(&conf->inactive_list))
27479 + first = conf->inactive_list.next;
27480 + sh = list_entry(first, struct stripe_head, lru);
27481 + list_del_init(first);
27483 + atomic_inc(&conf->active_stripes);
27489 +shrink_buffers(struct stripe_head *sh, int num)
27491 + struct buffer_head *bh;
27494 + for (i = 0; i < num; i++) {
27495 + bh = sh->bh_cache[i];
27498 + sh->bh_cache[i] = NULL;
27499 + free_page((unsigned long) bh->b_data);
27505 +grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
27507 + struct buffer_head *bh;
27510 + for (i = 0; i < num; i++) {
27511 + struct page *page;
27512 + bh = kmalloc(sizeof (struct buffer_head), priority);
27515 + memset(bh, 0, sizeof (struct buffer_head));
27516 + init_waitqueue_head(&bh->b_wait);
27517 + if ((page = alloc_page(priority)))
27518 + bh->b_data = page_address(page);
27523 + bh->b_count = (atomic_t)ATOMIC_INIT(0);
27524 + bh->b_page = page;
27525 + sh->bh_cache[i] = bh;
27531 +static struct buffer_head *raid5_build_block(struct stripe_head *sh, int i);
27533 +static inline void
27534 +init_stripe(struct stripe_head *sh, unsigned long sector)
27536 + raid5_conf_t *conf = sh->raid_conf;
27537 + int disks = conf->raid_disks, i;
27539 + if (atomic_read(&sh->count) != 0)
27541 + if (test_bit(STRIPE_HANDLE, &sh->state))
27548 + sh->sector = sector;
27549 + sh->size = conf->buffer_size;
27552 + for (i = disks; i--;) {
27553 + if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
27554 + buffer_locked(sh->bh_cache[i])) {
27555 + LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",
27556 + sh->sector, i, sh->bh_read[i],
27557 + sh->bh_write[i], sh->bh_written[i],
27558 + buffer_locked(sh->bh_cache[i]));
27561 + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
27562 + raid5_build_block(sh, i);
27564 + insert_hash(conf, sh);
27567 +/* the buffer size has changed, so unhash all stripes
27568 + * as active stripes complete, they will go onto inactive list
27571 +shrink_stripe_cache(raid5_conf_t * conf)
27575 + if (atomic_read(&conf->active_stripes))
27577 + for (i = 0; i < NR_HASH; i++) {
27578 + struct stripe_head *sh;
27579 + while ((sh = conf->stripe_hashtbl[i]))
27584 +static struct stripe_head *
27585 +__find_stripe(raid5_conf_t * conf, unsigned long sector)
27587 + struct stripe_head *sh;
27590 + for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
27591 + if (sh->sector == sector)
27596 +static struct stripe_head *
27597 +get_active_stripe(raid5_conf_t * conf, unsigned long sector, int size)
27599 + struct stripe_head *sh;
27601 + md_spin_lock_irq(&conf->device_lock);
27604 + if (conf->buffer_size == 0 ||
27605 + (size && size != conf->buffer_size)) {
27606 + /* either the size is being changed (buffer_size==0) or
27607 + * we need to change it.
27608 + * If size==0, we can proceed as soon as buffer_size gets set.
27609 + * If size>0, we can proceed when active_stripes reaches 0, or
27610 + * when someone else sets the buffer_size to size.
27611 + * If someone sets the buffer size to something else, we will need to
27612 + * assert that we want to change it again
27615 + wait_event_lock_irq(conf->wait_for_stripe,
27616 + conf->buffer_size,
27617 + conf->device_lock);
27619 + while (conf->buffer_size != size
27620 + && atomic_read(&conf->active_stripes)) {
27621 + conf->buffer_size = 0;
27622 + wait_event_lock_irq(conf->
27624 + atomic_read(&conf->
27629 + conf->device_lock);
27632 + if (conf->buffer_size != size) {
27633 + shrink_stripe_cache(conf);
27636 + conf->buffer_size = size;
27641 + sector -= sector & ((conf->buffer_size >> 9) - 1);
27643 + sh = __find_stripe(conf, sector);
27645 + if (!conf->inactive_blocked)
27646 + sh = get_free_stripe(conf);
27648 + conf->inactive_blocked = 1;
27649 + wait_event_lock_irq(conf->wait_for_stripe,
27650 + !list_empty(&conf->
27654 + (&conf->active_stripes) <
27655 + (NR_STRIPES * 3 / 4)
27657 + inactive_blocked),
27658 + conf->device_lock);
27659 + conf->inactive_blocked = 0;
27661 + init_stripe(sh, sector);
27663 + if (atomic_read(&sh->count)) {
27664 + if (!list_empty(&sh->lru))
27667 + if (!test_bit(STRIPE_HANDLE, &sh->state))
27668 + atomic_inc(&conf->active_stripes);
27669 + if (list_empty(&sh->lru))
27671 + list_del_init(&sh->lru);
27674 + } while (sh == NULL);
27677 + atomic_inc(&sh->count);
27679 + md_spin_unlock_irq(&conf->device_lock);
27684 +grow_stripes(raid5_conf_t * conf, int num, int priority)
27686 + struct stripe_head *sh;
27689 + sh = kmalloc(sizeof (struct stripe_head), priority);
27692 + memset(sh, 0, sizeof (*sh));
27693 + sh->raid_conf = conf;
27694 + sh->lock = SPIN_LOCK_UNLOCKED;
27696 + if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
27697 + shrink_buffers(sh, conf->raid_disks);
27701 + /* we just created an active stripe so... */
27702 + sh->count = (atomic_t)ATOMIC_INIT(1);
27703 + atomic_inc(&conf->active_stripes);
27704 + INIT_LIST_HEAD(&sh->lru);
27705 + release_stripe(sh);
27711 +shrink_stripes(raid5_conf_t * conf, int num)
27713 + struct stripe_head *sh;
27716 + spin_lock_irq(&conf->device_lock);
27717 + sh = get_free_stripe(conf);
27718 + spin_unlock_irq(&conf->device_lock);
27721 + if (atomic_read(&sh->count))
27723 + shrink_buffers(sh, conf->raid_disks);
27725 + atomic_dec(&conf->active_stripes);
27730 +raid5_end_read_request(struct buffer_head *bh, int uptodate)
27732 + struct stripe_head *sh = bh->b_private;
27733 + raid5_conf_t *conf = sh->raid_conf;
27734 + int disks = conf->raid_disks, i;
27735 + unsigned long flags;
27737 + for (i = 0; i < disks; i++)
27738 + if (bh == sh->bh_cache[i])
27741 + if (i == disks) {
27747 + struct buffer_head *buffer;
27748 + spin_lock_irqsave(&conf->device_lock, flags);
27749 + /* we can return a buffer if we bypassed the cache or
27750 + * if the top buffer is not in highmem. If there are
27751 + * multiple buffers, leave the extra work to
27754 + buffer = sh->bh_read[i];
27755 + if (buffer && (!PageHighMem(buffer->b_page)
27756 + || buffer->b_page == bh->b_page)
27758 + sh->bh_read[i] = buffer->b_reqnext;
27759 + buffer->b_reqnext = NULL;
27762 + spin_unlock_irqrestore(&conf->device_lock, flags);
27763 + if (sh->bh_page[i] == NULL)
27764 + set_bit(BH_Uptodate, &bh->b_state);
27766 + if (buffer->b_page != bh->b_page)
27767 + memcpy(buffer->b_data, bh->b_data, bh->b_size);
27768 + evms_cs_volume_request_in_progress(buffer->b_rdev, -1, NULL);
27769 + buffer->b_end_io(buffer, 1);
27774 + evms_md_error(conf->mddev, sh->node[i]);
27777 + ("NODE was not set, skipping evms_md_error()\n");
27778 + clear_bit(BH_Uptodate, &bh->b_state);
27780 + /* must restore b_page before unlocking buffer... */
27781 + if (sh->bh_page[i]) {
27782 + bh->b_page = sh->bh_page[i];
27783 + bh->b_data = page_address(bh->b_page);
27784 + sh->bh_page[i] = NULL;
27785 + clear_bit(BH_Uptodate, &bh->b_state);
27787 + clear_bit(BH_Lock, &bh->b_state);
27788 + set_bit(STRIPE_HANDLE, &sh->state);
27789 + release_stripe(sh);
27790 + if (sh->node[i]) {
27791 + sh->node[i] = NULL;
27793 + LOG_WARNING(" evms node was not set.\n");
27799 +raid5_end_write_request(struct buffer_head *bh, int uptodate)
27801 + struct stripe_head *sh = bh->b_private;
27802 + raid5_conf_t *conf = sh->raid_conf;
27803 + int disks = conf->raid_disks, i;
27804 + unsigned long flags;
27806 + for (i = 0; i < disks; i++)
27807 + if (bh == sh->bh_cache[i])
27810 + if (i == disks) {
27815 + md_spin_lock_irqsave(&conf->device_lock, flags);
27819 + evms_md_error(conf->mddev, sh->node[i]);
27822 + (" NODE was not set, skipping evms_md_error()\n");
27824 + clear_bit(BH_Lock, &bh->b_state);
27825 + set_bit(STRIPE_HANDLE, &sh->state);
27826 + __release_stripe(conf, sh);
27827 + md_spin_unlock_irqrestore(&conf->device_lock, flags);
27828 + if (sh->node[i]) {
27829 + sh->node[i] = NULL;
27831 + LOG_WARNING(" evms node was not set.\n");
27835 +static struct buffer_head *
27836 +raid5_build_block(struct stripe_head *sh, int i)
27838 + raid5_conf_t *conf = sh->raid_conf;
27839 + struct buffer_head *bh = sh->bh_cache[i];
27840 + unsigned long block = sh->sector / (sh->size >> 9);
27842 + init_buffer(bh, raid5_end_read_request, sh);
27843 + bh->b_dev = conf->disks[i].dev;
27844 + bh->b_blocknr = block;
27846 + bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
27847 + bh->b_size = sh->size;
27848 + bh->b_list = BUF_LOCKED;
27853 +raid5_error(mddev_t * mddev, struct evms_logical_node * node)
27855 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
27856 + mdp_super_t *sb = mddev->sb;
27857 + struct disk_info *disk;
27860 + LOG_WARNING("%s: called\n", __FUNCTION__);
27862 + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
27863 + if (disk->node == node) {
27864 + if (disk->operational) {
27865 + disk->operational = 0;
27866 + mark_disk_faulty(sb->disks + disk->number);
27867 + mark_disk_nonsync(sb->disks + disk->number);
27868 + mark_disk_inactive(sb->disks + disk->number);
27869 + sb->active_disks--;
27870 + sb->working_disks--;
27871 + sb->failed_disks++;
27872 + mddev->sb_dirty = 1;
27873 + conf->working_disks--;
27874 + conf->failed_disks++;
27875 + evms_cs_wakeup_thread(conf->thread);
27877 + ("Disk failure on %s, disabling device."
27878 + " Operation continuing on %d devices\n",
27879 + evms_md_partition_name(disk->node),
27880 + conf->working_disks);
27886 + * handle errors in spares (during reconstruction)
27888 + if (conf->spare) {
27889 + disk = conf->spare;
27890 + if (disk->node == node) {
27891 + LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",
27892 + evms_md_partition_name(disk->node));
27893 + if (!conf->spare->operational) {
27894 + /* probably a SET_DISK_FAULTY ioctl */
27897 + disk->operational = 0;
27898 + disk->write_only = 0;
27899 + conf->spare = NULL;
27900 + mark_disk_faulty(sb->disks + disk->number);
27901 + mark_disk_nonsync(sb->disks + disk->number);
27902 + mark_disk_inactive(sb->disks + disk->number);
27903 + sb->spare_disks--;
27904 + sb->working_disks--;
27905 + sb->failed_disks++;
27907 + mddev->sb_dirty = 1;
27908 + evms_cs_wakeup_thread(conf->thread);
27918 + * Input: a 'big' sector number,
27919 + * Output: index of the data and parity disk, and the sector # in them.
27921 +static unsigned long
27922 +raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
27923 + unsigned int data_disks, unsigned int *dd_idx,
27924 + unsigned int *pd_idx, raid5_conf_t * conf)
27926 + unsigned long stripe;
27927 + unsigned long chunk_number;
27928 + unsigned int chunk_offset;
27929 + unsigned long new_sector;
27930 + int sectors_per_chunk = conf->chunk_size >> 9;
27932 + /* First compute the information on this sector */
27935 + * Compute the chunk number and the sector offset inside the chunk
27937 + chunk_number = r_sector / sectors_per_chunk;
27938 + chunk_offset = r_sector % sectors_per_chunk;
27941 + * Compute the stripe number
27943 + stripe = chunk_number / data_disks;
27946 + * Compute the data disk and parity disk indexes inside the stripe
27948 + *dd_idx = chunk_number % data_disks;
27951 + * Select the parity disk based on the user selected algorithm.
27953 + if (conf->level == 4)
27954 + *pd_idx = data_disks;
27956 + switch (conf->algorithm) {
27957 + case ALGORITHM_LEFT_ASYMMETRIC:
27958 + *pd_idx = data_disks - stripe % raid_disks;
27959 + if (*dd_idx >= *pd_idx)
27962 + case ALGORITHM_RIGHT_ASYMMETRIC:
27963 + *pd_idx = stripe % raid_disks;
27964 + if (*dd_idx >= *pd_idx)
27967 + case ALGORITHM_LEFT_SYMMETRIC:
27968 + *pd_idx = data_disks - stripe % raid_disks;
27969 + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
27971 + case ALGORITHM_RIGHT_SYMMETRIC:
27972 + *pd_idx = stripe % raid_disks;
27973 + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
27976 + LOG_ERROR(" unsupported algorithm %d\n",
27977 + conf->algorithm);
27981 + * Finally, compute the new sector number
27983 + new_sector = stripe * sectors_per_chunk + chunk_offset;
27984 + return new_sector;
27987 +#define check_xor() do { \
27988 + if (count == MAX_XOR_BLOCKS) { \
27989 + evms_md_xor_block(count, bh_ptr); \
27995 +compute_block(struct stripe_head *sh, int dd_idx)
27997 + raid5_conf_t *conf = sh->raid_conf;
27998 + int i, count, disks = conf->raid_disks;
27999 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
28001 + memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
28002 + bh_ptr[0] = sh->bh_cache[dd_idx];
28004 + for (i = disks; i--;) {
28007 + bh = sh->bh_cache[i];
28008 + if (buffer_uptodate(bh))
28009 + bh_ptr[count++] = bh;
28011 + LOG_ERROR("%s: %d, stripe %lu, %d not present\n",
28012 + __FUNCTION__, dd_idx, sh->sector, i);
28017 + evms_md_xor_block(count, bh_ptr);
28018 + set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
28022 +compute_parity(struct stripe_head *sh, int method)
28024 + raid5_conf_t *conf = sh->raid_conf;
28025 + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
28026 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
28027 + struct buffer_head *chosen[MD_SB_DISKS];
28029 + memset(chosen, 0, sizeof (chosen));
28032 + bh_ptr[0] = sh->bh_cache[pd_idx];
28033 + switch (method) {
28034 + case READ_MODIFY_WRITE:
28035 + if (!buffer_uptodate(sh->bh_cache[pd_idx]))
28037 + for (i = disks; i--;) {
28040 + if (sh->bh_write[i] && buffer_uptodate(sh->bh_cache[i])) {
28041 + bh_ptr[count++] = sh->bh_cache[i];
28042 + chosen[i] = sh->bh_write[i];
28043 + sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
28044 + chosen[i]->b_reqnext = sh->bh_written[i];
28045 + sh->bh_written[i] = chosen[i];
28050 + case RECONSTRUCT_WRITE:
28051 + memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
28052 + for (i = disks; i--;)
28053 + if (i != pd_idx && sh->bh_write[i]) {
28054 + chosen[i] = sh->bh_write[i];
28055 + sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
28056 + chosen[i]->b_reqnext = sh->bh_written[i];
28057 + sh->bh_written[i] = chosen[i];
28060 + case CHECK_PARITY:
28064 + evms_md_xor_block(count, bh_ptr);
28068 + for (i = disks; i--;)
28070 + struct buffer_head *bh = sh->bh_cache[i];
28072 + bdata = bh_kmap(chosen[i]);
28073 + memcpy(bh->b_data, bdata, sh->size);
28074 + bh_kunmap(chosen[i]);
28075 + set_bit(BH_Lock, &bh->b_state);
28076 + mark_buffer_uptodate(bh, 1);
28079 + switch (method) {
28080 + case RECONSTRUCT_WRITE:
28081 + case CHECK_PARITY:
28082 + for (i = disks; i--;)
28083 + if (i != pd_idx) {
28084 + bh_ptr[count++] = sh->bh_cache[i];
28088 + case READ_MODIFY_WRITE:
28089 + for (i = disks; i--;)
28091 + bh_ptr[count++] = sh->bh_cache[i];
28096 + evms_md_xor_block(count, bh_ptr);
28098 + if (method != CHECK_PARITY) {
28099 + mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
28100 + set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
28102 + mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
28106 +add_stripe_bh(struct stripe_head *sh, struct buffer_head *bh, int dd_idx,
28109 + struct buffer_head **bhp;
28110 + raid5_conf_t *conf = sh->raid_conf;
28112 + spin_lock(&sh->lock);
28113 + spin_lock_irq(&conf->device_lock);
28114 + bh->b_reqnext = NULL;
28116 + bhp = &sh->bh_read[dd_idx];
28118 + bhp = &sh->bh_write[dd_idx];
28120 + LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n",
28122 + bhp = &(*bhp)->b_reqnext;
28125 + spin_unlock_irq(&conf->device_lock);
28126 + spin_unlock(&sh->lock);
28131 + * handle_stripe - do things to a stripe.
28133 + * We lock the stripe and then examine the state of various bits
28134 + * to see what needs to be done.
28135 + * Possible results:
28136 + * return some read request which now have data
28137 + * return some write requests which are safely on disc
28138 + * schedule a read on some buffers
28139 + * schedule a write of some buffers
28140 + * return confirmation of parity correctness
28142 + * Parity calculations are done inside the stripe lock
28143 + * buffers are taken off read_list or write_list, and bh_cache buffers
28144 + * get BH_Lock set before the stripe lock is released.
28149 +handle_stripe(struct stripe_head *sh)
28151 + raid5_conf_t *conf = sh->raid_conf;
28152 + int disks = conf->raid_disks;
28153 + struct buffer_head *return_ok = NULL, *return_fail = NULL;
28154 + int action[MD_SB_DISKS];
28157 + int locked = 0, uptodate = 0, to_read = 0, to_write = 0, failed =
28159 + int failed_num = 0;
28160 + struct buffer_head *bh;
28162 + memset(action, 0, sizeof (action));
28164 + spin_lock(&sh->lock);
28165 + clear_bit(STRIPE_HANDLE, &sh->state);
28166 + clear_bit(STRIPE_DELAYED, &sh->state);
28168 + syncing = test_bit(STRIPE_SYNCING, &sh->state);
28169 + /* Now to look around and see what can be done */
28171 + for (i = disks; i--;) {
28172 + bh = sh->bh_cache[i];
28173 + /* maybe we can reply to a read */
28174 + if (buffer_uptodate(bh) && sh->bh_read[i]) {
28175 + struct buffer_head *rbh, *rbh2;
28176 + spin_lock_irq(&conf->device_lock);
28177 + rbh = sh->bh_read[i];
28178 + sh->bh_read[i] = NULL;
28179 + spin_unlock_irq(&conf->device_lock);
28182 + bdata = bh_kmap(rbh);
28183 + memcpy(bdata, bh->b_data, bh->b_size);
28185 + rbh2 = rbh->b_reqnext;
28186 + rbh->b_reqnext = return_ok;
28192 + /* now count some things */
28193 + if (buffer_locked(bh))
28195 + if (buffer_uptodate(bh))
28198 + if (sh->bh_read[i])
28200 + if (sh->bh_write[i])
28202 + if (sh->bh_written[i])
28204 + if (!conf->disks[i].operational) {
28209 + /* check if the array has lost two devices and, if so, some requests might
28210 + * need to be failed
28212 + if (failed > 1 && to_read + to_write) {
28213 + for (i = disks; i--;) {
28214 + /* fail all writes first */
28215 + if (sh->bh_write[i])
28217 + while ((bh = sh->bh_write[i])) {
28218 + sh->bh_write[i] = bh->b_reqnext;
28219 + bh->b_reqnext = return_fail;
28220 + return_fail = bh;
28222 + /* fail any reads if this device is non-operational */
28223 + if (!conf->disks[i].operational) {
28224 + spin_lock_irq(&conf->device_lock);
28225 + if (sh->bh_read[i])
28227 + while ((bh = sh->bh_read[i])) {
28228 + sh->bh_read[i] = bh->b_reqnext;
28229 + bh->b_reqnext = return_fail;
28230 + return_fail = bh;
28232 + spin_unlock_irq(&conf->device_lock);
28236 + if (failed > 1 && syncing) {
28237 + evms_md_done_sync(conf->mddev,
28238 + (sh->size >> 9) - sh->sync_redone, 0);
28239 + clear_bit(STRIPE_SYNCING, &sh->state);
28243 + /* might be able to return some write requests if the parity block
28244 + * is safe, or on a failed drive
28246 + bh = sh->bh_cache[sh->pd_idx];
28248 + ((conf->disks[sh->pd_idx].operational && !buffer_locked(bh)
28249 + && buffer_uptodate(bh))
28250 + || (failed == 1 && failed_num == sh->pd_idx))
28252 + /* any written block on a uptodate or failed drive can be returned */
28253 + for (i = disks; i--;)
28254 + if (sh->bh_written[i]) {
28255 + bh = sh->bh_cache[i];
28256 + if (!conf->disks[sh->pd_idx].operational ||
28257 + (!buffer_locked(bh)
28258 + && buffer_uptodate(bh))) {
28259 + /* maybe we can return some write requests */
28260 + struct buffer_head *wbh, *wbh2;
28261 + wbh = sh->bh_written[i];
28262 + sh->bh_written[i] = NULL;
28264 + wbh2 = wbh->b_reqnext;
28265 + wbh->b_reqnext = return_ok;
28273 + /* Now we might consider reading some blocks, either to check/generate
28274 + * parity, or to satisfy requests
28276 + if (to_read || (syncing && (uptodate + failed < disks))) {
28277 + for (i = disks; i--;) {
28278 + bh = sh->bh_cache[i];
28279 + if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
28280 + (sh->bh_read[i] || syncing
28281 + || (failed && sh->bh_read[failed_num]))) {
28282 + /* we would like to get this block, possibly
28283 + * by computing it, but we might not be able to
28285 + if (uptodate == disks - 1) {
28286 + compute_block(sh, i);
28288 + } else if (conf->disks[i].operational) {
28289 + set_bit(BH_Lock, &bh->b_state);
28290 + action[i] = READ + 1;
28291 + /* if I am just reading this block and we don't have
28292 + a failed drive, or any pending writes then sidestep the cache */
28293 + if (sh->bh_page[i])
28295 + if (sh->bh_read[i]
28296 + && !sh->bh_read[i]->b_reqnext
28297 + && !syncing && !failed
28300 + sh->bh_cache[i]->b_page;
28301 + sh->bh_cache[i]->b_page =
28302 + sh->bh_read[i]->b_page;
28303 + sh->bh_cache[i]->b_data =
28304 + sh->bh_read[i]->b_data;
28308 + evms_md_sync_acct(conf->
28315 + set_bit(STRIPE_HANDLE, &sh->state);
28318 + /* now to consider writing and what else, if anything should be read */
28320 + int rmw = 0, rcw = 0;
28321 + for (i = disks; i--;) {
28322 + /* would I have to read this buffer for read_modify_write */
28323 + bh = sh->bh_cache[i];
28324 + if ((sh->bh_write[i] || i == sh->pd_idx) &&
28325 + (!buffer_locked(bh) || sh->bh_page[i]) &&
28326 + !buffer_uptodate(bh)) {
28327 + if (conf->disks[i].operational
28328 +/* && !(conf->resync_parity && i == sh->pd_idx) */
28332 + rmw += 2 * disks; /* cannot read it */
28334 + /* Would I have to read this buffer for reconstruct_write */
28335 + if (!sh->bh_write[i] && i != sh->pd_idx &&
28336 + (!buffer_locked(bh) || sh->bh_page[i]) &&
28337 + !buffer_uptodate(bh)) {
28338 + if (conf->disks[i].operational)
28341 + rcw += 2 * disks;
28344 + set_bit(STRIPE_HANDLE, &sh->state);
28345 + if (rmw < rcw && rmw > 0)
28346 + /* prefer read-modify-write, but need to get some data */
28347 + for (i = disks; i--;) {
28348 + bh = sh->bh_cache[i];
28349 + if ((sh->bh_write[i] || i == sh->pd_idx) &&
28350 + !buffer_locked(bh) && !buffer_uptodate(bh)
28351 + && conf->disks[i].operational) {
28353 + (STRIPE_PREREAD_ACTIVE,
28355 + set_bit(BH_Lock, &bh->b_state);
28356 + action[i] = READ + 1;
28359 + set_bit(STRIPE_DELAYED,
28361 + set_bit(STRIPE_HANDLE,
28366 + if (rcw <= rmw && rcw > 0)
28367 + /* want reconstruct write, but need to get some data */
28368 + for (i = disks; i--;) {
28369 + bh = sh->bh_cache[i];
28370 + if (!sh->bh_write[i] && i != sh->pd_idx &&
28371 + !buffer_locked(bh) && !buffer_uptodate(bh)
28372 + && conf->disks[i].operational) {
28374 + (STRIPE_PREREAD_ACTIVE,
28376 + set_bit(BH_Lock, &bh->b_state);
28377 + action[i] = READ + 1;
28380 + set_bit(STRIPE_DELAYED,
28382 + set_bit(STRIPE_HANDLE,
28387 + /* now if nothing is locked, and if we have enough data, we can start a write request */
28388 + if (locked == 0 && (rcw == 0 || rmw == 0)) {
28389 + compute_parity(sh,
28391 + 0 ? RECONSTRUCT_WRITE :
28392 + READ_MODIFY_WRITE);
28393 + /* now every locked buffer is ready to be written */
28394 + for (i = disks; i--;)
28395 + if (buffer_locked(sh->bh_cache[i])) {
28397 + action[i] = WRITE + 1;
28398 + if (!conf->disks[i].operational
28399 + || (i == sh->pd_idx && failed == 0))
28400 + set_bit(STRIPE_INSYNC,
28403 + if (test_and_clear_bit
28404 + (STRIPE_PREREAD_ACTIVE, &sh->state)) {
28405 + atomic_dec(&conf->preread_active_stripes);
28406 + if (atomic_read(&conf->preread_active_stripes) <
28408 + evms_cs_wakeup_thread(conf->thread);
28413 + /* maybe we need to check and possibly fix the parity for this stripe
28414 + * Any reads will already have been scheduled, so we just see if enough data
28417 + if (syncing && locked == 0 &&
28418 + !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
28419 + set_bit(STRIPE_HANDLE, &sh->state);
28420 + if (failed == 0) {
28421 + if (uptodate != disks)
28423 + compute_parity(sh, CHECK_PARITY);
28425 + bh = sh->bh_cache[sh->pd_idx];
28426 + if ((*(u32 *) bh->b_data) == 0 &&
28427 + !memcmp(bh->b_data, bh->b_data + 4,
28428 + bh->b_size - 4)) {
28429 + /* parity is correct (on disc, not in buffer any more) */
28430 + set_bit(STRIPE_INSYNC, &sh->state);
28433 + if (!test_bit(STRIPE_INSYNC, &sh->state)) {
28434 + struct disk_info *spare;
28436 + failed_num = sh->pd_idx;
28437 + /* should be able to compute the missing block and write it to spare */
28438 + if (!buffer_uptodate(sh->bh_cache[failed_num])) {
28439 + if (uptodate + 1 != disks)
28441 + compute_block(sh, failed_num);
28444 + if (uptodate != disks)
28446 + bh = sh->bh_cache[failed_num];
28447 + set_bit(BH_Lock, &bh->b_state);
28448 + action[failed_num] = WRITE + 1;
28450 + set_bit(STRIPE_INSYNC, &sh->state);
28451 + if (conf->disks[failed_num].operational)
28452 + evms_md_sync_acct(conf->disks[failed_num].dev,
28453 + bh->b_size >> 9);
28454 + else if ((spare = conf->spare))
28455 + evms_md_sync_acct(spare->dev, bh->b_size >> 9);
28459 + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
28460 + evms_md_done_sync(conf->mddev,
28461 + (sh->size >> 9) - sh->sync_redone, 1);
28462 + clear_bit(STRIPE_SYNCING, &sh->state);
28465 + spin_unlock(&sh->lock);
28467 + while ((bh = return_ok)) {
28468 + return_ok = bh->b_reqnext;
28469 + bh->b_reqnext = NULL;
28470 + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28471 + bh->b_end_io(bh, 1);
28473 + while ((bh = return_fail)) {
28474 + return_fail = bh->b_reqnext;
28475 + bh->b_reqnext = NULL;
28476 + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28477 + bh->b_end_io(bh, 0);
28479 + for (i = disks; i--;)
28481 + struct buffer_head *bh = sh->bh_cache[i];
28482 + struct disk_info *spare = conf->spare;
28483 + struct evms_logical_node *node = NULL;
28485 + if (action[i] == READ + 1)
28486 + bh->b_end_io = raid5_end_read_request;
28488 + bh->b_end_io = raid5_end_write_request;
28489 + if (conf->disks[i].operational) {
28490 + bh->b_dev = conf->disks[i].dev;
28491 + node = conf->disks[i].node;
28492 + } else if (spare && action[i] == WRITE + 1) {
28493 + bh->b_dev = spare->dev;
28494 + node = spare->node;
28498 + atomic_inc(&sh->count);
28499 + //bh->b_rdev = bh->b_dev;
28501 + bh->b_blocknr * (bh->b_size >> 9);
28502 + sh->node[i] = node;
28503 + if (action[i] == READ + 1)
28508 + clear_bit(BH_Lock, &bh->b_state);
28509 + set_bit(STRIPE_HANDLE, &sh->state);
28514 +static inline void
28515 +raid5_activate_delayed(raid5_conf_t * conf)
28517 + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
28518 + while (!list_empty(&conf->delayed_list)) {
28519 + struct list_head *l = conf->delayed_list.next;
28520 + struct stripe_head *sh;
28521 + sh = list_entry(l, struct stripe_head, lru);
28522 + list_del_init(l);
28523 + clear_bit(STRIPE_DELAYED, &sh->state);
28524 + if (!test_and_set_bit
28525 + (STRIPE_PREREAD_ACTIVE, &sh->state))
28526 + atomic_inc(&conf->preread_active_stripes);
28527 + list_add_tail(&sh->lru, &conf->handle_list);
28532 +raid5_unplug_device(void *data)
28534 + raid5_conf_t *conf = (raid5_conf_t *) data;
28535 + unsigned long flags;
28537 + spin_lock_irqsave(&conf->device_lock, flags);
28539 + raid5_activate_delayed(conf);
28541 + conf->plugged = 0;
28542 + evms_cs_wakeup_thread(conf->thread);
28544 + spin_unlock_irqrestore(&conf->device_lock, flags);
28547 +static inline void
28548 +raid5_plug_device(raid5_conf_t * conf)
28550 + spin_lock_irq(&conf->device_lock);
28551 + if (list_empty(&conf->delayed_list))
28552 + if (!conf->plugged) {
28553 + conf->plugged = 1;
28554 + queue_task(&conf->plug_tq, &tq_disk);
28556 + spin_unlock_irq(&conf->device_lock);
28559 +static inline void
28560 +raid5_rw(struct evms_logical_node * md_node, struct buffer_head *bh, int rw)
28562 + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
28563 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
28564 + const unsigned int raid_disks = conf->raid_disks;
28565 + const unsigned int data_disks = raid_disks - 1;
28566 + unsigned int dd_idx, pd_idx;
28567 + unsigned long new_sector;
28568 + struct stripe_head *sh;
28569 + unsigned long sectors_per_chunk = conf->chunk_size >> 9;
28570 + unsigned long sect_in_chunk = bh->b_rsector & (sectors_per_chunk - 1);
28572 + if (evms_md_check_boundary(md_node, bh))
28574 + if ((sect_in_chunk + (bh->b_size >> 9)) > sectors_per_chunk) {
28575 + bh->b_end_io(bh, 0);
28579 + new_sector = raid5_compute_sector(bh->b_rsector,
28580 + raid_disks, data_disks, &dd_idx,
28583 + sh = get_active_stripe(conf, new_sector, bh->b_size);
28585 + sh->pd_idx = pd_idx;
28587 + add_stripe_bh(sh, bh, dd_idx, rw);
28589 + raid5_plug_device(conf);
28591 + evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL);
28592 + handle_stripe(sh);
28593 + release_stripe(sh);
28595 + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28596 + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
28601 +raid5_read(struct evms_logical_node * md_node, struct buffer_head *bh)
28603 + raid5_rw(md_node, bh, READ);
28607 +raid5_write(struct evms_logical_node * md_node, struct buffer_head *bh)
28609 + raid5_rw(md_node, bh, WRITE);
28613 +raid5_sync_request(mddev_t * mddev, unsigned long sector_nr)
28615 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
28616 + struct stripe_head *sh;
28617 + int sectors_per_chunk = conf->chunk_size >> 9;
28618 + unsigned long stripe = sector_nr / sectors_per_chunk;
28619 + int chunk_offset = sector_nr % sectors_per_chunk;
28620 + int dd_idx, pd_idx;
28621 + unsigned long first_sector;
28622 + int raid_disks = conf->raid_disks;
28623 + int data_disks = raid_disks - 1;
28627 + sh = get_active_stripe(conf, sector_nr, 0);
28628 + bufsize = sh->size;
28629 + redone = sector_nr - sh->sector;
28631 + raid5_compute_sector(stripe * data_disks * sectors_per_chunk +
28632 + chunk_offset, raid_disks, data_disks, &dd_idx,
28634 + sh->pd_idx = pd_idx;
28635 + spin_lock(&sh->lock);
28636 + set_bit(STRIPE_SYNCING, &sh->state);
28637 + clear_bit(STRIPE_INSYNC, &sh->state);
28638 + sh->sync_redone = redone;
28639 + spin_unlock(&sh->lock);
28641 + handle_stripe(sh);
28642 + release_stripe(sh);
28644 + return (bufsize >> 9) - redone;
28648 + * This is our raid5 kernel thread.
28650 + * We scan the hash table for stripes which can be handled now.
28651 + * During the scan, completed stripes are saved for us by the interrupt
28652 + * handler, so that they will not have to wait for our next wakeup.
28655 +raid5d(void *data)
28657 + struct stripe_head *sh;
28658 + raid5_conf_t *conf = data;
28659 + mddev_t *mddev = conf->mddev;
28664 + if (mddev->sb_dirty) {
28665 + mddev->sb_dirty = 0;
28666 + evms_md_update_sb(mddev);
28668 + md_spin_lock_irq(&conf->device_lock);
28670 + struct list_head *first;
28672 + if (list_empty(&conf->handle_list) &&
28673 + atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
28674 + !conf->plugged && !list_empty(&conf->delayed_list))
28675 + raid5_activate_delayed(conf);
28677 + if (list_empty(&conf->handle_list))
28680 + first = conf->handle_list.next;
28681 + sh = list_entry(first, struct stripe_head, lru);
28683 + list_del_init(first);
28684 + atomic_inc(&sh->count);
28685 + if (atomic_read(&sh->count) != 1)
28687 + md_spin_unlock_irq(&conf->device_lock);
28690 + handle_stripe(sh);
28691 + release_stripe(sh);
28693 + md_spin_lock_irq(&conf->device_lock);
28696 + md_spin_unlock_irq(&conf->device_lock);
28701 + * Private kernel thread for parity reconstruction after an unclean
28702 + * shutdown. Reconstruction on spare drives in case of a failed drive
28703 + * is done by the generic mdsyncd.
28706 +raid5syncd(void *data)
28708 + raid5_conf_t *conf = data;
28709 + mddev_t *mddev = conf->mddev;
28711 + if (!conf->resync_parity)
28713 + if (conf->resync_parity == 2)
28715 + down(&mddev->recovery_sem);
28716 + if (evms_md_do_sync(mddev, NULL)) {
28717 + up(&mddev->recovery_sem);
28718 + LOG_WARNING("resync aborted!\n");
28721 + conf->resync_parity = 0;
28722 + up(&mddev->recovery_sem);
28723 + LOG_DEFAULT("resync finished.\n");
28727 +raid5_run(mddev_t * mddev)
28729 + raid5_conf_t *conf;
28730 + int i, j, raid_disk, memory;
28731 + mdp_super_t *sb = mddev->sb;
28732 + mdp_disk_t *desc;
28733 + mdk_rdev_t *rdev;
28734 + struct disk_info *disk;
28735 + struct md_list_head *tmp;
28736 + int start_recovery = 0;
28738 + MOD_INC_USE_COUNT;
28740 + if (sb->level != 5 && sb->level != 4) {
28741 + LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",
28742 + __FUNCTION__, mdidx(mddev), sb->level);
28743 + MOD_DEC_USE_COUNT;
28747 + mddev->private = kmalloc(sizeof (raid5_conf_t), GFP_KERNEL);
28748 + if ((conf = mddev->private) == NULL)
28750 + memset(conf, 0, sizeof (*conf));
28751 + conf->mddev = mddev;
28753 + if ((conf->stripe_hashtbl =
28754 + (struct stripe_head **) md__get_free_pages(GFP_ATOMIC,
28755 + HASH_PAGES_ORDER)) ==
28758 + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
28760 + conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
28761 + md_init_waitqueue_head(&conf->wait_for_stripe);
28762 + INIT_LIST_HEAD(&conf->handle_list);
28763 + INIT_LIST_HEAD(&conf->delayed_list);
28764 + INIT_LIST_HEAD(&conf->inactive_list);
28765 + conf->active_stripes = (atomic_t)ATOMIC_INIT(0);
28766 + conf->preread_active_stripes = (atomic_t)ATOMIC_INIT(0);
28767 + conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
28769 + conf->plugged = 0;
28770 + conf->plug_tq.sync = 0;
28771 + conf->plug_tq.routine = &raid5_unplug_device;
28772 + conf->plug_tq.data = conf;
28774 + ITERATE_RDEV(mddev, rdev, tmp) {
28776 + * This is important -- we are using the descriptor on
28777 + * the disk only to get a pointer to the descriptor on
28778 + * the main superblock, which might be more recent.
28780 + desc = sb->disks + rdev->desc_nr;
28781 + raid_disk = desc->raid_disk;
28782 + disk = conf->disks + raid_disk;
28784 + if (disk_faulty(desc)) {
28785 + LOG_ERROR("%s: disabled device %s (errors detected)\n",
28787 + evms_md_partition_name(rdev->node));
28788 + if (!rdev->faulty) {
28792 + disk->number = desc->number;
28793 + disk->raid_disk = raid_disk;
28794 + disk->dev = rdev->dev;
28795 + disk->node = rdev->node;
28797 + disk->operational = 0;
28798 + disk->write_only = 0;
28800 + disk->used_slot = 1;
28803 + if (disk_active(desc)) {
28804 + if (!disk_sync(desc)) {
28806 + ("%s: disabled device %s (not in sync)\n",
28808 + evms_md_partition_name(rdev->node));
28812 + if (raid_disk > sb->raid_disks) {
28814 + ("%s: disabled device %s (inconsistent descriptor)\n",
28816 + evms_md_partition_name(rdev->node));
28819 + if (disk->operational) {
28821 + ("%s: disabled device %s (device %d already operational)\n",
28823 + evms_md_partition_name(rdev->node),
28828 + ("%s: device %s operational as raid disk %d\n",
28829 + __FUNCTION__, evms_md_partition_name(rdev->node),
28832 + disk->number = desc->number;
28833 + disk->raid_disk = raid_disk;
28834 + disk->dev = rdev->dev;
28835 + disk->node = rdev->node;
28836 + disk->operational = 1;
28837 + disk->used_slot = 1;
28839 + conf->working_disks++;
28842 + * Must be a spare disk ..
28844 + LOG_DEFAULT(" spare disk %s\n",
28845 + evms_md_partition_name(rdev->node));
28846 + disk->number = desc->number;
28847 + disk->raid_disk = raid_disk;
28848 + disk->dev = rdev->dev;
28849 + disk->node = rdev->node;
28851 + disk->operational = 0;
28852 + disk->write_only = 0;
28854 + disk->used_slot = 1;
28858 + for (i = 0; i < MD_SB_DISKS; i++) {
28859 + desc = sb->disks + i;
28860 + raid_disk = desc->raid_disk;
28861 + disk = conf->disks + raid_disk;
28863 + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
28864 + !conf->disks[raid_disk].used_slot) {
28866 + disk->number = desc->number;
28867 + disk->raid_disk = raid_disk;
28868 + disk->dev = MKDEV(0, 0);
28869 + disk->node = NULL;
28871 + disk->operational = 0;
28872 + disk->write_only = 0;
28874 + disk->used_slot = 1;
28878 + conf->raid_disks = sb->raid_disks;
28880 + * faied_disks: 0 for a fully functional array, 1 for a degraded array.
28882 + conf->failed_disks = conf->raid_disks - conf->working_disks;
28883 + conf->mddev = mddev;
28884 + conf->chunk_size = sb->chunk_size;
28885 + conf->level = sb->level;
28886 + conf->algorithm = sb->layout;
28887 + conf->max_nr_stripes = NR_STRIPES;
28890 + * If chunk_size is validated in md_core.c, why do it again?
28891 + * And the check in md_core is:
28892 + * chunk_size has to be a power of 2 and multiples of PAGE_SIZE
28895 + if (!conf->chunk_size ||
28896 + ((1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||
28897 + (conf->chunk_size < PAGE_SIZE)) {
28898 + LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__,
28899 + conf->chunk_size, mdidx(mddev));
28902 + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
28903 + LOG_ERROR(" unsupported parity algorithm %d for md%d\n",
28904 + conf->algorithm, mdidx(mddev));
28907 + if (conf->failed_disks > 1) {
28909 + (" not enough operational devices for md%d (%d/%d failed)\n",
28910 + mdidx(mddev), conf->failed_disks, conf->raid_disks);
28914 + if (conf->working_disks != sb->raid_disks) {
28916 + (" md%d, not all disks are operational -- trying to recover array\n",
28918 + start_recovery = 1;
28922 + const char *name = "evms_raid5d";
28924 + conf->thread = evms_cs_register_thread(raid5d, conf, name);
28925 + if (!conf->thread) {
28926 + LOG_ERROR("%s: couldn't allocate thread for md%d\n",
28927 + __FUNCTION__, mdidx(mddev));
28932 + memory = conf->max_nr_stripes * (sizeof (struct stripe_head) +
28933 + conf->raid_disks *
28934 + ((sizeof (struct buffer_head) +
28935 + PAGE_SIZE))) / 1024;
28936 + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
28937 + LOG_ERROR("%s: couldn't allocate %dkB for buffers\n",
28938 + __FUNCTION__, memory);
28939 + shrink_stripes(conf, conf->max_nr_stripes);
28942 + LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__,
28943 + memory, mdidx(mddev));
28946 + * Regenerate the "device is in sync with the raid set" bit for
28949 + for (i = 0; i < MD_SB_DISKS; i++) {
28950 + mark_disk_nonsync(sb->disks + i);
28951 + for (j = 0; j < sb->raid_disks; j++) {
28952 + if (!conf->disks[j].operational)
28954 + if (sb->disks[i].number == conf->disks[j].number)
28955 + mark_disk_sync(sb->disks + i);
28958 + sb->active_disks = conf->working_disks;
28960 + if (sb->active_disks == sb->raid_disks) {
28962 + ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
28963 + __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks,
28964 + sb->raid_disks, conf->algorithm);
28967 + ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
28968 + __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks,
28969 + sb->raid_disks, conf->algorithm);
28972 + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
28973 + const char *name = "evms_raid5syncd";
28975 + conf->resync_thread =
28976 + evms_cs_register_thread(raid5syncd, conf, name);
28977 + if (!conf->resync_thread) {
28978 + LOG_ERROR("%s: couldn't allocate thread for md%d\n",
28979 + __FUNCTION__, mdidx(mddev));
28984 + ("%s: raid set md%d not clean; reconstructing parity\n",
28985 + __FUNCTION__, mdidx(mddev));
28986 + conf->resync_parity = 1;
28987 + evms_cs_wakeup_thread(conf->resync_thread);
28990 + print_raid5_conf(conf);
28991 + if (start_recovery)
28992 + evms_md_recover_arrays();
28993 + print_raid5_conf(conf);
28995 + /* Ok, everything is just fine now */
28999 + print_raid5_conf(conf);
29000 + if (conf->stripe_hashtbl)
29001 + free_pages((unsigned long) conf->stripe_hashtbl,
29002 + HASH_PAGES_ORDER);
29005 + mddev->private = NULL;
29006 + LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__,
29008 + MOD_DEC_USE_COUNT;
29013 +raid5_stop_resync(mddev_t * mddev)
29015 + raid5_conf_t *conf = mddev_to_conf(mddev);
29016 + struct evms_thread *thread;
29018 + if (conf == NULL) {
29022 + thread = conf->resync_thread;
29025 + if (conf->resync_parity) {
29026 + conf->resync_parity = 2;
29027 + evms_cs_interrupt_thread(thread);
29029 + ("%s: parity resync was not fully finished, restarting next time.\n",
29039 +raid5_restart_resync(mddev_t * mddev)
29041 + raid5_conf_t *conf = mddev_to_conf(mddev);
29043 + if (conf->resync_parity) {
29044 + if (!conf->resync_thread) {
29048 + LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);
29049 + conf->resync_parity = 1;
29050 + evms_cs_wakeup_thread(conf->resync_thread);
29053 + LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);
29058 +raid5_stop(mddev_t * mddev)
29060 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29062 + if (conf != NULL) {
29063 + if (conf->resync_thread)
29064 + evms_cs_unregister_thread(conf->resync_thread);
29065 + evms_cs_unregister_thread(conf->thread);
29066 + shrink_stripes(conf, conf->max_nr_stripes);
29067 + free_pages((unsigned long) conf->stripe_hashtbl,
29068 + HASH_PAGES_ORDER);
29070 + mddev->private = NULL;
29072 + MOD_DEC_USE_COUNT;
29078 +print_sh(struct stripe_head *sh)
29082 + LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector,
29083 + sh->size, sh->pd_idx, sh->state);
29084 + LOG_DEFAULT("sh %lu, count %d.\n", sh->sector,
29085 + atomic_read(&sh->count));
29086 + LOG_DEFAULT("sh %lu, ", sh->sector);
29087 + for (i = 0; i < MD_SB_DISKS; i++) {
29088 + if (sh->bh_cache[i])
29089 + LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i],
29090 + sh->bh_cache[i]->b_state);
29092 + LOG_DEFAULT("\n");
29096 +printall(raid5_conf_t * conf)
29098 + struct stripe_head *sh;
29101 + md_spin_lock_irq(&conf->device_lock);
29102 + for (i = 0; i < NR_HASH; i++) {
29103 + sh = conf->stripe_hashtbl[i];
29104 + for (; sh; sh = sh->hash_next) {
29105 + if (sh->raid_conf != conf)
29110 + md_spin_unlock_irq(&conf->device_lock);
29115 +raid5_status(char *page, mddev_t * mddev)
29117 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29118 + mdp_super_t *sb = mddev->sb;
29122 + sprintf(page + sz, " level %d, %dk chunk, algorithm %d", sb->level,
29123 + sb->chunk_size >> 10, sb->layout);
29125 + sprintf(page + sz, " [%d/%d] [", conf->raid_disks,
29126 + conf->working_disks);
29127 + for (i = 0; i < conf->raid_disks; i++)
29129 + sprintf(page + sz, "%s",
29130 + conf->disks[i].operational ? "U" : "_");
29131 + sz += sprintf(page + sz, "]");
29134 + sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
29141 +print_raid5_conf(raid5_conf_t * conf)
29144 + struct disk_info *tmp;
29146 + LOG_DEFAULT("RAID5 conf printout:\n");
29148 + LOG_DEFAULT("(conf==NULL)\n");
29151 + LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
29152 + conf->working_disks, conf->failed_disks);
29155 + for (i = 0; i < MD_SB_DISKS; i++) {
29157 + for (i = 0; i < conf->working_disks + conf->failed_disks; i++) {
29159 + tmp = conf->disks + i;
29160 + LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
29161 + i, tmp->spare, tmp->operational,
29162 + tmp->number, tmp->raid_disk, tmp->used_slot,
29163 + evms_md_partition_name(tmp->node));
29168 +raid5_diskop(mddev_t * mddev, mdp_disk_t ** d, int state)
29171 + int i, failed_disk = -1, spare_disk = -1, removed_disk = -1;
29172 + raid5_conf_t *conf = mddev->private;
29173 + struct disk_info *tmp, *sdisk, *fdisk, *rdisk;
29174 + mdp_super_t *sb = mddev->sb;
29175 + mdp_disk_t *failed_desc, *spare_desc;
29176 + mdk_rdev_t *spare_rdev, *failed_rdev;
29178 + print_raid5_conf(conf);
29179 + md_spin_lock_irq(&conf->device_lock);
29181 + * find the disk ...
29185 + case DISKOP_SPARE_ACTIVE:
29188 + * Find the failed disk within the RAID5 configuration ...
29189 + * (this can only be in the first conf->raid_disks part)
29191 + for (i = 0; i < conf->raid_disks; i++) {
29192 + tmp = conf->disks + i;
29193 + if ((!tmp->operational && !tmp->spare) ||
29194 + !tmp->used_slot) {
29200 + * When we activate a spare disk we _must_ have a disk in
29201 + * the lower (active) part of the array to replace.
29203 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
29208 + /* fall through */
29210 + case DISKOP_SPARE_WRITE:
29211 + case DISKOP_SPARE_INACTIVE:
29214 + * Find the spare disk ... (can only be in the 'high'
29215 + * area of the array)
29217 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
29218 + tmp = conf->disks + i;
29219 + if (tmp->spare && tmp->number == (*d)->number) {
29224 + if (spare_disk == -1) {
29231 + case DISKOP_HOT_REMOVE_SPARE:
29233 + for (i = 0; i < MD_SB_DISKS; i++) {
29234 + tmp = conf->disks + i;
29235 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
29236 + if (tmp->operational) {
29239 + } else if (!tmp->spare) {
29244 + removed_disk = i;
29248 + if (removed_disk == -1) {
29255 + case DISKOP_HOT_REMOVE_DISK:
29256 + for (i = 0; i < MD_SB_DISKS; i++) {
29257 + tmp = conf->disks + i;
29258 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
29259 + if (i < conf->raid_disks) {
29260 + if (conf->working_disks !=
29261 + conf->raid_disks) {
29263 + * Can't remove a disk from an
29264 + * array that is running in
29270 + if (sb->spare_disks == 0) {
29272 + * Must have a spare ready
29273 + * before removing an active
29280 + removed_disk = i;
29284 + if (removed_disk == -1) {
29291 + case DISKOP_HOT_ADD_DISK:
29299 + * Switch the spare disk to write-only mode:
29301 + case DISKOP_SPARE_WRITE:
29302 + if (conf->spare) {
29307 + sdisk = conf->disks + spare_disk;
29308 + sdisk->operational = 1;
29309 + sdisk->write_only = 1;
29310 + conf->spare = sdisk;
29313 + * Deactivate a spare disk:
29315 + case DISKOP_SPARE_INACTIVE:
29316 + sdisk = conf->disks + spare_disk;
29317 + sdisk->operational = 0;
29318 + sdisk->write_only = 0;
29320 + * Was the spare being resynced?
29322 + if (conf->spare == sdisk)
29323 + conf->spare = NULL;
29326 + * Activate (mark read-write) the (now sync) spare disk,
29327 + * which means we switch it's 'raid position' (->raid_disk)
29328 + * with the failed disk. (only the first 'conf->raid_disks'
29329 + * slots are used for 'real' disks and we must preserve this
29332 + case DISKOP_SPARE_ACTIVE:
29333 + if (!conf->spare) {
29338 + sdisk = conf->disks + spare_disk;
29339 + fdisk = conf->disks + failed_disk;
29341 + spare_desc = &sb->disks[sdisk->number];
29342 + failed_desc = &sb->disks[fdisk->number];
29344 + if (spare_desc != *d) {
29350 + if (spare_desc->raid_disk != sdisk->raid_disk) {
29356 + if (sdisk->raid_disk != spare_disk) {
29362 + if (failed_desc->raid_disk != fdisk->raid_disk) {
29368 + if (fdisk->raid_disk != failed_disk) {
29375 + * do the switch finally
29377 + spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
29378 + failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
29380 + /* There must be a spare_rdev, but there may not be a
29381 + * failed_rdev. That slot might be empty...
29383 + spare_rdev->desc_nr = failed_desc->number;
29385 + failed_rdev->desc_nr = spare_desc->number;
29387 + xchg_values(*spare_desc, *failed_desc);
29388 + xchg_values(*fdisk, *sdisk);
29391 + * (careful, 'failed' and 'spare' are switched from now on)
29393 + * we want to preserve linear numbering and we want to
29394 + * give the proper raid_disk number to the now activated
29395 + * disk. (this means we switch back these values)
29398 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
29399 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
29400 + xchg_values(spare_desc->number, failed_desc->number);
29401 + xchg_values(sdisk->number, fdisk->number);
29403 + *d = failed_desc;
29405 + //if (sdisk->dev == MKDEV(0,0))
29406 + if (sdisk->node == NULL)
29407 + sdisk->used_slot = 0;
29410 + * this really activates the spare.
29412 + fdisk->spare = 0;
29413 + fdisk->write_only = 0;
29416 + * if we activate a spare, we definitely replace a
29417 + * non-operational disk slot in the 'low' area of
29418 + * the disk array.
29420 + conf->failed_disks--;
29421 + conf->working_disks++;
29422 + conf->spare = NULL;
29426 + case DISKOP_HOT_REMOVE_SPARE:
29427 + rdisk = conf->disks + removed_disk;
29429 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
29434 + if (conf->spare != NULL) {
29435 + if (conf->spare->number == removed_disk) {
29436 + conf->spare = NULL;
29440 + rdisk->dev = MKDEV(0, 0);
29441 + rdisk->node = NULL;
29442 + rdisk->used_slot = 0;
29446 + case DISKOP_HOT_REMOVE_DISK:
29447 + rdisk = conf->disks + removed_disk;
29448 + if (rdisk->operational) {
29449 + /* We're removing a running disk in the array. */
29450 + conf->working_disks--;
29451 + conf->failed_disks++;
29453 + rdisk->dev = MKDEV(0, 0);
29454 + rdisk->node = NULL;
29455 + rdisk->used_slot = 0;
29456 + rdisk->operational = 0;
29465 + md_spin_unlock_irq(&conf->device_lock);
29466 + print_raid5_conf(conf);
29471 +raid5_bmap(mddev_t * mddev,
29473 + struct evms_logical_node ** node)
29475 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29476 + const unsigned int raid_disks = conf->raid_disks;
29477 + const unsigned int data_disks = raid_disks - 1;
29478 + unsigned int dd_idx, pd_idx;
29480 + *rsector = (u64) raid5_compute_sector( (unsigned long) *rsector,
29481 + raid_disks, data_disks,
29482 + &dd_idx, &pd_idx, conf);
29483 + *node = conf->disks[dd_idx].node;
29484 + return 0; /* always successful */
29488 +raid5_evms_ioctl(mddev_t * mddev,
29489 + struct inode *inode,
29490 + struct file *file, unsigned int cmd, unsigned long arg)
29493 + struct evms_logical_node *node;
29496 + case EVMS_GET_BMAP:
29498 + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
29499 + rc = raid5_bmap(mddev, &bmap->rsector, &node);
29502 + rc = IOCTL(node, inode, file, cmd, arg);
29515 +#define MAX_IO_SIZE 128
29517 +raid5_pers_ioctl(mddev_t * mddev, int cmd, void *args)
29521 + struct r5_sync_io init_io_args;
29523 + int io_size = MAX_IO_SIZE;
29525 + LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);
29527 + case EVMS_MD_RAID5_INIT_IO:
29529 + if (copy_from_user
29530 + (&init_io_args, (struct r5_sync_io *) args,
29531 + sizeof (init_io_args))) {
29534 + /* allocate a io buffer upto 64Kbytes in size */
29535 + if (init_io_args.nr_sects < MAX_IO_SIZE)
29536 + io_size = init_io_args.nr_sects;
29538 + /* allocate buffer large enough to hold a single sector */
29539 + data = kmalloc(io_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL);
29543 + u64 io_sector_offset, io_remaining;
29545 + u_char *user_buffer_ptr;
29547 + io_remaining = init_io_args.nr_sects;
29548 + io_sector_offset = 0;
29549 + user_buffer_ptr = init_io_args.data;
29550 + while (io_remaining) {
29551 + /* compute the io_size for this pass */
29552 + io_size = (io_remaining >= MAX_IO_SIZE) ?
29553 + MAX_IO_SIZE : io_remaining;
29555 + io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
29556 + if (init_io_args.rw == WRITE) {
29557 + if (copy_from_user(data,
29565 + rc = evms_md_sync_io(mddev->node,
29567 + init_io_args.lsn +
29568 + io_sector_offset, io_size,
29574 + if (init_io_args.rw != WRITE) {
29575 + if (copy_to_user(user_buffer_ptr,
29582 + user_buffer_ptr += io_bytes;
29583 + io_sector_offset += io_size;
29584 + io_remaining -= io_size;
29596 +static mdk_personality_t raid5_personality = {
29597 + .name = "evms_raid5",
29598 + .read = raid5_read,
29599 + .write = raid5_write,
29600 + .run = raid5_run,
29601 + .stop = raid5_stop,
29602 + .status = raid5_status,
29603 + .error_handler = raid5_error,
29604 + .diskop = raid5_diskop,
29605 + .stop_resync = raid5_stop_resync,
29606 + .restart_resync = raid5_restart_resync,
29607 + .sync_request = raid5_sync_request,
29608 + .evms_ioctl = raid5_evms_ioctl,
29609 + .md_pers_ioctl = raid5_pers_ioctl
29612 +static int md__init
29615 + return evms_register_md_personality(RAID5, &raid5_personality);
29621 + evms_unregister_md_personality(RAID5);
29624 +module_init(raid5_init);
29625 +module_exit(raid5_exit);
29626 +#ifdef MODULE_LICENSE
29627 +MODULE_LICENSE("GPL");
29629 diff -Naur linux-2002-09-30/drivers/evms/md_xor.c evms-2002-09-30/drivers/evms/md_xor.c
29630 --- linux-2002-09-30/drivers/evms/md_xor.c Wed Dec 31 18:00:00 1969
29631 +++ evms-2002-09-30/drivers/evms/md_xor.c Fri Mar 1 11:50:58 2002
29634 + * md_xor.c : Multiple Devices driver for Linux
29636 + * Copyright (C) 1996, 1997, 1998, 1999, 2000,
29637 + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
29639 + * Dispatch optimized RAID-5 checksumming functions.
29641 + * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified
29642 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
29644 + * This program is free software; you can redistribute it and/or modify
29645 + * it under the terms of the GNU General Public License as published by
29646 + * the Free Software Foundation; either version 2, or (at your option)
29647 + * any later version.
29649 + * You should have received a copy of the GNU General Public License
29650 + * (for example /usr/src/linux/COPYING); if not, write to the Free
29651 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29654 +#define BH_TRACE 0
29655 +#include <linux/module.h>
29656 +#include <linux/evms/evms_md.h>
29657 +#include <linux/evms/evms_xor.h>
29658 +#include <asm/xor.h>
29660 +#define LOG_PREFIX "md raid5: "
29661 +/* The xor routines to use. */
29662 +static struct xor_block_template *active_template;
29665 +evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)
29667 + unsigned long *p0, *p1, *p2, *p3, *p4;
29668 + unsigned long bytes = bh_ptr[0]->b_size;
29670 + p0 = (unsigned long *) bh_ptr[0]->b_data;
29671 + p1 = (unsigned long *) bh_ptr[1]->b_data;
29672 + if (count == 2) {
29673 + active_template->do_2(bytes, p0, p1);
29677 + p2 = (unsigned long *) bh_ptr[2]->b_data;
29678 + if (count == 3) {
29679 + active_template->do_3(bytes, p0, p1, p2);
29683 + p3 = (unsigned long *) bh_ptr[3]->b_data;
29684 + if (count == 4) {
29685 + active_template->do_4(bytes, p0, p1, p2, p3);
29689 + p4 = (unsigned long *) bh_ptr[4]->b_data;
29690 + active_template->do_5(bytes, p0, p1, p2, p3, p4);
29693 +/* Set of all registered templates. */
29694 +static struct xor_block_template *template_list;
29696 +#define BENCH_SIZE (PAGE_SIZE)
29699 +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
29702 + unsigned long now;
29703 + int i, count, max;
29705 + tmpl->next = template_list;
29706 + template_list = tmpl;
29709 + * Count the number of XORs done during a whole jiffy, and use
29710 + * this to calculate the speed of checksumming. We use a 2-page
29711 + * allocation to have guaranteed color L1-cache layout.
29714 + for (i = 0; i < 5; i++) {
29717 + while (jiffies == now) {
29719 + tmpl->do_2(BENCH_SIZE, b1, b2);
29728 + speed = max * (HZ * BENCH_SIZE / 1024);
29729 + tmpl->speed = speed;
29731 + LOG_DEFAULT(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
29732 + speed / 1000, speed % 1000);
29736 +calibrate_xor_block(void)
29739 + struct xor_block_template *f, *fastest;
29741 + b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
29743 + LOG_ERROR("Yikes! No memory available.\n");
29746 + b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
29748 + LOG_DEFAULT("measuring checksumming speed\n");
29751 +#define xor_speed(templ) do_xor_speed((templ), b1, b2)
29753 + XOR_TRY_TEMPLATES;
29757 + free_pages((unsigned long)b1, 2);
29759 + fastest = template_list;
29760 + for (f = fastest; f; f = f->next)
29761 + if (f->speed > fastest->speed)
29764 +#ifdef XOR_SELECT_TEMPLATE
29765 + fastest = XOR_SELECT_TEMPLATE(fastest);
29768 + active_template = fastest;
29769 + LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",
29770 + fastest->name, fastest->speed / 1000, fastest->speed % 1000);
29775 +MD_EXPORT_SYMBOL(evms_md_xor_block);
29777 +#ifdef MODULE_LICENSE
29778 +MODULE_LICENSE("GPL");
29781 +module_init(calibrate_xor_block);
29782 diff -Naur linux-2002-09-30/drivers/evms/os2lvm_vge.c evms-2002-09-30/drivers/evms/os2lvm_vge.c
29783 --- linux-2002-09-30/drivers/evms/os2lvm_vge.c Wed Dec 31 18:00:00 1969
29784 +++ evms-2002-09-30/drivers/evms/os2lvm_vge.c Fri Sep 13 16:09:55 2002
29788 + * Copyright (c) International Business Machines Corp., 2001
29790 + * This program is free software; you can redistribute it and/or modify
29791 + * it under the terms of the GNU General Public License as published by
29792 + * the Free Software Foundation; either version 2 of the License, or
29793 + * (at your option) any later version.
29795 + * This program is distributed in the hope that it will be useful,
29796 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
29797 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
29798 + * the GNU General Public License for more details.
29800 + * You should have received a copy of the GNU General Public License
29801 + * along with this program; if not, write to the Free Software
29802 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29807 + * linux/drivers/evms/os2lvm_vge.c
29809 + * EVMS OS/2 LVM Emulator
29811 + * This Volume Group Emulator will take the type 0x35 partitions created by
29812 + * OS/2 versions 4.5 and later and build them into volumes. It emulates
29813 + * the Drive Linking and Bad Block Relocation features and therefore
29814 + * provides binary compatibility with the OS/2 version. Of course, if
29815 + * you select to mkfs a file system OS/2 doesn't support, you're on your
29818 + * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,
29819 + * this VGE has a dependency on dospart.c to report a list of the
29820 + * candidate partitions. This module will then take the appropriate partitions
29821 + * from the list and use them to build the OS/2-style volumes.
29823 + * Change Activity:
29825 + * 7/01/2001 John Stiles getting started.
29826 + * 9/14/2001 John Stiles original version.
29827 + * 11/01/2001 John Stiles new naming scheme.
29828 + * 11/21/2001 John Stiles i/o path changes.
29831 +#define EVMS_DEBUG 1
29832 +#define EVMS_OS2_DEBUG 1
29834 +#include <linux/module.h>
29835 +#include <linux/kernel.h>
29836 +#include <linux/config.h>
29837 +#include <linux/genhd.h>
29838 +#include <linux/string.h>
29839 +#include <linux/blk.h>
29840 +#include <linux/init.h>
29841 +#include <linux/evms/evms.h>
29842 +#include <linux/evms/evms_os2.h>
29843 +#include <asm/uaccess.h>
29844 +#include <asm/atomic.h>
29846 +#define LOG_PREFIX "os2lvm: "
29848 +// Global Structure and Type definitions
29849 +struct transfer_record {
29850 + int Write_Flag; /* 0 = read, 1 = write */
29851 + struct os2_dl_entry *Partition_Data;
29852 + struct buffer_head *bh;
29853 + struct transfer_record *next;
29856 +struct tracking_record { /* structure used to track IO requests that must be broken into two pieces due to drive linking */
29857 + unsigned int io_in_progress;
29859 + struct buffer_head *org_bh; /* Original IO */
29860 + struct buffer_head *link1_bh; /* First child. */
29861 + struct os2_dl_entry *link1_data;
29862 + struct transfer_record *link1_transfer_rec;
29863 + int link1_bbr_attempted;
29864 + struct buffer_head *link2_bh; /* Second child */
29865 + struct os2_dl_entry *link2_data;
29866 + struct transfer_record *link2_transfer_rec;
29867 + int link2_bbr_attempted;
29870 +// Prototypes for local VGE functions
29871 +static int discover_os2lvm_partitions(struct evms_logical_node **);
29872 +static struct evms_logical_node *find_os2_volume(u32);
29873 +static int add_os2link(struct os2_dl_entry *,
29874 + struct evms_logical_node *);
29875 +static struct os2_dl_entry
29876 + *find_link_data(struct os2_dl_entry **, u32);
29877 +static int find_drive_link(struct evms_logical_node *,
29878 + struct os2_dl_entry **, u64 *, u64 *);
29879 +static int validate_signaturesector(struct evms_logical_node *,
29880 + LVM_Signature_Sector *, u32);
29881 +static int validate_drivelinksector(void *, int, u32);
29882 +static int validate_bbrtablesector(void *, int, u32);
29883 +static u32 check_for_os2_bbr_relocations(char *);
29884 +static int check_os2_volumes(struct evms_logical_node **);
29885 +static int OS2_ioctl_cmd_broadcast(struct evms_logical_node *node,
29886 + struct inode *inode, struct file *file,
29887 + unsigned long cmd, unsigned long arg);
29888 +static int os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
29889 + struct inode *inode, struct file *file,
29890 + unsigned long cmd, unsigned long arg);
29891 +static void BBR_Worker(void *);
29892 +static void OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record,
29893 + struct buffer_head *bh,
29894 + int uptodate, int *redrive);
29895 +static void BBR_Transfer_IO(struct transfer_record * Transfer_Record);
29896 +static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);
29897 +static int Sector_Is_Remapped(struct os2_dl_entry * io_dlentry,
29898 + u64 Source_Sector, u64 * Replacement_Sector);
29899 +static void Invalidate_Mapping(struct os2_dl_entry * io_dlentry,
29900 + u64 Source_Sector,
29901 + int Replacement_Sector_Is_Bad);
29902 +static int Create_New_BBR_Table_Entry(struct os2_dl_entry *
29903 + io_dlentry, u64 starting_lsn,
29904 + unsigned int count, void *buffer);
29905 +static void Clone_Bufferhead(struct buffer_head *Source,
29906 + struct buffer_head *Child);
29908 +// Prototypes for local memory allocation/deallocation functions
29909 +static struct os2_dl_entry *new_os2_drive_link(LVM_Signature_Sector *,
29911 + evms_logical_node *);
29912 +static char *new_os2_link_data(u32, u32, u32, struct evms_logical_node *);
29913 +static char *new_os2_bbr_data(u32, u32, u32, struct evms_logical_node *);
29914 +static struct evms_logical_node *new_os2volume(u32, char *);
29915 +static int delete_os2lvm_volume(struct evms_logical_node *);
29916 +static int delete_os2_drive_link(struct os2_dl_entry *, int);
29918 +// Prototypes for Function Table interface
29919 +static int discover_os2lvm(struct evms_logical_node **);
29920 +static int delete_os2lvm(struct evms_logical_node *);
29921 +static void read_os2lvm(struct evms_logical_node *, struct buffer_head *);
29922 +static void write_os2lvm(struct evms_logical_node *, struct buffer_head *);
29923 +static int init_io_os2lvm(struct evms_logical_node *, int, u64, u64, void *);
29924 +static int ioctl_os2lvm(struct evms_logical_node *, struct inode *,
29925 + struct file *, unsigned int, unsigned long);
29926 +static int do_os2_bbr_io(struct os2_dl_entry *, int, u64, u64,
29929 +// Global data structures
29930 +static struct evms_logical_node *os2lvm_nodes = NULL;
29931 +static struct evms_thread *BBR_Worker_Thread = NULL;
29932 +static spinlock_t BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;
29933 +static const char *BBR_Worker_Name = "evms_os2_bbr_io";
29934 +static struct transfer_record *BBR_IO_List_Head = NULL;
29935 +static struct transfer_record *BBR_IO_List_Tail = NULL;
29936 +static struct evms_pool_mgmt *BBR_Transfer_Pool = NULL;
29937 +static char *BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";
29938 +static char *DL_Tracking_Pool_Name = "OS-2 Tracking Pool";
29939 +static struct evms_pool_mgmt *DL_Tracking_Pool = NULL;
29941 +// Required plug-in Function Table definition
29942 +static struct evms_plugin_fops function_table = {
29943 + .discover = discover_os2lvm,
29944 + .delete = delete_os2lvm,
29945 + .read = read_os2lvm,
29946 + .write = write_os2lvm,
29947 + .init_io = init_io_os2lvm,
29948 + .ioctl = ioctl_os2lvm
29951 +// Required plug-in Header definition
29952 +static struct evms_plugin_header plugin_header = {
29953 + .id = SetPluginID(IBM_OEM_ID,
29954 + EVMS_REGION_MANAGER,
29961 + .required_services_version = {
29962 + .major = EVMS_COMMON_SERVICES_MAJOR,
29963 + .minor = EVMS_COMMON_SERVICES_MINOR,
29964 + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL
29966 + .fops = &function_table
29969 +// Required Plugin Functions
29972 + * Function: discover_os2lvm
29974 + * This is the entry point into the discovery process.
29977 +discover_os2lvm(struct evms_logical_node **evms_partition_list)
29981 + MOD_INC_USE_COUNT;
29983 + if (!BBR_Transfer_Pool) {
29984 + BBR_Transfer_Pool =
29985 + evms_cs_create_pool(sizeof (struct transfer_record),
29986 + BBR_Transfer_Pool_Name, NULL, NULL);
29987 + if (!BBR_Transfer_Pool) {
29988 + MOD_DEC_USE_COUNT;
29993 + if (!DL_Tracking_Pool) {
29994 + DL_Tracking_Pool =
29995 + evms_cs_create_pool(sizeof (struct tracking_record),
29996 + DL_Tracking_Pool_Name, NULL, NULL);
29997 + if (!DL_Tracking_Pool) {
29998 + MOD_DEC_USE_COUNT;
30003 + rc = discover_os2lvm_partitions(evms_partition_list);
30006 + rc = check_os2_volumes(evms_partition_list);
30009 + MOD_DEC_USE_COUNT;
30014 + * Function: delete_os2lvm
30016 + * This is the entry point for deleting a node.
30019 +delete_os2lvm(struct evms_logical_node *logical_node)
30021 + LOG_EXTRA("Deleting volume: %s\n", logical_node->name);
30023 + return delete_os2lvm_volume(logical_node);
30027 + * Function: read_os2lvm
30030 +read_os2lvm(struct evms_logical_node *node, struct buffer_head *bh)
30033 + u64 sector_count;
30035 + struct buffer_head *Link1 = NULL;
30036 + struct buffer_head *Link2 = NULL;
30037 + struct tracking_record *Tracking_Record = NULL;
30038 + struct os2_dl_entry *cur_dlentry = NULL;
30039 + struct transfer_record *Transfer_Record;
30041 + rsector = bh->b_rsector;
30042 + sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
30043 + rc = find_drive_link(node, &cur_dlentry, &rsector, §or_count);
30044 + bh->b_rsector = rsector;
30047 + if (cur_dlentry->bbr_is_active) {
30048 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30049 + /* Transfer the IO to the BBR Worker Thread. */
30050 + Transfer_Record->Write_Flag = 0;
30051 + Transfer_Record->Partition_Data = cur_dlentry;
30052 + Transfer_Record->bh = bh;
30053 + Transfer_Record->next = NULL;
30054 + BBR_Transfer_IO(Transfer_Record);
30056 + R_IO(cur_dlentry->link_partition, bh);
30059 + /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */
30060 + Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1); /* Block until we get a tracking record. */
30061 + Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30062 + Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30064 + /* Initialize the tracking record so we can associate the two new I/Os with the original. */
30065 + Tracking_Record->io_in_progress = 2;
30066 + Tracking_Record->up_to_date = 0;
30067 + Tracking_Record->org_bh = bh;
30069 + /* Create the I/O to the first link. */
30070 + Clone_Bufferhead(bh, Link1);
30071 + Link1->b_private = Tracking_Record;
30072 + Link1->b_end_io = OS2_DL_Callback;
30073 + Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30074 + Tracking_Record->link1_bh = Link1;
30075 + Tracking_Record->link1_data = cur_dlentry;
30076 + Tracking_Record->link1_bbr_attempted = 0;
30077 + Tracking_Record->link1_transfer_rec = NULL;
30079 + /* Create the I/O to the second link */
30080 + Clone_Bufferhead(bh, Link2);
30081 + Link2->b_private = Tracking_Record;
30082 + Link2->b_end_io = OS2_DL_Callback;
30083 + Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30084 + Link2->b_rsector = 0;
30086 + bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT);
30087 + Tracking_Record->link2_bh = Link2;
30088 + Tracking_Record->link2_data = cur_dlentry->next;
30089 + Tracking_Record->link2_bbr_attempted = 0;
30090 + Tracking_Record->link2_transfer_rec = NULL;
30092 + /* Process the I/O to the first link. */
30093 + if (cur_dlentry->bbr_is_active) {
30094 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30095 + /* Transfer the IO to the BBR Worker Thread. */
30096 + Transfer_Record->Write_Flag = 0;
30097 + Transfer_Record->Partition_Data = cur_dlentry;
30098 + Transfer_Record->bh = Tracking_Record->link1_bh;
30099 + Transfer_Record->next = NULL;
30100 + BBR_Transfer_IO(Transfer_Record);
30102 + R_IO(cur_dlentry->link_partition,
30103 + Tracking_Record->link1_bh);
30105 + /* Process the I/O to the second link. */
30106 + cur_dlentry = cur_dlentry->next;
30107 + if (cur_dlentry->bbr_is_active) {
30108 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30109 + /* Transfer the IO to the BBR Worker Thread. */
30110 + Transfer_Record->Write_Flag = 0;
30111 + Transfer_Record->Partition_Data = cur_dlentry;
30112 + Transfer_Record->bh = Tracking_Record->link2_bh;
30113 + Transfer_Record->next = NULL;
30114 + BBR_Transfer_IO(Transfer_Record);
30116 + R_IO(cur_dlentry->link_partition,
30117 + Tracking_Record->link2_bh);
30121 + LOG_SERIOUS("READ error, request exceeds volume size.\n");
30122 + bh->b_end_io(bh, 0);
30128 + * Function: write_os2lvm
30131 +write_os2lvm(struct evms_logical_node *node, struct buffer_head *bh)
30135 + u64 sector_count;
30136 + struct buffer_head *Link1 = NULL;
30137 + struct buffer_head *Link2 = NULL;
30138 + struct tracking_record *Tracking_Record = NULL;
30139 + struct os2_dl_entry *cur_dlentry = NULL;
30140 + struct transfer_record *Transfer_Record;
30142 + rsector = bh->b_rsector;
30143 + sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
30144 + rc = find_drive_link(node, &cur_dlentry, &rsector, §or_count);
30145 + bh->b_rsector = rsector;
30148 + /* Set up a Transfer Record. If there are Bad Blocks on the partition that this I/O is
30149 + directed to, then we will need the Transfer Record to put the I/O in the queue for the
30150 + BBR Worker Thread. If there are no bad blocks, then we will need the Transfer Record
30151 + for the OS2_BBR_Write_Callback function. This function expects the Transfer Record to
30152 + be pre-allocated and available because it is running on an interrupt thread and should
30153 + not do memory allocation. If there is an error during the write, then the
30154 + OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O
30155 + to the BBR worker thread for further processing. If there are no errors during the I/O,
30156 + then the OS2_BBR_Write_Callback will deallocate the Transfer Record. */
30157 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30158 + Transfer_Record->Write_Flag = 1;
30159 + Transfer_Record->Partition_Data = cur_dlentry;
30160 + Transfer_Record->bh = bh;
30161 + Transfer_Record->next = NULL;
30162 + if (cur_dlentry->bbr_is_active) {
30163 + /* Transfer the IO to the BBR Worker Thread. */
30164 + BBR_Transfer_IO(Transfer_Record);
30166 + evms_cs_register_for_end_io_notification
30167 + (Transfer_Record, bh, OS2_BBR_Write_Callback);
30168 + W_IO(cur_dlentry->link_partition, bh);
30172 + /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */
30173 + Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1); /* Block until we get a tracking record. */
30174 + Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30175 + Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30177 + /* Initialize the tracking record so we can associate the two new I/Os with the original. */
30178 + Tracking_Record->io_in_progress = 2;
30179 + Tracking_Record->up_to_date = 0;
30180 + Tracking_Record->org_bh = bh;
30182 + /* Create the I/O to the first link. */
30183 + Clone_Bufferhead(bh, Link1);
30184 + Link1->b_private = Tracking_Record;
30185 + Link1->b_end_io = OS2_DL_Callback;
30186 + Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30187 + Tracking_Record->link1_bh = Link1;
30188 + Tracking_Record->link1_data = cur_dlentry;
30190 + /* Create the I/O to the second link */
30191 + Clone_Bufferhead(bh, Link2);
30192 + Link2->b_private = Tracking_Record;
30193 + Link2->b_end_io = OS2_DL_Callback;
30194 + Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30195 + Link2->b_rsector = 0;
30197 + bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT);
30198 + Tracking_Record->link2_bh = Link2;
30199 + Tracking_Record->link2_data = cur_dlentry->next;
30201 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30202 + Transfer_Record->Write_Flag = 1;
30203 + Transfer_Record->Partition_Data = cur_dlentry;
30204 + Transfer_Record->bh = Tracking_Record->link1_bh;
30205 + Transfer_Record->next = NULL;
30206 + Tracking_Record->link1_transfer_rec = Transfer_Record;
30207 + /* Process the I/O to the first link. */
30208 + if (cur_dlentry->bbr_is_active) {
30209 + /* Transfer the IO to the BBR Worker Thread. */
30210 + Tracking_Record->link1_bbr_attempted = 1;
30211 + BBR_Transfer_IO(Transfer_Record);
30213 + Tracking_Record->link1_bbr_attempted = 0;
30214 + W_IO(cur_dlentry->link_partition,
30215 + Tracking_Record->link1_bh);
30218 + /* Process the I/O to the second link. */
30219 + cur_dlentry = cur_dlentry->next;
30220 + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */
30221 + Transfer_Record->Write_Flag = 1;
30222 + Transfer_Record->Partition_Data = cur_dlentry;
30223 + Transfer_Record->bh = Tracking_Record->link2_bh;
30224 + Transfer_Record->next = NULL;
30225 + Tracking_Record->link2_transfer_rec = Transfer_Record;
30226 + if (cur_dlentry->bbr_is_active) {
30227 + /* Transfer the IO to the BBR Worker Thread. */
30228 + Tracking_Record->link2_bbr_attempted = 1;
30229 + BBR_Transfer_IO(Transfer_Record);
30231 + Tracking_Record->link2_bbr_attempted = 0;
30232 + W_IO(cur_dlentry->link_partition,
30233 + Tracking_Record->link2_bh);
30238 + LOG_SERIOUS("WRITE error, request exceeds volume size.\n");
30239 + bh->b_end_io(bh, 0);
30245 +os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
30246 + struct inode *inode,
30247 + struct file *file,
30248 + unsigned long cmd, unsigned long arg)
30251 + os2_volume_runtime_entry_t *Node_Data;
30252 + struct os2_dl_entry *curlink, *nextlink;
30253 + struct evms_plugin_ioctl_pkt tmp, *user_parms;
30255 + user_parms = (struct evms_plugin_ioctl_pkt *) arg;
30256 + /* copy user's parameters to kernel space */
30257 + if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
30261 + Node_Data = (os2_volume_runtime_entry_t *) node->private;
30262 + /* is this cmd targetted at this feature ? */
30263 + if (tmp.feature_id == node->plugin->id) {
30264 + switch (tmp.feature_command) {
30268 + } else { /* broadcast this cmd to all children */
30269 + curlink = Node_Data->drive_link;
30271 + /* broadcast this cmd to all children */
30272 + while (curlink) {
30273 + nextlink = curlink->next;
30275 + rc = IOCTL(curlink->link_partition, inode, file,
30281 + curlink = nextlink;
30285 + /* copy info to userspace */
30286 + if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
30293 +OS2_ioctl_cmd_broadcast(struct evms_logical_node *node,
30294 + struct inode *inode,
30295 + struct file *file, unsigned long cmd, unsigned long arg)
30298 + os2_volume_runtime_entry_t *Node_Data;
30299 + struct os2_dl_entry *curlink, *nextlink;
30301 + Node_Data = (os2_volume_runtime_entry_t *) node->private;
30302 + curlink = Node_Data->drive_link;
30304 + /* broadcast this cmd to all children */
30305 + while (curlink) {
30306 + nextlink = curlink->next;
30308 + rc |= IOCTL(curlink->link_partition, inode, file, cmd, arg);
30310 + curlink = nextlink;
30317 + * Function: ioctl_os2lvm
30320 +ioctl_os2lvm(struct evms_logical_node *logical_node,
30321 + struct inode *inode,
30322 + struct file *file, unsigned int cmd, unsigned long arg)
30325 + u64 Sectors_Per_Cylinder;
30326 + u64 Total_Sectors;
30327 + struct evms_logical_node *partition_node;
30330 + ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link->
30336 + LOG_EVERYTHING("Ioctl %d\n", cmd);
30339 + case HDIO_GETGEO:
30341 + // Return fake geometry
30342 + struct hd_geometry *hd = (struct hd_geometry *) arg;
30344 + unsigned char heads = 255;
30345 + unsigned char sectors =
30346 + OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;
30349 + /* OS/2 always created a fake geometry using the maximum cylinder size. */
30350 + Sectors_Per_Cylinder = heads * sectors;
30351 + for (cylinders = 0, Total_Sectors = 0;
30353 + ((os2_volume_runtime_entry_t *) logical_node->
30354 + private)->size_in_sectors; cylinders++)
30355 + Total_Sectors += Sectors_Per_Cylinder;
30360 + ((short *) (&hd->cylinders), &cylinders,
30361 + sizeof (cylinders))
30362 + || copy_to_user((char *) (&hd->heads), &heads,
30364 + || copy_to_user((char *) (&hd->sectors), §ors,
30365 + sizeof (sectors))
30366 + || copy_to_user((long *) (&hd->start), &start,
30367 + sizeof (start))) {
30373 + case EVMS_GET_BMAP:
30374 + // No kernel images allowed on OS/2 volumes right now.
30378 + case EVMS_QUIESCE_VOLUME:
30379 + case EVMS_GET_DISK_LIST:
30380 + case EVMS_CHECK_MEDIA_CHANGE:
30381 + case EVMS_REVALIDATE_DISK:
30382 + case EVMS_OPEN_VOLUME:
30383 + case EVMS_CLOSE_VOLUME:
30384 + case EVMS_CHECK_DEVICE_STATUS:
30385 + rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd,
30388 + case EVMS_PLUGIN_IOCTL:
30389 + rc = os2_ioctl_cmd_plugin_ioctl(logical_node, inode, file, cmd,
30401 + * Function: init_io_os2lvm
30404 +init_io_os2lvm(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
30405 + u64 sect_nr, /* disk LBA */
30406 + u64 num_sects, /* # of sectors */
30408 +{ /* buffer address */
30410 + u64 sector_count;
30411 + struct evms_logical_node *partition_node;
30412 + struct os2_dl_entry *cur_dlentry = NULL;
30414 + sector_count = num_sects;
30415 + rc = find_drive_link(node, &cur_dlentry, §_nr, §or_count);
30418 + partition_node = cur_dlentry->link_partition;
30419 + if (cur_dlentry->bbr_is_active)
30420 + rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr,
30421 + num_sects, buf_addr);
30423 + rc = INIT_IO(partition_node, io_flag, sect_nr,
30424 + num_sects, buf_addr);
30425 + if (rc && io_flag) {
30426 + cur_dlentry->bbr_is_active = 1;
30427 + rc = do_os2_bbr_io(cur_dlentry, io_flag,
30428 + sect_nr, num_sects,
30434 + partition_node = cur_dlentry->link_partition;
30435 + if (cur_dlentry->bbr_is_active)
30436 + rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr,
30437 + sector_count, buf_addr);
30439 + rc = INIT_IO(partition_node, io_flag, sect_nr,
30440 + sector_count, buf_addr);
30441 + if (rc && io_flag) {
30442 + cur_dlentry->bbr_is_active = 1;
30443 + rc = do_os2_bbr_io(cur_dlentry, io_flag,
30444 + sect_nr, sector_count,
30450 + cur_dlentry = cur_dlentry->next;
30451 + partition_node = cur_dlentry->link_partition;
30452 + num_sects -= sector_count;
30453 + buf_addr += sector_count << OS2_SECTOR_SHIFT;
30455 + if (cur_dlentry->bbr_is_active)
30456 + rc = do_os2_bbr_io(cur_dlentry, io_flag, 0,
30457 + num_sects, buf_addr);
30459 + rc = INIT_IO(partition_node, io_flag, 0,
30460 + num_sects, buf_addr);
30461 + if (rc && io_flag) {
30462 + cur_dlentry->bbr_is_active = 1;
30463 + rc = do_os2_bbr_io(cur_dlentry, io_flag,
30472 + LOG_SERIOUS("INITIO error, request exceeds volume size.\n");
30480 + * Function: do_os2_bbr_io
30482 + * Check the Bad Block Relocation list for relocated sectors. If any are found,
30483 + * this function will do the i/o directly.
30484 + * Return values: 0 == i/o done, 1 == unable to complete i/o
30487 +do_os2_bbr_io(struct os2_dl_entry * io_dlentry, int rw, /* 0=read, 1=write */
30488 + u64 starting_lsn, /* disk LBA */
30489 + u64 count, /* # of sectors */
30491 +{ /* buffer address */
30492 + u64 lsn, remapped_lsn;
30495 + // For each sector in this request, check if this sector has already
30496 + // been remapped. If so, process all previous sectors in this request,
30497 + // followed by the remapped sector. Then reset the starting lsn and
30498 + // count and keep going with the rest of the request as if it were
30499 + // a whole new request.
30500 + for (lsn = 0; lsn < count; lsn++) {
30501 + remapped_lsn = starting_lsn + lsn;
30502 + rc = Sector_Is_Remapped(io_dlentry, remapped_lsn,
30505 + // Process all sectors in the request up to this one.
30507 + rc = INIT_IO(io_dlentry->link_partition, rw,
30508 + starting_lsn, lsn, buffer);
30510 + /* If this is a read, then we are done. */
30515 + /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
30516 + if (!Create_New_BBR_Table_Entry
30517 + (io_dlentry, starting_lsn, lsn,
30519 + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
30523 + buffer += (lsn * OS2_BYTES_PER_SECTOR);
30525 + // Process the remapped sector.
30526 + rc = INIT_IO(io_dlentry->link_partition, rw,
30527 + remapped_lsn, 1, buffer);
30529 + /* If this is a read, then we are done. */
30534 + /* Get the original sector that was remapped. */
30535 + remapped_lsn = starting_lsn + lsn;
30537 + /* Invalidate the current remapping. */
30538 + Invalidate_Mapping(io_dlentry, remapped_lsn, 1);
30540 + /* Try to remap the bad sector to another replacement sector. */
30541 + if (!Create_New_BBR_Table_Entry
30542 + (io_dlentry, remapped_lsn, 1, buffer)) {
30543 + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
30549 + buffer += OS2_BYTES_PER_SECTOR;
30551 + starting_lsn += (lsn + 1);
30552 + count -= (lsn + 1);
30558 + /* Are there any sectors left to process? */
30560 + rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn,
30563 + /* If this is a read, then we are done. */
30568 + /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
30569 + if (!Create_New_BBR_Table_Entry
30570 + (io_dlentry, starting_lsn, count, buffer)) {
30571 + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
30583 + * Function: os2lvm_vge_init
30586 +os2lvm_vge_init(void)
30588 + /* Should I be allocating the pools and BBR Worker Thread here? */
30589 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
30593 +os2lvm_vge_exit(void)
30595 + /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */
30597 + evms_cs_unregister_plugin(&plugin_header);
30600 +module_init(os2lvm_vge_init);
30601 +module_exit(os2lvm_vge_exit);
30602 +#ifdef MODULE_LICENSE
30603 +MODULE_LICENSE("GPL");
30606 +// Local VGE Functions
30609 + * Function: discover_os2lvm_partitions
30611 + * Examine the list of logical partitions. Any type 0x35 partition that contains
30612 + * a valid OS/2 signature sector is consumed and added to the appropriate logical
30616 +discover_os2lvm_partitions(struct evms_logical_node **evms_partition_list)
30618 + struct evms_logical_node *evms_partition;
30619 + struct evms_logical_node *next_partition;
30620 + struct evms_logical_node *new_volume;
30621 + u64 sectornum = 0;
30622 + u32 volumeserial;
30624 + char *volumename;
30625 + char driveletter[8];
30626 + LVM_Signature_Sector *sigsector;
30627 + struct os2_dl_entry *new_dlentry;
30629 + LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n");
30630 + sigsect = kmalloc(OS2_BYTES_PER_SECTOR, GFP_KERNEL);
30632 + LOG_SERIOUS("Could not allocate Signature sector data\n");
30636 + for (evms_partition = *evms_partition_list; evms_partition;
30637 + evms_partition = next_partition) {
30638 + // Save the next node. We may remove this one from the list.
30639 + next_partition = evms_partition->next;
30641 + // The node must not have the OS/2 vge id.
30642 + if (evms_partition->plugin->id == plugin_header.id) {
30646 + LOG_EXTRA("Examining partition serial %s\n",
30647 + evms_partition->name);
30649 + // Have to go to the last accessible sector of the partition and
30650 + // read it in. It should be the LVM Signature Sector.
30651 + sectornum = evms_partition->total_vsectors - 1;
30652 + if (INIT_IO(evms_partition, 0, sectornum, 1, sigsect)) {
30653 + // On an I/O error, continue on to the next partition.
30654 + // This means that the volume it belongs to will be incomplete
30655 + // and later deleted in the completeness check.
30656 + LOG_SERIOUS("I/O error on Signature sector read\n");
30659 + sigsector = (LVM_Signature_Sector *) sigsect;
30661 + // Validate the Signature Sector
30662 + if (validate_signaturesector
30663 + (evms_partition, sigsector, OS2_BYTES_PER_SECTOR)) {
30664 + LOG_EXTRA("Signature sector is not valid\n");
30667 +// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector. However, if the partition
30668 +// is not marked as a type 0x35, then this Signature Sector may be erroneous. The problem here is that
30669 +// there is currently no way to find out if this partition was marked as a type 0x35. Also, if we
30670 +// should reject this partition due to some problem with the drive linking or BBR metadata, should we
30671 +// leave the partition in the evms partition list or not? If the partition was marked as a type 0x35
30672 +// and the Signature Sector was valid, then I would say that we should remove it from the evms partition
30673 +// partition list. If the partition is not marked as a type 0x35 but the Signature Sector is valid, then
30674 +// we could have a stray Signature Sector, in which case the partition should remain in the evms partition
30675 +// list. The OS/2 LVM Signature Sector does have additional information that could be used to resolve
30676 +// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but
30677 +// we can not get the starting LBA of the partition to compare against. If we leave the partition in
30678 +// the evms partition list when we should not, then an extraneous compatibility volume could result.
30679 + // Build the Metadata for this partition
30682 + new_os2_drive_link(sigsector, evms_partition))) {
30685 + // Search for the parent Volume for this partition
30686 + volumeserial = sigsector->Volume_Serial_Number;
30687 + if (!(new_volume = find_os2_volume(volumeserial))) {
30689 + // If not found, allocate a new Volume
30690 + LOG_EVERYTHING("Parent not found, allocate new.\n");
30691 + if (sigsector->Drive_Letter != '\0') {
30692 + driveletter[0] = sigsector->Drive_Letter;
30693 + driveletter[1] = '\0';
30694 + volumename = driveletter;
30696 + volumename = sigsector->Volume_Name;
30700 + new_os2volume(volumeserial, volumename))) {
30701 + delete_os2_drive_link(new_dlentry, 0);
30702 + new_dlentry = NULL;
30706 + // Now remove the partition from the List
30707 + evms_cs_remove_logical_node_from_list(evms_partition_list,
30710 + if (((os2_volume_runtime_entry_t *) new_volume->private)->
30712 + // Volume is complete, delete this duplicate
30713 + delete_os2_drive_link(new_dlentry, 0);
30714 + LOG_EVERYTHING("Deleting duplicate node.\n");
30715 + ((os2_volume_runtime_entry_t *) new_volume->private)->Export_Needed = 1; //We must export this volume again!
30716 + } else /* Add this partition to its parent Volume */
30717 + add_os2link(new_dlentry, new_volume);
30722 + LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n");
30728 + * Function: find_os2_volume
30730 + * Search for the OS/2 volume that matches the volume serial.
30732 +static struct evms_logical_node *
30733 +find_os2_volume(u32 volumeserial)
30735 + os2_volume_runtime_entry_t *cur_volume;
30736 + struct evms_logical_node *cur_node;
30738 + cur_node = os2lvm_nodes;
30740 + while (cur_node) {
30741 + cur_volume = (os2_volume_runtime_entry_t *) cur_node->private;
30742 + if (cur_volume->Volume_Serial_Number == volumeserial) {
30743 + LOG_EVERYTHING("%s: found volser match.\n",
30747 + LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__);
30748 + cur_node = cur_volume->next_os2lvm_node;
30755 + * Function: add_os2link
30757 + * Add the Drive Link metadata to the parent OS/2 volume.
30760 +add_os2link(struct os2_dl_entry * newlink,
30761 + struct evms_logical_node *parent_volume)
30763 + os2_volume_runtime_entry_t *parent_metadata =
30764 + (os2_volume_runtime_entry_t *) parent_volume->private;
30765 + struct os2_dl_entry *curlink =
30766 + parent_metadata->drive_link, *nextlink;
30769 + nextlink = curlink->next;
30770 + while (nextlink) {
30771 + curlink = nextlink;
30772 + nextlink = curlink->next;
30774 + curlink->next = newlink;
30776 + parent_metadata->drive_link = newlink;
30778 + parent_metadata->drive_link_count++;
30779 + parent_metadata->size_in_sectors += newlink->sector_count;
30780 + parent_volume->total_vsectors += newlink->sector_count;
30785 + * Function: find_link_data
30787 + * Find the Drive Link metadata that matches the partition serial number.
30788 + * Remove it from the link_list passed in.
30790 +static struct os2_dl_entry *
30791 +find_link_data(struct os2_dl_entry ** link_list, u32 partitionser)
30793 + struct os2_dl_entry *curlink = *link_list, *prevlink = NULL;
30795 + while (curlink) {
30796 + if (curlink->partition_serial == partitionser) {
30798 + prevlink->next = curlink->next;
30800 + *link_list = curlink->next;
30802 + curlink->next = NULL;
30805 + prevlink = curlink;
30806 + curlink = prevlink->next;
30813 + * Function: find_drive_link
30815 + * Walk the linked list of drive links to find the proper
30816 + * target partition. Returns the metadata associated with
30817 + * the drive link.
30818 + * Return values: 1 == data contained in 1 partition, 2 == data crosses 2 partitions,
30819 + * 0 == target partition not found
30822 +find_drive_link(struct evms_logical_node *node,
30823 + struct os2_dl_entry ** dlentry,
30824 + u64 * sector, u64 * num_sectors)
30826 + u64 last_link_sector, cur_last_sector;
30827 + struct os2_dl_entry *curlink =
30828 + ((os2_volume_runtime_entry_t *) node->private)->drive_link,
30831 + while (curlink) {
30832 + nextlink = curlink->next;
30833 + last_link_sector =
30834 + curlink->start_sector + curlink->sector_count;
30835 + if (*sector < last_link_sector) {
30836 + *dlentry = curlink;
30837 + cur_last_sector = *sector + *num_sectors;
30838 + *sector -= curlink->start_sector;
30840 + ("I/O start_RBA == "PFU64" , sector_count == "PFU64"\n",
30841 + *sector, *num_sectors);
30842 + if (cur_last_sector <= last_link_sector)
30845 + if ((*dlentry)->next)
30847 + cur_last_sector - last_link_sector;
30854 + curlink = nextlink;
30860 +// Allocation/Deallocation Functions
30863 + * Function: new_os2_drive_link
30865 + * Allocate space for a new OS/2 drive link structure.
30866 + * Initialize the appropriate fields.
30867 + * Note: since the BBR info applies to each link, the BBR structures
30868 + * are also initialized here.
30870 +static struct os2_dl_entry *
30871 +new_os2_drive_link(LVM_Signature_Sector * signature_sector,
30872 + struct evms_logical_node *evms_partition)
30875 + u32 feature, feature_size, sectoroffset;
30876 + struct os2_dl_entry *new_dlentry;
30879 + kmalloc(sizeof (struct os2_dl_entry), GFP_KERNEL);
30880 + if (!new_dlentry) {
30881 + LOG_SERIOUS("Could not allocate drivelink metadata\n");
30884 + memset(new_dlentry, 0, sizeof (struct os2_dl_entry));
30885 + new_dlentry->sector_count =
30886 + signature_sector->Partition_Size_To_Report_To_User;
30887 + new_dlentry->partition_serial =
30888 + signature_sector->partition_serial;
30889 + new_dlentry->bbr_is_active = 0; // initialize to not active
30890 + new_dlentry->link_partition = evms_partition;
30891 + init_MUTEX(&(new_dlentry->bbr_table_lock));
30893 + sectoroffset = signature_sector->Partition_Start;
30894 + LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset);
30895 + for (i = 0; i < OS2LVM_MAX_FEATURES_PER_VOLUME; i++) {
30896 + feature = signature_sector->LVM_Feature_Array[i].Feature_ID;
30899 + signature_sector->LVM_Feature_Array[i].
30900 + Feature_Data_Size;
30901 + LOG_EVERYTHING("Entry %d in Feature Table is valid,\n",
30903 + LOG_EVERYTHING("Feature Data size is %i sectors.\n",
30905 + if (feature == DRIVE_LINKING_FEATURE_ID) {
30906 + if (!new_dlentry->link_data) {
30907 + new_dlentry->dl_lsn1 =
30908 + signature_sector->
30909 + LVM_Feature_Array[i].
30910 + Location_Of_Primary_Feature_Data -
30912 + new_dlentry->dl_lsn2 =
30913 + signature_sector->
30914 + LVM_Feature_Array[i].
30915 + Location_Of_Secondary_Feature_Data -
30917 + new_dlentry->link_data =
30918 + new_os2_link_data(new_dlentry->
30924 + if (new_dlentry->link_data == NULL) {
30925 + delete_os2_drive_link
30926 + (new_dlentry, 0);
30927 + new_dlentry = NULL;
30931 + ("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");
30932 + delete_os2_drive_link(new_dlentry, 0);
30933 + new_dlentry = NULL;
30935 + } else if (feature == BBR_FEATURE_ID) {
30936 + if (!new_dlentry->bbr_data) {
30937 + new_dlentry->bbr_lsn1 =
30938 + signature_sector->
30939 + LVM_Feature_Array[i].
30940 + Location_Of_Primary_Feature_Data;
30941 + new_dlentry->bbr_lsn2 =
30942 + signature_sector->
30943 + LVM_Feature_Array[i].
30944 + Location_Of_Secondary_Feature_Data;
30945 + new_dlentry->bbr_feature_size =
30947 + new_dlentry->bbr_data =
30948 + new_os2_bbr_data(new_dlentry->
30954 + if (new_dlentry->bbr_data == NULL) {
30955 + delete_os2_drive_link
30956 + (new_dlentry, 0);
30957 + new_dlentry = NULL;
30958 + } else if (signature_sector->
30959 + LVM_Feature_Array[i].
30960 + Feature_Active) {
30961 + new_dlentry->bbr_is_active =
30962 + check_for_os2_bbr_relocations
30963 + (new_dlentry->bbr_data);
30967 + ("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");
30968 + delete_os2_drive_link(new_dlentry, 0);
30969 + new_dlentry = NULL;
30973 + ("os2lvm_vge: Unknown Feature entry %d found.\n",
30975 + delete_os2_drive_link(new_dlentry, 0);
30976 + new_dlentry = NULL;
30979 + if (signature_sector->LVM_Feature_Array[i].
30980 + Feature_Active) {
30981 + LOG_EVERYTHING("Feature is active.\n");
30986 + if (new_dlentry &&
30987 + ((!new_dlentry->bbr_data) || (!new_dlentry->link_data))
30989 + LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n");
30990 + delete_os2_drive_link(new_dlentry, 0);
30991 + new_dlentry = NULL;
30993 + return new_dlentry;
30997 + * Function: new_os2_link_data
30999 + * Allocate space for OS/2 drive link information.
31000 + * Read in and validate the information from disk.
31001 + * Note: assumes 512 byte sectors.
31004 +new_os2_link_data(u32 linksector1,
31006 + u32 linknumsectors, struct evms_logical_node *link_partition)
31008 + char *new_data1; /* Buffer used to hold the primary copy of the drive linking data. */
31009 + char *new_data2; /* Buffer used to hold the secondary copy of the drive linking data. */
31010 + char *p1; /* Used to access individual sectors of data within new_data1. */
31011 + char *p2; /* Used to access individual sectors of data within new_data2. */
31012 + int memsize = linknumsectors * OS2_BYTES_PER_SECTOR;
31013 + u32 i, seq1, seq2;
31015 + /* Allocate Memory for the buffers to hold the drive linking data. */
31016 + LOG_EVERYTHING("Drive Linking Feature entry found.\n");
31017 + new_data1 = kmalloc(memsize, GFP_KERNEL);
31018 + if (!new_data1) {
31019 + LOG_SERIOUS("Could not allocate Primary Link data\n");
31022 + new_data2 = kmalloc(memsize, GFP_KERNEL);
31023 + if (!new_data2) {
31024 + LOG_SERIOUS("Could not allocate Secondary Link data\n");
31025 + kfree(new_data1);
31029 + LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1);
31030 + LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n",
31033 + /* Read the drive linking data into memory. */
31034 + if (INIT_IO(link_partition, 0, linksector1, linknumsectors, new_data1)) {
31035 + LOG_SERIOUS("I/O error reading Primary Feature Data.\n");
31039 + /* Set up access to the buffer. Extract the Master Sequence Number from the buffer. */
31041 + seq1 = ((struct link_table_first_sector *) p1)->Sequence_Number;
31044 + if (INIT_IO(link_partition, 0, linksector2, linknumsectors, new_data2)) {
31045 + LOG_SERIOUS("I/O error reading Secondary Feature Data.\n");
31049 + /* Set up access to the second buffer. Extract its copy of the Master Sequence Number. */
31051 + seq2 = ((struct link_table_sector *) p2)->Sequence_Number;
31054 + /* Validate both copies of the drive linking data one sector at a time. */
31055 + for (i = 0; i < linknumsectors;
31056 + i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) {
31058 + && validate_drivelinksector((struct link_table_sector *) p1, i,
31061 + ("The primary copy of the drive link data is invalid! Sector %i is not valid\n",
31067 + && validate_drivelinksector((struct link_table_sector *) p2, i,
31070 + ("The secondary copy of the drive link data is invalid! Sector %i is not valid\n",
31077 + LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1);
31078 + LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2);
31080 + /* Choose which copy of the drive linking data to use. If both sequence numbers are 0, then both copies
31081 + of the drive linking data are bad. If both are equal and non-zero, then both copies are good and it
31082 + really doesn't matter which one you choose. Otherwise, choose the copy with the highest sequence number. */
31083 + if (seq2 > seq1) {
31084 + kfree(new_data1);
31085 + return new_data2;
31087 + kfree(new_data2);
31089 + kfree(new_data1);
31090 + new_data1 = NULL;
31093 + return new_data1;
31097 + * Function: new_os2_bbr_data
31099 + * Allocate space for OS/2 bad block relocation information.
31100 + * Read in and validate the information from disk.
31101 + * Note: assumes 512 byte sectors.
31104 +new_os2_bbr_data(u32 bbrsector1,
31106 + u32 bbrnumsectors, struct evms_logical_node *bbr_partition)
31108 + char *new_data1; /* Buffer to hold the primary copy of the BBR data. */
31109 + char *new_data2; /* Buffer to hold the secondary copy of the BBR data. */
31110 + char *p1; /* Used to examine the individual sectors of BBR data within new_data1. */
31111 + char *p2; /* Used to examine the individual sectors of BBR data within new_data2. */
31112 + int memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;
31113 + u32 i, seq1, seq2;
31115 + LOG_EVERYTHING("BBR Feature entry found.\n");
31117 + /* Allocate memory for the buffers. */
31118 + new_data1 = kmalloc(memsize, GFP_KERNEL);
31119 + if (!new_data1) {
31120 + LOG_SERIOUS("Could not allocate Primary BBR data\n");
31123 + new_data2 = kmalloc(memsize, GFP_KERNEL);
31124 + if (!new_data2) {
31125 + LOG_SERIOUS("Could not allocate Secondary BBR data\n");
31126 + kfree(new_data1);
31130 + LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1);
31131 + LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2);
31133 + /* Read in both copies of the BBR data. */
31134 + if (INIT_IO(bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1)) {
31135 + LOG_SERIOUS("I/O error reading Primary Feature Data.\n");
31139 + /* Establish access to the first sector of the BBR data. Extract the Master Sequence Number
31140 + for this copy of the BBR data. */
31142 + seq1 = ((LVM_BBR_Table_First_Sector *) p1)->Sequence_Number;
31145 + if (INIT_IO(bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2)) {
31146 + LOG_SERIOUS("I/O error reading Secondary Feature Data.\n");
31150 + /* Establish access to the first sector of the second copy of the BBR data. Extract the
31151 + Master Sequence Number for this copy of the BBR data. */
31153 + seq2 = ((LVM_BBR_Table_Sector *) p2)->Sequence_Number;
31156 + /* Validate both copies of the BBR Data, one sector at a time. */
31157 + for (i = 0; i < bbrnumsectors;
31158 + i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) {
31159 + if ((seq1 > 0) && validate_bbrtablesector(p1, i, seq1)) {
31161 + ("The primary BBR data is invalid! Sector %i is not valid\n",
31166 + if ((seq2 > 0) && validate_bbrtablesector(p2, i, seq2)) {
31168 + ("The secondary BBR data is invalid! Sector %i is not valid\n",
31175 + LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1);
31176 + LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2);
31178 + /* Choose which copy of the BBR Data to use based upon the sequence number. If both sequence numbers
31179 + are 0, then there is no valid BBR data. If both are non-zero and equal, then it really doesn't
31180 + matter which copy is used. Otherwise, choose the copy with the highest sequence number. */
31181 + if (seq2 > seq1) {
31182 + kfree(new_data1);
31183 + return new_data2;
31185 + kfree(new_data2);
31187 + kfree(new_data1);
31188 + new_data1 = NULL;
31191 + return new_data1;
31195 + * Function: new_os2volume
31197 + * Allocate space for a new OS/2 logical volume.
31198 + * Initialize the appropriate fields.
31200 +static struct evms_logical_node *
31201 +new_os2volume(u32 volumeserial, char *volume_name)
31203 + struct evms_logical_node *new_node;
31204 + os2_volume_runtime_entry_t *cur_volume;
31206 + if (evms_cs_allocate_logical_node(&new_node)) {
31207 + LOG_SERIOUS("Could not allocate new volume\n");
31210 + new_node->private =
31211 + kmalloc(sizeof (os2_volume_runtime_entry_t), GFP_KERNEL);
31212 + if (!new_node->private) {
31213 + LOG_SERIOUS("Could not allocate volume metadata\n");
31214 + evms_cs_deallocate_logical_node(new_node);
31217 + memset(new_node->private, 0, sizeof (os2_volume_runtime_entry_t));
31218 + new_node->plugin = &plugin_header;
31219 + new_node->system_id = LVM_PARTITION_INDICATOR;
31220 + sprintf(new_node->name, "os2/%s", volume_name);
31221 + cur_volume = (os2_volume_runtime_entry_t *) new_node->private;
31222 + cur_volume->Volume_Serial_Number = volumeserial;
31223 + cur_volume->Export_Needed = 1;
31225 + if (os2lvm_nodes == NULL)
31226 + os2lvm_nodes = new_node;
31228 + // This is the first node discovered. Start the BBR thread.
31229 + if (!BBR_Worker_Thread) {
31230 + BBR_Worker_Thread =
31231 + evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);
31232 + if (!BBR_Worker_Thread) {
31233 + kfree(new_node->private);
31234 + evms_cs_deallocate_logical_node(new_node);
31235 + os2lvm_nodes = NULL;
31240 + (os2_volume_runtime_entry_t *) os2lvm_nodes->private;
31241 + while (cur_volume->next_os2lvm_node)
31243 + (os2_volume_runtime_entry_t *) cur_volume->
31244 + next_os2lvm_node->private;
31245 + cur_volume->next_os2lvm_node = new_node;
31248 + MOD_INC_USE_COUNT;
31254 + * Function: delete_os2lvm_volume
31256 + * This function deletes the in-memory representation of an OS/2
31257 + * logical volume.
31260 +delete_os2lvm_volume(struct evms_logical_node *logical_node)
31262 + struct os2_dl_entry *curdrvlink =
31263 + ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link,
31265 + os2_volume_runtime_entry_t *cur_volume, *next_volume;
31267 + while (curdrvlink) {
31268 + nextdrvlink = curdrvlink->next;
31269 + delete_os2_drive_link(curdrvlink, 1);
31270 + curdrvlink = nextdrvlink;
31273 + cur_volume = (os2_volume_runtime_entry_t *) os2lvm_nodes->private;
31274 + if (os2lvm_nodes == logical_node)
31275 + os2lvm_nodes = cur_volume->next_os2lvm_node;
31277 + while (cur_volume->next_os2lvm_node) {
31279 + (os2_volume_runtime_entry_t *) cur_volume->
31280 + next_os2lvm_node->private;
31281 + if (cur_volume->next_os2lvm_node == logical_node) {
31282 + cur_volume->next_os2lvm_node =
31283 + next_volume->next_os2lvm_node;
31289 + if (os2lvm_nodes == NULL) {
31290 + // Just deleted the last os2 node. Stop the BBR thread.
31291 + if (BBR_Worker_Thread) {
31292 + evms_cs_unregister_thread(BBR_Worker_Thread);
31293 + BBR_Worker_Thread = NULL;
31297 + kfree(logical_node->private);
31298 + evms_cs_deallocate_logical_node(logical_node);
31300 + MOD_DEC_USE_COUNT;
31306 + * Function: delete_os2_drive_link
31308 + * This function deletes the drive link runtime structure and any
31309 + * other structures it points to.
31312 +delete_os2_drive_link(struct os2_dl_entry * drive_link,
31313 + int delete_link_partition)
31315 + if (drive_link->link_data)
31316 + kfree(drive_link->link_data);
31317 + if (drive_link->bbr_data)
31318 + kfree(drive_link->bbr_data);
31319 + if (delete_link_partition)
31320 + DELETE(drive_link->link_partition);
31321 + kfree(drive_link);
31326 +// Consistency Checking Functions
31329 + * Function: validate_signaturesector
31331 + * This function checks the OS/2 LVM Signature Sector
31334 +validate_signaturesector(struct evms_logical_node *evms_partition,
31335 + LVM_Signature_Sector * signature_sector,
31338 + u32 crc_hold, crc_new;
31340 + /* In order for a signature sector to be considered valid, its signature and CRC must
31341 + be correct. Also, OS/2 stores the starting LBA of the partition and the size of
31342 + the partition that this signature sector corresponds to. These should be checked
31343 + as well. However, since the starting LBA of the partition that this belongs to is
31344 + not available to us as part of an struct evms_logical_node, we can only check the size
31345 + of the partition against what is stored in the signature sector. */
31347 + /* The signature used is in two parts. Test the first part. */
31348 + if (signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE) {
31349 + LOG_EVERYTHING("Primary LVM Signature failed.\n");
31353 + /* Test the second part of the signature. */
31354 + if (signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE) {
31355 + LOG_EVERYTHING("Secondary LVM Signature failed.\n");
31359 + /* Calculate the CRC and compare it against the stored CRC. */
31360 + crc_hold = signature_sector->Signature_Sector_CRC;
31361 + signature_sector->Signature_Sector_CRC = 0;
31363 + evms_cs_calculate_crc(EVMS_INITIAL_CRC, (void *) signature_sector,
31365 + if (crc_hold != crc_new) {
31366 + LOG_EVERYTHING("Signature sector crc failed.\n");
31367 + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31371 + // The partition size must == that found in the Signature Sector
31372 + if (evms_partition->total_vsectors !=
31373 + signature_sector->Partition_Sector_Count) {
31374 + LOG_EXTRA("Partition size is not valid\n");
31382 + * Function: validate_drivelinksector
31384 + * This function checks the OS/2 LVM Drivelink Feature Sector
31387 +validate_drivelinksector(void *Sector_To_Validate,
31388 + int Sector_Index, u32 Master_Sequence_Number)
31390 + u32 crc_hold, crc_new;
31391 + struct link_table_first_sector *First_Sector =
31392 + (struct link_table_first_sector *) Sector_To_Validate;
31393 + struct link_table_sector *Link_Sector =
31394 + (struct link_table_sector *) Sector_To_Validate;
31396 + /* The OS/2 drive linking data covers several sectors. The format of the first sector is slightly
31397 + different from the following sectors because it contains additional information about how many
31398 + drive links are actually in use. The following sectors just contain portions of the drive link
31399 + table. Each sector of OS/2 drive linking data contains a signature, crc, and sequence number
31400 + which must be validated. */
31402 + if (Sector_Index == 0) {
31404 + /* Link Table Master Signature Check */
31405 + if (LINK_TABLE_MASTER_SIGNATURE !=
31406 + First_Sector->Link_Table_Signature) {
31408 + ("Link Table Master Signature Test failed.\n");
31412 + /* We will NOT check the sequence number here as the first sector of drive link data is the
31413 + source of the Master_Sequence_Number which was passed in to us. */
31415 + /* Set up for the CRC Check */
31416 + crc_hold = First_Sector->Link_Table_CRC;
31417 + First_Sector->Link_Table_CRC = 0;
31419 + /* Link Table Internal Signature Check */
31420 + if (LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature) {
31422 + ("Link Table Internal Signature Test failed.\n");
31426 + /* Check the sequence number. */
31427 + if (Master_Sequence_Number != Link_Sector->Sequence_Number) {
31429 + ("Link Table Internal Sequence Number Test failed.\n");
31433 + /* Set up for the CRC Check */
31434 + crc_hold = Link_Sector->Link_Table_CRC;
31435 + Link_Sector->Link_Table_CRC = 0;
31439 + evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate,
31440 + OS2_BYTES_PER_SECTOR);
31441 + if (crc_hold != crc_new) {
31442 + LOG_EVERYTHING("Link Table crc failed.\n");
31443 + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31452 + * Function: validate_bbrtablesector
31454 + * This function checks the OS/2 LVM Bad Block Relocation Feature Sector
31457 +validate_bbrtablesector(void *Sector_To_Validate,
31458 + int Sector_Index, u32 Master_Sequence_Number)
31460 + u32 crc_hold, crc_new;
31461 + LVM_BBR_Table_First_Sector *First_Sector =
31462 + (LVM_BBR_Table_First_Sector *) Sector_To_Validate;
31463 + LVM_BBR_Table_Sector *BBR_Sector =
31464 + (LVM_BBR_Table_Sector *) Sector_To_Validate;
31466 + /* The OS/2 bad block relocation (BBR) data covers several sectors. The format of the first sector
31467 + is different from the following sectors because it contains additional information about how many
31468 + relocations are actually in use and the size and location of the block of replacement sectors.
31469 + The following sectors just contain portions of the BBR remap table. Each sector of OS/2 BBR data
31470 + contains a signature, crc, and sequence number which must be validated. */
31472 + if (Sector_Index == 0) {
31474 + /* BBR Table Master Signature Check */
31475 + if (BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature) {
31477 + ("BBR Table Master Signature Test failed.\n");
31481 + /* We will NOT check the sequence number here as the first sector of BBR data is the
31482 + source of the Master_Sequence_Number which was passed in to us. */
31484 + /* Set up for the CRC Check */
31485 + crc_hold = First_Sector->CRC;
31486 + First_Sector->CRC = 0;
31489 + /* BBR Table Internal Signature Check */
31490 + if (BBR_TABLE_SIGNATURE != BBR_Sector->Signature) {
31492 + ("BBR Table Internal Signature Test failed.\n");
31496 + /* Check the sequence number. */
31497 + if (Master_Sequence_Number != BBR_Sector->Sequence_Number) {
31499 + ("BBR Table Internal Sequence Number Test failed.\n");
31503 + /* Set up for the CRC Check */
31504 + crc_hold = BBR_Sector->CRC;
31505 + BBR_Sector->CRC = 0;
31509 + evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate,
31510 + OS2_BYTES_PER_SECTOR);
31511 + if (crc_hold != crc_new) {
31512 + LOG_EVERYTHING("BBRTable crc failed.\n");
31513 + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31522 + * Function: check_for_os2_bbr_relocations
31524 + * This function checks the OS/2 LVM Bad Block Relocation Tables
31525 + * for any active relocation sectors. The bbr table is reformatted in memory
31526 + * to make searches faster.
31527 + * Return values: 0 == no active relocations, 1 == contains active relocations
31530 +check_for_os2_bbr_relocations(char *bbr_data_ptr)
31532 + LVM_BBR_Feature *feature_data = (LVM_BBR_Feature *) bbr_data_ptr;
31534 + if (feature_data->control.Table_Entries_In_Use) {
31535 + LOG_EVERYTHING("There are %d active relocations.\n",
31536 + feature_data->control.Table_Entries_In_Use);
31544 + * Function: check_os2_volumes
31546 + * This function performs a consistency check on all existing OS/2
31547 + * Logical Volumes. The list of constituent partitions ( links )
31548 + * is checked and ordered according to the Link Table. If any link
31549 + * is missing or inconsistent, the entire volume will be deleted.
31552 +check_os2_volumes(struct evms_logical_node **node_list)
31554 + os2_volume_runtime_entry_t *cur_volume;
31555 + os2_volume_runtime_entry_t *previous_volume;
31556 + struct evms_logical_node *cur_node;
31557 + struct evms_logical_node *previous_node = NULL;
31558 + struct os2_dl_entry *link_list, *link_hold;
31559 + struct link_table_first_sector *psector1;
31561 + u32 numlinks, countlinks, linkser;
31562 + u32 Master_Sequence_Number; /* Used to check whether or not all of the copies of Drive Linking data match. */
31563 + u64 partition_offset;
31566 + LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n");
31568 + cur_node = os2lvm_nodes;
31570 + while (cur_node) {
31571 + cur_volume = (os2_volume_runtime_entry_t *) cur_node->private;
31572 + link_list = NULL;
31573 + if (!cur_volume->complete) { /* need to verify this one */
31574 + cur_volume->complete = 1;
31575 + LOG_EVERYTHING("Checking volume %s\n", cur_node->name);
31577 + // Reset fields for sort operation
31578 + cur_volume->size_in_sectors = 0;
31579 + numlinks = cur_volume->drive_link_count;
31580 + cur_volume->drive_link_count = 0;
31581 + cur_node->total_vsectors = 0;
31582 + link_list = cur_volume->drive_link;
31583 + cur_volume->drive_link = NULL;
31585 + // Access the link data to order the drive links
31587 + (struct link_table_first_sector *) link_list->
31589 + Master_Sequence_Number = psector1->Sequence_Number;
31591 + if (numlinks != psector1->Links_In_Use) {
31593 + ("Link Count mismatch vol=%i, table=%i\n",
31594 + numlinks, psector1->Links_In_Use);
31595 + cur_volume->complete = 0;
31598 + if (numlinks > LINKS_IN_FIRST_SECTOR) {
31599 + countlinks = LINKS_IN_FIRST_SECTOR;
31600 + numlinks -= LINKS_IN_FIRST_SECTOR;
31602 + countlinks = numlinks;
31608 + partition_offset = 0;
31610 + (i < countlinks) && (cur_volume->complete == 1);
31613 + psector1->Link_Table[i].
31614 + partition_serial;
31616 + find_link_data(&link_list, linkser))) {
31617 + // Add this partition to its parent Volume
31618 + add_os2link(link_hold, cur_node);
31620 + ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n",
31621 + partition_offset,
31622 + link_hold->sector_count);
31623 + link_hold->start_sector =
31624 + partition_offset;
31625 + partition_offset +=
31626 + link_hold->sector_count;
31629 + ("Link Table entry %i metadata missing\n",
31631 + cur_volume->complete = 0;
31636 + sect_ptr = (char *) psector1;
31638 + while (numlinks && (cur_volume->complete == 1)) {
31639 + if (numlinks > LINKS_IN_NEXT_SECTOR) {
31640 + countlinks = LINKS_IN_NEXT_SECTOR;
31641 + numlinks -= LINKS_IN_NEXT_SECTOR;
31643 + countlinks = numlinks;
31646 + sect_ptr += OS2_BYTES_PER_SECTOR;
31647 + if (Master_Sequence_Number !=
31648 + ((struct link_table_sector *) sect_ptr)->
31649 + Sequence_Number) {
31650 + cur_volume->complete = 0;
31652 + ("Bad Sequence Number for Drive Linking Metadata!\n");
31654 + for (i = 0; i < countlinks; i++) {
31656 + ((struct link_table_sector *)
31657 + sect_ptr)->Link_Table[i].
31658 + partition_serial;
31660 + find_link_data(&link_list,
31662 + // Add this partition to its parent Volume
31663 + add_os2link(link_hold,
31666 + ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n",
31667 + partition_offset,
31672 + partition_offset;
31673 + partition_offset +=
31678 + ("Link Table entry %i metadata missing\n",
31680 + cur_volume->complete =
31689 + /* If the volume is complete we can export it for use. */
31690 + if (cur_volume->complete && (link_list == NULL)) {
31692 + // Link new volume into the node list
31693 + if (cur_volume->Export_Needed &&
31694 + (!evms_cs_add_logical_node_to_list
31695 + (node_list, cur_node))
31698 + cur_volume->Export_Needed = 0;
31701 + previous_node = cur_node;
31702 + cur_node = cur_volume->next_os2lvm_node;
31704 + /* Remove the volume from os2lvm_nodes list and delete it. */
31705 + if (previous_node != NULL) {
31707 + previous_volume =
31708 + (os2_volume_runtime_entry_t *)
31709 + previous_node->private;
31710 + previous_volume->next_os2lvm_node =
31711 + cur_volume->next_os2lvm_node;
31712 + cur_volume->next_os2lvm_node = NULL;
31714 + delete_os2lvm_volume(cur_node);
31716 + cur_node = previous_volume->next_os2lvm_node;
31718 + previous_node = cur_volume->next_os2lvm_node;
31719 + delete_os2lvm_volume(cur_node);
31720 + cur_node = previous_node;
31721 + previous_node = NULL;
31722 + os2lvm_nodes = cur_node;
31725 + /* If any items remain in link_list, delete those as well. */
31726 + while (link_list) {
31727 + link_hold = link_list->next;
31728 + delete_os2_drive_link(link_list, 1);
31729 + link_list = link_hold;
31736 + LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n");
31741 +/* BBR_Transfer_IO
31743 + * Transfer the responsibility for completing the specified IO from
31744 + * the thread that requested it to the BBR Worker Thread
31747 +BBR_Transfer_IO(struct transfer_record * Transfer_Record)
31749 + unsigned long flags;
31750 + int Wake_Worker_Thread = 0; /* Assume that the worker is already awake. */
31752 + spin_lock_irqsave(&BBR_Queue_Lock, flags);
31754 + /* The BBR IO List is a singly linked list. BBR_IO_List_Head points
31755 + to the first item in the list, and BBR_IO_List_Tail points to the
31756 + last item in the list. */
31757 + Transfer_Record->next = NULL;
31758 + if (!BBR_IO_List_Tail) { /* Empty list */
31759 + BBR_IO_List_Head = Transfer_Record;
31760 + Wake_Worker_Thread = 1; /* Wake up the worker thread. */
31761 + } else /* Items already in the list. */
31762 + BBR_IO_List_Tail->next = Transfer_Record;
31764 + BBR_IO_List_Tail = Transfer_Record;
31766 + spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31767 + if (Wake_Worker_Thread)
31768 + evms_cs_wakeup_thread(BBR_Worker_Thread);
31773 +/* OS2_DL_Callback
31775 + * This is the callback function used when an I/O request has to be broken
31776 + * into two parts because it crosses a drive link boundary.
31780 +OS2_DL_Callback(struct buffer_head *bh, int uptodate)
31783 + struct tracking_record *Tracking_Record;
31784 + struct buffer_head *Original;
31786 + Tracking_Record = bh->b_private;
31788 + /* Is this a read or a write? */
31789 + if (Tracking_Record->link1_transfer_rec ||
31790 + Tracking_Record->link2_transfer_rec) {
31791 + /* We have a write here. Was it successful? */
31793 + /* Have we tried BBR yet? */
31794 + if ((bh == Tracking_Record->link1_bh) &&
31795 + (!Tracking_Record->link1_bbr_attempted)) {
31796 + /* Attempt BBR. */
31797 + BBR_Transfer_IO(Tracking_Record->
31798 + link1_transfer_rec);
31799 + Tracking_Record->link1_bbr_attempted = 1;
31801 + } else if ((bh == Tracking_Record->link2_bh) &&
31802 + (!Tracking_Record->link2_bbr_attempted)) {
31803 + /* Attempt BBR. */
31804 + BBR_Transfer_IO(Tracking_Record->
31805 + link2_transfer_rec);
31806 + Tracking_Record->link2_bbr_attempted = 1;
31814 + Tracking_Record->io_in_progress -= 1;
31815 + if (Tracking_Record->io_in_progress) {
31816 + Tracking_Record->up_to_date = uptodate;
31818 + Original = Tracking_Record->org_bh;
31820 + if (!Tracking_Record->io_in_progress) {
31821 + uptodate &= Tracking_Record->up_to_date;
31822 + /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.
31823 + If the transfer records were used because of BBR, then the BBR worker thread will have
31824 + disposed of the transfer records. If the transfer records were not used, then we must
31825 + dispose of them here to prevent memory leaks. */
31826 + if (Tracking_Record->link1_transfer_rec &&
31827 + (!Tracking_Record->link1_bbr_attempted)) {
31828 + evms_cs_deallocate_to_pool(BBR_Transfer_Pool,
31829 + Tracking_Record->
31830 + link1_transfer_rec);
31832 + if (Tracking_Record->link2_transfer_rec &&
31833 + (!Tracking_Record->link2_bbr_attempted)) {
31834 + evms_cs_deallocate_to_pool(BBR_Transfer_Pool,
31835 + Tracking_Record->
31836 + link2_transfer_rec);
31838 + evms_cs_deallocate_to_pool(evms_bh_pool,
31839 + Tracking_Record->link1_bh);
31840 + evms_cs_deallocate_to_pool(evms_bh_pool,
31841 + Tracking_Record->link2_bh);
31842 + evms_cs_deallocate_to_pool(DL_Tracking_Pool, Tracking_Record);
31843 + Original->b_end_io(Original, uptodate);
31849 +/* OS2_BBR_Write_Callback
31851 + * This is the callback for normal write requests. Check for an error
31852 + * during the I/O, and send to the worker thread for processing if necessary.
31855 +OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record,
31856 + struct buffer_head *bh, int uptodate, int *redrive)
31859 + BBR_Transfer_IO(Transfer_Record);
31862 + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Transfer_Record);
31868 +/* Worker thread to handle:
31870 + I/O to drive/partitions/objects where bad blocks are known to exist
31871 + I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.
31875 +BBR_Worker(void *Not_Used)
31877 + unsigned long flags;
31878 + struct transfer_record *Current_IO;
31882 + // Process bbr_io_list, one entry at a time.
31883 + spin_lock_irqsave(&BBR_Queue_Lock, flags);
31885 + /* Is there any work for us? */
31886 + if (!BBR_IO_List_Head) {
31887 + spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31888 + break; /* List empty - nothing to do. */
31891 + /* Get the IO to perform. */
31892 + Current_IO = BBR_IO_List_Head;
31893 + BBR_IO_List_Head = Current_IO->next;
31894 + if (!BBR_IO_List_Head)
31895 + BBR_IO_List_Tail = BBR_IO_List_Head;
31897 + spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31899 + /* Now lets process the I/O request. */
31900 + complete = do_os2_bbr_io(Current_IO->Partition_Data,
31901 + Current_IO->Write_Flag,
31902 + Current_IO->bh->b_rsector,
31904 + b_size >> EVMS_VSECTOR_SIZE_SHIFT,
31905 + Current_IO->bh->b_data);
31907 + /* We need to do the callback. */
31908 + Current_IO->bh->b_end_io(Current_IO->bh, (complete == 0));
31910 + /* Now cleanup */
31911 + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Current_IO);
31914 + return; /* Go to sleep. */
31919 + * Sector_Is_Remapped
31921 + * This function returns 1 if the specified sector has been remapped, 0 if it has not
31923 + * If the sector has been remapped, then the new sector is returned in Replacement_Sector
31927 +Sector_Is_Remapped(struct os2_dl_entry * io_dlentry,
31928 + u64 Source_Sector, u64 * Replacement_Sector)
31930 + LVM_BBR_Feature *Feature_Data =
31931 + (LVM_BBR_Feature *) io_dlentry->bbr_data;
31932 + unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */
31933 + unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
31934 + unsigned int BBR_Table_Entries_In_Use =
31935 + Feature_Data->control.Table_Entries_In_Use;
31936 + struct bbr_table_entry * table_entry;
31937 + unsigned int guard1;
31939 + /* Default value is no remap. */
31940 + *Replacement_Sector = Source_Sector;
31943 + guard1 = io_dlentry->guard1; /* Lamport's Theorem */
31945 + for (BBR_Table_Index = 0;
31946 + BBR_Table_Index < BBR_Table_Entries_In_Use;
31947 + BBR_Table_Index++) {
31949 + BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
31951 + &(Feature_Data->remap[Sector_Index].
31952 + BBR_Table[BBR_Table_Index -
31954 + BBR_TABLE_ENTRIES_PER_SECTOR)]);
31955 + if (table_entry->BadSector == (u32)Source_Sector) {
31956 + *Replacement_Sector =
31957 + (u64)table_entry->ReplacementSector;
31962 + } while (guard1 != io_dlentry->guard2); /* Lamport's Theorem */
31964 + if (*Replacement_Sector != Source_Sector)
31971 + * Invalidate_Mapping
31973 + * This function either frees a replacement sector to be reused, or it
31974 + * marks the replacement sector as bad.
31978 +Invalidate_Mapping(struct os2_dl_entry * dlentry,
31979 + u64 Source_Sector, int Replacement_Sector_Is_Bad)
31981 + LVM_BBR_Feature *Feature_Data = (LVM_BBR_Feature *) dlentry->bbr_data;
31982 + unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */
31983 + unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
31984 + unsigned int BBR_Table_Entries_In_Use =
31985 + Feature_Data->control.Table_Entries_In_Use;
31986 + struct bbr_table_entry * table_entry = NULL;
31988 + /* Lock for the BBR Table. */
31989 + down(&(dlentry->bbr_table_lock));
31991 + /* Find the entry to invalidate. */
31992 + for (BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use;
31993 + BBR_Table_Index++) {
31994 + Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
31996 + &(Feature_Data->remap[Sector_Index].
31997 + BBR_Table[BBR_Table_Index -
31998 + (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
31999 + if (table_entry->BadSector == Source_Sector) {
32004 + /* Now that we have found the entry, we must invalidate it. */
32005 + if (Replacement_Sector_Is_Bad) {
32006 + table_entry->BadSector = (u32) - 1;
32008 + /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported
32009 + the tracking of bad blocks. We don't support that under Linux, so there is no else case here. */
32011 + /* Unlock the BBR Table */
32012 + up(&(dlentry->bbr_table_lock));
32018 + * Create_New_struct bbr_table_entry
32020 + * Finds bad blocks within the range specified, allocates replacement sectors,
32021 + * writes the data to the replacement sectors, and updates the BBR metadata on
32022 + * disk to reflect the new mapping. Returns 1 if successful, 0 otherwise.
32026 +Create_New_BBR_Table_Entry(struct os2_dl_entry * dlentry,
32027 + u64 starting_lsn, unsigned int count, void *buffer)
32030 + struct bbr_table_entry *Table_Entry;
32031 + unsigned int Sector_Index;
32032 + unsigned int Table_Index;
32035 + u32 New_Sequence_Number;
32036 + LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature *) dlentry->bbr_data;
32038 + for (lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {
32039 + rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);
32042 + /* Lock for the BBR Table. */
32043 + down(&(dlentry->bbr_table_lock));
32045 + /* Increment the second guard value. This will cause those reading the BBR Table to spin. */
32046 + dlentry->guard2++;
32048 + /* Ensure that the bbr active flag is set. */
32049 + dlentry->bbr_is_active = 1;
32051 + /* Allocate a replacement sector */
32052 + if (BBR_Data->control.Table_Entries_In_Use <
32053 + BBR_Data->control.Table_Size) {
32055 + BBR_Data->control.Table_Entries_In_Use /
32056 + BBR_TABLE_ENTRIES_PER_SECTOR;
32058 + BBR_Data->control.Table_Entries_In_Use %
32059 + BBR_TABLE_ENTRIES_PER_SECTOR;
32060 + BBR_Data->control.Table_Entries_In_Use =
32061 + BBR_Data->control.Table_Entries_In_Use + 1;
32063 + (struct bbr_table_entry *) & (BBR_Data->
32064 + remap[Sector_Index].
32067 + Table_Entry->BadSector = lsn;
32069 + /* There are no more replacement sectors available! Time to bail ... */
32070 + up(&(dlentry->bbr_table_lock));
32074 + /* Now that we have a replacement sector, increment the first guard value. This will free any
32075 + threads reading the BBR Table. */
32076 + dlentry->guard1++;
32078 + /* Release the lock now that we have a replacement sector. */
32079 + up(&(dlentry->bbr_table_lock));
32081 + /* Test the replacement sector. */
32082 + rc = INIT_IO(dlentry->link_partition, 1,
32083 + Table_Entry->ReplacementSector, 1, buffer);
32085 + /* The replacement sector was bad. Lets mark it bad in the table and try again. */
32086 + Table_Entry->BadSector = (u32) - 1;
32089 + } /* End of processing for the current sector. */
32091 + } /* end of loop to test each sector in the I/O and remap any bad ones found. */
32093 + /* Need to write the modified BBR Table back to disk. This includes updating the sequence numbers and CRCs. */
32095 + /* Lock for the BBR Table. */
32096 + down(&(dlentry->bbr_table_lock));
32098 + /* Increment the sequence numbers. */
32099 + New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;
32100 + BBR_Data->control.Sequence_Number = New_Sequence_Number;
32101 + for (Sector_Index = 0;
32102 + Sector_Index < BBR_Data->control.Sectors_Per_Table;
32103 + Sector_Index++) {
32104 + BBR_Data->remap[Sector_Index].Sequence_Number =
32105 + New_Sequence_Number;
32108 + /* Calculate the new CRC values. */
32109 + BBR_Data->control.CRC = 0;
32110 + BBR_Data->control.CRC =
32111 + evms_cs_calculate_crc(EVMS_INITIAL_CRC, &(BBR_Data->control),
32112 + OS2_BYTES_PER_SECTOR);
32113 + for (Sector_Index = 0;
32114 + Sector_Index < BBR_Data->control.Sectors_Per_Table;
32115 + Sector_Index++) {
32116 + BBR_Data->remap[Sector_Index].CRC = 0;
32117 + BBR_Data->remap[Sector_Index].CRC =
32118 + evms_cs_calculate_crc(EVMS_INITIAL_CRC,
32119 + &(BBR_Data->remap[Sector_Index]),
32120 + OS2_BYTES_PER_SECTOR);
32123 + /* Now we must write the table back to the partition from whence it came. */
32125 + /* Write the first copy. */
32126 + rc = INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn1,
32127 + dlentry->bbr_feature_size, BBR_Data);
32129 + /* Write the second copy. */
32131 + INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn2,
32132 + dlentry->bbr_feature_size, BBR_Data);
32134 + /* If both copies failed to reach the disk, then fail the I/O. */
32140 + /* Unlock the BBR Table */
32141 + up(&(dlentry->bbr_table_lock));
32143 + /* Indicate success. */
32148 + * Clone_Bufferhead
32150 + * Prepares a usable copy of an existing bufferhead.
32154 +Clone_Bufferhead(struct buffer_head *Source, struct buffer_head *Child)
32156 + Child->b_next = NULL;
32157 + Child->b_blocknr = Source->b_blocknr;
32158 + Child->b_size = Source->b_size;
32159 + Child->b_list = BUF_LOCKED;
32160 + Child->b_dev = Source->b_dev;
32161 + Child->b_count = (atomic_t) ATOMIC_INIT(0);
32162 + atomic_set(&Child->b_count, atomic_read(&Source->b_count));
32163 + Child->b_rdev = Source->b_rdev;
32164 + Child->b_state = Source->b_state;
32165 + Child->b_flushtime = 0;
32166 + Child->b_next_free = NULL;
32167 + Child->b_prev_free = NULL;
32168 + Child->b_this_page = (struct buffer_head *) 1;
32169 + Child->b_reqnext = NULL;
32170 + Child->b_pprev = NULL;
32171 + Child->b_data = Source->b_data;
32172 + Child->b_page = Source->b_page;
32173 + Child->b_end_io = Source->b_end_io;
32174 + Child->b_private = Source->b_private;
32175 + Child->b_rsector = Source->b_rsector;
32176 + Child->b_inode_buffers.next = NULL;
32177 + Child->b_inode_buffers.prev = NULL;
32180 diff -Naur linux-2002-09-30/drivers/evms/s390_part.c evms-2002-09-30/drivers/evms/s390_part.c
32181 --- linux-2002-09-30/drivers/evms/s390_part.c Wed Dec 31 18:00:00 1969
32182 +++ evms-2002-09-30/drivers/evms/s390_part.c Fri Sep 13 16:09:55 2002
32184 +/* -*- linux-c -*- */
32188 + * Copyright (c) International Business Machines Corp., 2000
32190 + * This program is free software; you can redistribute it and/or modify
32191 + * it under the terms of the GNU General Public License as published by
32192 + * the Free Software Foundation; either version 2 of the License, or
32193 + * (at your option) any later version.
32195 + * This program is distributed in the hope that it will be useful,
32196 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
32197 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32198 + * the GNU General Public License for more details.
32200 + * You should have received a copy of the GNU General Public License
32201 + * along with this program; if not, write to the Free Software
32202 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32207 + * linux/drivers/evms/s390_part.c
32209 + * EVMS S/390 partition manager
32211 + * Partial code extracted from
32213 + * linux/fs/partitions/ibm.c
32217 +#include <linux/config.h>
32218 +#include <linux/module.h>
32219 +#include <linux/kernel.h>
32220 +#include <linux/config.h>
32221 +#include <linux/string.h>
32222 +#include <linux/blk.h>
32223 +#include <asm/ebcdic.h>
32224 +#include <asm/uaccess.h>
32225 +#include <asm/dasd.h>
32226 +#include <asm/vtoc.h>
32227 +#include <linux/evms/evms.h>
32229 +/* prefix used in logging messages */
32230 +#define LOG_PREFIX "s390_part: "
32232 +/* Private instance data structure for node we produced */
32233 +struct local_instance_data {
32234 + struct evms_logical_node *source_disk;
32235 + u64 start_sect; /* starting LBA */
32236 + u64 nr_sects; /* number of sectors */
32237 + unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */
32240 +static int exported_nodes; /* total # of exported segments
32241 + * produced during this discovery.
32245 +static int s390_partition_discover(struct evms_logical_node **);
32246 +static int s390_partition_delete(struct evms_logical_node *);
32247 +static void s390_partition_read(struct evms_logical_node *,
32248 + struct buffer_head *);
32249 +static void s390_partition_write(struct evms_logical_node *,
32250 + struct buffer_head *);
32251 +static int s390_partition_ioctl(struct evms_logical_node *,
32253 + struct file *, unsigned int, unsigned long);
32254 +static int s390_partition_init_io(struct evms_logical_node *,
32255 + int, u64, u64, void *);
32257 +static struct evms_plugin_fops fops = {
32258 + .discover = s390_partition_discover,
32259 + .delete = s390_partition_delete,
32260 + .read = s390_partition_read,
32261 + .write = s390_partition_write,
32262 + .init_io = s390_partition_init_io,
32263 + .ioctl = s390_partition_ioctl
32266 +#define EVMS_S390_PARTITION_MANAGER_ID 2
32268 +static struct evms_plugin_header plugin_header = {
32269 + .id = SetPluginID(IBM_OEM_ID,
32270 + EVMS_SEGMENT_MANAGER,
32271 + EVMS_S390_PARTITION_MANAGER_ID),
32275 + .patchlevel = 0},
32276 + .required_services_version = {
32279 + .patchlevel = 0},
32283 +/***************************************************/
32284 +/* List Support - Typedefs, Variables, & Functions */
32285 +/***************************************************/
32289 +/* structure to keep status on
32292 +#define S390_DISK_OK 0
32293 +#define S390_DISK_FAILED 1
32294 +#define S390_FAILED_SKIP_COUNT 1024
32295 +struct disk_object {
32297 + atomic_t skipped_ios;
32298 + atomic_t pending_ios;
32299 + atomic_t total_ios;
32300 + atomic_t failed_ios;
32301 + struct evms_logical_node *disk;
32304 +/* structure to keep status
32305 + * on each device.
32307 +struct device_object {
32308 + unsigned char label[8];
32310 + struct evms_list_node *disk_object_list;
32311 + struct evms_list_node *segment_list;
32314 +/* structure used to track in-flight IOs,
32315 + * and to handle failover scenarios.
32318 + struct device_object *devo;
32319 + struct disk_object *dsko;
32320 + struct evms_logical_node *segment;
32323 + struct buffer_head *bh;
32324 + struct s390_io *next;
32326 +static spinlock_t s390_redrive_list_lock = SPIN_LOCK_UNLOCKED;
32327 +static struct s390_io *s390_redrive_list = NULL;
32328 +static struct evms_thread *s390_io_redrive_thread;
32329 +static struct evms_pool_mgmt *s390_io_track_pool = NULL;
32333 +static struct evms_list_node *my_device_object_list;
32335 +static struct evms_list_node **
32336 +lookup_device_object(struct evms_logical_node *disk)
32338 + struct evms_list_node **devoln;
32340 + devoln = &my_device_object_list;
32341 + while (*devoln) {
32342 + struct evms_list_node **dskoln;
32343 + struct device_object *devo;
32344 + devo = (struct device_object *) (*devoln)->item;
32345 + dskoln = &devo->disk_object_list;
32346 + while (*dskoln) {
32347 + struct disk_object *dsko;
32348 + dsko = (struct disk_object *) (*dskoln)->item;
32349 + if (dsko->disk == disk) {
32352 + dskoln = &(*dskoln)->next;
32354 + devoln = &(*devoln)->next;
32359 +static struct evms_list_node **
32360 +lookup_label(unsigned char *label, struct evms_list_node **devoln)
32363 + devoln = &my_device_object_list;
32365 + devoln = &(*devoln)->next;
32367 + while (*devoln) {
32368 + struct device_object *devo;
32369 + struct disk_object *dsko;
32370 + devo = (struct device_object *) (*devoln)->item;
32371 + dsko = (struct disk_object *) devo->disk_object_list->item;
32372 + LOG_DEBUG("comparing labels: new(%s), %s(%s)\n",
32373 + label, dsko->disk->name, devo->label);
32374 + if (!strncmp(devo->label, label, 6)) {
32375 + LOG_DEBUG("matching label found!\n");
32378 + devoln = &(*devoln)->next;
32383 +static struct evms_logical_node *
32384 +find_segment_on_disk(struct evms_logical_node *disk,
32385 + u64 start_sect, u64 nr_sects)
32387 + struct evms_logical_node *rc = NULL;
32388 + struct evms_list_node **devoln;
32390 + /* find disk object */
32391 + devoln = lookup_device_object(disk);
32393 + /* disk object found in list */
32394 + /* attempt to find segment */
32395 + struct evms_list_node **sln;
32396 + struct device_object *devo;
32398 + devo = (struct device_object *) (*devoln)->item;
32399 + sln = &devo->segment_list;
32401 + struct evms_logical_node *segment;
32402 + struct local_instance_data *lid;
32404 + segment = (struct evms_logical_node *) (*sln)->item;
32405 + lid = segment->private;
32406 + if (lid->start_sect == start_sect) {
32407 + if (lid->nr_sects == nr_sects) {
32412 + sln = &(*sln)->next;
32419 +add_segment_to_disk(struct evms_logical_node *disk,
32420 + unsigned char *label, struct evms_logical_node *segment)
32423 + struct evms_list_node **devoln;
32424 + struct device_object *devo;
32426 + devoln = lookup_device_object(disk);
32427 + if (*devoln == NULL) {
32428 + struct disk_object *dsko = NULL;
32429 + /* device object not in list, add device object */
32430 + devo = kmalloc(sizeof (*devo), GFP_KERNEL);
32432 + memset(devo, 0, sizeof (*devo));
32433 + strncpy(devo->label, label, 6);
32434 + rc = evms_cs_add_item_to_list(devoln, devo);
32439 + /* create a disk object */
32441 + dsko = kmalloc(sizeof (*dsko), GFP_KERNEL);
32447 + memset(dsko, 0, sizeof (*dsko));
32448 + /* add disk to disk object */
32449 + dsko->disk = disk;
32450 + /* add disk object to disk object list
32451 + * in device object */
32452 + rc = evms_cs_add_item_to_list(&devo->disk_object_list,
32456 + devo->total_paths++;
32458 + /* on error clean up allocations */
32463 + evms_cs_remove_item_from_list(devoln, devo);
32469 + devo = (struct device_object *) (*devoln)->item;
32472 + /* attempt to add segment */
32473 + rc = evms_cs_add_item_to_list(&devo->segment_list, segment);
32479 +remove_segment_from_disk(struct evms_logical_node *disk,
32480 + struct evms_logical_node *segment,
32481 + struct evms_list_node **empty_disk_object_list)
32484 + struct evms_list_node **devoln;
32486 + *empty_disk_object_list = NULL;
32487 + devoln = lookup_device_object(disk);
32489 + /* device object found in list */
32490 + /* attempt to remove segment */
32491 + struct device_object *devo;
32492 + devo = (struct device_object *) (*devoln)->item;
32493 + rc = evms_cs_remove_item_from_list(&devo->segment_list,
32496 + if (devo->segment_list == NULL) {
32497 + /* return disk object list to caller */
32498 + *empty_disk_object_list =
32499 + devo->disk_object_list;
32500 + /* remove device object from list */
32501 + rc = evms_cs_remove_item_from_list(devoln,
32503 + /* free device object */
32511 +/* function: s390_load_balance
32513 + * this function is used to route an IO to the appropriate
32514 + * paths of a multipath device.
32516 + * appropriate paths are determine used load balancing
32517 + * techniques. load balancing is accomplished by monitoring
32518 + * pending or in-flight IOs to each path. when a new IO
32519 + * request is received, all paths are examined, and the path
32520 + * with the fewest IOs pending is selected to receive the
32523 + * this routine also utilizes some failed path recovery
32526 + * if a failed path has been skipped for a given number
32527 + * (timeout value) of IO requests. it is then tried again,
32528 + * and if the path has become functional again, it returned
32529 + * to the active state and it becomes available for load
32532 + * if a new IO arrives and we find no currently active paths,
32533 + * each failed path will be attempted one time in the hopes
32534 + * that it may have become active from the time between when
32535 + * it was marked failed and now. only when all paths have
32536 + * been tried and found non-active, is the IO marked with
32537 + * an error and returned.
32539 + * this function works in concert with s390_end_io_callback
32540 + * function and the s390iod(aemon), to redrive failed IO
32545 +s390_load_balance(struct s390_io **piot, struct evms_logical_node *disk)
32547 + struct evms_list_node **dskoln;
32548 + struct disk_object *dsko, *selected_dsko = NULL;
32549 + int dskidx, path = 0;
32550 + struct s390_io *iot;
32552 + /* allocate and initialize an IO tracking structure
32553 + * if one was not passed in.
32556 + struct evms_list_node **devoln;
32557 + /* allocate IO Track struct */
32558 + *piot = evms_cs_allocate_from_pool(s390_io_track_pool,
32560 + memset(*piot, 0, sizeof (*iot));
32561 + /* find the device object */
32562 + devoln = lookup_device_object(disk);
32563 + (*piot)->devo = (*devoln)->item;
32567 + /* find next disk object based on current load */
32569 + /* check for failed paths that have timed-out */
32571 + dskoln = &iot->devo->disk_object_list;
32572 + while (*dskoln) {
32573 + dsko = (struct disk_object *) (*dskoln)->item;
32575 + /* skip paths tried earlier */
32576 + if (iot->paths_tried & dskidx) {
32579 + /* skip active disks */
32580 + if (dsko->flags == S390_DISK_OK) {
32583 + /* skip disks that haven't timed-out yet */
32584 + if (atomic_read(&dsko->skipped_ios)
32585 + < S390_FAILED_SKIP_COUNT) {
32588 + selected_dsko = dsko;
32592 + dskoln = &(*dskoln)->next;
32596 + /* if we have no timed-out paths, then check for the
32597 + * path with lowest pending io count. if that path
32598 + * happens to be a failed path and there is active
32599 + * paths, increment the skipped io count, mark this
32600 + * path as having been selected, then go back and run
32601 + * the loop again, looking for the next best choice.
32602 + * continue this process until the best active has
32603 + * been selected, or we end up with the best failed
32606 + if (!selected_dsko) {
32607 + int paths_selected, have_actives;
32608 + paths_selected = 0;
32609 + s390_repeat_active_search:
32611 + have_actives = FALSE;
32613 + dskoln = &iot->devo->disk_object_list;
32614 + while (*dskoln) {
32615 + dsko = (struct disk_object *) (*dskoln)->item;
32617 + /* skip paths tried earlier */
32618 + if (iot->paths_tried & dskidx) {
32621 + /* skip previously selected disks */
32622 + if (paths_selected & dskidx) {
32625 + /* remember if we have active disks */
32626 + if (dsko->flags == S390_DISK_OK) {
32627 + have_actives = TRUE;
32629 + /* look for disk with smallest
32630 + * pending IO count.
32632 + if (selected_dsko) {
32633 + if (atomic_read(&dsko->pending_ios)
32636 + (&selected_dsko->pending_ios))) {
32640 + selected_dsko = dsko;
32643 + dskoln = &(*dskoln)->next;
32646 + /* if we have unselected active paths
32647 + * and the currently selected path is
32648 + * failed, increment its skipped io count,
32649 + * and then go back to find an active path.
32651 + * this loop is structured this way so that
32652 + * we can accurately determine and track when
32653 + * a path has been skipped.
32655 + if (have_actives && selected_dsko) {
32656 + if (selected_dsko->flags & S390_DISK_FAILED) {
32657 + atomic_inc(&selected_dsko->skipped_ios);
32658 + paths_selected |= path;
32659 + selected_dsko = NULL;
32660 + goto s390_repeat_active_search;
32665 + /* if we have a selected path, perform the necessary
32666 + * bookkeeping on it.
32668 + if (selected_dsko) {
32669 + atomic_set(&selected_dsko->skipped_ios, 0);
32670 + atomic_inc(&selected_dsko->pending_ios);
32671 + atomic_inc(&selected_dsko->total_ios);
32672 + iot->paths_tried |= path;
32674 + /* store the selected path (disk object) in the
32675 + * IO tracking structure, for examination by the
32678 + iot->dsko = selected_dsko;
32682 +s390_end_io_callback(void *private,
32683 + struct buffer_head *bh, int uptodate, int *done)
32685 + struct s390_io *iot;
32690 + /* update the disk object's status */
32691 +// spin_lock_irqsave(iot->devo->device_object_lock, flags);
32692 + atomic_dec(&iot->dsko->pending_ios);
32693 + iot->dsko->flags = !uptodate;
32694 +// spin_unlock_irqrestore(iot->devo->device_object_lock, flags);
32697 + atomic_inc(&iot->dsko->failed_ios);
32698 + /* encountered error */
32700 + /* is this a multipath device? */
32701 + if (iot->devo->total_paths > 1) {
32702 + /* yes, its a multipath device */
32704 + /* determine alternate path */
32705 + s390_load_balance(&iot, NULL);
32707 + /* queue up redrive request */
32708 + spin_lock_irqsave(&s390_redrive_list_lock,
32710 + iot->next = s390_redrive_list;
32711 + s390_redrive_list = iot;
32712 + spin_unlock_irqrestore(&s390_redrive_list_lock,
32714 + /* wake up redrive daemon */
32715 + evms_cs_wakeup_thread(s390_io_redrive_thread);
32717 + /* prevent the end_io to caller of EVMS */
32722 + if (*done == FALSE) {
32723 + evms_cs_deallocate_to_pool(s390_io_track_pool, iot);
32727 +/****************************************************
32728 +* Function: s390iod
32730 +* This is a kernel thread that handles read/write of mirrorss
32731 +* This shouldn't ever run on a non-mirrored LV read/write
32734 +*****************************************************/
32736 +s390iod(void *data)
32738 + struct s390_io *iot;
32739 + unsigned long flags;
32743 + spin_lock_irqsave(&s390_redrive_list_lock, flags);
32744 + if (s390_redrive_list == NULL) {
32745 + spin_unlock_irqrestore(&s390_redrive_list_lock, flags);
32748 + iot = s390_redrive_list;
32749 + s390_redrive_list = iot->next;
32750 + iot->next = NULL;
32751 + spin_unlock_irqrestore(&s390_redrive_list_lock, flags);
32753 + /* register for callback */
32754 + rc = evms_cs_register_for_end_io_notification(iot, iot->bh,
32755 + s390_end_io_callback);
32758 + ("error(%d): unable to register for end io callback!\n",
32762 + if (!iot->rw_flag) {
32763 + R_IO(iot->dsko->disk, iot->bh);
32765 + W_IO(iot->dsko->disk, iot->bh);
32772 + * Function: add_segment
32775 +s390_process_segment(struct evms_logical_node **discover_list,
32776 + struct evms_logical_node *node,
32777 + unsigned char *label,
32779 + u64 nr_sects, unsigned char type, int part_num)
32781 + struct local_instance_data *InstData = NULL;
32782 + struct evms_logical_node *segment;
32785 + segment = find_segment_on_disk(node, start_sect, nr_sects);
32787 + LOG_DETAILS("exporting segment '%s'.\n", segment->name);
32789 + InstData = kmalloc(sizeof (*InstData), GFP_KERNEL);
32791 + memset(InstData, 0, sizeof (*InstData));
32792 + InstData->source_disk = node;
32793 + InstData->start_sect = start_sect;
32794 + InstData->nr_sects = nr_sects;
32795 + InstData->type = type;
32796 + rc = evms_cs_allocate_logical_node(&segment);
32802 + segment->plugin = &plugin_header;
32803 + segment->system_id = (unsigned int) type;
32804 + segment->total_vsectors = nr_sects;
32805 + segment->block_size = node->block_size;
32806 + segment->hardsector_size = node->hardsector_size;
32807 + segment->private = InstData;
32808 + segment->flags = node->flags;
32809 + strcpy(segment->name, node->name);
32810 + sprintf(segment->name + strlen(segment->name), "%d",
32812 + LOG_DETAILS("creating segment '%s'.\n", segment->name);
32813 + rc = add_segment_to_disk(node, label, segment);
32816 + ("%s: error(%d) adding segment '%s'!\n",
32817 + __FUNCTION__, rc, segment->name);
32820 + MOD_INC_USE_COUNT;
32827 + evms_cs_deallocate_logical_node(segment);
32831 + evms_cs_add_logical_node_to_list(discover_list, segment);
32832 + exported_nodes++;
32838 + ibm_partition_lnx1 = 0,
32839 + ibm_partition_vol1 = 1,
32840 + ibm_partition_cms1 = 2,
32841 + ibm_partition_none = 3
32842 +} ibm_partition_t;
32844 +static char *part_names[] = {
32845 + [ibm_partition_lnx1] = "LNX1",
32846 + [ibm_partition_vol1] = "VOL1",
32847 + [ibm_partition_cms1] = "CMS1",
32848 + [ibm_partition_none] = "(nonl)"
32851 +static ibm_partition_t
32852 +get_partition_type(char *type)
32855 + for (i = 0; i < 3; i++) {
32856 + if (!strncmp(type, part_names[i], 4))
32863 + * compute the block number from a
32864 + * cyl-cyl-head-head structure
32867 +cchh2blk(cchh_t * ptr, struct hd_geometry *geo)
32869 + return ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors;
32873 + * compute the block number from a
32874 + * cyl-cyl-head-head-block structure
32877 +cchhb2blk(cchhb_t * ptr, struct hd_geometry *geo)
32881 + block = ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors;
32883 + block += ptr->b - 1;
32890 +print_mem(void *buffer, int length)
32893 + unsigned char *bufptr;
32895 + bufptr = (unsigned char *) buffer;
32898 + if ((i % 16) == 0)
32899 + printk(KERN_INFO "\n0x%p->", buffer + i);
32900 + printk(KERN_INFO "%02x ", bufptr[i]);
32901 + if (++i >= length)
32904 + printk(KERN_INFO "\n");
32908 +s390_probe_multipath(struct evms_logical_node *disk,
32909 + unsigned char *label,
32910 + u64 label_lba, int label_offset, unsigned char *org_buf)
32913 + struct evms_list_node **devoln;
32914 + unsigned char *sector_buf = NULL;
32916 + LOG_ENTRY_EXIT("%s: Entry\n", __FUNCTION__);
32917 + /* check if this disk is already known.
32918 + * if it is already in our device list
32919 + * then we don't need to check for
32920 + * multipath associations.
32922 + devoln = lookup_device_object(disk);
32923 + /* is this disk in our list? */
32925 + struct device_object *devo;
32926 + struct disk_object *dsko;
32927 + /* yes, disk already known */
32929 + /* we need to determine if this
32930 + * is our first path to this
32933 + devo = (struct device_object *) (*devoln)->item;
32934 + /* if this is the first path to this
32935 + * device, return FALSE so the main
32936 + * routine will process its segments.
32937 + * if this is not the first path,
32938 + * return TRUE so the main routine
32939 + * will not process its segments.
32941 + dsko = (struct disk_object *) devo->disk_object_list->item;
32942 + if (dsko->disk != disk) {
32945 + /* only print multipath log msgs if its
32946 + * active on this device.
32948 + if (devo->total_paths > 1) {
32950 + ("skipping probe of known multipath device '%s'.\n",
32953 + LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc);
32957 + /* search device object list for a matching label */
32959 + while (*(devoln = lookup_label(label, devoln))) {
32960 + struct device_object *devo;
32961 + struct disk_object *dsko;
32962 + unsigned char org_label[6];
32963 +#define S390_TEST_LABEL "~!@#$"
32965 + /* yes, found matching label */
32966 + if (!sector_buf) {
32967 + /* allocate buffer for incoming label sector */
32968 + sector_buf = kmalloc(disk->hardsector_size, GFP_KERNEL);
32969 + if (!sector_buf) {
32975 + /* save original label */
32976 + memcpy(org_label, org_buf + label_offset, 6);
32977 + /* alter label to test pattern */
32978 + strcpy(org_buf + label_offset, S390_TEST_LABEL);
32979 + /* write test pattern to this disk */
32980 + LOG_DEBUG("writing test label to '%s'.\n", disk->name);
32981 + rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf);
32983 + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
32984 + rc, label_lba, disk->name);
32988 + /* read label from device object with matching label */
32989 + devo = (struct device_object *) (*devoln)->item;
32990 + dsko = (struct disk_object *) devo->disk_object_list->item;
32991 + LOG_DEBUG("reading label from '%s'.\n", dsko->disk->name);
32992 + rc = INIT_IO(dsko->disk, READ, label_lba, 1, sector_buf);
32994 + LOG_ERROR("error(%d) writing sector("PFU64") to '%s'.\n",
32995 + rc, label_lba, dsko->disk->name);
32998 + /* restore original label */
32999 + memcpy(org_buf + label_offset, org_label, 6);
33000 + LOG_DEBUG("restoring original label to '%s'.\n", disk->name);
33001 + rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf);
33003 + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
33004 + rc, label_lba, disk->name);
33008 + LOG_DEBUG("checking label: %s(%6s), reference(%6s).\n",
33009 + dsko->disk->name,
33010 + sector_buf + label_offset, S390_TEST_LABEL);
33011 + if (!strcmp(sector_buf + label_offset, S390_TEST_LABEL)) {
33012 + LOG_DETAILS("assigning '%s' as first path to device.\n",
33013 + dsko->disk->name);
33014 + LOG_DETAILS("assigning '%s' as next path to device.\n",
33016 + /* store this disk in the disk object's
33019 + /* create a disk object */
33022 + dsko = kmalloc(sizeof (*dsko), GFP_KERNEL);
33028 + memset(dsko, 0, sizeof (*dsko));
33029 + /* add disk to disk object */
33030 + dsko->disk = disk;
33031 + /* add disk object to disk object list
33032 + * in device object */
33033 + rc = evms_cs_add_item_to_list(&devo->
33034 + disk_object_list,
33038 + devo->total_paths++;
33046 + /* indicate we found a multipath device */
33051 + if (sector_buf) {
33052 + kfree(sector_buf);
33055 + LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc);
33060 +s390_probe_for_segments(struct evms_logical_node **discover_list,
33061 + struct evms_logical_node *disk)
33063 + char type[5] = { 0, }, name[7] = {
33065 + int rc, vsects_per_hardsect = 0;
33066 + unsigned int blk;
33067 + u64 io_start, label_lba = 3;
33068 + dasd_information_t *info = NULL;
33069 + struct hd_geometry *geo = NULL;
33070 + unchar *data = NULL;
33072 + /* allocate space for DASD ioctl packet
33074 + info = kmalloc(sizeof (dasd_information_t), GFP_KERNEL);
33076 + memset(info, 0, sizeof (dasd_information_t));
33077 + LOG_DEBUG("probing '%s' for 390 DASD info...\n", disk->name);
33078 + /* issue DASD info ioctl
33080 + rc = evms_cs_kernel_ioctl(disk, BIODASDINFO,
33081 + (unsigned long) info);
33083 + LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);
33084 + LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",
33092 + /* if we successfully completed the previous
33093 + * get DASD info ioctl, we will assume that
33094 + * the device is a valid 390 disk.
33096 + * remove it from the discover list.
33098 + rc = evms_cs_remove_logical_node_from_list(discover_list, disk);
33101 + ("error(%d) removing disk(%s) from discover list.\n",
33106 + /* allocate space for the geometry packet
33108 + geo = kmalloc(sizeof (struct hd_geometry), GFP_KERNEL);
33114 + memset(geo, 0, sizeof (struct hd_geometry));
33115 + /* issue the Get GEO ioctl
33117 + rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO,
33118 + (unsigned long) geo);
33120 + LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);
33124 + /* retrieve the vsects_per_hardsect (hardsector size)
33126 + vsects_per_hardsect = disk->hardsector_size;
33127 + vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;
33128 + data = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
33134 + /* go read the 1st block on the disk
33136 + label_lba = info->label_block * vsects_per_hardsect;
33137 + io_start = label_lba;
33138 + rc = INIT_IO(disk, READ, io_start, 1, data);
33140 + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
33141 + rc, io_start, disk->name);
33143 +// print_mem(data, EVMS_VSECTOR_SIZE);
33147 + int offset, size, psize, counter = 0, label_offset;
33148 + int vstart = 0, vend = 0;
33149 + int vtoc_record_count, vtoc_index;
33150 + format1_label_t f1;
33151 + format4_label_t *f4;
33152 + volume_label_t vlabel;
33153 + ibm_partition_t partition_type;
33155 + /* determine the format type
33158 + strncpy(type, data, 4);
33159 + if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
33160 + label_offset = 8;
33162 + label_offset = 4;
33164 + strncpy(name, data + label_offset, 6);
33165 + memcpy(&vlabel, data, sizeof (volume_label_t));
33169 + partition_type = get_partition_type(type);
33170 + LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",
33171 + type, part_names[partition_type], name);
33173 + rc = s390_probe_multipath(disk, name, label_lba, label_offset,
33176 + switch (partition_type) {
33177 + case ibm_partition_cms1:
33178 + if (*((long *) data + 13) != 0) {
33179 + /* disk is reserved minidisk */
33180 + long *label = (long *) data;
33181 + vsects_per_hardsect =
33182 + label[3] >> EVMS_VSECTOR_SIZE_SHIFT;
33183 + offset = label[13];
33186 + 1) * vsects_per_hardsect;
33187 + LOG_DEBUG("(MDSK)");
33189 + offset = info->label_block + 1;
33190 + size = disk->total_vsectors;
33192 + offset *= vsects_per_hardsect;
33193 + /* adjust for 0 thru label block offset
33196 + rc = s390_process_segment(discover_list,
33199 + offset, size, 0, 1);
33201 + case ibm_partition_lnx1:
33202 + case ibm_partition_none:
33203 + offset = info->label_block + 1;
33204 + offset *= vsects_per_hardsect;
33205 + size = disk->total_vsectors;
33206 + /* adjust for 0 thru label block offset
33209 + rc = s390_process_segment(discover_list,
33212 + offset, size, 0, 1);
33214 + case ibm_partition_vol1:
33215 + /* set max dscb record count == single track till we see the vtoc descriptor */
33216 + vtoc_record_count = geo->sectors;
33217 + /* set current index into vtoc */
33219 + /* get block number and read then first dscb */
33220 + blk = cchhb2blk(&vlabel.vtoc, geo);
33221 + io_start = blk * vsects_per_hardsect;
33222 + rc = INIT_IO(disk, READ, io_start, 1, data);
33225 + ("error(%d) reading sector("PFU64") from '%s'.\n",
33226 + rc, io_start, disk->name);
33229 + // print_mem(data, EVMS_VSECTOR_SIZE);
33231 + memcpy(&f1, data, sizeof (format1_label_t));
33233 + // read vtoc records ... terminate when :
33234 + // (1) we hit first NULL record
33235 + // (2) we get an error processing a vtoc record
33236 + // (3) we run out of vtoc records to process
33237 + while (f1.DS1FMTID != 0x00 && rc == 0
33238 + && vtoc_index < vtoc_record_count) {
33239 + if (f1.DS1FMTID == 0xf4) { // vtoc descriptor
33240 + f4 = (format4_label_t *) data;
33242 + cchh2blk(&f4->DS4VTOCE.
33245 + cchh2blk(&f4->DS4VTOCE.
33247 + vtoc_record_count =
33248 + (vend - vstart) +
33250 + } else if (f1.DS1FMTID == 0xf1) { // dataset descriptor
33253 + cchh2blk(&f1.DS1EXT1.llimit,
33256 + cchh2blk(&f1.DS1EXT1.ulimit,
33261 + rc = s390_process_segment
33262 + (discover_list, disk, name,
33264 + vsects_per_hardsect,
33266 + vsects_per_hardsect, 0,
33269 + if (!rc) { // get next dscb
33273 + blk * vsects_per_hardsect;
33274 + rc = INIT_IO(disk, READ,
33275 + io_start, 1, data);
33278 + ("error(%d) reading sector("PFU64") from '%s'.\n",
33283 + // print_mem(data, EVMS_VSECTOR_SIZE);
33285 + memcpy(&f1, data,
33287 + (format1_label_t));
33292 + rc = s390_process_segment(discover_list,
33312 + * Function: s390_partition_discover
33316 +s390_partition_discover(struct evms_logical_node **discover_list)
33319 + struct evms_logical_node *node, *next_node;
33321 + MOD_INC_USE_COUNT;
33322 + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
33324 + /* initialize global variable */
33325 + exported_nodes = 0;
33327 + /* examine each node on the discover list */
33328 + next_node = *discover_list;
33329 + while (next_node) {
33330 + node = next_node;
33331 + next_node = node->next;
33332 + if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)
33333 + /* only process disk nodes
33336 + if (node->iflags & EVMS_TOP_SEGMENT)
33338 + s390_probe_for_segments(discover_list, node);
33341 + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
33342 + __FUNCTION__, exported_nodes, rc);
33343 + if (exported_nodes)
33344 + rc = exported_nodes;
33345 + MOD_DEC_USE_COUNT;
33350 + * Function: s390_partition_delete
33354 +s390_partition_delete(struct evms_logical_node *segment)
33357 + struct local_instance_data *LID;
33359 + LOG_DETAILS("deleting segment '%s'.\n", segment->name);
33364 + struct evms_list_node *empty_disk_object_list = NULL;
33365 + LID = segment->private;
33367 + /* remove the segment from the
33368 + * disk's segment list
33370 + rc = remove_segment_from_disk(LID->source_disk,
33372 + &empty_disk_object_list);
33373 + /* free the local instance data */
33376 + /* free the segment node */
33377 + evms_cs_deallocate_logical_node(segment);
33378 + MOD_DEC_USE_COUNT;
33379 + /* if the last segment on the disk was
33380 + * deleted, delete the disk node(s) too
33382 + while (empty_disk_object_list) {
33383 + struct disk_object *dsko;
33385 + (struct disk_object *) empty_disk_object_list->item;
33386 + rc = evms_cs_remove_item_from_list
33387 + (&empty_disk_object_list, dsko);
33389 + rc = DELETE(dsko->disk);
33392 + ("error(%d): attempting to delete '%s'.\n",
33393 + rc, dsko->disk->name);
33404 + * function: s390_partition_io_error
33406 + * this function was primarily created because the function
33407 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
33408 + * to be set on inline functions. Since this was an error path
33409 + * and not mainline, I decided to add a trace statement to help
33410 + * report on the failing condition.
33414 +s390_partition_io_error(int rc,
33415 + struct evms_logical_node *node,
33416 + int io_flag, struct buffer_head *bh)
33421 + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n",
33422 + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1,
33423 + node->name, bh->b_rsector);
33427 + ("%s error(no active paths) on '%s' to drive the I/O.\n",
33428 + (io_flag) ? "WRITE" : "READ", node->name);
33432 + bh->b_end_io(bh, 0);
33436 + * Function: s390_partition_read
33440 +s390_partition_read(struct evms_logical_node *partition, struct buffer_head *bh)
33442 + struct local_instance_data *LID = partition->private;
33443 + struct s390_io *iot = NULL;
33446 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
33447 + partition->total_vsectors) {
33448 + bh->b_rsector += LID->start_sect;
33450 + s390_load_balance(&iot, LID->source_disk);
33452 + iot->segment = partition;
33454 + iot->rw_flag = READ;
33455 + /* register the callback */
33456 + evms_cs_register_for_end_io_notification(iot, bh,
33457 + s390_end_io_callback);
33458 + /* drive the IO */
33459 + R_IO(iot->dsko->disk, bh);
33465 + s390_partition_io_error(rc, partition, READ, bh);
33469 + * Function: s390_partition_write
33473 +s390_partition_write(struct evms_logical_node *partition,
33474 + struct buffer_head *bh)
33476 + struct local_instance_data *LID = partition->private;
33477 + struct s390_io *iot = NULL;
33480 + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
33481 + partition->total_vsectors) {
33482 + bh->b_rsector += LID->start_sect;
33484 + s390_load_balance(&iot, LID->source_disk);
33486 + iot->segment = partition;
33488 + iot->rw_flag = WRITE;
33489 + /* register the callback */
33490 + evms_cs_register_for_end_io_notification(iot, bh,
33491 + s390_end_io_callback);
33492 + /* drive the IO */
33493 + W_IO(iot->dsko->disk, bh);
33499 + s390_partition_io_error(rc, partition, WRITE, bh);
33503 + * Function: s390_partition_init_io
33507 +s390_partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */
33508 + u64 sect_nr, /* disk LBA */
33509 + u64 num_sects, /* # of sectors */
33511 +{ /* buffer address */
33513 + struct local_instance_data *LID = partition->private;
33514 + struct s390_io *iot = NULL;
33516 + if ((sect_nr + num_sects) <= partition->total_vsectors) {
33518 + s390_load_balance(&iot, LID->source_disk);
33519 + if (!iot->dsko) {
33523 + rc = INIT_IO(iot->dsko->disk, io_flag,
33524 + sect_nr + LID->start_sect, num_sects,
33526 + /* do disk object IO bookkeeping */
33527 + atomic_dec(&iot->dsko->pending_ios);
33528 + if (rc == -EIO) {
33529 + atomic_inc(&iot->dsko->failed_ios);
33530 + iot->dsko->flags = S390_DISK_FAILED;
33532 + iot->dsko->flags = S390_DISK_OK;
33534 + } while (rc == -EIO);
33535 + evms_cs_deallocate_to_pool(s390_io_track_pool, iot);
33538 + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
33539 + (io_flag) ? "WRITE" : "READ", partition->name,
33540 + (LID->nr_sects - 1), sect_nr, num_sects);
33548 + * Function: s390_partition_ioctl
33552 +s390_partition_ioctl(struct evms_logical_node *partition,
33553 + struct inode *inode,
33554 + struct file *file, unsigned int cmd, unsigned long arg)
33556 + struct local_instance_data *LID;
33557 + struct hd_geometry hd_geo;
33561 + LID = partition->private;
33565 + case HDIO_GETGEO:
33567 + rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
33570 + if (copy_from_user
33571 + (&hd_geo, (void *) arg,
33572 + sizeof (struct hd_geometry)))
33576 + hd_geo.start = LID->start_sect;
33578 + ((void *) arg, &hd_geo,
33579 + sizeof (struct hd_geometry)))
33583 + case EVMS_GET_BMAP:
33585 + struct evms_get_bmap_pkt *bmap =
33586 + (struct evms_get_bmap_pkt *) arg;
33587 + bmap->rsector += LID->start_sect;
33588 + /* intentionally fall thru to
33589 + * default ioctl down to device
33594 + rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
33600 + * Function: s390_part_init
33604 +s390_part_init(void)
33606 + const char *name = "evms_s390iod";
33608 + /* create s390 IODaemon thread */
33609 + s390_io_redrive_thread = evms_cs_register_thread(s390iod, NULL, name);
33610 + /* create pool of IO tracking structures */
33611 + s390_io_track_pool =
33612 + evms_cs_create_pool(sizeof (struct s390_io), "EVMS_s390_IO", NULL,
33615 + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
33618 +static void __exit
33619 +s390_part_exit(void)
33621 + evms_cs_unregister_plugin(&plugin_header);
33624 +module_init(s390_part_init);
33625 +module_exit(s390_part_exit);
33626 +#ifdef MODULE_LICENSE
33627 +MODULE_LICENSE("GPL");
33629 diff -Naur linux-2002-09-30/drivers/evms/snapshot.c evms-2002-09-30/drivers/evms/snapshot.c
33630 --- linux-2002-09-30/drivers/evms/snapshot.c Wed Dec 31 18:00:00 1969
33631 +++ evms-2002-09-30/drivers/evms/snapshot.c Wed Sep 25 16:53:00 2002
33633 +/* -*- linux-c -*- */
33635 + * Copyright (c) International Business Machines Corp., 2000
33637 + * This program is free software; you can redistribute it and/or modify
33638 + * it under the terms of the GNU General Public License as published by
33639 + * the Free Software Foundation; either version 2 of the License, or
33640 + * (at your option) any later version.
33642 + * This program is distributed in the hope that it will be useful,
33643 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
33644 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
33645 + * the GNU General Public License for more details.
33647 + * You should have received a copy of the GNU General Public License
33648 + * along with this program; if not, write to the Free Software
33649 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33652 + * linux/drivers/evms/snapshot.c
33654 + * EVMS SnapShot Feature.
33656 + * This feature provides the ability to Snapshot ANY existing EVMS volume
33657 + * (including compatibility) to a new EVMS volume that is created when the
33658 + * SnapShot is enabled. This feature will appear in the call stack for both
33659 + * the original and the snapshot volume.
33662 +#define LOG_PREFIX "snapshot: "
33664 +#include <linux/kernel.h>
33665 +#include <linux/module.h>
33666 +#include <linux/compiler.h>
33667 +#include <linux/mempool.h>
33668 +#include <linux/version.h>
33669 +#include <linux/vmalloc.h>
33670 +#include <asm/uaccess.h>
33671 +#include <linux/evms/evms.h>
33672 +#include <linux/evms/evms_snapshot.h>
33674 +static struct proc_dir_entry * snap_proc = NULL;
33675 +static unsigned int snapshot_count = 0; /* Number of active snapshots and originals. */
33677 +/* Memory pools. */
33678 +static kmem_cache_t * snap_page_slab = NULL;
33679 +static mempool_t * snap_page_pool = NULL;
33680 +static kmem_cache_t * snap_buffer_slab = NULL;
33681 +static mempool_t * snap_buffer_pool = NULL;
33682 +static kmem_cache_t * snap_async_org_io_slab = NULL;
33683 +static mempool_t * snap_async_org_io_pool = NULL;
33684 +static kmem_cache_t * snap_async_snap_io_slab = NULL;
33685 +static mempool_t * snap_async_snap_io_pool = NULL;
33686 +static kmem_cache_t * snap_hash_entry_slab = NULL;
33687 +static mempool_t * snap_hash_entry_pool = NULL;
33689 +#ifdef SNAPSHOT_DEBUG
33690 +static struct async_org_io * debug_async_org_io_list = NULL;
33691 +static spinlock_t debug_async_org_io_list_lock = SPIN_LOCK_UNLOCKED;
33694 +/* API prototypes */
33695 +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list);
33696 +static int snap_delete_volume(struct evms_logical_node * node);
33697 +static void snap_read(struct evms_logical_node * node,
33698 + struct buffer_head * bh);
33699 +static void snap_write(struct evms_logical_node * node,
33700 + struct buffer_head * bh);
33701 +static int snap_init_io(struct evms_logical_node * node, int rw,
33702 + u64 sect_nr, u64 num_sects, void * buf_addr);
33703 +static int snap_ioctl(struct evms_logical_node * node,
33704 + struct inode * inode, struct file * file,
33705 + unsigned int cmd, unsigned long arg);
33707 +/* Other functions that require prototypes. */
33708 +static int add_snapshot(struct evms_logical_node * node,
33709 + struct snapshot_metadata * metadata,
33710 + struct evms_logical_node ** evms_node_list);
33711 +static int snap_proc_read(char * page, char ** start, off_t off,
33712 + int count, int * eof, void * data);
33713 +static void snapshot_do_rollback(void * volume);
33714 +static void snap_async_io_thread(void * volume);
33715 +void snap_read_chunk_cb(struct buffer_head * bh, int uptodate);
33716 +void snap_write_chunk_cb(struct buffer_head * bh, int uptodate);
33717 +void snap_cow_table_cb(struct buffer_head * bh, int uptodate);
33719 +/* Snapshot plugin's function table and header. */
33720 +static struct evms_plugin_fops function_table = {
33721 + .discover = snap_discover_volumes,
33722 + .delete = snap_delete_volume,
33723 + .read = snap_read,
33724 + .write = snap_write,
33725 + .init_io = snap_init_io,
33726 + .ioctl = snap_ioctl
33729 +static struct evms_plugin_header plugin_header = {
33730 + .id = SetPluginID(IBM_OEM_ID,
33731 + EVMS_ASSOCIATIVE_FEATURE,
33732 + EVMS_SNAPSHOT_FEATURE_ID),
33734 + .major = EVMS_SNAPSHOT_VERSION_MAJOR,
33735 + .minor = EVMS_SNAPSHOT_VERSION_MINOR,
33736 + .patchlevel = EVMS_SNAPSHOT_VERSION_PATCHLEVEL
33738 + .required_services_version = {
33739 + .major = EVMS_COMMON_SERVICES_MAJOR,
33740 + .minor = EVMS_COMMON_SERVICES_MINOR,
33741 + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL
33743 + .fops = &function_table
33747 + * convert_metadata - Perform endian conversion on a metadata sector.
33748 + * @metadata: snapshot metadata sector
33750 +static void convert_metadata(struct snapshot_metadata * metadata)
33752 + metadata->signature = le32_to_cpup(&metadata->signature);
33753 + metadata->CRC = le32_to_cpup(&metadata->CRC);
33754 + metadata->version.major = le32_to_cpup(&metadata->version.major);
33755 + metadata->version.minor = le32_to_cpup(&metadata->version.minor);
33756 + metadata->version.patchlevel = le32_to_cpup(&metadata->version.patchlevel);
33757 + metadata->flags = le32_to_cpup(&metadata->flags);
33758 + metadata->original_size = le64_to_cpup(&metadata->original_size);
33759 + metadata->lba_of_COW_table = le64_to_cpup(&metadata->lba_of_COW_table);
33760 + metadata->lba_of_first_chunk = le64_to_cpup(&metadata->lba_of_first_chunk);
33761 + metadata->chunk_size = le32_to_cpup(&metadata->chunk_size);
33762 + metadata->total_chunks = le32_to_cpup(&metadata->total_chunks);
33765 +static void *slab_pool_alloc(int gfp_mask, void * data)
33767 + return kmem_cache_alloc(data, gfp_mask);
33770 +static void slab_pool_free(void * ptr, void * data)
33772 + kmem_cache_free(data, ptr);
33776 + * allocate_snapshot_hash_entry
33777 + * @volume: Snapshot volume to get a new entry for.
33778 + * @org_chunk: Number of original chunk.
33779 + * @snap_chunk: Number of remap chunk.
33780 + * @chunk_state: see SNAP_CHUNK_*
33782 + * Get a snapshot_hash_entry from the pool and initialize. Accessing the
33783 + * free_hash_list is safe, since we only call this function while holding
33784 + * the snap_semaphore.
33786 +static struct snapshot_hash_entry *
33787 +allocate_snapshot_hash_entry(struct snapshot_volume * volume,
33792 + struct snapshot_hash_entry * hash_entry;
33794 + hash_entry = volume->free_hash_list;
33795 + if (hash_entry) {
33796 + volume->free_hash_list = hash_entry->next;
33797 + hash_entry->org_chunk = org_chunk;
33798 + hash_entry->snap_chunk = snap_chunk;
33799 + hash_entry->chunk_state = chunk_state;
33800 + hash_entry->snap_io = NULL;
33801 + hash_entry->next = NULL;
33802 + hash_entry->prev = NULL;
33803 + spin_lock_init(&hash_entry->chunk_state_lock);
33805 + /* Should never happen, since hash entries are max
33806 + * allocated at discovery time.
33811 + return hash_entry;
33815 + * insert_snapshot_hash_entry
33817 + * Insert a new entry into a snapshot hash chain, immediately following the
33818 + * specified entry. This function should not be used to add an entry into an
33819 + * empty list, or as the first entry in an existing list. For that case, use
33820 + * insert_snapshot_map_entry_at_head().
33822 +static int insert_snapshot_hash_entry(struct snapshot_hash_entry * entry,
33823 + struct snapshot_hash_entry * base)
33825 + entry->next = base->next;
33826 + entry->prev = base;
33827 + base->next = entry;
33828 + if ( entry->next ) {
33829 + entry->next->prev = entry;
33835 + * insert_snapshot_hash_entry_at_head
33837 + * Insert a new entry into a snapshot chain as the first entry in the chain.
33839 +static int insert_snapshot_hash_entry_at_head(struct snapshot_hash_entry * entry,
33840 + struct snapshot_hash_entry ** head)
33842 + entry->next = *head;
33843 + entry->prev = NULL;
33845 + if ( entry->next ) {
33846 + entry->next->prev = entry;
33852 + * set_snapshot_flags
33854 + * @set_flag: Flags to turn "on" in the metadata sector.
33855 + * @unset_flag: Flags to turn "off" in the metadata sector.
33857 + * Set the flags field in the metadata and write the metadata sector to
33858 + * the snapshot volume. The node passed in to this function should be the
33859 + * "lower" of the snapshot nodes, meaning the one consumed by the snapshot
33860 + * plugin, not the one exported from the plugin.
33862 + * Appropriate values for the two flag parameters are:
33863 + * EVMS_SNAPSHOT_DISABLED
33864 + * EVMS_SNAPSHOT_FULL
33865 + * EVMS_SNAPSHOT_ROLLBACK
33866 + * EVMS_SNAPSHOT_ROLLBACK_COMP
33868 +static int set_snapshot_flags(struct evms_logical_node * snap_node,
33872 + unsigned char data[EVMS_VSECTOR_SIZE] = {0};
33873 + struct snapshot_metadata * metadata = (struct snapshot_metadata*)data;
33875 + /* Read the metadata sector */
33876 + if ( INIT_IO(snap_node, 0, snap_node->total_vsectors-3, 1, data) ) {
33880 + /* Set the appropriate flags. Do endian conversion on the fly. */
33881 + metadata->flags |= cpu_to_le32p(&set_flag);
33882 + metadata->flags &= ~(cpu_to_le32p(&unset_flag));
33883 + metadata->CRC = 0;
33884 + metadata->CRC = cpu_to_le32(evms_cs_calculate_crc(EVMS_INITIAL_CRC,
33886 + sizeof(struct snapshot_metadata)));
33888 + /* Write the metadata sector back to the volume. */
33889 + if ( INIT_IO(snap_node, 1, snap_node->total_vsectors-3, 1, data) ) {
33897 + * disable_snapshot
33899 +static void disable_snapshot(struct snapshot_volume * snap_volume,
33900 + int update_metadata)
33902 + LOG_ERROR("Disabling snapshot volume '%s'.\n",
33903 + snap_volume->exported_node->name);
33904 + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
33905 + if ( update_metadata ) {
33906 + set_snapshot_flags(snap_volume->logical_node,
33907 + EVMS_SNAPSHOT_DISABLED, 0);
33909 + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED_PENDING;
33910 + evms_cs_wakeup_thread(snap_volume->snapshot_org->async_io_thread);
33915 + * snap_discover_volumes
33917 + * Inspect the global node list, looking for volumes with a valid
33918 + * snapshot metadata sector.
33920 +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list)
33922 + struct evms_logical_node * node, * next_node;
33923 + struct snapshot_metadata * metadata = NULL;
33924 + int org_crc, final_crc, rc = 0;
33926 + MOD_INC_USE_COUNT;
33928 + /* A buffer for reading the metadata. */
33929 + metadata = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
33931 + MOD_DEC_USE_COUNT;
33935 + /* Check every node on the discovery list. */
33936 + for ( node = *evms_node_list; node && !rc; node = next_node ) {
33937 + next_node = node->next;
33939 + /* This node must not be one we put back on the list already,
33940 + * and must have a feature header with snapshot's ID.
33942 + if ( node->plugin->id == plugin_header.id ||
33943 + ! node->feature_header ||
33944 + node->feature_header->feature_id != plugin_header.id ) {
33948 + /* Read third-to-last sector for the snapshot metadata. */
33949 + rc = INIT_IO(node, 0, node->total_vsectors-3, 1, metadata);
33951 + LOG_ERROR("IO error reading sector "PFU64" on '%s'.\n",
33952 + node->total_vsectors-3, node->name);
33953 + rc = -EVMS_FEATURE_FATAL_ERROR;
33954 + evms_cs_remove_logical_node_from_list(evms_node_list,
33960 + /* Check for a valid snapshot signature. */
33961 + if ( le32_to_cpup(&metadata->signature) !=
33962 + EVMS_SNAPSHOT_SIGNATURE ) {
33965 + evms_cs_remove_logical_node_from_list(evms_node_list, node);
33967 + /* Check for a valid CRC. */
33968 + org_crc = le32_to_cpup(&metadata->CRC);
33969 + metadata->CRC = 0;
33970 + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, metadata,
33971 + sizeof(struct snapshot_metadata));
33972 + if ( final_crc != org_crc ) {
33973 + LOG_ERROR("CRC error in feature data on '%s'.\n",
33975 + rc = -EVMS_FEATURE_FATAL_ERROR;
33980 + /* Check for correct metadata version. */
33981 + convert_metadata(metadata);
33982 + if ( metadata->version.major > plugin_header.version.major ) {
33983 + LOG_ERROR("ERROR: unsuppoprted metadata version on '%s'.\n",
33985 + rc = -EVMS_FEATURE_FATAL_ERROR;
33990 + rc = add_snapshot(node, metadata, evms_node_list);
33994 + MOD_DEC_USE_COUNT;
34001 + * Make sure an original volume and all of its snapshots are quiesced.
34003 +static int check_quiesce(struct snapshot_volume * org_volume)
34005 + struct snapshot_volume * next_vol;
34007 + for ( next_vol = org_volume;
34009 + next_vol = next_vol->snapshot_next ) {
34010 + if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34011 + LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",
34012 + next_vol->logical_node->name);
34020 + * remove_snapshot_from_chain
34022 + * Remove the specified snapshot volume from its original's chain of snapshots.
34024 +static int remove_snapshot_from_chain(struct snapshot_volume * snap_volume)
34026 + struct snapshot_volume ** p_volume;
34028 + if ( snap_volume->snapshot_org ) {
34029 + down_write(&snap_volume->snapshot_org->snap_semaphore);
34030 + for ( p_volume = &snap_volume->snapshot_org->snapshot_next;
34032 + p_volume = &(*p_volume)->snapshot_next ) {
34033 + if ( *p_volume == snap_volume ) {
34034 + *p_volume = (*p_volume)->snapshot_next;
34038 + up_write(&snap_volume->snapshot_org->snap_semaphore);
34040 + snap_volume->snapshot_org = NULL;
34041 + snap_volume->snapshot_next = NULL;
34046 + * delete_snapshot_hash_chain
34048 + * Delete all items in a single chain in the hash table.
34050 +static int delete_snapshot_hash_chain(struct snapshot_hash_entry * head)
34052 + struct snapshot_hash_entry * next;
34055 + next = head->next;
34056 + mempool_free(head, snap_hash_entry_pool);
34063 + * snapshot_delete_pools
34065 + * Delete all memory pools after all snapshots have been deleted.
34066 + * Also shutdown the daemon thread.
34068 +static void snapshot_delete_pools(void)
34070 + /* The pool of data pages. */
34071 + if (snap_page_slab) {
34072 + if (snap_page_pool) {
34073 + mempool_destroy(snap_page_pool);
34074 + snap_page_pool = NULL;
34076 + kmem_cache_destroy(snap_page_slab);
34077 + snap_page_slab = NULL;
34080 + /* The pool of snap_io_buffer's. */
34081 + if (snap_buffer_slab) {
34082 + if (snap_buffer_pool) {
34083 + mempool_destroy(snap_buffer_pool);
34084 + snap_buffer_pool = NULL;
34086 + kmem_cache_destroy(snap_buffer_slab);
34087 + snap_buffer_slab = NULL;
34090 + /* The pool of async_org_io's. */
34091 + if (snap_async_org_io_slab) {
34092 + if (snap_async_org_io_pool) {
34093 + mempool_destroy(snap_async_org_io_pool);
34094 + snap_async_org_io_pool = NULL;
34096 + kmem_cache_destroy(snap_async_org_io_slab);
34097 + snap_async_org_io_slab = NULL;
34100 + /* The pool of async_snap_io's. */
34101 + if (snap_async_snap_io_slab) {
34102 + if (snap_async_snap_io_pool) {
34103 + mempool_destroy(snap_async_snap_io_pool);
34104 + snap_async_snap_io_pool = NULL;
34106 + kmem_cache_destroy(snap_async_snap_io_slab);
34107 + snap_async_snap_io_slab = NULL;
34110 + /* The pool of hash table entries. */
34111 + if (snap_hash_entry_slab) {
34112 + if (snap_hash_entry_pool) {
34113 + mempool_destroy(snap_hash_entry_pool);
34114 + snap_hash_entry_pool = NULL;
34116 + kmem_cache_destroy(snap_hash_entry_slab);
34117 + snap_hash_entry_slab = NULL;
34122 + * snapshot_create_pools
34124 + * Allocate all of the memory pools when the first snapshot is created.
34125 + * Also start up the daemon thread for processing async I/O's.
34127 +static int snapshot_create_pools(void)
34129 + /* Pool of data pages. */
34130 + if (!snap_page_slab) {
34131 + snap_page_slab = kmem_cache_create("snap_page_slab",
34133 + SLAB_HWCACHE_ALIGN,
34135 + if (snap_page_slab) {
34136 + snap_page_pool = mempool_create(1, slab_pool_alloc,
34142 + /* Pool of snap_io_buffer's. */
34143 + if (!snap_buffer_slab) {
34144 + snap_buffer_slab = kmem_cache_create("snap_bh_slab",
34145 + sizeof(struct snap_io_buffer),
34146 + 0, SLAB_HWCACHE_ALIGN,
34148 + if (snap_buffer_slab) {
34149 + snap_buffer_pool = mempool_create(1, slab_pool_alloc,
34151 + snap_buffer_slab);
34155 + /* Pool of async_org_io's. */
34156 + if (!snap_async_org_io_slab) {
34157 + snap_async_org_io_slab = kmem_cache_create("async_org_io_slab",
34158 + sizeof(struct async_org_io),
34159 + 0, SLAB_HWCACHE_ALIGN,
34161 + if (snap_async_org_io_slab) {
34162 + snap_async_org_io_pool = mempool_create(1, slab_pool_alloc,
34164 + snap_async_org_io_slab);
34168 + /* Pool of async_snap_io's. */
34169 + if (!snap_async_snap_io_slab) {
34170 + snap_async_snap_io_slab = kmem_cache_create("async_snap_io_slab",
34171 + sizeof(struct async_snap_io),
34172 + 0, SLAB_HWCACHE_ALIGN,
34174 + if (snap_async_snap_io_slab) {
34175 + snap_async_snap_io_pool = mempool_create(1, slab_pool_alloc,
34177 + snap_async_snap_io_slab);
34181 + /* Pool of hash table entries. */
34182 + if (!snap_hash_entry_slab) {
34183 + snap_hash_entry_slab = kmem_cache_create("snap_hash_slab",
34184 + sizeof(struct snapshot_hash_entry),
34185 + 0, SLAB_HWCACHE_ALIGN,
34187 + if (snap_hash_entry_slab) {
34188 + snap_hash_entry_pool = mempool_create(1, slab_pool_alloc,
34190 + snap_hash_entry_slab);
34194 + if ( ! snap_page_slab || ! snap_page_pool ||
34195 + ! snap_buffer_slab || ! snap_buffer_pool ||
34196 + ! snap_async_org_io_slab || ! snap_async_org_io_pool ||
34197 + ! snap_async_snap_io_slab || ! snap_async_snap_io_pool ||
34198 + ! snap_hash_entry_slab || ! snap_hash_entry_pool ) {
34199 + LOG_CRITICAL("No memory available to create snapshot pools.\n");
34200 + snapshot_delete_pools();
34208 + * snap_delete_volume
34210 + * Delete the in-memory representation of a volume. The specified node
34211 + * can actually be either a snapshot or an original. Deleting a snapshot
34212 + * causes it to be removed from its original's chain of snapshots.
34214 + * For async snapshots, we will need to flush the COW table and mark the
34215 + * snapshot clean in the metadata.
34217 +static int snap_delete_volume(struct evms_logical_node * node)
34219 + struct snapshot_volume * volume = node->private;
34220 + struct snapshot_volume * org_volume = volume->snapshot_org;
34221 + struct snapshot_volume * next_vol;
34224 + /* Don't delete a snapshot that's rolling back. */
34225 + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK &&
34226 + ! (volume->flags & EVMS_SNAPSHOT_DISABLED) ) {
34227 + LOG_ERROR("Can't delete '%s' during snapshot rollback.",
34232 + /* Delete the instance data. */
34234 + if ( volume->flags & EVMS_SNAPSHOT ) {
34235 + /* This node is a snapshot. Check that this snapshot and
34236 + * its original have been quiesced. For async snapshots,
34237 + * make sure there are no outstanding remaps in
34238 + * progress. Then remove it from the original's chain of
34241 + if ( ! (volume->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34242 + LOG_ERROR("Can't delete snapshot, snapshot volume '%s' not quiesced.\n",
34243 + volume->logical_node->name);
34246 + if ( org_volume &&
34247 + ! (org_volume->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34248 + LOG_ERROR("Can't delete snapshot, original volume '%s' not quiesced.\n",
34249 + org_volume->logical_node->name);
34253 + remove_snapshot_from_chain(volume);
34255 + /* If we just deleted the only/last snapshot for this
34256 + * original, the original will not be modified. It is
34257 + * the engine's responsibility to delete the original
34258 + * and rediscover in order to clear it of its snapshot
34259 + * information. Even if that doesn't happen, the state
34260 + * of the kernel will still be safe. I/O's coming into
34261 + * this plugin for the original will just be passed
34262 + * down without any other action or modification.
34265 + /* Unregister the proc-fs entry for this node. */
34267 + remove_proc_entry(node->volume_info->volume_name,
34271 + /* This is an original. It's the engine's responsibility
34272 + * to delete all snapshots before deleting an original.
34273 + * Otherwise, a snapshot could be left pointing to an
34274 + * original that no longer exists. Thus, we just need to
34275 + * make sure there are no snapshots in the chain.
34277 + rc = check_quiesce(volume);
34282 + /* Shut down the async I/O thread. */
34283 + if (volume->async_io_thread) {
34284 + evms_cs_unregister_thread(volume->async_io_thread);
34287 + /* Loop through all snapshots left on this original,
34288 + * and NULL out their org pointer, in case they don't
34291 + for ( next_vol = volume->snapshot_next; next_vol;
34292 + next_vol = next_vol->snapshot_next ) {
34293 + next_vol->snapshot_org = NULL;
34297 + /* Free up all memory used by the instance data, including
34298 + * the underlying node, the hash table, and the data buffer.
34300 + if (volume->logical_node) {
34301 + rc = DELETE(volume->logical_node);
34306 + if (volume->snapshot_map) {
34307 + /* Delete all of the hash chains,
34308 + * then the actual table.
34310 + for ( i = 0; i < volume->hash_table_size; i++ ) {
34311 + delete_snapshot_hash_chain(volume->snapshot_map[i]);
34313 + delete_snapshot_hash_chain(volume->free_hash_list);
34314 + vfree(volume->snapshot_map);
34316 + if (volume->chunk_data_buffer) {
34317 + kfree(volume->chunk_data_buffer);
34319 + if (volume->rollback_thread) {
34320 + evms_cs_unregister_thread(volume->rollback_thread);
34326 + evms_cs_deallocate_logical_node(node);
34327 + snapshot_count--;
34329 + /* If there are no more snapshot objects, free up the memory pools. */
34330 + if ( snapshot_count == 0 ) {
34331 + snapshot_delete_pools();
34334 + MOD_DEC_USE_COUNT;
34340 + * search_snapshot_hash_chain
34342 + * Search the hash chain that is anchored at the specified head pointer. If the
34343 + * chunk number is found, a pointer to that entry in the chain is set, and a 1
34344 + * is returned. If the chunk is not found, a pointer to the previous entry is
34345 + * set and 0 is returned. If the return pointer is NULL, this means either the
34346 + * list is empty, or the specified sector should become the first list item.
34348 +static int search_snapshot_hash_chain(u64 chunk,
34349 + struct snapshot_hash_entry * head,
34350 + struct snapshot_hash_entry ** result)
34352 + struct snapshot_hash_entry * curr = head;
34353 + struct snapshot_hash_entry * prev = head;
34354 + while ( curr && curr->org_chunk < chunk ) {
34356 + curr = curr->next;
34359 + /* Either an empty chain or went off the end of the chain. */
34362 + } else if ( curr->org_chunk != chunk ) {
34363 + *result = curr->prev;
34372 + * snapshot_remap_chunk
34374 + * Perform a sector remap on a snapshot volume. This should be called from the
34375 + * I/O read path, It first determines the base sector of the chunk containing
34376 + * the specified sector, and saves the remainder. Then it performs a search
34377 + * through the snapshot map for the specified volume. If a match is found, the
34378 + * sector number is changed to the new value. If no match is found, the value
34379 + * is left the same, meaning the read should proceed down the original volume.
34381 +static int snapshot_remap_chunk(struct snapshot_volume * snap_volume,
34382 + struct buffer_head * bh)
34384 + struct snapshot_hash_entry * result;
34385 + u64 chunk, sector = bh->b_rsector;
34386 + unsigned long remainder, hash_value;
34387 + unsigned long flags, queued = FALSE;
34389 + remainder = sector & (u64)(snap_volume->chunk_size - 1);
34390 + chunk = sector >> snap_volume->chunk_shift;
34391 + hash_value = ((unsigned long)chunk) % snap_volume->hash_table_size;
34393 + if ( search_snapshot_hash_chain(chunk,
34394 + snap_volume->snapshot_map[hash_value],
34396 + bh->b_rsector = (result->snap_chunk << snap_volume->chunk_shift)
34398 + if ( result->chunk_state != SNAP_CHUNK_COPIED ) {
34399 + /* If this chunk is in the middle of being copied,
34400 + * place this request on the pending list.
34402 + spin_lock_irqsave(&result->chunk_state_lock, flags);
34403 + if ( result->chunk_state != SNAP_CHUNK_COPIED ) {
34404 + bh->b_reqnext = result->snap_io->pending_reads;
34405 + result->snap_io->pending_reads = bh;
34406 + if (!result->snap_io->dev) {
34407 + result->snap_io->dev = bh->b_rdev;
34409 + evms_cs_volume_request_in_progress(result->snap_io->dev,
34413 + spin_unlock_irqrestore(&result->chunk_state_lock, flags);
34428 +static void snap_read(struct evms_logical_node * node,
34429 + struct buffer_head * bh)
34431 + struct snapshot_volume * volume = node->private;
34435 + /* Size check. */
34436 + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
34437 + node->total_vsectors ) {
34438 + bh->b_end_io(bh, 0);
34442 + /* Can't read if rollback is in progress. */
34443 + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) {
34444 + LOG_ERROR("Cannot read from snapshot '%s' during rollback.\n",
34445 + volume->logical_node->name);
34446 + bh->b_end_io(bh, 0);
34450 + /* On a read to the original, we can just pass it through completely
34451 + * untouched. Only reads to the snapshot can be remapped.
34453 + if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
34454 + R_IO(volume->logical_node, bh);
34458 + /* Lock the snapshot before processing the request. */
34459 + down_read(&volume->snap_semaphore);
34461 + /* Make sure the snapshot is not full/disabled, and that
34462 + * the original is present.
34464 + if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||
34465 + (! volume->snapshot_org) ) {
34466 + bh->b_end_io(bh, 0);
34467 + up_read(&volume->snap_semaphore);
34471 + /* Check for unaligned I/O. This is mostly to prevent XFS from
34472 + * sending a request that spans a chunk.
34474 + alignment = bh->b_rsector;
34475 + alignment <<= EVMS_VSECTOR_SIZE_SHIFT;
34476 + if ( unlikely(alignment & (bh->b_size - 1)) ) {
34477 + LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n",
34478 + bh->b_rsector, bh->b_size, node->name);
34479 + bh->b_end_io(bh, 0);
34480 + up_read(&volume->snap_semaphore);
34484 + /* Check if this sector has been remapped. */
34485 + rc = snapshot_remap_chunk(volume, bh);
34487 + /* Sector was remapped. Send IO to the snapshot. */
34488 + up_read(&volume->snap_semaphore);
34489 + R_IO(volume->logical_node, bh);
34490 + } else if ( rc < 0 ) {
34491 + /* Sector was remapped but queued to be driven later. */
34492 + up_read(&volume->snap_semaphore);
34494 + /* Has not been remapped. Send IO to the original. */
34495 + R_IO(volume->snapshot_org->logical_node, bh);
34496 + up_read(&volume->snap_semaphore);
34501 +/********** Asynchronous Snapshot I/O Code **********/
34505 + * snap_deallocate_buffer
34507 +static void snap_deallocate_buffer(struct snap_io_buffer * buf,
34508 + int deallocate_page)
34511 + if (deallocate_page) {
34512 + mempool_free(buf->bh->b_data, snap_page_pool);
34514 + mempool_free(buf, snap_buffer_pool);
34519 + * snap_allocate_buffer
34521 + * Allocate a snap_io_buffer and a data page from the respective memory
34522 + * pools. Initialize as appropriate.
34524 +static struct snap_io_buffer * snap_allocate_buffer(int allocate_page)
34526 + struct snap_io_buffer * buf;
34527 + struct buffer_head * bh;
34529 + /* Grab a snap_io_buffer from the pool. */
34530 + buf = mempool_alloc(snap_buffer_pool, GFP_NOIO);
34534 + memset(buf, 0, sizeof(struct snap_io_buffer));
34535 + bh = buf->bh = &buf->_bh;
34537 + /* Grab a data page from the pool. */
34538 + if (allocate_page) {
34539 + bh->b_data = mempool_alloc(snap_page_pool, GFP_NOIO);
34540 + if (!bh->b_data) {
34541 + snap_deallocate_buffer(buf, FALSE);
34544 + bh->b_page = virt_to_page(bh->b_data);
34547 + /* Initialize the rest of the buffer. */
34548 + bh->b_size = PAGE_SIZE;
34549 + bh->b_list = BUF_LOCKED;
34550 + bh->b_count = (atomic_t)ATOMIC_INIT(1);
34551 + bh->b_this_page = (struct buffer_head *)1;
34552 + bh->b_private = buf;
34553 + set_bit(BH_Dirty, &bh->b_state);
34554 + set_bit(BH_Lock, &bh->b_state);
34555 + set_bit(BH_Req, &bh->b_state);
34556 + set_bit(BH_Mapped, &bh->b_state);
34557 + set_bit(BH_Uptodate, &bh->b_state);
34558 + init_waitqueue_head(&bh->b_wait);
34559 + INIT_LIST_HEAD(&buf->chunk_write_list);
34565 + * snap_deallocate_buffer_list
34567 + * Free each buffer in the specified list.
34569 +static void snap_deallocate_buffer_list(struct snap_io_buffer * buf_list,
34570 + int deallocate_pages)
34572 + struct snap_io_buffer * buf, * buf_next;
34574 + for ( buf = buf_list; buf; buf = buf_next ) {
34575 + buf_next = buf->buffer_next;
34576 + snap_deallocate_buffer(buf, deallocate_pages);
34581 + * snap_allocate_buffer_list
34583 + * Allocate a list of snap_io_buffer's which will be used to copy a chunk
34584 + * from the original to the snapshot.
34586 +static struct snap_io_buffer *
34587 +snap_allocate_buffer_list(unsigned int count,
34588 + u64 starting_lba,
34589 + void (*callback)(struct buffer_head *, int),
34590 + int allocate_pages)
34592 + struct snap_io_buffer * buf, * head = NULL;
34593 + struct snap_io_buffer ** tail = &head;
34596 + for ( i = 0; i < count; i++ ) {
34597 + /* Get a buffer from the pool. */
34598 + buf = snap_allocate_buffer(allocate_pages);
34600 + snap_deallocate_buffer_list(head, allocate_pages);
34604 + /* Set the callback function and the sector value. */
34605 + buf->bh->b_end_io = callback;
34606 + buf->bh->b_rsector = starting_lba + i *
34607 + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT);
34609 + /* Add this buffer to the list to return. */
34611 + tail = &buf->buffer_next;
34618 + * deallocate_async_snap_io
34620 +static void deallocate_async_snap_io(struct async_snap_io * async_snap_io)
34622 + DEBUG_CHECK_SNAP_IO(async_snap_io);
34624 + snap_deallocate_buffer(async_snap_io->cow_table_buffer, TRUE);
34625 + snap_deallocate_buffer_list(async_snap_io->copy_buffers, FALSE);
34626 + mempool_free(async_snap_io, snap_async_snap_io_pool);
34630 + * allocate_async_snap_io
34631 + * @snap_volume: The snapshot volume this chunk belongs to.
34632 + * @hash_entry: The entry in the hash table representing this chunk.
34633 + * @async_org_io: The parent async I/O structure.
34634 + * @snap_chunk_lba: The starting LBA on the snapshot for this chunk.
34635 + * @buffer_count: The number of buffers needed to copy this chunk.
34637 + * Allocate an async_snap_io structure from the pool and initialize.
34638 + * Create a list of buffer heads to use for the copy.
34640 +static struct async_snap_io *
34641 +allocate_async_snap_io(struct snapshot_volume * snap_volume,
34642 + struct snapshot_hash_entry * hash_entry,
34643 + struct async_org_io * async_org_io,
34644 + u64 snap_chunk_lba,
34645 + unsigned int buffer_count)
34647 + struct async_snap_io * async_snap_io;
34649 + async_snap_io = mempool_alloc(snap_async_snap_io_pool, GFP_NOIO);
34650 + if (async_snap_io) {
34651 + memset(async_snap_io, 0, sizeof(struct async_snap_io));
34652 + async_snap_io->snap_volume = snap_volume;
34653 + async_snap_io->hash_table_entry = hash_entry;
34654 + async_snap_io->org_io = async_org_io;
34655 + INIT_LIST_HEAD(&async_snap_io->snap_pending_io_list);
34656 + INIT_LIST_HEAD(&async_snap_io->cow_write_list);
34657 + async_snap_io->write_count = (atomic_t)ATOMIC_INIT(buffer_count);
34659 + async_snap_io->cow_table_buffer = snap_allocate_buffer(TRUE);
34660 + if (async_snap_io->cow_table_buffer) {
34661 + /* The buffer for the COW table needs to be adjusted. */
34662 + struct snap_io_buffer * buf = async_snap_io->cow_table_buffer;
34663 + buf->bh->b_size = EVMS_VSECTOR_SIZE;
34664 + buf->bh->b_end_io = snap_cow_table_cb;
34665 + buf->buffer_private = async_snap_io;
34667 + async_snap_io->copy_buffers =
34668 + snap_allocate_buffer_list(buffer_count,
34670 + snap_write_chunk_cb,
34672 + if (!async_snap_io->copy_buffers) {
34673 + deallocate_async_snap_io(async_snap_io);
34674 + async_snap_io = NULL;
34677 + deallocate_async_snap_io(async_snap_io);
34678 + async_snap_io = NULL;
34681 + return async_snap_io;
34685 + * deallocate_async_org_io
34687 +static void deallocate_async_org_io(struct async_org_io * async_org_io)
34689 + DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io);
34691 + snap_deallocate_buffer_list(async_org_io->copy_buffers, TRUE);
34692 + mempool_free(async_org_io, snap_async_org_io_pool);
34696 + * allocate_async_org_io
34698 + * Allocate an async_org_io structure from the pool and initialize.
34699 + * Create a list of buffer heads to use for the copy.
34701 +static struct async_org_io *
34702 +allocate_async_org_io(struct snapshot_volume * org_volume,
34703 + u64 org_chunk_lba,
34704 + unsigned int buffer_count)
34706 + struct async_org_io * async_org_io;
34708 + async_org_io = mempool_alloc(snap_async_org_io_pool, GFP_NOIO);
34709 + if (async_org_io) {
34710 + DEBUG_ADD_ORG_IO_TO_LIST(async_org_io);
34712 + memset(async_org_io, 0, sizeof(struct async_org_io));
34713 + async_org_io->org_volume = org_volume;
34714 + spin_lock_init(&async_org_io->pending_writes_lock);
34715 + INIT_LIST_HEAD(&async_org_io->org_pending_io_list);
34716 + async_org_io->copy_count = (atomic_t)ATOMIC_INIT(0);
34717 + async_org_io->ref_count = (atomic_t)ATOMIC_INIT(1);
34719 + async_org_io->copy_buffers =
34720 + snap_allocate_buffer_list(buffer_count,
34722 + snap_read_chunk_cb,
34724 + if (!async_org_io->copy_buffers) {
34725 + deallocate_async_org_io(async_org_io);
34726 + async_org_io = NULL;
34729 + return async_org_io;
34733 + * deallocate_async_io
34735 + * This function deletes the entire async I/O structure, including the
34736 + * async_org_io, all async_snap_io's, and all buffer heads and pages.
34738 +static void deallocate_async_io(struct async_org_io * async_org_io)
34740 + struct async_snap_io * async_snap_io, * next_snap_io;
34742 + for ( async_snap_io = async_org_io->snap_io_list;
34744 + async_snap_io = next_snap_io ) {
34745 + next_snap_io = async_snap_io->snap_io_list_next;
34746 + deallocate_async_snap_io(async_snap_io);
34748 + deallocate_async_org_io(async_org_io);
34752 + * process_org_pending_io_list
34754 + * Grab the first item from the org_pending_io_list and send all
34755 + * waiting write requests to the original.
34757 +static void process_org_pending_io_list(struct snapshot_volume * org_volume,
34760 + struct async_org_io * async_org_io;
34761 + struct buffer_head * bh;
34762 + unsigned long flags;
34764 + spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags);
34765 + if ( list_empty(&org_volume->org_pending_io_list) ) {
34766 + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock,
34770 + async_org_io = ORG_PENDING_IO_ENTRY(org_volume->org_pending_io_list.next);
34771 + list_del(&async_org_io->org_pending_io_list);
34772 + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock,
34775 + for ( bh = async_org_io->pending_writes; bh;
34776 + bh = async_org_io->pending_writes ) {
34777 + async_org_io->pending_writes = bh->b_reqnext;
34778 + bh->b_reqnext = NULL;
34779 + W_IO(async_org_io->org_volume->logical_node, bh);
34780 + evms_cs_volume_request_in_progress(async_org_io->dev,
34784 + if ( atomic_dec_and_test(&async_org_io->ref_count) ) {
34785 + deallocate_async_io(async_org_io);
34793 + * process_snap_pending_io_list
34795 + * Grab the first item from the snap_pending_io_list and send
34796 + * all waiting read and write requests to the snapshot.
34798 +static void process_snap_pending_io_list(struct snapshot_volume * org_volume,
34801 + struct async_snap_io * async_snap_io;
34802 + struct buffer_head * bh;
34803 + unsigned long flags;
34805 + spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags);
34806 + if ( list_empty(&org_volume->snap_pending_io_list) ) {
34807 + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock,
34810 + async_snap_io = SNAP_PENDING_IO_ENTRY(org_volume->snap_pending_io_list.next);
34811 + list_del(&async_snap_io->snap_pending_io_list);
34812 + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock,
34816 + for ( bh = async_snap_io->pending_reads; bh;
34817 + bh = async_snap_io->pending_reads ) {
34818 + async_snap_io->pending_reads = bh->b_reqnext;
34819 + bh->b_reqnext = NULL;
34820 + R_IO(async_snap_io->snap_volume->logical_node, bh);
34821 + evms_cs_volume_request_in_progress(async_snap_io->dev,
34825 + for ( bh = async_snap_io->pending_writes; bh;
34826 + bh = async_snap_io->pending_writes ) {
34827 + async_snap_io->pending_writes = bh->b_reqnext;
34828 + bh->b_reqnext = NULL;
34829 + W_IO(async_snap_io->snap_volume->logical_node, bh);
34830 + evms_cs_volume_request_in_progress(async_snap_io->dev,
34834 + if ( atomic_dec_and_test(&async_snap_io->org_io->ref_count) ) {
34835 + deallocate_async_io(async_snap_io->org_io);
34842 + * process_chunk_write_list
34844 + * Grab the first item from the chunk_write_list and send down
34845 + * writes to each snapshot of this original.
34847 +static void process_chunk_write_list(struct snapshot_volume * org_volume,
34850 + struct snap_io_buffer * buf, * snap_buf;
34851 + struct async_snap_io * async_snap_io;
34852 + unsigned long flags;
34854 + spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags);
34855 + if ( list_empty(&org_volume->chunk_write_list) ) {
34856 + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock,
34859 + buf = CHUNK_WRITE_ENTRY(org_volume->chunk_write_list.next);
34860 + list_del(&buf->chunk_write_list);
34861 + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock,
34864 + for ( snap_buf = buf->copy_next; snap_buf;
34865 + snap_buf = snap_buf->copy_next ) {
34866 + async_snap_io = snap_buf->buffer_private;
34867 + W_IO(async_snap_io->snap_volume->logical_node,
34875 + * write_cow_table
34877 + * On S/390 machines, the hardsector size is usually 4k, and the driver won't
34878 + * accept I/O requests that are less than 4k in size. Thus, the COW table
34879 + * cannot be written as a single sector. We must first read in the entire
34880 + * 4k hardsector, overlay the 512 byte COW table, and then write out the entire
34883 + * On machines with hardsector size of 512, the COW table write will be
34884 + * processed just as it was before.
34886 + * If an error occurs in this function, we will send down the buffer as as
34887 + * read instead of a write. This will ensure that the callback function still
34888 + * runs and cleans up the async_io structures and releases all pending I/Os.
34890 +static inline void write_cow_table(struct snapshot_volume * snap,
34891 + struct buffer_head * bh)
34893 + if ( snap->logical_node->hardsector_size > bh->b_size ) {
34896 + unsigned short b_size = bh->b_size;
34899 + offset = bh->b_rsector & ((snap->logical_node->hardsector_size >>
34900 + EVMS_VSECTOR_SIZE_SHIFT) - 1);
34901 + bh->b_rsector -= offset;
34902 + bh->b_size = snap->logical_node->hardsector_size;
34904 + /* Need a buffer to temporarily hold the COW table sector. */
34905 + buffer = kmalloc(b_size, GFP_NOIO);
34907 + disable_snapshot(snap, TRUE);
34908 + R_IO(snap->logical_node, bh);
34911 + memcpy(buffer, bh->b_data, b_size);
34913 + /* Read in the entire hardsector from disk. */
34914 + rc = INIT_IO(snap->logical_node, READ, bh->b_rsector,
34915 + snap->logical_node->hardsector_size >>
34916 + EVMS_VSECTOR_SIZE_SHIFT,
34919 + disable_snapshot(snap, TRUE);
34920 + R_IO(snap->logical_node, bh);
34924 + /* Copy the COW table back into the buffer. */
34925 + memcpy(bh->b_data + (offset << EVMS_VSECTOR_SIZE_SHIFT),
34929 + W_IO(snap->logical_node, bh);
34933 + * process_cow_table_write_lists
34935 +static void process_cow_table_write_lists(struct snapshot_volume * org_volume,
34938 + struct snapshot_volume * snap_volume;
34939 + struct async_snap_io * async_snap_io, * async_snap_io2;
34940 + struct list_head * lh;
34941 + unsigned long flags;
34943 + /* Check the chunk_write_list for each snapshot on this original. */
34944 + down_read(&org_volume->snap_semaphore);
34945 + for ( snap_volume = org_volume->snapshot_next;
34947 + snap_volume = snap_volume->snapshot_next ) {
34949 + /* While we are here, see if the DISABLED bit needs to
34950 + * be written to disk.
34952 + if ( snap_volume->flags & EVMS_SNAPSHOT_DISABLED &&
34953 + snap_volume->flags & EVMS_SNAPSHOT_DISABLED_PENDING ) {
34954 + disable_snapshot(snap_volume, TRUE);
34955 + snap_volume->flags &= ~EVMS_SNAPSHOT_DISABLED_PENDING;
34958 + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock,
34961 + if ( list_empty(&snap_volume->cow_table_write_list) ) {
34962 + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock,
34967 + /* Check for an in-flight COW-table-write. */
34968 + async_snap_io = COW_WRITE_ENTRY(snap_volume->cow_table_write_list.next);
34969 + if ( atomic_read(&async_snap_io->write_count) != 0 ) {
34970 + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock,
34975 + /* See if there are any COW-table-writes that can be skipped. */
34976 + list_for_each(lh, &snap_volume->cow_table_write_list) {
34977 + /* No need to check the first list element, since
34978 + * we've already examined it.
34980 + if ( lh->prev != &snap_volume->cow_table_write_list ) {
34981 + async_snap_io = COW_WRITE_ENTRY(lh);
34982 + async_snap_io2 = COW_WRITE_ENTRY(lh->prev);
34983 + if ( atomic_read(&async_snap_io->write_count) != 0 ) {
34984 + async_snap_io = async_snap_io2;
34987 + if ( async_snap_io->cow_table_buffer->bh->b_rsector !=
34988 + async_snap_io2->cow_table_buffer->bh->b_rsector ) {
34989 + async_snap_io = async_snap_io2;
34995 + /* We have the buffer to send down. Now mark all
34996 + * previous COW-table buffers as in-flight.
34998 + list_for_each(lh, &snap_volume->cow_table_write_list) {
34999 + async_snap_io2 = COW_WRITE_ENTRY(lh);
35000 + atomic_dec(&async_snap_io2->write_count);
35001 + if ( async_snap_io2 == async_snap_io ) {
35005 + DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume);
35009 + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35011 + /* Write the COW table. */
35012 + DEBUG_INC_COW_TABLE_WRITES(snap_volume);
35013 + write_cow_table(snap_volume, async_snap_io->cow_table_buffer->bh);
35017 + up_read(&org_volume->snap_semaphore);
35021 + * snap_async_io_thread
35023 + * This is the async I/O thread function. It processes requests from four
35024 + * lists, which are embedded in the original volume structure passed to the
35027 + * The first list, org_pending_io_list, contains async_org_io's, each of which
35028 + * contain a list of write requests to the original volume that are waiting on
35029 + * the completion of a chunk copy.
35031 + * The second list, snap_pending_io_list, contains async_snap_io's, each of
35032 + * which contain a list of read requests and a list of write requests to the
35033 + * snapshot volume that are waiting on the completion of a chunk copy.
35035 + * The third list, chunk_write_list, contains buffers that were used to read
35036 + * part of a chunk from the original volume. Those buffers are linked to other
35037 + * buffers which are used to write the same part of that chunk to the snapshot.
35039 + * The fourth list is actually the list of snapshots for this original. Each
35040 + * snapshot then has a list of COW-table buffers that have to be written. The
35041 + * processing of this list is optimized to eliminate unnecessary, overlapping
35042 + * writes of the COW table.
35044 + * The loop will continue as long as there is an item on at least one of
35045 + * the four lists. When they are all empty, the loop exits and the thread
35046 + * goes back to sleep.
35048 +static void snap_async_io_thread(void * volume)
35050 + struct snapshot_volume * org_volume = volume;
35051 + int done = FALSE;
35054 + process_org_pending_io_list(org_volume, &done);
35056 + process_snap_pending_io_list(org_volume, &done);
35058 + process_chunk_write_list(org_volume, &done);
35060 + process_cow_table_write_lists(org_volume, &done);
35063 + run_task_queue(&tq_disk);
35067 + * schedule_org_pending_io
35069 + * Place the async_org_io on the thread's processing list.
35071 +static void schedule_org_pending_io(struct async_org_io * async_org_io)
35073 + struct snapshot_volume * org_volume = async_org_io->org_volume;
35074 + unsigned long flags;
35076 + spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags);
35077 + list_add_tail(&async_org_io->org_pending_io_list,
35078 + &org_volume->org_pending_io_list);
35079 + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock, flags);
35080 + evms_cs_wakeup_thread(org_volume->async_io_thread);
35084 + * schedule_snap_pending_io
35086 + * Place the async_snap_io on the thread's processing list.
35088 +static void schedule_snap_pending_io(struct async_snap_io * async_snap_io)
35090 + struct snapshot_volume * org_volume = async_snap_io->org_io->org_volume;
35091 + unsigned long flags;
35093 + spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags);
35094 + list_add_tail(&async_snap_io->snap_pending_io_list,
35095 + &org_volume->snap_pending_io_list);
35096 + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock, flags);
35097 + evms_cs_wakeup_thread(org_volume->async_io_thread);
35101 + * schedule_chunk_write
35103 + * Place the buffer on the chunk_write_list for the thread to process. This
35104 + * list uses the chunk_write_list field in the snap_io_buffer.
35106 +static void schedule_chunk_write(struct snap_io_buffer * buf)
35108 + struct async_org_io * org_io = buf->buffer_private;
35109 + struct snapshot_volume * org_volume = org_io->org_volume;
35110 + unsigned long flags;
35112 + spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags);
35113 + list_add_tail(&buf->chunk_write_list, &org_volume->chunk_write_list);
35114 + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock, flags);
35115 + evms_cs_wakeup_thread(org_volume->async_io_thread);
35119 + * schedule_cow_table_write
35121 + * Place the async_snap_io on the thread's processing list.
35123 +static void schedule_cow_table_write(struct async_snap_io * async_snap_io)
35125 + struct snapshot_volume * snap_volume = async_snap_io->snap_volume;
35126 + unsigned long flags;
35128 + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags);
35129 + list_add_tail(&async_snap_io->cow_write_list,
35130 + &snap_volume->cow_table_write_list);
35131 + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35135 + * snap_read_chunk_cb
35137 + * This is the callback function for reading chunks from the original.
35138 + * When each read completes, we have to decrement the read_count in the
35139 + * async_org_io. If this count reaches zero, we can decrement the
35140 + * chunk_lock's in all of the hash entries, and send the original write
35141 + * request down. Finally, send this buffer head over to the thread to
35142 + * send the writes down to the snapshots.
35144 +void snap_read_chunk_cb(struct buffer_head * bh,
35147 + struct snap_io_buffer * buf = bh->b_private;
35150 + /* Error reading the chunk. Disable all snapshots on this org. */
35151 + struct async_org_io * async_org_io = buf->buffer_private;
35152 + struct snapshot_volume * snap_volume;
35153 + LOG_ERROR("Error reading chunk from original '%s'.\n",
35154 + async_org_io->org_volume->exported_node->name);
35155 + for ( snap_volume = async_org_io->org_volume->snapshot_next;
35157 + snap_volume = snap_volume->snapshot_next ) {
35158 + disable_snapshot(snap_volume, FALSE);
35162 + schedule_chunk_write(buf);
35166 + * snap_write_chunk_cb
35168 + * This is the callback function for writing chunks to the snapshot. When
35169 + * each write completes, decrement the write_count in the async_snap_io.
35170 + * If this count reaches zero, decrement the chunk_lock in the hash entry,
35171 + * and decrement the remap_count in the async_org_io. If the remap_count
35172 + * reaches zero, then everybody is done, and we can free up the entire
35173 + * async_io structure.
35175 +void snap_write_chunk_cb(struct buffer_head * bh,
35178 + struct snap_io_buffer * buf = bh->b_private;
35179 + struct async_snap_io * async_snap_io = buf->buffer_private;
35182 + /* Error writing chunk. Disable this snapshot. */
35183 + LOG_ERROR("Error writing chunk to snapshot '%s'.\n",
35184 + async_snap_io->snap_volume->exported_node->name);
35185 + disable_snapshot(async_snap_io->snap_volume, FALSE);
35188 + atomic_dec(&async_snap_io->write_count);
35189 + evms_cs_wakeup_thread(async_snap_io->org_io->org_volume->async_io_thread);
35193 + * snap_cow_table_cb
35195 + * This is the callback function for writing out the COW table.
35197 +void snap_cow_table_cb(struct buffer_head * bh,
35200 + struct snap_io_buffer * buf = bh->b_private;
35201 + struct async_snap_io * async_snap_io = buf->buffer_private;
35202 + struct async_snap_io * async_snap_io2;
35203 + struct async_org_io * async_org_io;
35204 + struct snapshot_volume * snap_volume = async_snap_io->snap_volume;
35205 + struct list_head * lh, * tmp;
35206 + unsigned long flags, flags2;
35209 + /* Error writing the COW table sector. Disable the snapshot. */
35210 + struct snapshot_volume * snap_volume = buf->buffer_private;
35211 + LOG_ERROR("Error writing COW table to snapshot '%s'.\n",
35212 + snap_volume->exported_node->name);
35213 + disable_snapshot(snap_volume, FALSE);
35216 + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags);
35218 + list_for_each_safe(lh, tmp, &snap_volume->cow_table_write_list) {
35220 + async_snap_io2 = COW_WRITE_ENTRY(lh);
35221 + async_org_io = async_snap_io2->org_io;
35223 + /* Mark the chunk as copied in the hash table. */
35224 + spin_lock_irqsave(&async_snap_io2->hash_table_entry->chunk_state_lock,
35226 + async_snap_io2->hash_table_entry->chunk_state = SNAP_CHUNK_COPIED;
35227 + async_snap_io2->hash_table_entry->snap_io = NULL;
35228 + spin_unlock_irqrestore(&async_snap_io2->hash_table_entry->chunk_state_lock,
35231 + /* Release any pending I/Os waiting on this chunk. */
35232 + schedule_snap_pending_io(async_snap_io2);
35233 + if ( atomic_dec_and_test(&async_org_io->copy_count) ) {
35234 + schedule_org_pending_io(async_org_io);
35237 + if ( async_snap_io2 == async_snap_io ) {
35242 + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35246 + * snap_queue_original_request
35248 + * An existing remap was found for the chunk for this write request.
35249 + * If the chunk has been fully copied, then the request can go through
35250 + * normally. If the chunk is still being processed, this request must
35251 + * be queued up to be driven after the chunk has been copied.
35253 +static void snap_queue_original_request(struct snapshot_volume * snap_volume,
35254 + struct buffer_head * org_bh,
35255 + struct snapshot_hash_entry * target_entry,
35257 + int * queued_org_bh,
35258 + int write_to_snapshot)
35260 + struct async_org_io * org_io;
35261 + unsigned long flags, flags2;
35263 + if (write_to_snapshot) {
35264 + org_bh->b_rsector = (target_entry->snap_chunk <<
35265 + snap_volume->chunk_shift) +
35269 + if ( ! *queued_org_bh &&
35270 + target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35271 + spin_lock_irqsave(&target_entry->chunk_state_lock, flags);
35272 + if (write_to_snapshot) {
35273 + /* A write to the snapshot. */
35274 + if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35275 + org_bh->b_reqnext =
35276 + target_entry->snap_io->pending_writes;
35277 + target_entry->snap_io->pending_writes = org_bh;
35278 + if (!target_entry->snap_io->dev) {
35279 + target_entry->snap_io->dev =
35282 + evms_cs_volume_request_in_progress(target_entry->snap_io->dev,
35284 + *queued_org_bh = TRUE;
35287 + /* A write to the original. */
35288 + if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35289 + org_io = target_entry->snap_io->org_io;
35290 + spin_lock_irqsave(&org_io->pending_writes_lock,
35292 + org_bh->b_reqnext = org_io->pending_writes;
35293 + org_io->pending_writes = org_bh;
35294 + if (!org_io->dev) {
35295 + org_io->dev = org_bh->b_rdev;
35297 + spin_unlock_irqrestore(&org_io->pending_writes_lock,
35299 + evms_cs_volume_request_in_progress(org_io->dev,
35301 + *queued_org_bh = TRUE;
35304 + spin_unlock_irqrestore(&target_entry->chunk_state_lock, flags);
35309 + * snapshot_copy_1
35311 + * Check this snapshot node to see if the given sector/chunk has been
35312 + * remapped yet. If it hasn't, create a new hash table entry, update the
35313 + * in-memory COW table, write the COW table to disk if it is full, and
35314 + * then start the process of copying the chunk from the original to the
35317 +static int snapshot_copy_1(struct snapshot_volume * snap_volume,
35318 + struct buffer_head * org_bh,
35319 + struct async_org_io ** async_org_io,
35320 + int * queued_org_bh,
35321 + int write_to_snapshot)
35323 + struct snapshot_volume * org_volume = snap_volume->snapshot_org;
35324 + struct snapshot_hash_entry * target_entry, * new_map_entry;
35325 + struct snap_io_buffer * cow_buf, *buf1, *buf2;
35326 + struct async_snap_io * async_snap_io;
35327 + u64 org_sector = org_bh->b_rsector;
35328 + u64 org_chunk_lba, snap_chunk_lba;
35330 + u64 chunk, remainder;
35331 + unsigned long hash_value, buffer_count, sectors_in_chunk;
35333 + /* Grab the read-lock when checking for an existing remap. */
35334 + down_read(&snap_volume->snap_semaphore);
35336 + /* Make sure the snapshot has not been disabled. */
35337 + if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ||
35339 + up_read(&snap_volume->snap_semaphore);
35343 + /* Check for unaligned I/O. This is mostly to prevent XFS from
35344 + * sending a request that spans a chunk.
35346 + alignment = org_sector << EVMS_VSECTOR_SIZE_SHIFT;
35347 + if ( unlikely(alignment & (org_bh->b_size - 1)) ) {
35348 + LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n",
35349 + org_bh->b_rsector, org_bh->b_size,
35350 + snap_volume->logical_node->name);
35351 + if (!write_to_snapshot) {
35352 + disable_snapshot(snap_volume, TRUE);
35354 + up_read(&snap_volume->snap_semaphore);
35358 + /* Search the hash table to see if this sector has already been
35359 + * remapped on this snapshot.
35361 + chunk = org_sector >> snap_volume->chunk_shift;
35362 + remainder = org_sector & (u64)(snap_volume->chunk_size - 1);
35363 + hash_value = (unsigned long)chunk % snap_volume->hash_table_size;
35365 + if ( search_snapshot_hash_chain(chunk,
35366 + snap_volume->snapshot_map[hash_value],
35367 + &target_entry) ) {
35368 + /* Chunk is already remapped. If the remap is still in progress,
35369 + * queue up this request to be handled later. If the remap is
35370 + * complete, we can just keep going.
35372 + up_read(&snap_volume->snap_semaphore);
35373 + snap_queue_original_request(snap_volume, org_bh,
35374 + target_entry, remainder,
35375 + queued_org_bh, write_to_snapshot);
35379 + /* Convert to a write-lock and check again for a remap.
35380 + * (Same search and check as just before).
35382 + up_read(&snap_volume->snap_semaphore);
35383 + down_write(&snap_volume->snap_semaphore);
35384 + if ( search_snapshot_hash_chain(chunk,
35385 + snap_volume->snapshot_map[hash_value],
35386 + &target_entry) ) {
35387 + /* Chunk is already remapped. If the remap is still in progress,
35388 + * queue up this request to be handled later. If the remap is
35389 + * complete, we can just keep going.
35391 + up_write(&snap_volume->snap_semaphore);
35392 + snap_queue_original_request(snap_volume, org_bh,
35393 + target_entry, remainder,
35394 + queued_org_bh, write_to_snapshot);
35398 + /* Is there enough room left on this snapshot to remap this chunk? */
35399 + if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {
35400 + /* Once the snapshot becomes full, further writes to the
35401 + * original can't be remapped, and thus this snapshot
35402 + * will become "corrupted".
35404 + snap_volume->flags |= EVMS_SNAPSHOT_FULL;
35405 + set_snapshot_flags(snap_volume->logical_node,
35406 + EVMS_SNAPSHOT_FULL, EVMS_SNAPSHOT_DISABLED);
35407 + up_write(&snap_volume->snap_semaphore);
35411 + /* Create and initialize a new hash table entry for the new remap.
35412 + * The value SNAP_CHUNK_COPYING indicates that this chunk still has to
35413 + * be read from the original and written to the snapshot.
35415 + new_map_entry = allocate_snapshot_hash_entry(snap_volume,
35417 + snap_volume->next_free_chunk,
35418 + SNAP_CHUNK_COPYING);
35419 + if (!new_map_entry) {
35420 + /* Can't get memory for map entry. Disable this snapshot. */
35421 + LOG_ERROR("Memory error allocating hash table entry for snapshot '%s'.\n",
35422 + snap_volume->exported_node->name);
35423 + disable_snapshot(snap_volume, TRUE);
35424 + up_write(&snap_volume->snap_semaphore);
35428 + /* Add the entry to the hash table. */
35429 + if (target_entry) {
35430 + insert_snapshot_hash_entry(new_map_entry, target_entry);
35432 + insert_snapshot_hash_entry_at_head(new_map_entry,
35433 + &(snap_volume->snapshot_map[hash_value]));
35436 + /* Calculate the number of buffers that will be needed to copy this
35437 + * chunk, and the starting LBAs for both the org and the snap.
35439 + org_chunk_lba = chunk * org_volume->chunk_size;
35440 + snap_chunk_lba = snap_volume->next_free_chunk * org_volume->chunk_size;
35441 + snap_volume->next_free_chunk++;
35442 + sectors_in_chunk = min(((u64)org_volume->chunk_size),
35443 + org_volume->logical_node->total_vsectors -
35445 + buffer_count = (sectors_in_chunk +
35446 + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1) /
35447 + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT);
35449 + /* Create the parent async_org_io structure if it hasn't been done yet. */
35450 + if (!*async_org_io) {
35451 + *async_org_io = allocate_async_org_io(org_volume,
35454 + if (!*async_org_io) {
35455 + // BUGBUG: Disable the snapshot?
35459 + /* If we are only reading a partial chunk from the original,
35460 + * may need to readjust the size in the last buffer.
35462 + if ( (sectors_in_chunk < org_volume->chunk_size) &&
35463 + (sectors_in_chunk &
35464 + ((PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1)) ) {
35465 + for ( buf1 = (*async_org_io)->copy_buffers;
35466 + buf1->buffer_next;
35467 + buf1 = buf1->buffer_next ) {
35470 + buf1->bh->b_size = (sectors_in_chunk <<
35471 + EVMS_VSECTOR_SIZE_SHIFT) &
35476 + /* Create an async_snap_io structure for this snapshot and attach to
35477 + * the org io structure.
35479 + async_snap_io = allocate_async_snap_io(snap_volume, new_map_entry,
35480 + *async_org_io, snap_chunk_lba,
35482 + if (!async_snap_io) {
35483 + // BUGBUG: Disable the snapshot?
35487 + /* Fill in the next entry in the COW table. Copy the COW table to the
35488 + * buffer to be written out later.
35490 + snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64p(&chunk);
35491 + snap_volume->next_cow_entry++;
35492 + cow_buf = async_snap_io->cow_table_buffer;
35493 + cow_buf->bh->b_rdev = org_bh->b_rdev;
35494 + cow_buf->bh->b_rsector = snap_volume->current_cow_sector;
35495 + memcpy(cow_buf->bh->b_data, snap_volume->cow_table, EVMS_VSECTOR_SIZE);
35497 + /* If the COW table is full, reinitialize for the next sector. */
35498 + if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
35499 + snap_volume->next_cow_entry = 0;
35500 + snap_volume->current_cow_sector++;
35501 + memset(snap_volume->cow_table, 0xff, EVMS_VSECTOR_SIZE);
35504 + /* Attach the original buffer head, if it hasn't been queued
35505 + * already on a different copy.
35507 + if (!*queued_org_bh) {
35508 + org_bh->b_reqnext = NULL;
35509 + if (write_to_snapshot) {
35510 + /* Write to the snapshot. Attach to the async_snap_io. */
35511 + org_bh->b_rsector = (new_map_entry->snap_chunk <<
35512 + snap_volume->chunk_shift) +
35514 + async_snap_io->pending_writes = org_bh;
35515 + async_snap_io->dev = org_bh->b_rdev;
35517 + /* Write to the original. Attatch to the async_org_io. */
35518 + (*async_org_io)->pending_writes = org_bh;
35519 + (*async_org_io)->dev = org_bh->b_rdev;
35521 + evms_cs_volume_request_in_progress(org_bh->b_rdev, +1, NULL);
35522 + *queued_org_bh = TRUE;
35525 + /* Point the hash table entry at this async_snap_io. Then add this
35526 + * async_snap_io to the list in the async_org_io, as well as to the
35527 + * list in the snapshot volume.
35529 + new_map_entry->snap_io = async_snap_io;
35531 + async_snap_io->snap_io_list_next = (*async_org_io)->snap_io_list;
35532 + (*async_org_io)->snap_io_list = async_snap_io;
35533 + atomic_inc(&(*async_org_io)->copy_count);
35534 + atomic_inc(&(*async_org_io)->ref_count);
35536 + schedule_cow_table_write(async_snap_io);
35538 + /* Parallel walk through the copy_buffer's in the org and the snap,
35539 + * updating all necessary pointers and lists.
35541 + for ( buf1 = (*async_org_io)->copy_buffers,
35542 + buf2 = async_snap_io->copy_buffers;
35544 + buf1 = buf1->buffer_next, buf2 = buf2->buffer_next ) {
35545 + buf2->copy_next = buf1->copy_next;
35546 + buf2->buffer_private = async_snap_io;
35547 + buf2->bh->b_rdev = org_bh->b_rdev;
35548 + buf2->bh->b_data = buf1->bh->b_data;
35549 + buf2->bh->b_page = buf1->bh->b_page;
35551 + buf1->bh->b_rdev = org_bh->b_rdev;
35552 + buf1->copy_next = buf2;
35553 + buf1->buffer_private = *async_org_io;
35556 + /* We're done modifying snapshot volume info, so we can release the
35557 + * lock. We can't start any reads until all snapshots for this original
35558 + * have been checked. Return and start the reads later.
35560 + up_write(&snap_volume->snap_semaphore);
35566 + * snapshot_copy_data
35568 +static void snapshot_copy_data(struct snapshot_volume * org_volume,
35569 + struct buffer_head * org_bh)
35571 + struct snapshot_volume * snap_volume, * next_volume;
35572 + struct async_org_io * async_org_io = NULL;
35573 + struct snap_io_buffer * buf;
35574 + int queued_org_bh = FALSE;
35576 + /* Check each snapshot on this original
35577 + * to see which one's need a remap.
35579 + for ( snap_volume = org_volume->snapshot_next;
35580 + snap_volume; snap_volume = next_volume ) {
35581 + next_volume = snap_volume->snapshot_next;
35582 + snapshot_copy_1(snap_volume, org_bh, &async_org_io,
35583 + &queued_org_bh, FALSE);
35586 + if (async_org_io) {
35587 + /* One or more snapshots need a remap. The async_io structures
35588 + * have been built. Now we just need to run through them and
35589 + * start all of the reads.
35591 + for ( buf = async_org_io->copy_buffers;
35592 + buf; buf = buf->buffer_next ) {
35593 + R_IO(org_volume->logical_node, buf->bh);
35595 + } else if (!queued_org_bh) {
35596 + /* None of the snapshots needed a remap, and we didn't have to
35597 + * queue this request to be processed later due to a copy in
35598 + * progress. The write can be sent down normally.
35600 + W_IO(org_volume->logical_node, org_bh);
35605 + * writeable_snapshot_copy_data
35607 +static void writeable_snapshot_copy_data(struct snapshot_volume * snap_volume,
35608 + struct buffer_head * org_bh)
35610 + struct snapshot_volume * org_volume = snap_volume->snapshot_org;
35611 + struct async_org_io * async_org_io = NULL;
35612 + struct snap_io_buffer * buf;
35613 + int rc, queued_org_bh = FALSE;
35615 + rc = snapshot_copy_1(snap_volume, org_bh, &async_org_io,
35616 + &queued_org_bh, TRUE);
35618 + org_bh->b_end_io(org_bh, 0);
35622 + if (async_org_io) {
35623 + /* Need to remap this chunk to the snapshot. The async_io
35624 + * structures have been built. Just need to run through them
35625 + * and start all of the reads.
35627 + for ( buf = async_org_io->copy_buffers; buf;
35628 + buf = buf->buffer_next ) {
35629 + R_IO(org_volume->logical_node, buf->bh);
35631 + } else if (!queued_org_bh) {
35632 + /* No remap. The write can be sent down immediately. */
35633 + W_IO(snap_volume->logical_node, org_bh);
35640 +static void snap_write(struct evms_logical_node * node,
35641 + struct buffer_head * bh)
35643 + struct snapshot_volume * volume = node->private;
35645 + /* Size check. */
35646 + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
35647 + node->total_vsectors) {
35648 + bh->b_end_io(bh, 0);
35652 + /* Can't write if rollback is in progress. */
35653 + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) {
35654 + LOG_ERROR("Cannot write to snapshot '%s' during rollback.\n",
35655 + volume->logical_node->name);
35656 + bh->b_end_io(bh, 0);
35660 + if ( volume->flags & EVMS_SNAPSHOT ) {
35662 + if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE ) {
35663 + writeable_snapshot_copy_data(volume, bh);
35665 + bh->b_end_io(bh, 0);
35669 + snapshot_copy_data(volume, bh);
35676 +static int snap_ioctl(struct evms_logical_node * logical_node,
35677 + struct inode * inode,
35678 + struct file * file,
35679 + unsigned int cmd,
35680 + unsigned long arg)
35682 + struct snapshot_volume * volume = logical_node->private;
35683 + struct evms_quiesce_vol_pkt * quiesce;
35684 + struct evms_plugin_ioctl_pkt pkt, * user_pkt;
35685 + int percent_full, rc = 0;
35688 + case EVMS_QUIESCE_VOLUME:
35689 + quiesce = (struct evms_quiesce_vol_pkt*)arg;
35690 + if (quiesce->command) {
35692 + volume->flags |= EVMS_SNAPSHOT_QUIESCED;
35695 + volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;
35699 + case EVMS_GET_BMAP:
35700 + if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
35701 + rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35707 + case EVMS_PLUGIN_IOCTL:
35708 + user_pkt = (struct evms_plugin_ioctl_pkt *)arg;
35710 + /* Copy user's parameters to kernel space. */
35711 + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
35716 + if ( pkt.feature_id != logical_node->plugin->id ) {
35717 + /* This ioctl is not targetted at snapshotting, so
35718 + * broadcast the command to all children.
35720 + rc = IOCTL(logical_node, inode, file, cmd, arg);
35724 + switch (pkt.feature_command) {
35725 + case SNAPSHOT_QUERY_PERCENT_FULL:
35726 + if ( volume->flags & EVMS_SNAPSHOT_FULL ) {
35727 + percent_full = -1;
35728 + } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) {
35729 + percent_full = -2;
35731 + percent_full = (volume->next_free_chunk * 100) /
35732 + volume->num_chunks;
35734 + rc = copy_to_user(pkt.feature_ioctl_data,
35736 + sizeof(percent_full));
35739 + case SNAPSHOT_START_ROLLBACK:
35740 + if ( volume->flags & EVMS_SNAPSHOT_FULL ) {
35742 + } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) {
35744 + } else if ( ! (volume->flags & EVMS_SNAPSHOT) ) {
35747 + set_snapshot_flags(volume->logical_node,
35748 + EVMS_SNAPSHOT_ROLLBACK, 0);
35752 + case SNAPSHOT_CHECK_STATE:
35753 + rc = copy_to_user(pkt.feature_ioctl_data,
35755 + sizeof(volume->flags));
35763 + case EVMS_CHECK_MEDIA_CHANGE:
35764 + case EVMS_REVALIDATE_DISK:
35765 + case EVMS_GET_DISK_LIST:
35766 + case EVMS_CHECK_DEVICE_STATUS:
35767 + /* Broadcast these to all children. */
35768 + if ( ! (volume->flags & EVMS_SNAPSHOT_ORG) ) {
35769 + volume = volume->snapshot_org;
35772 + rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35773 + volume = volume->snapshot_next;
35777 + case EVMS_OPEN_VOLUME:
35778 + /* Disallow opens on rollback in progress.
35779 + * Otherwise fall through.
35781 + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK) {
35782 + LOG_ERROR("Cannot open snapshot volume '%s' during rollback\n",
35783 + volume->logical_node->name);
35789 + rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35798 +static int snap_init_io(struct evms_logical_node * node,
35804 + struct snapshot_volume * volume = node->private;
35806 + /* No init io access to snapshot, and no writes allowed to original
35807 + * since they would not be snapshotted.
35809 + if ( rw || (volume->flags & EVMS_SNAPSHOT) ) {
35812 + return INIT_IO(volume->logical_node, rw,
35813 + sect_nr, num_sects, buf_addr);
35817 + * add_cow_entry_to_snapshot_map
35819 + * This function takes a cow table entry (from the on-disk data), and
35820 + * converts it into an appropriate entry for the snapshot map, and
35821 + * inserts it into the appropriate map for the specified volume.
35823 +static int add_cow_entry_to_snapshot_map(u64 org_chunk,
35825 + struct snapshot_volume * volume)
35827 + struct snapshot_hash_entry * new_entry, * target_entry;
35828 + unsigned long hash_value;
35830 + new_entry = allocate_snapshot_hash_entry(volume, org_chunk,
35831 + snap_chunk, SNAP_CHUNK_COPIED);
35832 + if (!new_entry) {
35836 + hash_value = (long)org_chunk % volume->hash_table_size;
35837 + if ( search_snapshot_hash_chain(org_chunk,
35838 + volume->snapshot_map[hash_value],
35839 + &target_entry) ) {
35840 + /* A duplicate mapping was found. This should never happen. */
35842 + if (target_entry) {
35843 + insert_snapshot_hash_entry(new_entry, target_entry);
35845 + insert_snapshot_hash_entry_at_head(new_entry,
35846 + &(volume->snapshot_map[hash_value]));
35853 + * build_snapshot_maps
35855 + * Construct the initial hash table state based on
35856 + * existing COW tables on the disk.
35858 +static int build_snapshot_maps(struct snapshot_volume * volume)
35861 + int done = FALSE;
35863 + /* Read in one sector's worth of COW tables. */
35864 + if ( INIT_IO(volume->logical_node, 0,
35865 + volume->current_cow_sector, 1,
35866 + volume->cow_table) ) {
35870 + /* Translate every valid COW table entry into
35871 + * a snapshot map entry.
35873 + for ( volume->next_cow_entry = 0;
35874 + volume->next_cow_entry < (EVMS_VSECTOR_SIZE/sizeof(u64)) &&
35875 + volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;
35876 + volume->next_cow_entry++, volume->next_free_chunk++ ) {
35877 + rc = add_cow_entry_to_snapshot_map(le64_to_cpup(&volume->cow_table[volume->next_cow_entry]),
35878 + volume->next_free_chunk,
35885 + /* Move on to the next sector if necessary. */
35886 + if ( volume->next_cow_entry ==
35887 + (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
35888 + volume->current_cow_sector++;
35897 + * initialize_snapshot_node
35899 +static int initialize_snapshot_node(struct evms_logical_node * snap_node,
35900 + struct evms_logical_node * new_snap_node,
35901 + struct evms_logical_node * org_node,
35902 + struct snapshot_metadata * metadata)
35904 + struct snapshot_volume * snap_volume;
35905 + struct snapshot_hash_entry * new_entry;
35908 + /* Instance data for the snapshot. */
35909 + snap_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL);
35910 + if (!snap_volume) {
35911 + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
35912 + snap_delete_volume(new_snap_node);
35913 + DELETE(snap_node);
35916 + memset(snap_volume, 0, sizeof(struct snapshot_volume));
35918 + /* Initialize the snapshot node. */
35919 + new_snap_node->total_vsectors = org_node->total_vsectors;
35920 + new_snap_node->plugin = &plugin_header;
35921 + new_snap_node->private = snap_volume;
35922 + new_snap_node->flags = snap_node->flags |
35923 + (org_node->flags & (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL)) |
35924 + ((metadata->flags & EVMS_SNAPSHOT_WRITEABLE) ? 0 : EVMS_VOLUME_READ_ONLY);
35925 + new_snap_node->hardsector_size = snap_node->hardsector_size;
35926 + new_snap_node->block_size = snap_node->block_size;
35927 + new_snap_node->system_id = EVMS_SNAPSHOT_SIGNATURE;
35928 + new_snap_node->volume_info = snap_node->volume_info;
35929 + /* Get the new node's name from the consumed node's feature header. */
35930 + strcpy(new_snap_node->name, snap_node->feature_header->object_name);
35932 + /* Initialize the private data. */
35933 + snap_volume->logical_node = snap_node;
35934 + snap_volume->exported_node = new_snap_node;
35935 + init_rwsem(&snap_volume->snap_semaphore);
35936 + snap_volume->chunk_size = metadata->chunk_size;
35937 + snap_volume->chunk_shift = evms_cs_log2((u64)metadata->chunk_size);
35938 + snap_volume->num_chunks = metadata->total_chunks;
35939 + snap_volume->current_cow_sector = metadata->lba_of_COW_table;
35940 + snap_volume->hash_table_size = metadata->total_chunks / MAX_HASH_CHAIN_ENTRIES + 1;
35941 + snap_volume->flags = EVMS_SNAPSHOT |
35942 + (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) |
35943 + (metadata->flags & EVMS_SNAPSHOT_ASYNC);
35944 + INIT_LIST_HEAD(&snap_volume->cow_table_write_list);
35945 + spin_lock_init(&snap_volume->cow_table_write_list_lock);
35947 +#ifdef SNAPSHOT_DEBUG
35948 + snap_volume->cow_table_writes = (atomic_t)ATOMIC_INIT(0);
35949 + snap_volume->cow_table_overlaps = (atomic_t)ATOMIC_INIT(0);
35952 + if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) {
35954 + /* Buffer for reading rollback data. */
35955 + snap_volume->chunk_data_buffer = kmalloc(SNAPSHOT_CHUNK_BUFFER_SIZE <<
35956 + EVMS_VSECTOR_SIZE_SHIFT,
35958 + if (!snap_volume->chunk_data_buffer) {
35959 + disable_snapshot(snap_volume, TRUE);
35960 + snap_delete_volume(new_snap_node);
35964 + /* Create the rollback thread. */
35965 + snap_volume->rollback_thread =
35966 + evms_cs_register_thread(snapshot_do_rollback,
35968 + "evms_snapshot_rollback");
35969 + if (!snap_volume->rollback_thread){
35970 + LOG_SERIOUS("Could not start rollback thread for snapshot '%s'.\n",
35971 + snap_node->name);
35972 + disable_snapshot(snap_volume, TRUE);
35973 + snap_delete_volume(new_snap_node);
35977 + /* Snapshot hash table. */
35978 + snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size *
35979 + sizeof(struct snapshot_hash_entry*));
35980 + if (!snap_volume->snapshot_map) {
35981 + disable_snapshot(snap_volume, TRUE);
35982 + snap_delete_volume(new_snap_node);
35985 + memset(snap_volume->snapshot_map, 0,
35986 + snap_volume->hash_table_size *
35987 + sizeof(struct snapshot_hash_entry*));
35989 + /* Pre-allocate all of the hash entries we will need and
35990 + * store them in the free list in the volume.
35992 + for ( i = 0; i < snap_volume->num_chunks; i++ ) {
35993 + new_entry = mempool_alloc(snap_hash_entry_pool,
35995 + if (!new_entry) {
35996 + disable_snapshot(snap_volume, TRUE);
35997 + snap_delete_volume(new_snap_node);
36000 + new_entry->next = snap_volume->free_hash_list;
36001 + snap_volume->free_hash_list = new_entry;
36004 + rc = build_snapshot_maps(snap_volume);
36006 + disable_snapshot(snap_volume, TRUE);
36007 + snap_delete_volume(new_snap_node);
36016 + * initialize_original_node
36018 +static int initialize_original_node(struct evms_logical_node * snap_node,
36019 + struct evms_logical_node * new_snap_node,
36020 + struct evms_logical_node * org_node,
36021 + struct evms_logical_node * new_org_node)
36023 + struct snapshot_volume * snap_volume = new_snap_node->private;
36024 + struct snapshot_volume * org_volume;
36026 + /* Instance data for the original. */
36027 + org_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL);
36028 + if (!org_volume) {
36029 + disable_snapshot(snap_volume, TRUE);
36030 + snap_delete_volume(new_snap_node);
36031 + snap_delete_volume(new_org_node);
36034 + memset(org_volume, 0, sizeof(struct snapshot_volume));
36036 + /* Initialize the new node. */
36037 + new_org_node->total_vsectors = org_node->total_vsectors;
36038 + new_org_node->plugin = &plugin_header;
36039 + new_org_node->private = org_volume;
36040 + new_org_node->flags = org_node->flags |
36041 + (snap_node->flags &
36042 + (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
36043 + new_org_node->hardsector_size = org_node->hardsector_size;
36044 + new_org_node->block_size = org_node->block_size;
36045 + new_org_node->system_id = EVMS_ORIGINAL_SIGNATURE;
36046 + new_org_node->volume_info = org_node->volume_info;
36047 + /* Must reuse the original node's name. */
36048 + strcpy(new_org_node->name, org_node->name);
36050 + /* Initialize the private data. */
36051 + org_volume->logical_node = org_node;
36052 + org_volume->exported_node = new_org_node;
36053 + init_rwsem(&org_volume->snap_semaphore);
36054 + org_volume->chunk_size = snap_volume->chunk_size;
36055 + org_volume->chunk_shift = snap_volume->chunk_shift;
36056 + org_volume->flags = EVMS_SNAPSHOT_ORG |
36057 + (snap_volume->flags & EVMS_SNAPSHOT_ASYNC);
36058 + INIT_LIST_HEAD(&org_volume->chunk_write_list);
36059 + spin_lock_init(&org_volume->chunk_write_list_lock);
36060 + INIT_LIST_HEAD(&org_volume->org_pending_io_list);
36061 + spin_lock_init(&org_volume->org_pending_io_list_lock);
36062 + INIT_LIST_HEAD(&org_volume->snap_pending_io_list);
36063 + spin_lock_init(&org_volume->snap_pending_io_list_lock);
36065 + /* Start the async I/O thread for this original. */
36066 + org_volume->async_io_thread =
36067 + evms_cs_register_thread(snap_async_io_thread, org_volume,
36068 + "evms_async_snapshot");
36069 + if (!org_volume->async_io_thread) {
36070 + disable_snapshot(snap_volume, TRUE);
36071 + snap_delete_volume(new_snap_node);
36072 + snap_delete_volume(new_org_node);
36082 + * Initializes a snapshot instance and exports an evms_logical_node to
36083 + * the global list.
36085 +static int add_snapshot(struct evms_logical_node * snap_node,
36086 + struct snapshot_metadata * metadata,
36087 + struct evms_logical_node ** evms_node_list)
36089 + struct evms_logical_node * new_snap_node;
36090 + struct evms_logical_node * new_org_node;
36091 + struct evms_logical_node * org_node;
36092 + struct snapshot_volume * snap_volume;
36093 + struct snapshot_volume * org_volume;
36094 + struct snapshot_volume * tmp_volume;
36097 + /* Make sure the snapshot is not full or disabled. */
36098 + if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {
36099 + LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n",
36100 + snap_node->name);
36101 + LOG_WARNING(" Deleting from further use.\n");
36102 + DELETE(snap_node);
36106 + /* Inspect the global list until a node is found with the name of
36107 + * this snapshot's original. There can only be one original for
36110 + for ( org_node = *evms_node_list;
36111 + org_node && strncmp(EVMS_GET_NODE_NAME(org_node),
36112 + metadata->original_volume,
36113 + EVMS_VOLUME_NAME_SIZE);
36114 + org_node = org_node->next ) {
36118 + /* No original was found. Disable and delete the snapshot. */
36119 + LOG_ERROR("Error: No original found for snapshot %s, looking for %s\n",
36120 + snap_node->name, metadata->original_volume);
36121 + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36122 + DELETE(snap_node);
36126 + LOG_DEBUG("Adding snapshot for '%s'\n", org_node->name);
36128 + /* We found the original on the list. Verify the size to be sure the
36129 + * name didn't change for compatibility. For non-512-byte hardsector
36130 + * sizes, round down org node to a hardsector multiple to be the same
36131 + * as what was stored in the metadata.
36133 + if ( (org_node->total_vsectors &
36134 + (~((org_node->hardsector_size/EVMS_VSECTOR_SIZE)-1))) !=
36135 + metadata->original_size ) {
36136 + /* The snapshot no longer points at a valid original.
36137 + * Disable and delete the snapshot.
36139 + LOG_ERROR("Error: Original volume size does not match for snapshot '%s'!\n",
36140 + snap_node->name);
36141 + LOG_ERROR(" volume=%s: org_size="PFU64", current size="PFU64"\n",
36142 + org_node->name, metadata->original_size,
36143 + org_node->total_vsectors);
36144 + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36145 + DELETE(snap_node);
36149 + /* New EVMS node for the snapshot. */
36150 + if ( evms_cs_allocate_logical_node(&new_snap_node) ) {
36151 + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36152 + DELETE(snap_node);
36156 + MOD_INC_USE_COUNT;
36157 + snapshot_count++;
36159 + snapshot_create_pools();
36161 + rc = initialize_snapshot_node(snap_node, new_snap_node,
36162 + org_node, metadata);
36166 + snap_volume = new_snap_node->private;
36168 + /* Check to see if the node we found is one we put back on the list due
36169 + * to another snapshot of the original, if so then don't allocate a new
36170 + * node and volume info, just get the old one.
36172 + if ( org_node->plugin->id != plugin_header.id ) {
36174 + /* New EVMS node for the original. */
36175 + if ( evms_cs_allocate_logical_node(&new_org_node) ) {
36176 + disable_snapshot(snap_volume, TRUE);
36177 + snap_delete_volume(new_snap_node);
36181 + MOD_INC_USE_COUNT;
36182 + snapshot_count++;
36184 + rc = initialize_original_node(snap_node, new_snap_node,
36185 + org_node, new_org_node);
36189 + org_volume = new_org_node->private;
36191 + /* Remove the original volume from the global list, then
36192 + * add the new version of the original to the global list.
36194 + evms_cs_remove_logical_node_from_list(evms_node_list, org_node);
36195 + evms_cs_add_logical_node_to_list(evms_node_list, new_org_node);
36197 + /* There is already at least one snapshot for this original. */
36198 + new_org_node = org_node;
36199 + org_volume = new_org_node->private;
36200 + org_node = org_volume->logical_node;
36202 + /* Make sure this snapshot matches the current
36203 + * chunk size if we have async snapshots.
36205 + if ( snap_volume->chunk_size != org_volume->chunk_size ) {
36206 + LOG_ERROR("Cannot add snapshot '%s' with chunk size %u to original '%s' with chunk size %u.\n",
36207 + new_snap_node->name, snap_volume->chunk_size,
36208 + new_org_node->name, org_volume->chunk_size);
36209 + disable_snapshot(snap_volume, TRUE);
36210 + snap_delete_volume(new_snap_node);
36214 + /* If the new snapshot is Removable or Partial, propogate
36215 + * the flags to the original and all other snapshots.
36217 + for ( tmp_volume = org_volume;
36219 + tmp_volume = tmp_volume->snapshot_next) {
36220 + tmp_volume->exported_node->flags |=
36221 + (snap_node->flags &
36222 + (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
36226 + /* Create a proc-fs entry for this snapshot. */
36228 + create_proc_read_entry(snap_node->feature_header->volume_name,
36229 + S_IFREG, snap_proc,
36230 + snap_proc_read, new_snap_node);
36233 + /* Insert the new snapshot at the start of the original's chain. */
36234 + down_write(&org_volume->snap_semaphore);
36235 + snap_volume->snapshot_next = org_volume->snapshot_next;
36236 + org_volume->snapshot_next = snap_volume;
36237 + snap_volume->snapshot_org = org_volume;
36238 + up_write(&org_volume->snap_semaphore);
36240 + /* Place the new snapshot on the global list. */
36241 + evms_cs_add_logical_node_to_list(evms_node_list, new_snap_node);
36243 + if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) {
36244 + org_volume->flags |= EVMS_SNAPSHOT_ROLLBACK;
36245 + snap_volume->flags |= EVMS_SNAPSHOT_ROLLBACK;
36246 + evms_cs_wakeup_thread(snap_volume->rollback_thread);
36255 +void snapshot_do_rollback(void * volume)
36257 + struct snapshot_volume * snap_volume = volume;
36258 + struct snapshot_volume * org_volume = snap_volume->snapshot_org;
36259 + u32 io_size = snap_volume->chunk_size;
36260 + u32 sectors = io_size;
36261 + int done = FALSE;
36262 + int i, iterations = 1;
36264 + evms_cs_invalidate_volume(org_volume->exported_node);
36265 + evms_cs_invalidate_volume(snap_volume->exported_node);
36267 + /* Safety to start at chunk 0. */
36268 + snap_volume->next_free_chunk = 0;
36271 + if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {
36272 + iterations = snap_volume->chunk_size /
36273 + org_volume->chunk_size;
36274 + sectors = io_size = org_volume->chunk_size;
36277 + /* Read in one sector's worth of COW tables. */
36278 + if ( INIT_IO(snap_volume->logical_node, 0,
36279 + snap_volume->current_cow_sector, 1,
36280 + snap_volume->cow_table) ) {
36281 + LOG_ERROR("Error reading COW table from snapshot during rollback, aborting rollback\n");
36285 + /* Translate every valid COW table entry into
36286 + * a snapshot map entry.
36288 + for ( snap_volume->next_cow_entry = 0;
36289 + snap_volume->next_cow_entry <
36290 + (EVMS_VSECTOR_SIZE/sizeof(u64)) &&
36291 + snap_volume->cow_table[snap_volume->next_cow_entry] !=
36292 + 0xffffffffffffffff;
36293 + snap_volume->next_cow_entry++,
36294 + snap_volume->next_free_chunk++ ) {
36295 + for ( i = 0; i < iterations; i++ ) {
36297 + /* Don't go off the end of the original. */
36299 + org_volume->logical_node->total_vsectors -
36300 + (snap_volume->cow_table[snap_volume->next_cow_entry] *
36301 + snap_volume->chunk_size + i * io_size) ) {
36302 + sectors = org_volume->logical_node->total_vsectors -
36303 + (snap_volume->cow_table[snap_volume->next_cow_entry] *
36304 + snap_volume->chunk_size + i * io_size);
36307 + /* Read the chunk from the snapshot volume. */
36308 + if ( INIT_IO(snap_volume->logical_node, READ,
36309 + (snap_volume->next_free_chunk *
36310 + snap_volume->chunk_size +
36313 + snap_volume->chunk_data_buffer) ) {
36314 + LOG_ERROR("Error reading chunk %u from snapshot '%s'. Continuing.\n",
36315 + snap_volume->next_free_chunk,
36316 + snap_volume->logical_node->name);
36319 + /* Write the chunk to the original volume. */
36320 + if ( INIT_IO(org_volume->logical_node, WRITE,
36321 + snap_volume->cow_table[snap_volume->next_cow_entry] *
36322 + snap_volume->chunk_size + i*io_size,
36324 + snap_volume->chunk_data_buffer) ) {
36325 + LOG_ERROR("Error writing chunk %u to original '%s' during rollback. Continuing.\n",
36326 + snap_volume->next_free_chunk,
36327 + org_volume->logical_node->name);
36330 + if ( sectors < io_size ) {
36336 + /* Move on to the next COW table sector if necessary. */
36337 + if ( snap_volume->next_cow_entry ==
36338 + (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
36339 + snap_volume->current_cow_sector++;
36342 + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED |
36343 + EVMS_SNAPSHOT_ROLLBACK_COMP;
36344 + snap_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK;
36345 + org_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK;
36346 + set_snapshot_flags(snap_volume->logical_node,
36347 + EVMS_SNAPSHOT_DISABLED |
36348 + EVMS_SNAPSHOT_ROLLBACK_COMP,
36349 + EVMS_SNAPSHOT_ROLLBACK);
36350 + LOG_DEFAULT("Rollback complete from snapshot %s\n",
36351 + snap_volume->exported_node->name);
36359 + * Callback function for the proc-fs entry for each snapshot node.
36360 + * Print out pertinent information about this snapshot. The "data"
36361 + * parameter is a pointer to an EVMS logical node.
36363 +static int snap_proc_read(char * page, char ** start, off_t off,
36364 + int count, int * eof, void * data)
36366 + struct evms_logical_node * snap_node = data;
36367 + struct snapshot_volume * snap_volume = snap_node->private;
36370 + PROCPRINT("Snapshot of : %s\n", (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : (u8 *)"Unknown");
36371 + PROCPRINT("Size (KB) : %u\n", (snap_volume->num_chunks * snap_volume->chunk_size)/2);
36372 + PROCPRINT("Chunk Size (KB): %u\n", (snap_volume->chunk_size)/2);
36373 + PROCPRINT("Writeable : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "Yes" : "No");
36374 + PROCPRINT("Usage : %u%%\n", (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);
36375 + PROCPRINT("Status : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");
36376 +#ifdef SNAPSHOT_DEBUG
36377 + PROCPRINT("Next free chunk: %u\n", snap_volume->next_free_chunk);
36378 + PROCPRINT("COW Writes : %u\n", atomic_read(&snap_volume->cow_table_writes));
36379 + PROCPRINT("COW Overlaps : %u\n", atomic_read(&snap_volume->cow_table_overlaps));
36383 + *start = page + off;
36387 + return sz > count ? count : sz;
36393 +int __init snapshot_init(void)
36395 + struct proc_dir_entry * pde;
36397 + /* Register a directory in proc-fs. */
36398 + pde = evms_cs_get_evms_proc_dir();
36400 + snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);
36403 + /* Register with EVMS. */
36404 + return evms_cs_register_plugin(&plugin_header);
36410 +void __exit snapshot_exit(void)
36412 + struct proc_dir_entry * pde;
36414 + /* Unregister the directory in proc-fs. */
36415 + pde = evms_cs_get_evms_proc_dir();
36417 + remove_proc_entry("snapshot", pde);
36420 + evms_cs_unregister_plugin(&plugin_header);
36423 +module_init(snapshot_init);
36424 +module_exit(snapshot_exit);
36425 +#ifdef MODULE_LICENSE
36426 +MODULE_LICENSE("GPL");
36429 diff -Naur linux-2002-09-30/include/linux/evms/evms.h evms-2002-09-30/include/linux/evms/evms.h
36430 --- linux-2002-09-30/include/linux/evms/evms.h Wed Dec 31 18:00:00 1969
36431 +++ evms-2002-09-30/include/linux/evms/evms.h Thu Sep 26 11:55:45 2002
36433 +/* -*- linux-c -*- */
36435 + * Copyright (c) International Business Machines Corp., 2000
36437 + * This program is free software; you can redistribute it and/or modify
36438 + * it under the terms of the GNU General Public License as published by
36439 + * the Free Software Foundation; either version 2 of the License, or
36440 + * (at your option) any later version.
36442 + * This program is distributed in the hope that it will be useful,
36443 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
36444 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
36445 + * the GNU General Public License for more details.
36447 + * You should have received a copy of the GNU General Public License
36448 + * along with this program; if not, write to the Free Software
36449 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36452 + * linux/include/linux/evms/evms.h
36454 + * EVMS kernel header file
36458 +#ifndef __EVMS_INCLUDED__
36459 +#define __EVMS_INCLUDED__
36461 +#include <linux/blk.h>
36462 +#include <linux/genhd.h>
36463 +#include <linux/fs.h>
36464 +#include <linux/iobuf.h>
36465 +#include <linux/kdev_t.h>
36466 +#include <linux/hdreg.h>
36467 +#include <linux/slab.h>
36468 +#include <linux/proc_fs.h>
36469 +#include <linux/major.h>
36474 +#define EVMS_MAJOR_VERSION 1
36475 +#define EVMS_MINOR_VERSION 2
36476 +#define EVMS_PATCHLEVEL_VERSION 0
36479 + * general defines section
36484 +#define MAX_EVMS_VOLUMES 256
36485 +#define EVMS_VOLUME_NAME_SIZE 127
36486 +#define IBM_OEM_ID 8112
36487 +#define EVMS_INITIAL_CRC 0xFFFFFFFF
36488 +#define EVMS_MAGIC_CRC 0x31415926
36489 +#define EVMS_VSECTOR_SIZE 512
36490 +#define EVMS_VSECTOR_SIZE_SHIFT 9
36492 +#define DEV_PATH "/dev"
36493 +#define EVMS_DIR_NAME "evms"
36494 +#define EVMS_DEV_NAME "block_device"
36495 +#define EVMS_DEV_NODE_PATH DEV_PATH "/" EVMS_DIR_NAME "/"
36496 +#define EVMS_DEVICE_NAME DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME
36499 + * kernel logging levels defines
36501 +#define EVMS_INFO_CRITICAL 0
36502 +#define EVMS_INFO_SERIOUS 1
36503 +#define EVMS_INFO_ERROR 2
36504 +#define EVMS_INFO_WARNING 3
36505 +#define EVMS_INFO_DEFAULT 5
36506 +#define EVMS_INFO_DETAILS 6
36507 +#define EVMS_INFO_DEBUG 7
36508 +#define EVMS_INFO_EXTRA 8
36509 +#define EVMS_INFO_ENTRY_EXIT 9
36510 +#define EVMS_INFO_EVERYTHING 10
36513 + * kernel logging level variable
36515 +extern int evms_info_level;
36518 + * kernel logging macros
36520 +#define evmsLOG(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }
36521 +#define evmsLOG2(info_level,statement) { if (evms_info_level >= info_level) statement; }
36524 + * LOG MACROS to make evms log messages
36525 + * look much cleaner in the source.
36527 +#define EVMS_LOG_PREFIX "evms: "
36528 +#define LOG_CRITICAL(msg, args...) evmsLOG(EVMS_INFO_CRITICAL, (KERN_CRIT EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36529 +#define LOG_SERIOUS(msg, args...) evmsLOG(EVMS_INFO_SERIOUS, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36530 +#define LOG_ERROR(msg, args...) evmsLOG(EVMS_INFO_ERROR, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36531 +#define LOG_WARNING(msg, args...) evmsLOG(EVMS_INFO_WARNING, (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36532 +#define LOG_DEFAULT(msg, args...) evmsLOG(EVMS_INFO_DEFAULT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36533 +#define LOG_DETAILS(msg, args...) evmsLOG(EVMS_INFO_DETAILS, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36534 +#define LOG_DEBUG(msg, args...) evmsLOG(EVMS_INFO_DEBUG, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36535 +#define LOG_EXTRA(msg, args...) evmsLOG(EVMS_INFO_EXTRA, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36536 +#define LOG_ENTRY_EXIT(msg, args...) evmsLOG(EVMS_INFO_ENTRY_EXIT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36537 +#define LOG_EVERYTHING(msg, args...) evmsLOG(EVMS_INFO_EVERYTHING, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36540 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
36541 + * Use these in place of %Ld, %Lu, and %Lx.
36543 +#if BITS_PER_LONG > 32
36544 +#define PFD64 "%ld"
36545 +#define PFU64 "%lu"
36546 +#define PFX64 "%lx"
36548 +#define PFD64 "%Ld"
36549 +#define PFU64 "%Lu"
36550 +#define PFX64 "%Lx"
36554 + * helpful PROCFS macro
36556 +#ifdef CONFIG_PROC_FS
36557 +#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args));\
36559 + off -= sz, sz = 0;\
36560 + else if (sz >= off + count)\
36565 + * PluginID convenience macros
36567 + * An EVMS PluginID is a 32-bit number with the following bit positions:
36568 + * Top 16 bits: OEM identifier. See IBM_OEM_ID.
36569 + * Next 4 bits: Plugin type identifier. See evms_plugin_code.
36570 + * Lowest 12 bits: Individual plugin identifier within a given plugin type.
36572 +#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)
36573 +#define GetPluginOEM(pluginid) (pluginid >> 16)
36574 +#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)
36575 +#define GetPluginID(pluginid) (pluginid & 0xfff)
36578 + * enum evms_plugin_type - evms plugin types
36580 +enum evms_plugin_code {
36581 + EVMS_NO_PLUGIN = 0,
36582 + EVMS_DEVICE_MANAGER,
36583 + EVMS_SEGMENT_MANAGER,
36584 + EVMS_REGION_MANAGER,
36586 + EVMS_ASSOCIATIVE_FEATURE,
36587 + EVMS_FILESYSTEM_INTERFACE_MODULE,
36588 + EVMS_CLUSTER_MANAGER_INTERFACE_MODULE,
36589 + EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE
36593 + * struct evms_version -
36594 + * @major: changes when incompatible difference are introduced
36595 + * @minor: changes when additions are made
36596 + * @patchlevel: reflects bug level fixes within a particular major/minor pair
36598 + * generic versioning info used by EVMS
36600 +struct evms_version {
36607 + * struct evms_plugin_header - kernel plugin header record
36609 + * @version: plugin version
36610 + * @required_services_version: required common services version
36611 + * @fops: table of function operations
36613 + * kernel plugin header record
36615 +struct evms_plugin_header {
36617 + struct evms_version version;
36618 + struct evms_version required_services_version;
36619 + struct evms_plugin_fops *fops;
36623 + * struct evms_feature_header - EVMS generic on-disk header for features
36624 + * @signature: unique magic number
36625 + * @crc: structure's crc
36626 + * @version: feature header version
36627 + * @engine_version: created by this evms engine version
36628 + * @flags: feature characteristics, bit definitions below.
36629 + * @feature_id: indicates which feature this header is describing
36630 + * @sequence_number: describes most recent copy of redundant metadata
36631 + * @alignment_padding: used when objects are moved between different sized devices
36632 + * @feature_data1_start_lsn: object relative start of 1st copy feature data
36633 + * @feature_data1_size: size of 1st copy of feature data
36634 + * @feature_data2_start_lsn: object relative start of 2nd copy feature data
36635 + * @feature_data2_size: size of 2nd copy of feature data
36636 + * @volume_serial_number: unique/persistent volume identifier
36637 + * @volume_system_id: unique/persistent minor number
36638 + * @object_depth: depth of object in volume tree
36639 + * @object_name: object's name
36640 + * @volume_name: volume name object is a part of
36641 + * @pad: padding to make structure be 512 byte aligned
36643 + * generic on-disk header used to describe any EVMS feature
36644 + * NOTE: 2nd copy of feature data is optional, if used set start_lsn to 0.
36646 +struct evms_feature_header {
36649 + struct evms_version version;
36650 + struct evms_version engine_version;
36653 + u64 sequence_number;
36654 + u64 alignment_padding;
36655 + u64 feature_data1_start_lsn;
36656 + u64 feature_data1_size;
36657 + u64 feature_data2_start_lsn;
36658 + u64 feature_data2_size;
36659 + u64 volume_serial_number;
36660 + u32 volume_system_id;
36661 + u32 object_depth;
36662 + u8 object_name[EVMS_VOLUME_NAME_SIZE + 1];
36663 + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
36668 + * field evms_feature_header.signature majic number
36670 +#define EVMS_FEATURE_HEADER_SIGNATURE 0x54414546 /* FEAT */
36672 + * field evms_feature_header.flags defines
36674 +#define EVMS_FEATURE_ACTIVE (1<<0)
36675 +#define EVMS_FEATURE_VOLUME_COMPLETE (1<<1)
36676 +#define EVMS_VOLUME_DATA_OBJECT (1<<16)
36677 +#define EVMS_VOLUME_DATA_STOP (1<<17)
36679 + * struct evms_feature_header version info
36681 +#define EVMS_FEATURE_HEADER_MAJOR 3
36682 +#define EVMS_FEATURE_HEADER_MINOR 0
36683 +#define EVMS_FEATURE_HEADER_PATCHLEVEL 0
36686 + * EVMS specific error codes
36688 +#define EVMS_FEATURE_FATAL_ERROR 257
36689 +#define EVMS_VOLUME_FATAL_ERROR 258
36690 +#define EVMS_FEATURE_INCOMPLETE_ERROR 259
36693 + * struct evms_volume_info - exported volume info
36694 + * @volume_sn: unique volume identifier
36695 + * @volume_minor: persistent device minor assigned to this volume
36696 + * @volume_name: persistent name assigned to this volume
36698 + * a collection of volume specific info
36700 +struct evms_volume_info {
36702 + u32 volume_minor;
36703 + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
36707 + * struct evms_logical_node - generic kernel storage object
36708 + * @total_vsectors: 0 size of this object in 512 byte units
36709 + * @plugin: 8 plugin that created/owns/manages this storage object
36710 + * @private: 12 location for owner to store private info
36711 + * @flags: 16 storage object characteristics (set/used by plugins)
36712 + * bit definitions located in evms_common.h
36713 + * @iflags: 20 internal flags (used exclusively by the framework, not for plugins to use/set)
36714 + * bit definitions below.
36715 + * @hardsector_size: 24 assumed physical sector size of underlying device
36716 + * @block_size: 28 default block size for this object
36717 + * @system_id: 32 system indicator (set by the segment manager)
36718 + * @volume_info: 36 persistent volume info, used only by EVMS volumes
36719 + * @feature_header: 40 generic on-disk metadata describing any EVMS feature
36720 + * @next: 44 linked list field
36721 + * @name: 48 storage object name
36724 + * generic kernel storage object
36726 +struct evms_logical_node {
36727 + u64 total_vsectors;
36728 + struct evms_plugin_header *plugin;
36732 + int hardsector_size;
36735 + struct evms_volume_info *volume_info;
36736 + struct evms_feature_header *feature_header;
36737 + struct evms_logical_node *next;
36738 + u8 name[EVMS_VOLUME_NAME_SIZE + 1];
36742 + * fields evms_logical_node.flags & evms_logical_volume.flags defines
36744 +#define EVMS_FLAGS_WIDTH 32
36745 +#define EVMS_VOLUME_FLAG (1<<0)
36746 +#define EVMS_VOLUME_PARTIAL_FLAG (1<<1)
36747 +#define EVMS_VOLUME_PARTIAL (1<<1)
36748 +#define EVMS_VOLUME_SET_READ_ONLY (1<<2)
36749 +#define EVMS_VOLUME_READ_ONLY (1<<2)
36751 + * these bits define volume status
36753 +#define EVMS_MEDIA_CHANGED (1<<20)
36754 +#define EVMS_DEVICE_UNPLUGGED (1<<21)
36756 + * these bits used for removable status
36758 +#define EVMS_DEVICE_MEDIA_PRESENT (1<<24)
36759 +#define EVMS_DEVICE_PRESENT (1<<25)
36760 +#define EVMS_DEVICE_LOCKABLE (1<<26)
36761 +#define EVMS_DEVICE_REMOVABLE (1<<27)
36764 + * fields evms_logical_node.iflags defines
36766 +#define EVMS_FEATURE_BOTTOM (1<<0)
36767 +#define EVMS_TOP_SEGMENT (1<<1)
36770 + * macro to obtain a node's name from either EVMS or compatibility volumes
36772 +#define EVMS_GET_NODE_NAME(node) \
36773 + ((node->flags & EVMS_VOLUME_FLAG) ? \
36774 + node->volume_info->volume_name : \
36778 + * macro used to transform to/from userland device handles and device storage object nodes
36780 +#define EVMS_HANDLE_KEY 0x0123456789ABCDEF
36781 +#define DEV_HANDLE_TO_NODE(handle) ((struct evms_logical_node *)(unsigned long)((handle) ^ EVMS_HANDLE_KEY))
36782 +#define NODE_TO_DEV_HANDLE(node) (((u64)(unsigned long)(node)) ^ EVMS_HANDLE_KEY)
36785 + * struct evms_logical_volume - logical volume info
36786 + * @name: logical volume name
36787 + * @node: logical volume storage object
36788 + * @flags: characteristics of logical volume
36789 + * @quiesced: quiesce state info
36790 + * @vfs_quiesced: vfs quiesce state info
36791 + * @requests_in_progress: count of in-flight I/Os
36792 + * @wait_queue: used when volume is quiesced
36793 + * @devfs_handle: handle for devfs
36794 + * @request_queue: unique request queue
36795 + * @request_lock: unique request queue lock
36797 + * contains all the fields needed to manage to a logical volume
36799 +struct evms_logical_volume {
36801 + struct evms_logical_node *node;
36804 + int vfs_quiesced;
36806 + atomic_t requests_in_progress;
36807 + wait_queue_head_t wait_queue;
36808 + devfs_handle_t devfs_handle;
36810 + request_queue_t request_queue;
36811 + spinlock_t request_lock;
36816 + * field evms_logical_volume.flags defines
36819 + * queued flags bits
36821 +#define EVMS_REQUESTED_DELETE (1<<5)
36822 +#define EVMS_REQUESTED_QUIESCE (1<<6)
36823 +#define EVMS_REQUESTED_VFS_QUIESCE (1<<7)
36825 + * this bit indicates corruption
36827 +#define EVMS_VOLUME_CORRUPT (1<<8)
36829 + * these bits define the source of the corruption
36831 +#define EVMS_VOLUME_SOFT_DELETED (1<<9)
36832 +#define EVMS_DEVICE_UNAVAILABLE (1<<10)
36835 + * The following function table is used for all plugins.
36838 + * struct evms_plugin_fops - evms plugin's table of function operations
36839 + * @discover: volume discovery entry point
36840 + * @end_discover: final discovery entry point
36841 + * @delete: delete volume entry point
36842 + * @read: asynchronous read entry point
36843 + * @write: asynchronous write entry point
36844 + * @init_io: synchronous io entry point
36845 + * @ioctl: generic ioctl entry point
36846 + * @direct_ioctl: non-generic ioctl entry point
36848 + * evms plugin's table of function operations
36850 +struct evms_plugin_fops {
36851 + int (*discover) (struct evms_logical_node **);
36852 + int (*end_discover) (struct evms_logical_node **);
36853 + int (*delete) (struct evms_logical_node *);
36854 + void (*read) (struct evms_logical_node *, struct buffer_head *);
36855 + void (*write) (struct evms_logical_node *, struct buffer_head *);
36856 + int (*init_io) (struct evms_logical_node *, int, u64,
36858 + int (*ioctl) (struct evms_logical_node *, struct inode *,
36859 + struct file *, u32, unsigned long);
36860 + int (*direct_ioctl) (struct inode *, struct file *,
36861 + u32, unsigned long);
36865 + * convenience macros to use plugin's fops entry points
36867 +#define DISCOVER(node, list) ((node)->plugin->fops->discover(list))
36868 +#define END_DISCOVER(node, list) ((node)->plugin->fops->end_discover(list))
36869 +#define DELETE(node) ((node)->plugin->fops->delete(node))
36870 +#define R_IO(node, bh) ((node)->plugin->fops->read(node, bh))
36871 +#define W_IO(node, bh) ((node)->plugin->fops->write(node, bh))
36872 +#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->fops->init_io(node, rw_flag, start_sec, num_secs, buf_addr))
36873 +#define IOCTL(node, inode, file, cmd, arg) ((node)->plugin->fops->ioctl(node, inode, file, cmd, arg))
36874 +#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg) ((reg_record)->plugin->fops->direct_ioctl(inode, file, cmd, arg))
36877 + * struct evms_list_node - generic non-imbedded list node object
36878 + * @item: ptr to object in list
36879 + * @next: ptr to next item in list
36881 + * light weight generic non-imbedded list object definition
36883 +struct evms_list_node {
36885 + struct evms_list_node *next;
36889 + * struct evms_pool_mgmt - anchor block for private pool management
36890 + * @cachep: kmem_cache_t variable
36891 + * @member_size: size of each element in the pool
36893 + * @waiters: count of waiters
36894 + * @wait_queue: list of waiters
36895 + * @name: name of the pool (must be less than 20 chars)
36897 + * anchor block for private pool management
36899 +struct evms_pool_mgmt {
36900 + kmem_cache_t *cachep;
36903 + atomic_t waiters;
36904 + wait_queue_head_t wait_queue;
36910 + * All of the following kernel thread functions belong to EVMS base.
36911 + * These functions were copied from md_core.c
36913 +#define EVMS_THREAD_WAKEUP 0
36915 + * struct evms_thread
36918 + * @wqueue: thread wait queue
36919 + * @flags: thread attributes
36920 + * @event: event completion
36921 + * @tsk: task info
36922 + * @name: thread name
36924 + * data structure for creating/managing a kernel thread
36926 +struct evms_thread {
36927 + void (*run) (void *data);
36929 + wait_queue_head_t wqueue;
36930 + unsigned long flags;
36931 + struct completion *event;
36932 + struct task_struct *tsk;
36937 + * EVMS (common services) exported functions prototypes
36939 + * since these function names are global, evms_cs_ has been prepended
36940 + * to each function name, to ensure they do not collide with any
36941 + * other global functions in the kernel.
36943 +#define EVMS_COMMON_SERVICES_MAJOR 0
36944 +#define EVMS_COMMON_SERVICES_MINOR 6
36945 +#define EVMS_COMMON_SERVICES_PATCHLEVEL 0
36947 +void evms_cs_get_version(int *, int *);
36948 +int evms_cs_check_version(struct evms_version *, struct evms_version *);
36949 +int evms_cs_register_plugin(struct evms_plugin_header *);
36950 +int evms_cs_unregister_plugin(struct evms_plugin_header *);
36951 +#ifdef EVMS_MEM_DEBUG
36952 +int evms_cs_verify_memory_integrity(int);
36954 +int evms_cs_allocate_logical_node(struct evms_logical_node **);
36955 +void evms_cs_deallocate_volume_info(struct evms_logical_node *);
36956 +void evms_cs_deallocate_logical_node(struct evms_logical_node *);
36957 +int evms_cs_add_logical_node_to_list(struct evms_logical_node **,
36958 + struct evms_logical_node *);
36959 +int evms_cs_remove_logical_node_from_list(struct evms_logical_node **,
36960 + struct evms_logical_node *);
36961 +int evms_cs_kernel_ioctl(struct evms_logical_node *, u32,
36963 +inline unsigned long evms_cs_size_in_vsectors(long long);
36964 +inline int evms_cs_log2(long long);
36965 +u32 evms_cs_calculate_crc(u32, void *, u32);
36966 +int evms_cs_register_for_end_io_notification(void *,
36967 + struct buffer_head *,
36968 + void *callback_function);
36969 +struct evms_pool_mgmt *evms_cs_create_pool(int,
36971 + void (*ctor) (void *, kmem_cache_t *,
36973 + void (*dtor) (void *, kmem_cache_t *,
36975 +#define EVMS_BLOCKABLE TRUE
36976 +void *evms_cs_allocate_from_pool(struct evms_pool_mgmt *, int);
36977 +void evms_cs_deallocate_to_pool(struct evms_pool_mgmt *, void *);
36978 +void evms_cs_destroy_pool(struct evms_pool_mgmt *);
36979 +struct evms_list_node **evms_cs_lookup_item_in_list(struct evms_list_node **,
36981 +int evms_cs_add_item_to_list(struct evms_list_node **, void *);
36982 +int evms_cs_remove_item_from_list(struct evms_list_node **, void *);
36983 +int evms_cs_register_device(struct evms_logical_node *);
36984 +int evms_cs_unregister_device(struct evms_logical_node *);
36985 +int evms_cs_find_next_device(struct evms_logical_node *,
36986 + struct evms_logical_node **);
36987 +void evms_cs_signal_event(int);
36988 +struct evms_thread *evms_cs_register_thread(void (*run) (void *),
36989 + void *data, const u8 *name);
36990 +void evms_cs_unregister_thread(struct evms_thread *thread);
36991 +void evms_cs_wakeup_thread(struct evms_thread *thread);
36992 +void evms_cs_interrupt_thread(struct evms_thread *thread);
36993 +struct proc_dir_entry *evms_cs_get_evms_proc_dir(void);
36994 +int evms_cs_volume_request_in_progress(kdev_t, int, int *);
36995 +void evms_cs_invalidate_volume(struct evms_logical_node *topmost_node);
36997 +/* EVMS exported global variables */
36998 +extern struct evms_pool_mgmt *evms_bh_pool;
36999 +extern u8 *evms_primary_string;
37000 +extern u8 *evms_secondary_string;
37002 +/* Have to include this at the end, since it depends
37003 + * on structures and definitions in this file.
37005 +#include <linux/evms/evms_ioctl.h>
37008 diff -Naur linux-2002-09-30/include/linux/evms/evms_aix.h evms-2002-09-30/include/linux/evms/evms_aix.h
37009 --- linux-2002-09-30/include/linux/evms/evms_aix.h Wed Dec 31 18:00:00 1969
37010 +++ evms-2002-09-30/include/linux/evms/evms_aix.h Mon Sep 23 15:11:41 2002
37013 +* The following structures are nested within the structures used by the
37014 +* system management routines. These structures and sizes were pulled from the AIX
37017 +#define LVM_MAXLPS 65535 /* max number of logical partitions allowed */
37018 +#define LVM_NAMESIZ 64 /* maximum size for the logical volume name */
37019 +#define LVM_NUMCOPIES 3 /* max number of copies allowed of a logical partition */
37020 +#define LVM_MAXVGS 255
37021 +#define LVM_MAXPVS 32
37022 +#define LVM_MAXLVS 256
37023 +#define AIX_MIN_BLOCK_SIZE 4096
37024 +#define VGSA_BT_PV 127
37027 +#define OFFSET_CONSTANT 144
37028 +#define SLEEP_TIME 0
37029 +#define MAXLVS_OFFSET 16
37030 +#define PHYS_VOL_OFFSET 34
37031 +#define AIX_PVHPP_LENGTH PHYS_VOL_OFFSET
37032 +#define MAX_SECTORS_NAMELIST 32
37033 +#define AIX_DEFAULT_MIRRORING 1
37034 +#define AIX_FIRST_MIRROR 2
37035 +#define AIX_MAX_MIRRORS 3 // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies
37037 +#define EVMS_AIX_FEATURE_ID 3
37039 +#define EVMS_AIX_RESYNC_MIRRORS 1
37041 +#define PSN_LVM_REC 7
37042 +#define PSN_VGSA_REC 128
37043 +#define PSN_NAMELIST_REC 2065
37044 +#define PSN_VGT_TRAILER 135
37045 +#define PSN_LVE_REC 1
37046 +#define PSN_PPH_OFFSET 17
37047 +#define PSN_PVH_INCREMENT 17
37048 +#define AIX_MIN_PVH_SIZE 271 // used to find the PV header info for Pv's other than 0
37049 +#define AIX_SECTOR_SIZE 512
37050 +#define MAX_PPENT_SECTOR 16
37051 +#define NAME_LEN 128 /* don't change!!! */
37052 +#define UUID_LEN 32 /* don't change!!! */
37053 +#define MAX_SECTORS_LV_ENTRIES 16
37054 +#define AIX_MIN_MIRROR_POOL 10
37055 +#define AIX_MIRROR_POOL_CHANGE 10
37057 +#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1)
37058 +#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1)
37059 +#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1)
37060 +#define LV_BMAP _IOWR ( 0xfe, 0x30, 1)
37062 +#define LV_ACTIVE 0x01 /* lv_status */
37063 +#define LV_SPINDOWN 0x02 /* " */
37064 +#define LV_ERROR 0x99 /* " */
37066 +#define VG_ACTIVE 0x01 /* vg_status */
37068 +#define AIX_LV_READ 0x00 /* lv_access */
37069 +#define AIX_LV_WRITE 0x01 /* " */
37070 +#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass
37071 +#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map
37072 +#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem
37075 +#define AIX_VG_DIRTY 0x01 // group has had a new PV added during this discovery
37076 +#define AIX_VG_INCOMPLETE 0x20 // volume group is incomplete
37078 +#define AIX_LVM_LVUNDEF 0 /* the logical volume is not defined to a */
37079 +/* volume group */
37080 +#define AIX_LVM_LVDEFINED 1 /* the logical volume is defined to a */
37081 +/* volume group */
37082 +#define AIX_LVM_LVSTALE 2 /* the logical volume has stale logical */
37084 +#define AIX_LVM_LVMIRBKP 4 /* the logical volume is an online mirror backup */
37085 +/* We are skipping '3' since it is used by CMDLVM_LVSTALE */
37086 +/* as an addition of LVM_LVDEFINE + LVM_LVSTALE, and is */
37087 +/* defined in src/bos/usr/sbin/lvm/include/ls.h */
37091 +#define LOG_PREFIX "--AIXlvm: "
37093 +// Entries in the list of physical volumes (PV)
37094 +// in a volume group (VG)
37096 +struct unique_id {
37103 +struct partition_list_entry {
37104 + struct evms_logical_node * logical_node;
37106 + u32 block_size; // bytes
37107 + u32 hard_sect_size; // bytes
37108 + struct partition_list_entry * next;
37112 +// Table for mapping logical extents (LE) to physical extents (PE)
37113 +struct pe_table_entry {
37114 + struct partition_list_entry * owning_pv;
37115 + u64 pe_sector_offset;
37119 +// Logical volumes (LV) in a volume group (VG)
37120 +struct aix_logical_volume {
37122 + u64 lv_size; // Sectors
37123 + u32 lv_access; // Flags: LV_READ, LV_WRITE, LN_NEW
37124 + u32 lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN
37125 +// u32 lv_minor; // Device minor number
37126 + u32 mirror_copies; // Do we have mirroring and how many ?
37127 +// u32 mirror_number; // mirror number - which copy is this ?
37128 +// u32 mirror_iterations; // Which mirror should we be writing to ?
37130 + u32 stripe_size; // Sectors
37131 + u32 stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size
37132 + u32 pe_size; // Sectors
37133 + u32 pe_size_shift; // Number of bits to shift right instead of dividing by pe_size
37134 + u32 num_le; // Number of entries in the le_to_pe_map
37135 +// u32 new_volume; // Flag to indicate if this volume needs to be exported
37136 + struct aix_volume_group * group; // Pointer back to parent volume group
37137 + unsigned char name[EVMS_VOLUME_NAME_SIZE+1]; // Dev-tree volume name (eg: /dev/group0/vol0)
37138 + struct pe_table_entry * le_to_pe_map; // Mapping of logical to physical extents
37139 + struct pe_table_entry * le_to_pe_map_mir1; // Mapping of logical to physical extents for mirror 1
37140 + struct pe_table_entry * le_to_pe_map_mir2; // Mapping of logical to physical extents for mirror 2
37141 + struct evms_logical_node * volume_node; // Pointer to the parent EVMS node representing this volume
37145 +// Volume groups (VG)
37146 +struct aix_volume_group {
37147 + struct unique_id vg_id; // volume group number */
37148 + struct partition_list_entry * partition_list; // List of partitions/segments/PVs that make up this VG
37149 + struct aix_logical_volume ** volume_list; // Array of volumes found in this VG.
37150 + struct aix_volume_group * next; // Pointer to the next VG
37151 + struct vg_header * AIXvgh; // Pointer to valid data area on disk for the VG
37152 + s32 vgda_psn; // Which VGDA we should use
37153 +// u32 numpvs; // Number of PVs found on this VG.
37154 + u32 numlvs; // Number of LVs found on this VG.
37155 + u32 hard_sect_size; // The largest hard_sect_size and block_size
37156 + u32 block_size; // values of all partitions in this group.
37158 +// u32 lv_max; // maximum logical volumes */
37159 + u32 pe_size; // physical extent size in sectors */
37160 + u32 partition_count; // actual partitions found for this group
37161 + u32 CleanVGInfo; // Do we have a clean VG Info to work with ?
37162 + u32 vgda_len; // length of the volume group descriptor area */
37165 +struct aix_resync_struct {
37166 + u64 master_offset;
37167 + u64 slave1_offset;
37168 + u64 slave2_offset;
37169 + struct partition_list_entry * master_part; //
37170 + struct partition_list_entry * slave1_part; //
37171 + struct partition_list_entry * slave2_part; //
37172 + struct aix_logical_volume * resync_vol;
37173 + struct aix_logical_volume * next_resync_vol;
37176 +struct aix_mirror_bh {
37177 + atomic_t remaining;
37178 + s32 iteration; // 'have we finished' count, used from IRQ handlers
37179 + u32 le; // In case we have to flag this pp as stale later.
37183 + struct buffer_head *master_bh;
37184 + struct buffer_head bh_req;
37185 + struct aix_mirror_bh *mirror_bh_list;
37186 + struct evms_logical_node *node; // map to evms node (READ only)
37187 + struct evms_logical_node *mir_node1; //
37188 + struct evms_logical_node *mir_node2; //
37189 + struct aix_mirror_bh *next_r1; // next for retry or in free list
37190 + char sync_flag; // Flag for resyncing of mirrored PPs
37193 +struct aix_volume_resync_ioctl {
37194 + char object_name[EVMS_VOLUME_NAME_SIZE+1]; // Input - Name of bbr object from feature header
37198 +struct timestruc {
37204 +struct aix_ipl_rec_area {
37205 + u32 IPL_record_id; /* This physical volume contains a */
37206 + /* valid IPL record if and only if */
37207 + /* this field contains IPLRECID */
37209 +#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */
37211 + char reserved1[20];
37212 + u32 formatted_cap; /* Formatted capacity. The number of */
37213 + /* sectors available after formatting*/
37214 + /* The presence or absence of bad */
37215 + /* blocks does not alter this value. */
37217 + char last_head; /* THIS IS DISKETTE INFORMATION */
37218 + /* The number of heads minus 1. Heads*/
37219 + /* are number from 0 to last_head. */
37221 + char last_sector; /* THIS IS DISKETTE INFORMATION */
37222 + /* The number of sectors per track. */
37223 + /* Sectors are numbered from 1 to */
37224 + /* last_sector. */
37226 + char reserved2[6];
37228 + u32 boot_code_length; /* Boot code length in sectors. A 0 */
37229 + /* value implies no boot code present*/
37231 + u32 boot_code_offset; /* Boot code offset. Must be 0 if no */
37232 + /* boot code present, else contains */
37233 + /* byte offset from start of boot */
37234 + /* code to first instruction. */
37236 + u32 boot_lv_start; /* Contains the PSN of the start of */
37239 + u32 boot_prg_start; /* Boot code start. Must be 0 if no */
37240 + /* boot code present, else contains */
37241 + /* the PSN of the start of boot code.*/
37243 + u32 boot_lv_length; /* BLV length in sectors. */
37245 + u32 boot_load_add; /* 512 byte boundary load address for*/
37248 + char boot_frag; /* Boot code fragmentation flag. Must*/
37249 + /* be 0 if no fragmentation allowed, */
37250 + /* else must be 0x01. */
37252 + char boot_emulation; /* ROS network emulation flag */
37253 + /* 0x0 => not an emul support image */
37254 + /* 0x1 => ROS network emulation code */
37255 + /* 0x2 => AIX code supporting ROS emul*/
37257 + char reserved3[2];
37259 + u16 basecn_length; /* Number of sectors for base */
37260 + /* customization. Normal mode. */
37262 + u16 basecs_length; /* Number of sectors for base */
37263 + /* customization. Service mode. */
37265 + u32 basecn_start; /* Starting PSN value for base */
37266 + /* customization. Normal mode. */
37268 + u32 basecs_start; /* Starting PSN value for base */
37269 + /* customization. Service mode. */
37271 + char reserved4[24];
37273 + u32 ser_code_length; /* Service code length in sectors. */
37274 + /* A 0 value implies no service code */
37277 + u32 ser_code_offset; /* Service code offset. Must be 0 if */
37278 + /* no service code is present, else */
37279 + /* contains byte offset from start of*/
37280 + /* service code to first instruction.*/
37282 + u32 ser_lv_start; /* Contains the PSN of the start of */
37285 + u32 ser_prg_start; /* Service code start. Must be 0 if */
37286 + /* service code is not present, else */
37287 + /* contains the PSN of the start of */
37288 + /* service code. */
37290 + u32 ser_lv_length; /* SLV length in sectors. */
37292 + u32 ser_load_add; /* 512 byte boundary load address for*/
37293 + /* service code. */
37295 + char ser_frag; /* Service code fragmentation flag. */
37296 + /* Must be 0 if no fragmentation */
37297 + /* allowed, else must be 0x01. */
37299 + char ser_emulation; /* ROS network emulation flag */
37300 + /* 0x0 => not an emul support image */
37301 + /* 0x1 => ROS network emulation code */
37302 + /* 0x2 => AIX code supporting ROS emul*/
37304 + char reserved5[2];
37306 + struct unique_id pv_id; /* The unique identifier for this */
37307 + /* physical volume. */
37308 + char dummy[512 - 128 - sizeof(struct unique_id)];
37313 +/* structure which describes the physical volume LVM record */ {
37314 + u32 lvm_id; /* LVM id field which identifies whether the PV is a member of a volume group */
37316 +#define AIX_LVM_LVMID 0x5F4C564D /* LVM id field of ASCII "_LVM" */
37318 + struct unique_id vg_id; /* the id of the volume group to which this physical volume belongs */
37319 + u32 lvmarea_len; /* the length of the LVM reserved area */
37320 + u32 vgda_len; /* length of the volume group descriptor area */
37321 + s32 vgda_psn [2]; /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */
37322 + s32 reloc_psn; /* the physical sector number of the beginning of a pool of blocks */
37323 + /* (located at the end of the PV) which are reserved for the relocation of bad blocks */
37324 + u32 reloc_len; /* the length in number of sectors of the pool of bad block relocation blocks */
37325 + s16 pv_num; /* the physical volume number within the volume group of this physical volume */
37326 + s16 pp_size; /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */
37327 + u32 vgsa_len; /* length of the volume group status area */
37328 + s32 vgsa_psn [2]; /* the physical sector numbers of the beginning of the volume group status area copies on this disk */
37329 + s16 version; /* the version number of this volume group descriptor and status area */
37331 +#define LVM_VERSION_1 1 /* first version - AIX 3.0 */
37332 +#define LVM_STRIPE_ENHANCE 2 /* version with striped lv's - AIX 4.1 */
37333 +#define LVM_1024_PPSIZE 3 /* ppsizes of 512 and 1024 */
37334 +#define LVM_GT_1016 4 /* version with support for > 1016 pps/pv */
37335 +#define LVM_MAX_VERSION LVM_GT_1016 /* max version # */
37337 + char res1 [450]; /* reserved area */
37343 +/* II.Volume Group Descriptor Area */
37345 +struct vgsa_area {
37346 + struct timestruc b_tmstamp; /* Beginning timestamp */
37347 + u32 pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI]; /* Bit per PV */
37348 + unsigned char stalepp [LVM_MAXPVS] [VGSA_BT_PV];
37350 + char resv[10]; /* Padding */
37351 + struct timestruc e_tmstamp; /* Ending timestamp */
37355 +struct vg_header {
37356 + struct timestruc vg_timestamp; /* time of last update */
37357 + struct unique_id vg_id; /* unique id for volume group */
37358 + s16 numlvs; /* number of lvs in vg */
37359 + s16 maxlvs; /* max number of lvs allowed in vg */
37360 + s16 pp_size; /* size of pps in the vg */
37361 + s16 numpvs; /* number of pvs in the vg */
37362 + s16 total_vgdas; /* number of copies of vg */
37363 + /* descriptor area on disk */
37364 + s16 vgda_size; /* size of volume group descriptor */
37372 +struct lv_entries {
37373 + s16 lvname; /* name of LV */
37374 + s16 res1; /* reserved area */
37375 + s32 maxsize; /* maximum number of partitions allowed */
37376 + char lv_state; /* state of logical volume */
37377 + char mirror; /* none,single, or double */
37378 + s16 mirror_policy; /* type of writing used to write */
37379 + s32 num_lps; /* number of logical partitions on the lv */
37381 + char permissions; /* read write or read only */
37382 + char bb_relocation; /* specifies if bad block */
37383 + /* relocation is desired */
37384 + char write_verify; /* verify all writes to the LV */
37385 + char mirwrt_consist; /* mirror write consistency flag */
37386 + u16 stripe_exp; /* stripe size in exponent value */
37387 + u16 striping_width; /* stripe width */
37389 + u16 child_minor_num;
37390 + char res4[4]; /* reserved area on disk */
37394 +struct pv_header {
37395 + struct unique_id pv_id; /* unique identifier of PV */
37396 + u16 pp_count; /* number of physical partitions */
37398 + char pv_state; /* state of physical volume */
37399 + char res1; /* reserved area on disk */
37400 + s32 psn_part1; /* physical sector number of 1st pp */
37401 + s16 pvnum_vgdas;/* number of vg descriptor areas */
37402 + /* on the physical volume */
37403 + s16 pv_num; /* PV number */
37404 + u32 res2; /* reserved area on disk */
37408 +struct pp_entries {
37409 + s16 lv_index; /* index to lv pp is on */
37410 + s16 res_1; /* reserved area on disk */
37411 + u32 lp_num; /* log. part. number */
37412 + char copy; /* the copy of the logical partition */
37413 + /* that this pp is allocated for */
37414 + char pp_state; /* current state of pp */
37415 + char fst_alt_vol; /* pv where partition allocation for*/
37416 + /* first mirror begins */
37417 + char snd_alt_vol; /* pv where partition allocation for*/
37418 + /* second mirror begins */
37419 + s16 fst_alt_part; /* partition to begin first mirror */
37420 + s16 snd_alt_part; /*partition to begin second mirror */
37421 + u64 res_3; /* reserved area on disk */
37422 + u64 res_4; /* reserved area on disk */
37426 + char name[LVM_MAXLVS][LVM_NAMESIZ];
37429 +struct vg_trailer {
37430 + struct timestruc timestamp; /* time of last update */
37432 + /* MS Nibble = concurrent capable */
37433 + /* LS Nibble = concurrent auto-varyon */
37435 + s32 res_3; /* reserved area on disk */
37436 + u64 res_4; /* reserved area on disk */
37437 + u64 res_5; /* reserved area on disk */
37440 diff -Naur linux-2002-09-30/include/linux/evms/evms_bbr_k.h evms-2002-09-30/include/linux/evms/evms_bbr_k.h
37441 --- linux-2002-09-30/include/linux/evms/evms_bbr_k.h Wed Dec 31 18:00:00 1969
37442 +++ evms-2002-09-30/include/linux/evms/evms_bbr_k.h Wed Sep 25 15:04:22 2002
37445 + * Copyright (c) International Business Machines Corp., 2000
37447 + * This program is free software; you can redistribute it and/or modify
37448 + * it under the terms of the GNU General Public License as published by
37449 + * the Free Software Foundation; either version 2 of the License, or
37450 + * (at your option) any later version.
37452 + * This program is distributed in the hope that it will be useful,
37453 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
37454 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
37455 + * the GNU General Public License for more details.
37457 + * You should have received a copy of the GNU General Public License
37458 + * along with this program; if not, write to the Free Software
37459 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37461 +/* linux/include/linux/evms/evms_bbr_k.h
37463 + * Kernel header file for Bad Block Relocation (BBR) Feature
37465 + * BBR feature is designed to remap I/O write failures to another safe location
37466 + * on disk. Note that most disk drives have BBR built into them, this means
37467 + * that our software BBR will be only activated when all hardware BBR
37468 + * replacement sectors have been used.
37471 +#ifndef __EVMS_BBR_K__
37472 +#define __EVMS_BBR_K__
37474 +#define EVMS_BBR_VERSION_MAJOR 1
37475 +#define EVMS_BBR_VERSION_MINOR 1
37476 +#define EVMS_BBR_VERSION_PATCHLEVEL 1
37478 +#define EVMS_BBR_COMMON_SERVICES_MAJOR 0
37479 +#define EVMS_BBR_COMMON_SERVICES_MINOR 6
37480 +#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL 0
37482 +#define EVMS_BBR_FEATURE_ID 6
37483 +#define EVMS_BBR_SIGNATURE 0x42627246 /* BbrF */
37484 +#define EVMS_BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
37486 +#define EVMS_BBR_ENTRIES_PER_SECT 31
37487 +#define BBR_POOL_NAME_LENGTH 20
37488 +#define BBR_STOP_REMAP (1<<0)
37489 +#define BBR_BH_USE_EVMS_CALLBACK (1<<0)
37491 +/* BBR direct ioctl commands.
37493 + * BBR_GET_INFO_CMD: Return total number of sectors that are currently
37494 + * remapped for the specified BBR object.
37495 + * BBR_STOP_REMAP_CMD: Stop remapping. Do not remap any new sectors or even
37496 + * honor any existing remaps for the specified BBR object
37497 + * until the next rediscover command is received.
37498 + * BBR_SECTOR_IO_CMD: Process an I/O from the engine directly through the
37499 + * specified BBR object in the kernel.
37501 +#define BBR_GET_INFO_CMD 1
37502 +#define BBR_STOP_REMAP_CMD 2
37503 +#define BBR_SECTOR_IO_CMD 3
37506 + * struct evms_bbr_table_entry
37507 + * @bad_sect: LBA of bad location.
37508 + * @replacement_sect: LBA of new location.
37510 + * Structure to describe one BBR remap.
37512 +struct evms_bbr_table_entry {
37514 + u64 replacement_sect;
37518 + * struct evms_bbr_table
37519 + * @signature: Signature on each BBR table sector.
37520 + * @crc: CRC for this table sector.
37521 + * @sequence_number: Used to resolve conflicts when primary and secondary
37522 + * tables do not match.
37523 + * @in_use_cnt: Number of in-use table entries.
37524 + * @entries: Actual table of remaps.
37526 + * Structure to describe each sector of the metadata table. Each sector in this
37527 + * table can describe 31 remapped sectors.
37529 +struct evms_bbr_table {
37532 + u32 sequence_number;
37534 + struct evms_bbr_table_entry entries[EVMS_BBR_ENTRIES_PER_SECT];
37538 + * struct evms_bbr_metadata
37539 + * @signature: 0 EVMS_BBR_SIGNATURE
37541 + * @block_size: 8 Block size in bytes.
37542 + * @flags: 12 Global flags used by BBR.
37543 + * @sequence_number: 16
37544 + * @start_sect_bbr_table: 24 LBA of start of BBR table.
37545 + * @nr_sects_bbr_table: 32 Number of sectors in the BBR table.
37546 + * @start_replacement_sect: 40 LBA of start of replacement sectors.
37547 + * @nr_replacement_blks: 48 Number of replacement sectors.
37550 + * On-disk metadata identifying an object as a BBR object.
37552 +struct evms_bbr_metadata {
37557 + u64 sequence_number;
37558 + u64 start_sect_bbr_table;
37559 + u64 nr_sects_bbr_table;
37560 + u64 start_replacement_sect;
37561 + u64 nr_replacement_blks;
37566 + * struct evms_notify_bbr
37567 + * @object_name: Input - Name of BBR object from feature header.
37568 + * @count: Output - Number of remapped sectors.
37569 + * @start_sect: Input - Start sector for sector_io.
37570 + * @nr_sect: Input - Number of sectors for sector_io.
37571 + * @buffer: Input/Output - Pointer to data buffer for sector_io.
37572 + * @rw: Input - READ or WRITE for sector_io.
37574 +struct evms_notify_bbr {
37575 + u8 object_name[EVMS_VOLUME_NAME_SIZE+1];
37584 + * struct bbr_runtime_remap
37586 + * Node in the binary tree used to keep track of remaps.
37588 +struct bbr_runtime_remap {
37589 + struct evms_bbr_table_entry remap;
37590 + struct bbr_runtime_remap * left;
37591 + struct bbr_runtime_remap * right;
37595 + * struct bbr_private
37596 + * @next: List of all bbr_private structures.
37597 + * @node: Output node.
37598 + * @source: Consumed node.
37599 + * @bbr_table: Copy of metadata table.
37600 + * @lba_table1: LBA of primary BBR table.
37601 + * @lba_table2: LBA of secondary BBR table.
37602 + * @nr_sects_bbr_table: Size of each BBR table.
37603 + * @nr_replacement_blks: Number of replacement sectors.
37604 + * @start_replacement_sect: LBA of start of replacement sectors.
37605 + * @blksize_in_sects: Size of each sector.
37606 + * @in_use_replacement_blks: Current number of remaps.
37607 + * @remap_root: Binary tree containing all remaps.
37608 + * @bbr_id_lock: Lock for the binary tree.
37609 + * @flags: BBR_STOP_REMAP
37611 +struct bbr_private {
37612 + struct bbr_private * next;
37613 + struct evms_logical_node * node;
37614 + struct evms_logical_node * source;
37615 + struct evms_bbr_table * bbr_table;
37618 + u64 nr_sects_bbr_table;
37619 + u64 nr_replacement_blks;
37620 + u64 start_replacement_sect;
37621 + u32 blksize_in_sects;
37622 + atomic_t in_use_replacement_blks;
37623 + struct bbr_runtime_remap * remap_root;
37624 + spinlock_t bbr_id_lock;
37629 + * struct bbr_io_buffer
37630 + * @bbr_io_list: Thread's list of bbr_io_buf's.
37631 + * @bbr_id: Object for this request.
37632 + * @bh: Original buffer_head.
37633 + * @org_end_io: Saved callback address from original buffer_head.
37634 + * @org_private: Saved private data address from original buffer_head.
37635 + * @org_rsector: Saved sector value from original buffer_head.
37636 + * @org_dev: Saved b_rdev field from original buffer_head.
37637 + * @complete: Completion structure used by init_io.
37638 + * @rw: READ or WRITE.
37639 + * @rc: Return code from bbr_io_handler.
37641 + * Structure used to track each write request.
37643 +struct bbr_io_buffer {
37644 + struct list_head bbr_io_list;
37645 + struct bbr_private * bbr_id;
37646 + struct buffer_head * bh;
37647 + void (* org_end_io)(struct buffer_head *bh, int uptodate);
37648 + void * org_private;
37650 + struct completion * complete;
37656 +#ifdef EVMS_BBR_DEBUG
37657 +static void print_meta_data(struct evms_bbr_metadata * md);
37658 +static void print_bbr_table_sector(struct evms_bbr_table * bbr_table);
37659 +static void print_remap_list(struct bbr_private * bbr_id);
37660 +#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)
37661 +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)
37662 +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id) print_remap_list(bbr_id)
37664 +#define BBR_DEBUG_PRINT_META_DATA(md)
37665 +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)
37666 +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id)
37670 diff -Naur linux-2002-09-30/include/linux/evms/evms_drivelink.h evms-2002-09-30/include/linux/evms/evms_drivelink.h
37671 --- linux-2002-09-30/include/linux/evms/evms_drivelink.h Wed Dec 31 18:00:00 1969
37672 +++ evms-2002-09-30/include/linux/evms/evms_drivelink.h Fri Aug 16 16:43:11 2002
37674 +/* -*- linux-c -*- */
37677 + * Copyright (c) International Business Machines Corp., 2000
37679 + * This program is free software; you can redistribute it and/or modify
37680 + * it under the terms of the GNU General Public License as published by
37681 + * the Free Software Foundation; either version 2 of the License, or
37682 + * (at your option) any later version.
37684 + * This program is distributed in the hope that it will be useful,
37685 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
37686 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
37687 + * the GNU General Public License for more details.
37689 + * You should have received a copy of the GNU General Public License
37690 + * along with this program; if not, write to the Free Software
37691 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37694 + * linux/include/linux/evms_drvlink.h
37696 + * EVMS DriveLink Feature kernel header file
37700 +#ifndef __EVMS_DRIVELINK_INCLUDED__
37701 +#define __EVMS_DRIVELINK_INCLUDED__
37703 +#define EVMS_DRIVELINK_FEATURE_ID 1
37704 +#define EVMS_DRIVELINK_SIGNATURE 0x4C767244 //DrvL
37705 +#define EVMS_DRIVELINK_MAX_ENTRIES 60
37708 + * feature data version defines
37710 +#define DRIVELINK_METADATA_MAJOR 2
37711 +#define DRIVELINK_METADATA_MINOR 0
37712 +#define DRIVELINK_METADATA_PATCHLEVEL 0
37714 +static struct evms_version metadata_ver = {
37715 + .major = DRIVELINK_METADATA_MAJOR,
37716 + .minor = DRIVELINK_METADATA_MINOR,
37717 + .patchlevel = DRIVELINK_METADATA_PATCHLEVEL
37721 + * struct evms_dl_ordering_table_entry - ordering table entry structure definition
37722 + * @child_sn: child serial number
37723 + * @child_size: in sectors
37725 + * ordering table entry struction definition
37727 +struct evms_dl_ordering_table_entry {
37728 + u64 child_serial_number;
37733 + * struct evms_drivelink_metadata - on-disk metadata definition
37734 + * @signature: drivelink metadata magic number
37735 + * @crc: crc of entire structure
37736 + * @version: drivelink metadata version
37738 + * @sequence_number: used to determine most recent redundant data
37739 + * @child_sn: child object serial number
37740 + * @parent_sn: parent object serial number
37741 + * @child_count: count of child objects of parent
37742 + * @pad: used for alignment of following table
37743 + * @ordering_table: table of child ordering entries
37745 + * drivelink on-disk metadata definition
37747 +struct evms_drivelink_metadata {
37750 + struct evms_version version;
37752 + u64 sequence_number;
37753 + u64 child_serial_number;
37754 + u64 parent_serial_number;
37757 + struct evms_dl_ordering_table_entry
37758 + ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];
37763 + * struct runtime_entry - in-memory metadata entry description
37764 + * @block_size: largest block size of all children
37765 + * @voffset: relative offset of child object within parent object (in 512 byte units)
37766 + * @vsize: child object size (in 512 byte units)
37767 + * @child_node: child storage object
37768 + * @child_metadata: child's on-disk metadata
37770 + * drivelink's in-memory metadata entry description
37772 +struct runtime_entry {
37776 + struct evms_logical_node *child_node;
37777 + struct evms_drivelink_metadata *child_metadata;
37781 + * struct runtime_data - in-memory metadata description
37782 + * @block_size: largest block size of all children
37783 + * @voffset: relative offset of child object within parent object (in 512 byte units)
37784 + * @vsize: child object size (in 512 byte units)
37785 + * @child_node: child storage object
37786 + * @child_metadata: child's on-disk metadata
37788 + * drivelink's in-memory metadata description
37790 +struct runtime_data {
37794 + struct runtime_entry *child_table;
37799 diff -Naur linux-2002-09-30/include/linux/evms/evms_ecr.h evms-2002-09-30/include/linux/evms/evms_ecr.h
37800 --- linux-2002-09-30/include/linux/evms/evms_ecr.h Wed Dec 31 18:00:00 1969
37801 +++ evms-2002-09-30/include/linux/evms/evms_ecr.h Fri Aug 16 16:19:56 2002
37805 + * Copyright (c) International Business Machines Corp., 2000
37807 + * This program is free software; you can redistribute it and/or modify
37808 + * it under the terms of the GNU General Public License as published by
37809 + * the Free Software Foundation; either version 2 of the License, or
37810 + * (at your option) any later version.
37812 + * This program is distributed in the hope that it will be useful,
37813 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
37814 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
37815 + * the GNU General Public License for more details.
37817 + * You should have received a copy of the GNU General Public License
37818 + * along with this program; if not, write to the Free Software
37819 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37823 + * linux/include/linux/evms_ecr.h
37825 + * EVMS Cluster enablement kernel header file
37829 +#ifndef __EVMS_ECR__
37831 +#define __EVMS_ECR__
37833 +#define ECR_SUCCESS 0
37834 +#define ECR_FAIL -1
37837 + * Beginning of group messaging API
37839 +typedef int ecr_group_t;
37840 +typedef int ecr_nodeid_t;
37841 +typedef void ecr_cred_t;
37842 +typedef void ecr_instance_t;
37843 +typedef void ecr_message_t;
37845 +typedef enum ecr_type_s {
37846 + ECR_GROUP_START, /* 0th entry is reserved */
37847 + ECR_P2P, /* Point to Point message type */
37848 + ECR_BROADCAST, /* Broadcast message type */
37849 + ECR_ATOMIC_EXECUTE, /* Atomic execute type */
37850 + ECR_GROUP_LAST /* Just a last enum type, not a message type */
37853 +typedef struct ecr_table_s {
37854 + void (*join) (ecr_nodeid_t, uint, ecr_nodeid_t *, ecr_instance_t *);
37855 + int (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);
37856 + void (*leave) (ecr_nodeid_t, ecr_instance_t *);
37857 + void (*recover)(ecr_nodeid_t, ecr_instance_t *);
37858 + void (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t,
37859 + void *, size_t, ecr_instance_t *);
37860 + void (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);
37864 +#define ECR_GROUPNAME_MAX_SIZE NAME_SIZE /* maximum size of a group name */
37866 +ecr_group_t ecr_group_join(char *, ecr_table_t *, ecr_cred_t *, size_t,
37867 + ecr_instance_t *);
37868 +void ecr_group_leave(ecr_group_t);
37869 +int ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t,
37870 + ecr_instance_t *,
37871 + void callback(int, ecr_instance_t *));
37872 +int ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t,
37874 +int ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,
37875 + void callback(u_char, ecr_instance_t *));
37876 +int ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);
37877 +int ecr_group_atomic_execute(ecr_group_t, void *, size_t,
37878 + ecr_instance_t *,
37879 + void callback(ecr_instance_t *));
37880 +int ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);
37881 +void ecr_group_success_response(ecr_message_t *);
37882 +void ecr_group_failure_response(ecr_message_t *, int);
37887 + * Beginning of distributed lock API
37890 +typedef int ecr_lock_t;
37891 +typedef enum ecr_lock_mode_s {
37892 + ECR_LOCK_START, /* 0th entry is reserved */
37893 + ECR_LOCK_CONCURRENT, /* concurrent access */
37894 + ECR_LOCK_EXCLUSIVE, /* exclusive access */
37895 + ECR_LOCK_LAST /* Just a last enum type, not a lock type */
37896 +} ecr_lock_mode_t;
37898 +typedef u_char ecr_mode_t;
37901 +#define ECR_LOCKNAME_MAX_SIZE NAME_SIZE /* maximum size of a lock name */
37902 +#define ECR_BLOCK 1 /* waitflag set */
37904 +ecr_lock_t ecr_lock_create(char * /* lock name */);
37905 +int ecr_lock(ecr_lock_t, u64, u64, ecr_lock_mode_t,
37906 + u_char /*waitflag*/);
37907 +int ecr_unlock(ecr_lock_t, u64, u64);
37909 +#endif /* __EVMS_ECR__ */
37910 diff -Naur linux-2002-09-30/include/linux/evms/evms_ioctl.h evms-2002-09-30/include/linux/evms/evms_ioctl.h
37911 --- linux-2002-09-30/include/linux/evms/evms_ioctl.h Wed Dec 31 18:00:00 1969
37912 +++ evms-2002-09-30/include/linux/evms/evms_ioctl.h Thu Sep 26 11:55:45 2002
37914 +/* -*- linux-c -*- */
37917 + * Copyright (c) International Business Machines Corp., 2000
37919 + * This program is free software; you can redistribute it and/or modify
37920 + * it under the terms of the GNU General Public License as published by
37921 + * the Free Software Foundation; either version 2 of the License, or
37922 + * (at your option) any later version.
37924 + * This program is distributed in the hope that it will be useful,
37925 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
37926 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
37927 + * the GNU General Public License for more details.
37929 + * You should have received a copy of the GNU General Public License
37930 + * along with this program; if not, write to the Free Software
37931 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37934 + * linux/include/linux/evms.h
37936 + * EVMS public kernel header file
37940 +#ifndef __EVMS_IOCTL_INCLUDED__
37941 +#define __EVMS_IOCTL_INCLUDED__
37943 +#include <linux/hdreg.h>
37945 +/* IOCTL interface version definitions */
37946 +#define EVMS_IOCTL_INTERFACE_MAJOR 11
37947 +#define EVMS_IOCTL_INTERFACE_MINOR 3
37948 +#define EVMS_IOCTL_INTERFACE_PATCHLEVEL 0
37950 +/* IOCTL definitions */
37951 +enum evms_ioctl_cmds {
37952 + /* version commands */
37953 + EVMS_GET_IOCTL_VERSION_NUMBER = 0,
37954 + EVMS_GET_VERSION_NUMBER,
37956 + /* EVMS internal commands */
37957 + EVMS_GET_DISK_LIST_NUMBER = 0x40,
37958 + EVMS_CHECK_MEDIA_CHANGE_NUMBER,
37959 + EVMS_REVALIDATE_DISK_NUMBER,
37960 + EVMS_OPEN_VOLUME_NUMBER,
37961 + EVMS_CLOSE_VOLUME_NUMBER,
37962 + EVMS_QUIESCE_VOLUME_NUMBER,
37963 + EVMS_CHECK_DEVICE_STATUS_NUMBER,
37964 + EVMS_UPDATE_DEVICE_INFO_NUMBER,
37966 + /* configuration commands */
37967 + EVMS_GET_INFO_LEVEL_NUMBER = 0x80,
37968 + EVMS_SET_INFO_LEVEL_NUMBER,
37969 + EVMS_REDISCOVER_VOLUMES_NUMBER,
37970 + EVMS_DELETE_VOLUME_NUMBER,
37971 + EVMS_PLUGIN_IOCTL_NUMBER,
37972 + EVMS_PROCESS_NOTIFY_EVENT_NUMBER,
37973 + /* query info commands */
37974 + EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,
37975 + EVMS_GET_LOGICAL_DISK_INFO_NUMBER,
37976 + EVMS_SECTOR_IO_NUMBER,
37977 + EVMS_GET_MINOR_NUMBER,
37978 + EVMS_GET_VOLUME_DATA_NUMBER,
37979 + EVMS_GET_PLUGIN_NUMBER,
37980 + EVMS_COMPUTE_CSUM_NUMBER,
37981 + EVMS_GET_BMAP_NUMBER,
37982 + EVMS_CHECK_MOUNT_STATUS_NUMBER,
37983 + EVMS_CHECK_OPEN_STATUS_NUMBER,
37984 + /* commands for non-EVMS apps */
37985 + EVMS_GET_VOL_STRIPE_INFO_NUMBER = 0xF0,
37988 +/* version commands */
37989 +#define EVMS_GET_IOCTL_VERSION_STRING "EVMS_GET_IOCTL_VERSION"
37990 +#define EVMS_GET_IOCTL_VERSION _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, struct evms_version)
37992 +#define EVMS_GET_VERSION_STRING "EVMS_GET_VERSION"
37993 +#define EVMS_GET_VERSION _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, struct evms_version)
37997 +/* EVMS internal commands */
37998 +#define EVMS_GET_DISK_LIST_STRING "EVMS_GET_DISK_LIST"
37999 +#define EVMS_GET_DISK_LIST _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, struct evms_list_node **)
38001 +#define EVMS_CHECK_MEDIA_CHANGE_STRING "EVMS_CHECK_MEDIA_CHANGE"
38002 +#define EVMS_CHECK_MEDIA_CHANGE _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)
38004 +#define EVMS_REVALIDATE_DISK_STRING "EVMS_REVALIDATE_DISK"
38005 +#define EVMS_REVALIDATE_DISK _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)
38007 +#define EVMS_OPEN_VOLUME_STRING "EVMS_OPEN_VOLUME"
38008 +#define EVMS_OPEN_VOLUME _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)
38010 +#define EVMS_CLOSE_VOLUME_STRING "EVMS_CLOSE_VOLUME"
38011 +#define EVMS_CLOSE_VOLUME _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)
38014 + * struct evms_quiesce_vol_pkt - ioctl packet definition
38015 + * @command: 0 = unquiesce, 1 = quiesce
38016 + * @minor: minor device number of target volume
38017 + * @do_vfs: 0 = do nothing, 1 = also perform equivalent VFS operation
38018 + * @status: returned operation status
38020 + * ioctl packet definition for EVMS_QUIESCE_VOLUME
38022 +struct evms_quiesce_vol_pkt {
38029 + * defines for evms_quiesce_vol_pkt.command field
38031 +#define EVMS_UNQUIESCE 0
38032 +#define EVMS_QUIESCE 1
38034 + * defines for evms_quiesce_vol_pkt.do_vfs field
38035 + * located below struct evms_delete_vol_pkt definition
38038 +#define EVMS_QUIESCE_VOLUME_STRING "EVMS_QUIESCE_VOLUME"
38039 +#define EVMS_QUIESCE_VOLUME _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, struct evms_quiesce_vol_pkt)
38041 +#define EVMS_CHECK_DEVICE_STATUS_STRING "EVMS_CHECK_DEVICE_STATUS"
38042 +#define EVMS_CHECK_DEVICE_STATUS _IOR(EVMS_MAJOR, EVMS_CHECK_DEVICE_STATUS_NUMBER, int)
38044 +#define EVMS_UPDATE_DEVICE_INFO_STRING "EVMS_UPDATE_DEVICE_INFO"
38045 +#define EVMS_UPDATE_DEVICE_INFO _IO(EVMS_MAJOR, EVMS_UPDATE_DEVICE_INFO_NUMBER)
38049 +/* configuration commands */
38050 +#define EVMS_GET_INFO_LEVEL_STRING "EVMS_GET_INFO_LEVEL"
38051 +#define EVMS_GET_INFO_LEVEL _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)
38053 +#define EVMS_SET_INFO_LEVEL_STRING "EVMS_SET_INFO_LEVEL"
38054 +#define EVMS_SET_INFO_LEVEL _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)
38057 + * struct evms_rediscover_pkt - rediscover volume ioctl packet definition
38058 + * @status: return operation status
38059 + * @drive_count: count of drives being probed, 0xffffffff for all disks
38060 + * @drive_array: array of drive handles to be probed
38062 + * ioctl packet definition for EVMS_REDISCOVER_VOLUMES ioctl
38064 +struct evms_rediscover_pkt {
38067 + u64 *drive_array;
38070 + * defines for evms_delete_vol_pkt.command field
38072 +#define EVMS_SOFT_DELETE 0
38073 +#define EVMS_HARD_DELETE 1
38075 + * defines evms_rediscover_pkt.drive_count field
38077 +#define REDISCOVER_ALL_DEVICES 0xFFFFFFFF
38079 +#define EVMS_REDISCOVER_VOLUMES_STRING "EVMS_REDISCOVER_VOLUMES"
38080 +#define EVMS_REDISCOVER_VOLUMES _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, struct evms_rediscover_pkt)
38082 +/* field: command: defines */
38085 + * struct evms_delete_vol_pkt - delete volume ioctl packet definition
38086 + * @command: 0 = soft delete, 1 = hard delete
38087 + * @minor: minor device num of target volume
38088 + * @do_vfs: 0 = do nothing, 1 = perform VFS operation(s)
38089 + * @associative_minor: optional minor device num of associative volume, 0 when unused
38090 + * @author returned operation status
38092 + * ioctl packet definition for EVMS_DELETE_VOLUME ioctl
38094 +struct evms_delete_vol_pkt {
38098 + s32 associative_minor;
38102 + * field evms_delete_vol_pkt defines
38103 + * @EVMS_VFS_DO_NOTHING:
38106 + * NOTE: these defines are also used with evms_quiesce_vol_pkt.
38108 +#define EVMS_VFS_DO_NOTHING 0
38109 +#define EVMS_VFS_DO 1
38111 +#define EVMS_DELETE_VOLUME_STRING "EVMS_DELETE_VOLUME"
38112 +#define EVMS_DELETE_VOLUME _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, struct evms_delete_vol_pkt)
38115 + * struct evms_plugin_ioctl_pkt - generic plugin ioctl packet definition
38116 + * @feature_id: plugin ID of feature to receive this ioctl
38117 + * @feature_command: feature specific ioctl command
38118 + * @status: 0 = completed, 0 != error
38119 + * @feature_ioctl_data: ptr to feature specific ioctl struct
38121 + * ioctl packet definition for EVMS_PLUGIN_IOCTL ioctl
38123 +struct evms_plugin_ioctl_pkt {
38124 + ulong feature_id;
38125 + s32 feature_command;
38127 + void *feature_ioctl_data;
38130 +#define EVMS_PLUGIN_IOCTL_STRING "EVMS_PLUGIN_IOCTL"
38131 +#define EVMS_PLUGIN_IOCTL _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, struct evms_plugin_ioctl_pkt)
38134 + * struct evms_event - evms event structure
38135 + * @pid: PID to act on
38136 + * @eventid: event id to respond to
38137 + * @signo: signal # to send when event occurs
38139 + * contains process event notification info
38141 +struct evms_event {
38147 + * field evms_event_pkt.eventid defines
38149 +#define EVMS_EVENT_END_OF_DISCOVERY 0
38152 + * struct evms_notify_pkt - evms event notification ioctl packet definition
38153 + * @command: 0 = unregister, 1 = register
38154 + * @eventry: event structure
38155 + * @status: returned operation status
38157 + * ioctl packet definition for EVMS_PROCESS_NOTIFY_EVENT ioctl
38159 +struct evms_notify_pkt {
38161 + struct evms_event eventry;
38165 + * field evms_notify_pkt.command defines
38167 +#define EVMS_EVENT_UNREGISTER 0
38168 +#define EVMS_EVENT_REGISTER 1
38170 +#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"
38171 +#define EVMS_PROCESS_NOTIFY_EVENT _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, struct evms_notify_pkt)
38173 +/* query info commands */
38176 + * struct evms_user_disk_pkt - get disk handle ioctl packet definition
38177 + * @command: 0 = first disk, 1 = next disk
38178 + * @status: 0 = no more disks, 1 = valid disk info
38179 + * @disk_handle: only valid when status == 1
38181 + * ioctl packet definition for EVMS_GET_LOGICAL_DISK ioctl
38183 +struct evms_user_disk_pkt {
38189 + * field evms_user_disk_pkt.command defines
38191 +#define EVMS_FIRST_DISK 0
38192 +#define EVMS_NEXT_DISK 1
38194 + * field evms_user_disk_pkt.status defines
38196 +#define EVMS_DISK_INVALID 0
38197 +#define EVMS_DISK_VALID 1
38199 +#define EVMS_GET_LOGICAL_DISK_STRING "EVMS_GET_LOGICAL_DISK"
38200 +#define EVMS_GET_LOGICAL_DISK _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, struct evms_user_disk_pkt)
38203 + * evms_user_disk_info_pkt - disk info packet definition
38204 + * @status: return operation status
38205 + * @flags: device characteristics
38206 + * @disk_handle: kernel handle to specified device
38207 + * @disk_dev: kernel device info, used by MD plugin
38208 + * @geometry: reported device geometry
38209 + * @block_size: reported block size
38210 + * @hardsect_size: reported physical sector size
38211 + * @total_vsectors: size of device in 512 byte units
38212 + * @disk_name: legacy name for the device
38214 + * ioctl packet definition for EVMS_GET_LOGICAL_DISK_INFO ioctl
38216 +struct evms_user_disk_info_pkt {
38223 + u64 geo_cylinders;
38225 + u32 hardsect_size;
38226 + u64 total_sectors;
38227 + u8 disk_name[EVMS_VOLUME_NAME_SIZE + 1];
38230 + * field evms_user_disk_info_pkt.flags define in evms.h
38233 +#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"
38234 +#define EVMS_GET_LOGICAL_DISK_INFO _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, struct evms_user_disk_info_pkt)
38237 + * struct evms_sector_io_pkt - sector io ioctl packet definition
38238 + * @disk_handle: disk handle of target device
38239 + * @io_flag: 0 = read, 1 = write
38240 + * @starting_sector: disk relative starting sector
38241 + * @sector_count: count of sectors
38242 + * @buffer_address: user buffer address
38243 + * @status: return operation status
38245 + * ioctl packet definition for EVMS_SECTOR_IO ioctl
38247 +struct evms_sector_io_pkt {
38250 + u64 starting_sector;
38251 + u64 sector_count;
38252 + u8 *buffer_address;
38256 + * field evms_sector_io_pkt.io_flag defines
38258 +#define EVMS_SECTOR_IO_READ 0
38259 +#define EVMS_SECTOR_IO_WRITE 1
38261 +#define EVMS_SECTOR_IO_STRING "EVMS_SECTOR_IO"
38262 +#define EVMS_SECTOR_IO _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, struct evms_sector_io_pkt)
38265 + * struct evms_user_minor_pkt - get a list of device minors, one at a time
38266 + * @command: 0 = first volume, 1 = next volume
38267 + * @status: returned operation status
38268 + * @minor: returned minor number, only valid when status == 1
38270 + * ioctl packet definition for EVMS_GET_MINOR ioctl
38272 +struct evms_user_minor_pkt {
38278 + * field evms_user_minor_pkt.command defines
38280 +#define EVMS_FIRST_VOLUME 0
38281 +#define EVMS_NEXT_VOLUME 1
38283 + * field evms_user_minor_pkt.status defines
38285 +#define EVMS_VOLUME_INVALID 0
38286 +#define EVMS_VOLUME_VALID 1
38288 +#define EVMS_GET_MINOR_STRING "EVMS_GET_MINOR"
38289 +#define EVMS_GET_MINOR _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, struct evms_user_minor_pkt)
38292 + * struct evms_volume_data_pkt - volume data packet definition
38293 + * @minor: minor device number of target volume
38294 + * @flags: returned volume characteristics
38295 + * @volume_name: returned volume name
38296 + * @status: returned operation status
38298 + * ioctl packet definition for EVMS_GET_VOLUME_DATA ioctl
38300 +struct evms_volume_data_pkt {
38303 + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
38307 + * field evms_volume_data_pkt.flags defines found in evms_common.h
38310 +#define EVMS_GET_VOLUME_DATA_STRING "EVMS_GET_VOLUME_DATA"
38311 +#define EVMS_GET_VOLUME_DATA _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, struct evms_volume_data_pkt)
38314 + * struct evms_kernel_plugin_pkt - get kernel plugin ioctl packet definition
38315 + * @command: 0 = first plugin, 1 = next plugin
38316 + * @id: returned plugin id
38317 + * @version: returned plugin version info
38318 + * @status: returned operation status
38320 + * ioctl packet definition for EVMS_GET_PLUGIN ioctl
38322 +struct evms_kernel_plugin_pkt {
38325 + struct evms_version version;
38329 + * field evms_kernel_plugin_pkt.command defines
38331 +#define EVMS_FIRST_PLUGIN 0
38332 +#define EVMS_NEXT_PLUGIN 1
38334 + * field evms_kernel_plugin_pkt.status defines
38336 +#define EVMS_PLUGIN_INVALID 0
38337 +#define EVMS_PLUGIN_VALID 1
38339 +#define EVMS_GET_PLUGIN_STRING "EVMS_GET_PLUGIN"
38340 +#define EVMS_GET_PLUGIN _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, struct evms_kernel_plugin_pkt)
38343 + * struct evms_compute_csum_pkt - compute checksum ioctl packet definition
38344 + * @buffer_address:
38350 + * ioctl packet definition for EVMS_COMPUTE_CSUM ioctl
38352 +struct evms_compute_csum_pkt {
38353 + u8 *buffer_address;
38360 +#define EVMS_COMPUTE_CSUM_STRING "EVMS_COMPUTE_CSUM"
38361 +#define EVMS_COMPUTE_CSUM _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, struct evms_compute_csum_pkt)
38364 + * struct evms_get_bmap_pkt - get bmap data ioctl packet definition
38365 + * @rsector: input, volume relative rsector value
38366 + * output, disk relative rsector value
38367 + * @dev output, physical device
38368 + * @status: output, operation status
38370 + * ioctl packet definition for EVMS_GET_BMAP ioctl
38372 +struct evms_get_bmap_pkt {
38378 +#define EVMS_GET_BMAP_STRING "EVMS_GET_BMAP"
38379 +#define EVMS_GET_BMAP _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, struct evms_get_bmap_pkt)
38382 + * struct evms_mount_status_pkt - ioctl packet definition
38383 + * @minor: input, minor of volume to check
38384 + * @mounted: output, TRUE if mounted, FALSE if not
38385 + * @status: output, operation completion status
38387 + * ioctl packet definition for EVMS_CHECK_MOUNT_STATUS ioctl.
38389 +struct evms_mount_status_pkt {
38395 +#define EVMS_CHECK_MOUNT_STATUS_STRING "EVMS_CHECK_MOUNT_STATUS"
38396 +#define EVMS_CHECK_MOUNT_STATUS _IOWR(EVMS_MAJOR, EVMS_CHECK_MOUNT_STATUS_NUMBER, struct evms_mount_status_pkt)
38399 + * struct evms_open_status_pkt - ioctl packet definition
38400 + * @minor: input, minor of volume to check
38401 + * @opens: output, 0 (FALSE) if not, count (TRUE) of opens
38402 + * @status: output, operation completion status
38404 + * ioctl packet definition for EVMS_CHECK_OPEN_STATUS ioctl.
38406 +struct evms_open_status_pkt {
38412 +#define EVMS_CHECK_OPEN_STATUS_STRING "EVMS_CHECK_OPEN_STATUS"
38413 +#define EVMS_CHECK_OPEN_STATUS _IOWR(EVMS_MAJOR, EVMS_CHECK_OPEN_STATUS_NUMBER, struct evms_open_status_pkt)
38416 + * struct evms_vol_stripe_info_pkt - ioctl packet definition
38417 + * @size: the stripe unit specified in 512 byte block units
38418 + * @width: the number of stripe members or RAID data disks
38420 + * ioctl packet definition for EVMS_GET_VOL_STRIPE_INFO ioctl.
38422 +struct evms_vol_stripe_info_pkt {
38427 +#define EVMS_GET_VOL_STRIPE_INFO_STRING "EVMS_GET_VOL_STRIPE_INFO"
38428 +#define EVMS_GET_VOL_STRIPE_INFO _IOR(EVMS_MAJOR, EVMS_GET_VOL_STRIPE_INFO_NUMBER, struct evms_vol_stripe_info_pkt)
38430 diff -Naur linux-2002-09-30/include/linux/evms/evms_linear.h evms-2002-09-30/include/linux/evms/evms_linear.h
38431 --- linux-2002-09-30/include/linux/evms/evms_linear.h Wed Dec 31 18:00:00 1969
38432 +++ evms-2002-09-30/include/linux/evms/evms_linear.h Tue Aug 6 01:03:24 2002
38434 +#ifndef __EVMS_LINEAR_H
38435 +#define __EVMS_LINEAR_H
38437 +#include <linux/evms/evms_md.h>
38440 + struct evms_logical_node *node;
38442 + unsigned long size;
38443 + unsigned long offset;
38446 +typedef struct dev_info dev_info_t;
38448 +struct linear_hash
38450 + dev_info_t *dev0, *dev1;
38453 +struct linear_private_data
38455 + struct linear_hash *hash_table;
38456 + dev_info_t disks[MD_SB_DISKS];
38457 + dev_info_t *smallest;
38462 +typedef struct linear_private_data linear_conf_t;
38464 +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
38467 diff -Naur linux-2002-09-30/include/linux/evms/evms_lvm.h evms-2002-09-30/include/linux/evms/evms_lvm.h
38468 --- linux-2002-09-30/include/linux/evms/evms_lvm.h Wed Dec 31 18:00:00 1969
38469 +++ evms-2002-09-30/include/linux/evms/evms_lvm.h Mon Aug 26 10:01:08 2002
38471 +/* -*- linux-c -*- */
38473 + * Copyright (c) International Business Machines Corp., 2000
38475 + * This program is free software; you can redistribute it and/or modify
38476 + * it under the terms of the GNU General Public License as published by
38477 + * the Free Software Foundation; either version 2 of the License, or
38478 + * (at your option) any later version.
38480 + * This program is distributed in the hope that it will be useful,
38481 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
38482 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
38483 + * the GNU General Public License for more details.
38485 + * You should have received a copy of the GNU General Public License
38486 + * along with this program; if not, write to the Free Software
38487 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38490 + * linux/include/linux/evms_lvm.h
38492 + * EVMS LVM VGE kernel header file
38495 +#ifndef __EVMS_LVM_H__
38496 +#define __EVMS_LVM_H__
38498 +#define EVMS_LVM_VERSION_MAJOR 1
38499 +#define EVMS_LVM_VERSION_MINOR 1
38500 +#define EVMS_LVM_VERSION_PATCH 1
38502 +/* The following definitions and data structures are copied from lvm.h and
38503 + * liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format
38504 + * changed in beta8, lvm.h changed significantly enough that this module would
38505 + * no longer compile. Instead of requiring evms users to install the latest lvm
38506 + * release, the required definitions and data structures will now be included
38507 + * in this header file.
38511 +#define MAX_LV 256
38512 +#define MAX_PV 256
38513 +#define NAME_LEN 128
38514 +#define UUID_LEN 32
38515 +#define LVM_VGDA_ALIGN 4096UL
38516 +#define LVM_PV_DISK_BASE 0L
38517 +#define LVM_PV_DISK_SIZE 1024L
38518 +#define LVM_VG_DISK_BASE round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, \
38520 +#define LVM_VG_DISK_SIZE (8*512L)
38525 +/* lv->lv_status */
38526 +#define LV_ACTIVE 0x01
38527 +/* lv->lv_access */
38528 +#define LV_READ 0x01
38529 +#define LV_WRITE 0x02
38530 +#define LV_SNAPSHOT 0x04
38531 +#define LV_SNAPSHOT_ORG 0x08
38534 + * struct lv_COW_table_disk_v1
38535 + * @pv_org_number:
38536 + * @pv_org_rsector:
38537 + * @pv_snap_number:
38538 + * @pv_snap_rsector:
38540 + * Copy-On-Write tables in disk format (version 1).
38542 +struct lv_COW_table_disk {
38543 + u64 pv_org_number;
38544 + u64 pv_org_rsector;
38545 + u64 pv_snap_number;
38546 + u64 pv_snap_rsector;
38554 + * Disk stored PE map entry definition.
38562 + * struct lvm_disk_data
38566 + * Disk stored PV, VG, LV and PE size and offset information.
38568 +struct lvm_disk_data {
38579 + * @pv_uuidlist_on_disk:
38584 + * @system_id: used by vgexport/vgimport
38588 + * @pv_allocatable:
38594 + * @pe_start: in sectors (new in version 2)
38596 + * Physical volume on disk metadata definition (version 2).
38601 + struct lvm_disk_data pv_on_disk;
38602 + struct lvm_disk_data vg_on_disk;
38603 + struct lvm_disk_data pv_uuidlist_on_disk;
38604 + struct lvm_disk_data lv_on_disk;
38605 + struct lvm_disk_data pe_on_disk;
38606 + u8 pv_uuid[NAME_LEN];
38607 + u8 vg_name[NAME_LEN];
38608 + u8 system_id[NAME_LEN];
38612 + u32 pv_allocatable;
38617 + u32 pe_allocated;
38630 + * @lv_mirror_copies:
38634 + * @lv_snapshot_minor: minor number of original
38635 + * @lv_chunk_size: chuck size for snapshots
38637 + * @lv_allocated_le:
38639 + * @lv_stripesize:
38642 + * @lv_io_timeout:
38643 + * @lv_read_ahead:
38645 + * Logical volume metadata definition (version 3).
38648 + u8 lv_name[NAME_LEN];
38649 + u8 vg_name[NAME_LEN];
38655 + u32 lv_mirror_copies;
38659 + u32 lv_snapshot_minor;
38660 + u16 lv_chunk_size;
38662 + u32 lv_allocated_le;
38664 + u32 lv_stripesize;
38666 + u32 lv_allocation;
38667 + u32 lv_io_timeout;
38668 + u32 lv_read_ahead;
38673 + * @vg_uuid: Volume group UUID
38674 + * @vg_name_dummy: Remainder of version 1 VG name
38675 + * @vg_number: Volume group number
38676 + * @vg_access: Read/Write
38677 + * @vg_status: Active or not
38678 + * @lv_max: Maximum logical volumes
38679 + * @lv_cur: Current logical volumes
38680 + * @lv_open: Open logical volumes
38681 + * @pv_max: Maximum physical volumes
38682 + * @pv_cur: Current physical volumes
38683 + * @pv_act: Active physical volumes
38685 + * @vgda: Volume group descriptor arrays
38686 + * @pe_size: Physical extent size in sectors
38687 + * @pe_total: Total of physical extents
38688 + * @pe_allocated: Allocated physical extents
38689 + * @pvg_total: Physical volume groups
38691 + * Volume group metadata definition (version 2).
38694 + u8 vg_uuid[UUID_LEN];
38695 + u8 vg_name_dummy[NAME_LEN - UUID_LEN];
38709 + u32 pe_allocated;
38713 +/* Useful inlines */
38714 +static inline ulong round_up(ulong n, ulong size)
38717 + return (n + size) & ~size;
38720 +static inline ulong div_up(ulong n, ulong size)
38722 + return round_up(n, size) / size;
38725 +/* End of lvm.h imported data structures. */
38727 +#define DEV_DIRECTORY "/dev/"
38728 +#define LVM_DEV_DIRECTORY "lvm/"
38729 +#define LVM_PROC_NAME "lvm"
38730 +#define LVM_PROC_VG_NAME "VGs"
38731 +#define LVM_PROC_LV_NAME "LVs"
38732 +#define LVM_PROC_PV_NAME "PVs"
38733 +#define LVM_PROC_GLOBAL_NAME "global"
38734 +#define IO_BUFFER_SECTORS 8
38736 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9)
38737 +#define max(a,b) (((a)>(b))?(a):(b))
38740 +/* Structure for doing PV remove ioctls. */
38742 +#define EVMS_LVM_PV_REMOVE_IOCTL 0x01
38743 +#define EVMS_LVM_SNAPSHOT_STAT_IOCTL 0x02
38746 + * struct lvm_pv_remove_ioctl
38747 + * @vg_uuid: Volume group UUID
38748 + * @pv_number: Physical volume number
38749 + * @next: Link to next packet (engine-use only)
38751 + * PV remove ioctl packet definition.
38753 +struct lvm_pv_remove_ioctl {
38754 + u8 vg_uuid[UUID_LEN];
38756 + struct lvm_pv_remove_ioctl * next;
38760 + * struct lvm_snapshot_stat_ioctl
38761 + * @vg_uuid: Volume group UUID
38762 + * @lv_number: Logical volume number
38763 + * @next_free_chuck:
38766 + * Snapshot statistics ioctl packet definition.
38768 +struct lvm_snapshot_stat_ioctl {
38769 + u8 vg_uuid[UUID_LEN];
38771 + u64 next_free_chunk;
38776 + * struct lvm_physical_volume
38777 + * @logical_node: Storage object
38778 + * @pv: Copy of on-disk PV struct
38781 + * @next: Pointer to next entry
38783 + * Entries in the list of physical volumes (PV) in a volume group (VG).
38785 +struct lvm_physical_volume {
38786 + struct evms_logical_node * logical_node;
38787 + struct pv_disk * pv;
38788 + struct pe_disk * pe_map;
38790 + struct lvm_physical_volume * next;
38794 + * struct le_table_entry
38796 + * @pe_sector_offset:
38798 + * Table entry definition for mapping logical
38799 + * extents (LE) to physical extents (PE).
38801 +struct le_table_entry {
38802 + struct lvm_physical_volume * owning_pv;
38803 + u64 pe_sector_offset;
38807 + * struct snapshot_map_entry
38814 + * Snapshot remapping entry structure definition.
38816 +struct snapshot_map_entry {
38819 + struct lvm_physical_volume * snap_pv;
38820 + struct snapshot_map_entry * next;
38821 + struct snapshot_map_entry * prev;
38824 +#define MAX_HASH_CHAIN_ENTRIES 10
38825 +#define CHUNK_DATA_BUFFER_SIZE 128
38828 + * struct lvm_logical_volume
38830 + * @lv_size: In sectors
38831 + * @lv_access: Flags: LV_READ, LV_WRITE, LV_SNAPSHOT,
38832 + * LV_SNAPSHOT_ROG, EVMS_LV*
38833 + * @lv_status: Flags: LV_ACTIVE, LV_SPINDOWN
38834 + * @lv_minor: Device minor number
38836 + * @stripe_size: In sectors
38837 + * @stripe_size_shift: # of bits to shift right instead of dividing by stripe_size
38838 + * @pe_size: In sectors
38839 + * @pe_size_shift: Number of bits to shift right instead of dividing by pe_size
38840 + * @num_le: Number of entries in the le-to-pe map
38841 + * @group: Pointer back to parent volume group
38842 + * @name: Dev-tree volume name (eg. /dev/group0/vol0)
38843 + * @le_map: Mapping of logical to physical extents
38844 + * @volume_node: Pointer to parent EVMS object representing this volume
38845 + * @chunk_size: In sectors
38846 + * @num_chunks: lv_size / chunk_size
38847 + * @snap_org_minor: Minor number of snapshot original
38848 + * @next_cow_entry: Index into current COW table
38849 + * @current_cow_sector: Logical sector of current COW table
38850 + * @next_free_chunk: Starting logical sector of next free chunk
38851 + * @hash_table_size: Number of pointers in each hash table
38852 + * @cow_table: Pointer to one sector's worth of COW tables.
38853 + * @chunk_data_buffer: Buffer reading data when doing copy-on-write
38854 + * @snap_semaphore: For locking during snapshot IO operations
38855 + * @snapshot_map: Pointer to remapping hash tables
38856 + * @snapshot_next: Linked list of volumes being snapshotted
38857 + * @snapshot_org: Pointer to volume being snapshotted
38859 + * In-memory representation of an LVM LV.
38861 +struct lvm_logical_volume {
38869 + u32 stripe_size_shift;
38871 + u32 pe_size_shift;
38873 + struct lvm_volume_group * group;
38874 + u8 name[NAME_LEN];
38875 + struct le_table_entry * le_map;
38876 + struct evms_logical_node * volume_node;
38879 + u32 snap_org_minor;
38880 + u32 next_cow_entry;
38881 + u64 current_cow_sector;
38882 + u64 next_free_chunk;
38883 + u32 hash_table_size;
38884 + struct lv_COW_table_disk * cow_table;
38885 + u8 * chunk_data_buffer;
38886 + struct semaphore snap_semaphore;
38887 + struct snapshot_map_entry *** snapshot_map;
38888 + struct lvm_logical_volume * snapshot_next;
38889 + struct lvm_logical_volume * snapshot_org;
38893 + * EVMS_LV_NEW: Volume was created during the current discovery pass.
38894 + * EVMS_LV_INCOMPLETE: Volume has an incomplete LE map.
38895 + * EVMS_LV_INVALID: Volume has a memory-corruption problem.
38896 + * EVMS_LV_QUIESCED: Volume is in quiesced state.
38897 + * EVMS_LV_EXPORTED: Volume has been exported during this EVMS discovery pass.
38899 +#define EVMS_LV_NEW 0x10
38900 +#define EVMS_LV_INCOMPLETE 0x20
38901 +#define EVMS_LV_INVALID 0x40
38902 +#define EVMS_LV_QUIESCED 0x80
38903 +#define EVMS_LV_EXPORTED 0x100
38906 + * struct lvm_volume_group
38907 + * @vg: Copy of on-disk VG metadata
38908 + * @pv_list: List of PVs that make up this group
38909 + * @volume_list: Array of volumes
38910 + * @lv_array: Array of LV metadata
38911 + * @uuid_list: List of PV UUIDs
38912 + * @vg_uuid: UUID from the VG metadata
38913 + * @vg_name: Name from the PV metadata
38914 + * @pv_count: # of PVs found in this group
38915 + * @volume_count: # of LVs found in this group
38916 + * @hard_sect_size: Largest hardsector size of all PVs in this group
38917 + * @block_size: Largest block size of all PVs in this group
38918 + * @flags: EVMS_VG*
38919 + * @next_group: Linked list
38921 + * In-memory representation of an LVM VG.
38923 +struct lvm_volume_group {
38924 + struct vg_disk * vg;
38925 + struct lvm_physical_volume * pv_list;
38926 + struct lvm_logical_volume * volume_list[MAX_LV + 1];
38927 + struct lv_disk * lv_array;
38929 + u8 vg_uuid[UUID_LEN];
38930 + u8 vg_name[NAME_LEN];
38932 + u32 volume_count;
38933 + s32 hard_sect_size;
38936 + struct lvm_volume_group * next_group;
38940 + * EVMS_VG_DIRTY: Group is new or has had a PV added
38941 + * during this discovery.
38942 + * EVMS_VG_PARTIAL_PVS: Group contains at least one partial PV.
38943 + * EVMS_VG_REMOVABLE_PVS: Group contains at least one removable PV.
38945 +#define EVMS_VG_DIRTY (1 << 0)
38946 +#define EVMS_VG_PARTIAL_PVS (1 << 1)
38947 +#define EVMS_VG_REMOVABLE_PVS (1 << 2)
38950 diff -Naur linux-2002-09-30/include/linux/evms/evms_md.h evms-2002-09-30/include/linux/evms/evms_md.h
38951 --- linux-2002-09-30/include/linux/evms/evms_md.h Wed Dec 31 18:00:00 1969
38952 +++ evms-2002-09-30/include/linux/evms/evms_md.h Fri Aug 16 11:10:59 2002
38955 + * Copyright (c) International Business Machines Corp., 2000
38957 + * This program is free software; you can redistribute it and/or modify
38958 + * it under the terms of the GNU General Public License as published by
38959 + * the Free Software Foundation; either version 2 of the License, or
38960 + * (at your option) any later version.
38962 + * This program is distributed in the hope that it will be useful,
38963 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
38964 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
38965 + * the GNU General Public License for more details.
38967 + * You should have received a copy of the GNU General Public License
38968 + * along with this program; if not, write to the Free Software
38969 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38971 + * linux/include/linux/evms/evms_md.h
38973 + * EVMS Linux MD Region Manager Public Header File
38975 + * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified
38976 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
38980 +#ifndef __EVMS_MD_INCLUDED
38981 +#define __EVMS_MD_INCLUDED
38983 +#include <linux/mm.h>
38984 +#include <linux/fs.h>
38985 +#include <linux/blkdev.h>
38986 +#include <asm/semaphore.h>
38987 +#include <linux/ioctl.h>
38988 +#include <linux/types.h>
38989 +#include <asm/bitops.h>
38990 +#include <linux/module.h>
38991 +#include <linux/hdreg.h>
38992 +#include <linux/proc_fs.h>
38993 +#include <linux/smp_lock.h>
38994 +#include <linux/delay.h>
38995 +#include <net/checksum.h>
38996 +#include <linux/random.h>
38997 +#include <linux/locks.h>
38998 +#include <linux/kernel_stat.h>
38999 +#include <asm/io.h>
39000 +#include <linux/completion.h>
39002 +#include <linux/evms/evms.h>
39004 +#include <linux/raid/md_compatible.h>
39006 + * 'md_p.h' holds the 'physical' layout of RAID devices
39007 + * 'md_u.h' holds the user <=> kernel API
39009 + * 'md_k.h' holds kernel internal definitions
39012 +#include <linux/evms/evms_md_p.h>
39013 +#include <linux/evms/evms_md_u.h>
39014 +#include <linux/evms/evms_md_k.h>
39017 + * Different major versions are not compatible.
39018 + * Different minor versions are only downward compatible.
39019 + * Different patchlevel versions are downward and upward compatible.
39021 +#define EVMS_MD_MAJOR_VERSION 1
39022 +#define EVMS_MD_MINOR_VERSION 1
39023 +#define EVMS_MD_PATCHLEVEL_VERSION 1
39025 +#define MD_MAJOR_VERSION 0
39026 +#define MD_MINOR_VERSION 90
39027 +#define MD_PATCHLEVEL_VERSION 0
39029 +#define EVMS_MD_COMMON_SERVICES_MAJOR 0
39030 +#define EVMS_MD_COMMON_SERVICES_MINOR 5
39031 +#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL 0
39034 +extern int evms_md_size[MAX_MD_DEVS];
39036 +extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
39037 +extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);
39038 +extern char * evms_md_partition_name (struct evms_logical_node *node);
39039 +extern int evms_register_md_personality (int p_num, mdk_personality_t *p);
39040 +extern int evms_unregister_md_personality (int p_num);
39042 +extern int evms_md_update_sb (mddev_t *mddev);
39043 +extern int evms_md_check_ordering (mddev_t *mddev);
39044 +extern void evms_md_print_devices (void);
39046 +extern int evms_md_sync_io(
39047 + struct evms_logical_node *node, /* evms node for the MD array */
39048 + int rw, /* READ / WRITE */
39049 + u64 sector, /* starting sector */
39050 + u64 total_nr_sects, /* total number of sectors */
39051 + void *data ); /* pointer to buffer */
39053 +extern int evms_md_partial_sync_io(
39054 + struct evms_logical_node *node, /* evms node for the MD array */
39055 + int rw, /* READ / WRITE */
39056 + u64 sector, /* starting sector */
39057 + u32 *nsects, /* on input: the total number of sectors for the request */
39058 + /* on output, number of sectors completed */
39059 + void *data); /* pointer to buffer */
39062 +extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
39063 +extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);
39064 +extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);
39065 +extern void evms_md_recover_arrays (void);
39066 +extern int evms_md_error (mddev_t *mddev, struct evms_logical_node *node);
39067 +extern int evms_md_error_dev(mddev_t *mddev, kdev_t dev);
39069 +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }
39074 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_k.h evms-2002-09-30/include/linux/evms/evms_md_k.h
39075 --- linux-2002-09-30/include/linux/evms/evms_md_k.h Wed Dec 31 18:00:00 1969
39076 +++ evms-2002-09-30/include/linux/evms/evms_md_k.h Tue Aug 6 01:03:24 2002
39079 + * Copyright (c) International Business Machines Corp., 2000
39081 + * This program is free software; you can redistribute it and/or modify
39082 + * it under the terms of the GNU General Public License as published by
39083 + * the Free Software Foundation; either version 2 of the License, or
39084 + * (at your option) any later version.
39086 + * This program is distributed in the hope that it will be useful,
39087 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
39088 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
39089 + * the GNU General Public License for more details.
39091 + * You should have received a copy of the GNU General Public License
39092 + * along with this program; if not, write to the Free Software
39093 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39096 + * linux/include/linux/evms/evms_md_k.h
39098 + * EVMS Linux MD Region Manager Public Header File
39100 + * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified
39101 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
39105 +#ifndef __EVMS_MD_K_INC__
39106 +#define __EVMS_MD_K_INC__
39108 +#define EVMS_MD_SECTS_PER_PAGE (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT)
39109 +#define EVMS_MD_SECTS_PER_PAGE_MASK (~(EVMS_MD_SECTS_PER_PAGE-1))
39111 +#define MD_RESERVED 0UL
39112 +#define LINEAR 1UL
39116 +#define TRANSLUCENT 5UL
39118 +#define MULTIPATH 7UL
39119 +#define MAX_PERSONALITY 8UL
39121 +static inline int pers_to_level (int pers)
39124 + case MULTIPATH: return -4;
39125 + case HSM: return -3;
39126 + case TRANSLUCENT: return -2;
39127 + case LINEAR: return -1;
39128 + case RAID0: return 0;
39129 + case RAID1: return 1;
39130 + case RAID5: return 5;
39133 + return MD_RESERVED;
39136 +static inline int level_to_pers (int level)
39139 + case -3: return HSM;
39140 + case -2: return TRANSLUCENT;
39141 + case -1: return LINEAR;
39142 + case 0: return RAID0;
39143 + case 1: return RAID1;
39145 + case 5: return RAID5;
39147 + return MD_RESERVED;
39150 +typedef struct mddev_s mddev_t;
39151 +typedef struct mdk_rdev_s mdk_rdev_t;
39153 +#if (MINORBITS != 8)
39154 +#error MD doesnt handle bigger kdev yet
39157 +#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
39160 + * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
39161 + * the personality. (eg. HSM uses this to identify individual LVs)
39163 +struct dev_mapping {
39168 +extern struct dev_mapping evms_mddev_map [MAX_MD_DEVS];
39169 +static inline mddev_t * kdev_to_mddev (kdev_t dev)
39171 + if (MAJOR(dev) != MD_MAJOR)
39173 + return evms_mddev_map[MINOR(dev)].mddev;
39177 + * options passed in raidrun:
39180 +#define MAX_CHUNK_SIZE (4096*1024)
39183 + * default readahead
39185 +#define MD_READAHEAD vm_max_readahead
39187 +static inline int disk_faulty(mdp_disk_t * d)
39189 + return d->state & (1 << MD_DISK_FAULTY);
39192 +static inline int disk_active(mdp_disk_t * d)
39194 + return d->state & (1 << MD_DISK_ACTIVE);
39197 +static inline int disk_sync(mdp_disk_t * d)
39199 + return d->state & (1 << MD_DISK_SYNC);
39202 +static inline int disk_spare(mdp_disk_t * d)
39204 + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
39207 +static inline int disk_removed(mdp_disk_t * d)
39209 + return d->state & (1 << MD_DISK_REMOVED);
39212 +static inline void mark_disk_faulty(mdp_disk_t * d)
39214 + d->state |= (1 << MD_DISK_FAULTY);
39217 +static inline void mark_disk_active(mdp_disk_t * d)
39219 + d->state |= (1 << MD_DISK_ACTIVE);
39220 + d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);
39223 +static inline void mark_disk_sync(mdp_disk_t * d)
39225 + d->state |= (1 << MD_DISK_SYNC);
39228 +static inline void mark_disk_spare(mdp_disk_t * d)
39233 +static inline void mark_disk_removed(mdp_disk_t * d)
39235 + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
39238 +static inline void mark_disk_inactive(mdp_disk_t * d)
39240 + d->state &= ~(1 << MD_DISK_ACTIVE);
39243 +static inline void mark_disk_nonsync(mdp_disk_t * d)
39245 + d->state &= ~(1 << MD_DISK_SYNC);
39249 + * MD's 'extended' device
39253 + struct md_list_head same_set; /* RAID devices within the same set */
39254 + struct md_list_head all; /* all RAID devices */
39255 + struct md_list_head pending; /* undetected RAID devices */
39256 + struct evms_logical_node *node; /* EVMS device node */
39257 + kdev_t dev; /* Device number */
39258 + kdev_t old_dev; /* "" when it was last imported */
39259 + unsigned long size; /* Device size (in blocks) */
39260 + mddev_t *mddev; /* RAID array if running */
39261 + unsigned long last_events; /* IO event timestamp */
39263 + struct block_device *bdev; /* block device handle */
39266 + unsigned long sb_offset; /* in blocks */
39268 + int virtual_spare; /* "virtual" spare added via IOCTL */
39269 + int alias_device; /* device alias to the same disk */
39270 + int faulty; /* if faulty do not issue IO requests */
39271 + int desc_nr; /* descriptor index in the superblock */
39276 + * disk operations in a working array:
39278 +#define DISKOP_SPARE_INACTIVE 0
39279 +#define DISKOP_SPARE_WRITE 1
39280 +#define DISKOP_SPARE_ACTIVE 2
39281 +#define DISKOP_HOT_SPARE_ACTIVE 3
39282 +#define DISKOP_HOT_REMOVE_SPARE 4
39283 +#define DISKOP_HOT_REMOVE_DISK 5
39284 +#define DISKOP_HOT_ADD_DISK 6
39285 +#define DISKOP_HOT_DEACTIVATE_DISK 7
39287 +typedef struct mdk_personality_s mdk_personality_t;
39292 + mdk_personality_t *pers;
39293 + struct evms_logical_node *node;
39294 + unsigned long flag;
39295 + int nr_raid_disks;
39300 + struct md_list_head disks;
39303 + unsigned long curr_resync; /* blocks scheduled */
39304 + unsigned long resync_mark; /* a recent timestamp */
39305 + unsigned long resync_mark_cnt;/* blocks written at resync_mark */
39307 + int recovery_running;
39308 + struct semaphore reconfig_sem;
39309 + struct semaphore recovery_sem;
39310 + struct semaphore resync_sem;
39313 + atomic_t recovery_active; /* blocks scheduled, but not written */
39314 + md_wait_queue_head_t recovery_wait;
39316 + struct md_list_head all_mddevs;
39317 + struct md_list_head incomplete_mddevs;
39318 + struct md_list_head running_mddevs;
39321 +struct mdk_personality_s
39324 + int (*sync_io) (mddev_t *mddev, int rw, u64 LSN, u64 nr_sects, void *data);
39325 + void (*read)(struct evms_logical_node *node, struct buffer_head *bh);
39326 + void (*write)(struct evms_logical_node *node, struct buffer_head *bh);
39327 + int (*run)(mddev_t *mddev);
39328 + int (*stop)(mddev_t *mddev);
39329 + int (*status)(char *page, mddev_t *mddev);
39330 + int (*error_handler)(mddev_t *mddev, struct evms_logical_node *node);
39333 + * Some personalities (RAID-1, RAID-5) can have disks hot-added and
39334 + * hot-removed. Hot removal is different from failure. (failure marks
39335 + * a disk inactive, but the disk is still part of the array) The interface
39336 + * to such operations is the 'pers->diskop()' function, can be NULL.
39338 + * the diskop function can change the pointer pointing to the incoming
39339 + * descriptor, but must do so very carefully. (currently only
39340 + * SPARE_ACTIVE expects such a change)
39342 + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
39344 + int (*stop_resync)(mddev_t *mddev);
39345 + int (*restart_resync)(mddev_t *mddev);
39346 + int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
39347 + int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,
39348 + unsigned int cmd, unsigned long arg);
39349 + int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);
39353 + * EVMS MD instance data structure definition
39357 + struct evms_plugin_header instance_plugin_hdr;
39360 +#define EVMS_MD_NODE_TO_MDDEV(node) ((struct evms_md *)(node->private))->mddev
39362 +static inline int evms_md_check_boundary(struct evms_logical_node *node, struct buffer_head *bh)
39364 + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) > node->total_vsectors) {
39365 + bh->b_end_io(bh, 0);
39372 + * This structure is used for synchronous I/O
39373 + * @rc : error code
39374 + * @io_count: number of I/Os
39375 + + @wait: wait queue
39377 +struct evms_md_sync_cb {
39379 + atomic_t io_count;
39380 + wait_queue_head_t wait;
39385 + * This structure is required for activating a spare device
39386 + * @next: next spare
39387 + * @mddev: target md device
39388 + * @spare: spare to activate
39390 +struct evms_md_activate_spare {
39391 + struct evms_md_activate_spare *next;
39393 + mdp_disk_t *spare;
39396 +static inline int incomplete_mddev(mddev_t * mddev)
39398 + return (mddev->incomplete_mddevs.next != &mddev->incomplete_mddevs);
39402 + * Currently we index md_array directly, based on the minor
39403 + * number. This will have to change to dynamic allocation
39404 + * once we start supporting partitioning of md devices.
39406 +static inline int mdidx (mddev_t * mddev)
39408 + return mddev->__minor;
39411 +static inline kdev_t mddev_to_kdev(mddev_t * mddev)
39413 + return MKDEV(MD_MAJOR, mdidx(mddev));
39416 +extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);
39417 +extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);
39418 +extern mdp_disk_t *get_spare(mddev_t *mddev);
39421 + * iterates through some rdev ringlist. It's safe to remove the
39422 + * current 'rdev'. Dont touch 'tmp' though.
39424 +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
39426 + for (tmp = head.next; \
39427 + rdev = md_list_entry(tmp, mdk_rdev_t, field), \
39428 + tmp = tmp->next, tmp->prev != &head \
39431 + * iterates through the 'same array disks' ringlist
39433 +#define ITERATE_RDEV(mddev,rdev,tmp) \
39434 + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
39437 + * Same as above, but assumes that the device has rdev->desc_nr numbered
39438 + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
39440 +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
39441 + for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
39445 + * Iterates through all 'RAID managed disks'
39447 +#define ITERATE_RDEV_ALL(rdev,tmp) \
39448 + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
39451 + * Iterates through 'pending RAID disks'
39453 +#define ITERATE_RDEV_PENDING(rdev,tmp) \
39454 + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
39457 + * iterates through all used mddevs in the system.
39459 +#define ITERATE_MDDEV(mddev,tmp) \
39461 + for (tmp = all_mddevs.next; \
39462 + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
39463 + tmp = tmp->next, tmp->prev != &all_mddevs \
39467 + * iterates through all incomplete mddevs in the system.
39469 +#define ITERATE_INCOMPLETE_MDDEV(mddev,tmp) \
39471 + for (tmp = incomplete_mddevs.next; \
39472 + mddev = list_entry(tmp, mddev_t, incomplete_mddevs), \
39473 + tmp = tmp->next, tmp->prev != &incomplete_mddevs\
39476 + * iterates through all running mddevs in the system.
39478 +#define ITERATE_RUNNING_MDDEV(mddev,tmp) \
39480 + for (tmp = running_mddevs.next; \
39481 + mddev = list_entry(tmp, mddev_t, running_mddevs), \
39482 + tmp = tmp->next, tmp->prev != &running_mddevs \
39485 +static inline int lock_mddev (mddev_t * mddev)
39487 + return down_interruptible(&mddev->reconfig_sem);
39490 +static inline void unlock_mddev (mddev_t * mddev)
39492 + up(&mddev->reconfig_sem);
39495 +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
39496 + x = y; y = __tmp; } while (0)
39498 +#define MAX_DISKNAME_LEN 64
39500 +typedef struct dev_name_s {
39501 + struct md_list_head list;
39503 + char namebuf [MAX_DISKNAME_LEN];
39508 +#define __wait_event_lock_irq(wq, condition, lock) \
39510 + wait_queue_t __wait; \
39511 + init_waitqueue_entry(&__wait, current); \
39513 + add_wait_queue(&wq, &__wait); \
39515 + set_current_state(TASK_UNINTERRUPTIBLE); \
39518 + spin_unlock_irq(&lock); \
39519 + run_task_queue(&tq_disk); \
39521 + spin_lock_irq(&lock); \
39523 + current->state = TASK_RUNNING; \
39524 + remove_wait_queue(&wq, &__wait); \
39527 +#define wait_event_lock_irq(wq, condition, lock) \
39531 + __wait_event_lock_irq(wq, condition, lock); \
39535 +#define __wait_disk_event(wq, condition) \
39537 + wait_queue_t __wait; \
39538 + init_waitqueue_entry(&__wait, current); \
39540 + add_wait_queue(&wq, &__wait); \
39542 + set_current_state(TASK_UNINTERRUPTIBLE); \
39545 + run_task_queue(&tq_disk); \
39548 + current->state = TASK_RUNNING; \
39549 + remove_wait_queue(&wq, &__wait); \
39552 +#define wait_disk_event(wq, condition) \
39556 + __wait_disk_event(wq, condition); \
39561 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_p.h evms-2002-09-30/include/linux/evms/evms_md_p.h
39562 --- linux-2002-09-30/include/linux/evms/evms_md_p.h Wed Dec 31 18:00:00 1969
39563 +++ evms-2002-09-30/include/linux/evms/evms_md_p.h Tue Mar 26 18:58:57 2002
39566 + * Copyright (c) International Business Machines Corp., 2000
39568 + * This program is free software; you can redistribute it and/or modify
39569 + * it under the terms of the GNU General Public License as published by
39570 + * the Free Software Foundation; either version 2 of the License, or
39571 + * (at your option) any later version.
39573 + * This program is distributed in the hope that it will be useful,
39574 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
39575 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
39576 + * the GNU General Public License for more details.
39578 + * You should have received a copy of the GNU General Public License
39579 + * along with this program; if not, write to the Free Software
39580 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39583 + * linux/include/linux/evms/evms_md_p.h
39585 + * EVMS Linux MD Region Manager Public Header File
39587 + * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified
39588 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.
39592 +#ifndef __EVMS_MD_P_INC__
39593 +#define __EVMS_MD_P_INC__
39596 + * RAID superblock.
39598 + * The RAID superblock maintains some statistics on each RAID configuration.
39599 + * Each real device in the RAID set contains it near the end of the device.
39600 + * Some of the ideas are copied from the ext2fs implementation.
39602 + * We currently use 4096 bytes as follows:
39604 + * word offset function
39606 + * 0 - 31 Constant generic RAID device information.
39607 + * 32 - 63 Generic state information.
39608 + * 64 - 127 Personality specific information.
39609 + * 128 - 511 12 32-words descriptors of the disks in the raid set.
39610 + * 512 - 911 Reserved.
39611 + * 912 - 1023 Disk specific descriptor.
39615 + * If x is the real device size in bytes, we return an apparent size of:
39617 + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
39619 + * and place the 4kB superblock at offset y.
39621 +#define MD_RESERVED_BYTES (64 * 1024)
39622 +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
39623 +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
39625 +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
39626 +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
39628 +#define MD_SB_BYTES 4096
39629 +#define MD_SB_WORDS (MD_SB_BYTES / 4)
39630 +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
39631 +#define MD_SB_SECTORS (MD_SB_BYTES / 512)
39634 + * The following are counted in 32-bit words
39636 +#define MD_SB_GENERIC_OFFSET 0
39637 +#define MD_SB_PERSONALITY_OFFSET 64
39638 +#define MD_SB_DISKS_OFFSET 128
39639 +#define MD_SB_DESCRIPTOR_OFFSET 992
39641 +#define MD_SB_GENERIC_CONSTANT_WORDS 32
39642 +#define MD_SB_GENERIC_STATE_WORDS 32
39643 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
39644 +#define MD_SB_PERSONALITY_WORDS 64
39645 +#define MD_SB_DESCRIPTOR_WORDS 32
39646 +#define MD_SB_DISKS 27
39647 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
39648 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
39649 +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
39652 + * Device "operational" state bits
39654 +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
39655 +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
39656 +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
39657 +#define MD_DISK_REMOVED 3 /* disk has kind of been removed, but not really or it would not be here */
39658 +#define MD_DISK_NEW 4 /* disk has just been added to the raid set */
39659 +#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */
39661 +typedef struct mdp_device_descriptor_s {
39662 + __u32 number; /* 0 Device number in the entire set */
39663 + __u32 major; /* 1 Device major number */
39664 + __u32 minor; /* 2 Device minor number */
39665 + __u32 raid_disk; /* 3 The role of the device in the raid set */
39666 + __u32 state; /* 4 Operational state */
39667 + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
39670 +#define MD_SB_MAGIC 0xa92b4efc
39673 + * Superblock state bits
39675 +#define MD_SB_CLEAN 0
39676 +#define MD_SB_ERRORS 1
39678 +typedef struct mdp_superblock_s {
39680 + * Constant generic information
39682 + __u32 md_magic; /* 0 MD identifier */
39683 + __u32 major_version; /* 1 major version to which the set conforms */
39684 + __u32 minor_version; /* 2 minor version ... */
39685 + __u32 patch_version; /* 3 patchlevel version ... */
39686 + __u32 gvalid_words; /* 4 Number of used words in this section */
39687 + __u32 set_uuid0; /* 5 Raid set identifier */
39688 + __u32 ctime; /* 6 Creation time */
39689 + __u32 level; /* 7 Raid personality */
39690 + __u32 size; /* 8 Apparent size of each individual disk */
39691 + __u32 nr_disks; /* 9 total disks in the raid set */
39692 + __u32 raid_disks; /* 10 disks in a fully functional raid set */
39693 + __u32 md_minor; /* 11 preferred MD minor device number */
39694 + __u32 not_persistent; /* 12 does it have a persistent superblock */
39695 + __u32 set_uuid1; /* 13 Raid set identifier #2 */
39696 + __u32 set_uuid2; /* 14 Raid set identifier #3 */
39697 + __u32 set_uuid3; /* 15 Raid set identifier #4 */
39698 + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
39701 + * Generic state information
39703 + __u32 utime; /* 0 Superblock update time */
39704 + __u32 state; /* 1 State bits (clean, ...) */
39705 + __u32 active_disks; /* 2 Number of currently active disks */
39706 + __u32 working_disks; /* 3 Number of working disks */
39707 + __u32 failed_disks; /* 4 Number of failed disks */
39708 + __u32 spare_disks; /* 5 Number of spare disks */
39709 + __u32 sb_csum; /* 6 checksum of the whole superblock */
39711 +#ifdef __BIG_ENDIAN
39712 + __u32 events_hi; /* 7 high-order of superblock update count */
39713 + __u32 events_lo; /* 8 low-order of superblock update count */
39715 + __u32 events_lo; /* 7 low-order of superblock update count */
39716 + __u32 events_hi; /* 8 high-order of superblock update count */
39719 +#if __BYTE_ORDER == __BIG_ENDIAN
39720 + __u32 events_hi; /* 7 high-order of superblock update count */
39721 + __u32 events_lo; /* 8 low-order of superblock update count */
39723 + __u32 events_lo; /* 7 low-order of superblock update count */
39724 + __u32 events_hi; /* 8 high-order of superblock update count */
39727 + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
39730 + * Personality information
39732 + __u32 layout; /* 0 the array's physical layout */
39733 + __u32 chunk_size; /* 1 chunk size in bytes */
39734 + __u32 root_pv; /* 2 LV root PV */
39735 + __u32 root_block; /* 3 LV root block */
39736 + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
39739 + * Disks information
39741 + mdp_disk_t disks[MD_SB_DISKS];
39746 + __u32 reserved[MD_SB_RESERVED_WORDS];
39749 + * Active descriptor
39751 + mdp_disk_t this_disk;
39755 +static inline __u64 md_event(mdp_super_t *sb) {
39756 + __u64 ev = sb->events_hi;
39757 + return (ev<<32)| sb->events_lo;
39762 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_u.h evms-2002-09-30/include/linux/evms/evms_md_u.h
39763 --- linux-2002-09-30/include/linux/evms/evms_md_u.h Wed Dec 31 18:00:00 1969
39764 +++ evms-2002-09-30/include/linux/evms/evms_md_u.h Fri Aug 16 16:19:56 2002
39767 + * Copyright (c) International Business Machines Corp., 2000
39769 + * This program is free software; you can redistribute it and/or modify
39770 + * it under the terms of the GNU General Public License as published by
39771 + * the Free Software Foundation; either version 2 of the License, or
39772 + * (at your option) any later version.
39774 + * This program is distributed in the hope that it will be useful,
39775 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
39776 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
39777 + * the GNU General Public License for more details.
39779 + * You should have received a copy of the GNU General Public License
39780 + * along with this program; if not, write to the Free Software
39781 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39784 + * linux/include/linux/evms/evms_md_h.c
39786 + * EVMS MD Region Manager, User <-> Kernel common file
39790 +#ifndef _EVMS_MD_U_INC_
39791 +#define _EVMS_MD_U_INC_
39793 +#define EVMS_MD_ID 4
39794 +#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)
39796 +#define EVMS_MD_PERS_IOCTL_CMD 1 /* personality specific ioctl command */
39797 +#define EVMS_MD_ADD 2
39798 +#define EVMS_MD_REMOVE 3
39799 +#define EVMS_MD_ACTIVATE 4
39800 +#define EVMS_MD_DEACTIVATE 5
39801 +#define EVMS_MD_GET_ARRAY_INFO 6
39804 + * structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE
39806 +struct evms_md_kdev {
39812 + * structure definition to use with MD_GET_ARRAY_INFO
39814 +#define EVMS_MD_ARRAY_DEGRADED (1<<0)
39815 +#define EVMS_MD_ARRAY_SYNCING (1<<1)
39816 +struct evms_md_array_info {
39822 + * EVMS MD user/kernel communication
39823 + * @mddev_idx: md minor
39824 + * @cmd: command for personality
39825 + * @arg: specific command structure
39827 +struct evms_md_ioctl {
39835 diff -Naur linux-2002-09-30/include/linux/evms/evms_os2.h evms-2002-09-30/include/linux/evms/evms_os2.h
39836 --- linux-2002-09-30/include/linux/evms/evms_os2.h Wed Dec 31 18:00:00 1969
39837 +++ evms-2002-09-30/include/linux/evms/evms_os2.h Thu Aug 8 17:40:37 2002
39841 + * Copyright (c) International Business Machines Corp., 2000
39843 + * This program is free software; you can redistribute it and/or modify
39844 + * it under the terms of the GNU General Public License as published by
39845 + * the Free Software Foundation; either version 2 of the License, or
39846 + * (at your option) any later version.
39848 + * This program is distributed in the hope that it will be useful,
39849 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
39850 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
39851 + * the GNU General Public License for more details.
39853 + * You should have received a copy of the GNU General Public License
39854 + * along with this program; if not, write to the Free Software
39855 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39857 + * Module: linux/include/linux/evms_os2.h
39861 + * Change History:
39866 + * Description: This module defines the disk structures used by the OS/2
39867 + * Logical Volume Manager, including that of the Master
39868 + * Boot Record (MBR) and Extended Boot Records (EBR).
39870 + * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the
39871 + * last sector of each track containing a valid MBR or EBR. Since
39872 + * partitions must be track aligned, any track containing an MBR or
39873 + * EBR will be almost all empty sectors. We will grab the last
39874 + * of these empty sectors for our DLT_Tables.
39878 +#ifndef OS2LVM_INCLUDED__
39879 +#define OS2LVM_INCLUDED__
39881 +/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */
39882 +#define EBR_BOOT_INDICATOR 0
39883 +#define EBR_FORMAT_INDICATOR 5
39885 +/* The following define is used as the default Format_Indicator for new non-primary partitions. */
39886 +#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR 0x6
39888 +/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */
39889 +#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR 0x16
39891 +/* The following define is used as the default Format_Indicator for a new active primary partition. */
39892 +#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR 0x06
39894 +/* The following define is used to hold the value of the Boot_Indicator for active partitions. */
39895 +#define ACTIVE_PARTITION 0x80
39897 +/* Define the size of a Partition Name. Partition Names are user defined names given to a partition. */
39898 +#define PARTITION_NAME_SIZE 20
39900 +/* Define the size of a volume name. Volume Names are user defined names given to a volume. */
39901 +#define VOLUME_NAME_SIZE 20
39903 +/* Define the size of a disk name. Disk Names are user defined names given to physical disk drives in the system. */
39904 +#define DISK_NAME_SIZE 20
39906 +/* The name of the filesystem in use on a partition. This name may be up to 12 ( + NULL terminator) characters long. */
39907 +#define FILESYSTEM_NAME_SIZE 20
39909 +/* The comment field is reserved but is not currently used. This is for future expansion and use. */
39910 +#define COMMENT_SIZE 81
39912 +/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */
39913 +#define BOOT_MANAGER_SIZE 2048
39915 +#define OS2_BYTES_PER_SECTOR 512
39916 +#define OS2_SECTOR_SHIFT 9
39918 +/*--------------------------------------------------
39919 + * Type definitions
39920 + --------------------------------------------------*/
39922 +/* The following definitions define the drive letter assignment table used by LVM.
39923 + For each partition table on the disk, there will be a drive letter assignment table in the last sector
39924 + of the track containing the partition table. */
39926 +/* NOTE: DLA stands for Drive Letter Assignment. */
39928 +#define DLA_TABLE_SIGNATURE1 0x424D5202L
39929 +#define DLA_TABLE_SIGNATURE2 0x44464D50L
39931 +struct dla_entry { /* DE */
39932 + u32 Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */
39933 + u32 partition_serial; /* The serial number of this partition. */
39934 + u32 Partition_Size; /* The size of the partition, in sectors. */
39935 + u32 Partition_Start; /* The starting sector of the partition. */
39936 + unsigned char On_Boot_Manager_Menu; /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */
39937 + unsigned char Installable; /* Set to TRUE if this volume is the one to install the operating system on. */
39938 + char Drive_Letter; /* The drive letter assigned to the partition. */
39939 + unsigned char Reserved;
39940 + char Volume_Name[VOLUME_NAME_SIZE]; /* The name assigned to the volume by the user. */
39941 + char Partition_Name[PARTITION_NAME_SIZE]; /* The name assigned to the partition. */
39944 +struct dla_table_sector { /* DTS */
39945 + u32 DLA_Signature1; /* The magic signature (part 1) of a Drive Letter Assignment Table. */
39946 + u32 DLA_Signature2; /* The magic signature (part 2) of a Drive Letter Assignment Table. */
39947 + u32 DLA_CRC; /* The 32 bit CRC for this sector. Calculated assuming that this field and all unused space in the sector is 0. */
39948 + u32 Disk_Serial_Number; /* The serial number assigned to this disk. */
39949 + u32 Boot_Disk_Serial_Number; /* The serial number of the disk used to boot the system. This is for conflict resolution when multiple volumes
39950 + want the same drive letter. Since LVM.EXE will not let this situation happen, the only way to get this situation
39951 + is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one
39952 + machine to another. If the drive has been moved, then it should have a different Boot_Disk_Serial_Number. Thus,
39953 + we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.
39954 + If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on
39955 + a first come, first serve basis. */
39956 + u32 Install_Flags; /* Used by the Install program. */
39958 + u32 Heads_Per_Cylinder;
39959 + u32 Sectors_Per_Track;
39960 + char Disk_Name[DISK_NAME_SIZE]; /* The name assigned to the disk containing this sector. */
39961 + unsigned char Reboot; /* For use by Install. Used to keep track of reboots initiated by install. */
39962 + unsigned char Reserved[3]; /* Alignment. */
39963 + struct dla_entry DLA_Array[4]; /* These are the four entries which correspond to the entries in the partition table. */
39966 +/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */
39968 +#define OS2LVM_PRIMARY_SIGNATURE 0x4A435332L
39969 +#define OS2LVM_SECONDARY_SIGNATURE 0x4252444BL
39971 +#define CURRENT_OS2LVM_MAJOR_VERSION_NUMBER 2 /* Define as appropriate. */
39972 +#define CURRENT_OS2LVM_MINOR_VERSION_NUMBER 0 /* Define as appropriate. */
39974 +/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */
39975 +#define OS2LVM_MAX_FEATURES_PER_VOLUME 10 /* The maximum number of LVM features that can be applied to a volume. */
39976 +#define OS2LVM_NULL_FEATURE 0 /* No feature. Used in all unused entries of the feature array in the LVM Signature sector. */
39978 +/* The following structure is used to hold the location of the feature specific data for LVM features. */
39979 +typedef struct _LVM_Feature_Data { /* LFD */
39980 + u32 Feature_ID; /* The ID of the feature. */
39981 + u32 Location_Of_Primary_Feature_Data; /* The u32 of the starting sector of the private data for this feature. */
39982 + u32 Location_Of_Secondary_Feature_Data; /* The u32 of the starting sector of the backup copy of the private data for this feature. */
39983 + u32 Feature_Data_Size; /* The number of sectors used by this feature for its private data. */
39984 + u16 Feature_Major_Version_Number; /* The integer portion of the version number of this feature. */
39985 + u16 Feature_Minor_Version_Number; /* The decimal portion of the version number of this feature. */
39986 + unsigned char Feature_Active; /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */
39987 + unsigned char Reserved[3]; /* Alignment. */
39988 +} LVM_Feature_Data;
39990 +/* The following structure defines the LVM Signature Sector. This is the last sector of every partition which is part of an LVM volume. It gives vital
39991 + information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are
39992 + active on the volume that this partition is a part of. */
39993 +typedef struct _LVM_Signature_Sector { /* LSS */
39994 + u32 LVM_Signature1; /* The first part of the magic LVM signature. */
39995 + u32 LVM_Signature2; /* The second part of the magic LVM signature. */
39996 + u32 Signature_Sector_CRC; /* 32 bit CRC for this sector. Calculated using 0 for this field. */
39997 + u32 partition_serial; /* The LVM assigned serial number for this partition. */
39998 + u32 Partition_Start; /* u32 of the first sector of this partition. */
39999 + u32 Partition_End; /* u32 of the last sector of this partition. */
40000 + u32 Partition_Sector_Count; /* The number of sectors in this partition. */
40001 + u32 LVM_Reserved_Sector_Count; /* The number of sectors reserved for use by LVM. */
40002 + u32 Partition_Size_To_Report_To_User; /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */
40003 + u32 Boot_Disk_Serial_Number; /* The serial number of the boot disk for the system. If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */
40004 + u32 Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */
40005 + u32 Fake_EBR_Location; /* The location, on disk, of a Fake EBR, if one has been allocated. */
40006 + u16 LVM_Major_Version_Number; /* Major version number of the LVM that created this partition. */
40007 + u16 LVM_Minor_Version_Number; /* Minor version number of the LVM that created this partition. */
40008 + char Partition_Name[PARTITION_NAME_SIZE]; /* User defined partition name. */
40009 + char Volume_Name[VOLUME_NAME_SIZE]; /* The name of the volume that this partition belongs to. */
40010 + LVM_Feature_Data LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array. This indicates which LVM features, if any, are active on this volume
40011 + and what order they should be applied in. */
40012 + char Drive_Letter; /* The drive letter assigned to the volume that this partition is part of. */
40013 + unsigned char Fake_EBR_Allocated; /* If TRUE, then a fake EBR has been allocated. */
40014 + char Comment[COMMENT_SIZE]; /* User comment. */
40015 + char Disk_Name[DISK_NAME_SIZE]; /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */
40016 + u32 Sequence_Number; /* This indicates the order that partitions within a volume are used. This number is 1 based. A 0 here indicates that the volume was made by LVM Ver. 1. */
40017 + u32 Next_Aggregate_Number; /* Used during volume creation and expansion when creating unique names for aggregates. */
40018 + /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */
40019 +} LVM_Signature_Sector;
40021 +/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */
40022 +typedef struct _Partition_Record { /* PR */
40023 + unsigned char Boot_Indicator; /* 80h = active partition. */
40024 + unsigned char Starting_Head;
40025 + unsigned char Starting_Sector; /* Bits 0-5 are the sector. Bits 6 and 7 are the high order bits of the starting cylinder. */
40026 + unsigned char Starting_Cylinder; /* The cylinder number is a 10 bit value. The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */
40027 + unsigned char Format_Indicator; /* An indicator of the format/operation system on this partition. */
40028 + unsigned char Ending_Head;
40029 + unsigned char Ending_Sector;
40030 + unsigned char Ending_Cylinder;
40031 + u32 Sector_Offset; /* The number of sectors on the disk which are prior to the start of this partition. */
40032 + u32 Sector_Count; /* The number of sectors in this partition. */
40033 +} Partition_Record;
40035 +typedef struct _Master_Boot_Record { /* MBR */
40036 + unsigned char Reserved[446];
40037 + Partition_Record Partition_Table[4];
40038 + u16 Signature; /* AA55h in this field indicates that this is a valid partition table/MBR. */
40039 +} Master_Boot_Record;
40041 +typedef Master_Boot_Record Extended_Boot_Record;
40043 +/* The following definition covers the Boot Manager Alias Table in the EBR.
40045 + The Alias Table in the EBR has 2 entries in it, although only the first one is actually used. */
40046 +#define ALIAS_NAME_SIZE 8
40047 +typedef struct _AliasTableEntry { /* ATE */
40048 + unsigned char On_Boot_Manager_Menu;
40049 + char Name[ALIAS_NAME_SIZE];
40050 +} AliasTableEntry;
40052 +#define ALIAS_TABLE_OFFSET 0x18A
40055 +/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and
40056 + which have since been migrated to the new LVM format. This text is put into the Name field of an AliasTableEntry so
40057 + that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display
40058 + something for those partitions/volumes which are on the Boot Manager Menu.
40060 + NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length! */
40061 +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT "--> LVM "
40062 +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2 "--> LVM*"
40066 +/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */
40067 +#define MBR_EBR_SIGNATURE 0xAA55
40069 +/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */
40070 +#define EBR_INDICATOR 0x5
40071 +#define WINDOZE_EBR_INDICATOR 0xF
40072 +#define UNUSED_INDICATOR 0x0
40073 +#define IFS_INDICATOR 0x7
40074 +#define FAT12_INDICATOR 0x1
40075 +#define FAT16_SMALL_PARTITION_INDICATOR 0x4
40076 +#define FAT16_LARGE_PARTITION_INDICATOR 0x6
40077 +#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG 0x10
40078 +#define LVM_PARTITION_INDICATOR 0x35
40079 +#define BOOT_MANAGER_INDICATOR 0x0A
40081 +/* The following is the signature used in the Boot Sector for Boot Manager. */
40082 +#define OS2LVM_BOOT_MANAGER_SIGNATURE "APJ&WN"
40084 +/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */
40085 +#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK 63
40087 +/*--------------------------------------------------
40088 + * Declares for Drive Linking feature:
40089 + *--------------------------------------------------*/
40091 +/* The following defines uniquely identify Drive Linking. */
40092 +#define DRIVE_LINKING_FEATURE_ID 100
40093 +#define DRIVE_LINKING_MAJOR_VERSION 1
40094 +#define DRIVE_LINKING_MINOR_VERSION 0
40096 +/* The following definitions are used for the disk structures supporting drive linking. */
40098 +#define LINK_TABLE_MASTER_SIGNATURE 0x434E4157L
40099 +#define LINK_TABLE_SIGNATURE 0X4D4D5652L
40101 +#define MAXIMUM_LINKS 246
40103 +#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4
40105 +#define LINKS_IN_FIRST_SECTOR 60
40107 +#define LINKS_IN_NEXT_SECTOR 62
40109 +struct drive_link {
40110 + u32 drive_serial;
40111 + u32 partition_serial;
40114 +struct link_table_first_sector {
40115 + u32 Link_Table_Signature; /* Use the LINK_TABLE_MASTER_SIGNATURE here. */
40116 + u32 Link_Table_CRC;
40117 + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
40118 + u32 Links_In_Use;
40119 + struct drive_link Link_Table[LINKS_IN_FIRST_SECTOR];
40122 +struct link_table_sector {
40123 + u32 Link_Table_Signature; /* Use LINK_TABLE_SIGNATURE here. */
40124 + u32 Link_Table_CRC;
40125 + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
40126 + struct drive_link Link_Table[LINKS_IN_NEXT_SECTOR];
40129 +/*--------------------------------------------------
40130 + * Declares for Bad Block Relocation feature:
40131 + *--------------------------------------------------*/
40133 +/* The following definition is the numeric ID for Bad Block Relocation. */
40134 +#define BBR_FEATURE_ID 101
40136 +#define BBR_FEATURE_MAJOR_VERSION 0x0001
40137 +#define BBR_FEATURE_MINOR_VERSION 0x0000
40139 +/* The following definitions are used for the disk structures supporting bad block relocation. */
40141 +/* NOTE: BBR stands for Bad Block Relocation. */
40143 +#define BBR_TABLE_MASTER_SIGNATURE 0x00726D62
40144 +#define BBR_TABLE_SIGNATURE 0x01726276
40146 +struct bbr_table_entry {
40148 + u32 ReplacementSector;
40151 +typedef struct _LVM_BBR_Table_First_Sector {
40152 + u32 Signature; /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here. */
40153 + u32 CRC; /* CRC for this sector. */
40154 + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
40155 + u32 Table_Size; /* The number of BBR_Table_Entries in the BBR Table. */
40156 + u32 Table_Entries_In_Use; /* The number of BBR Table entries which are in use. */
40157 + u32 Sectors_Per_Table; /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table. */
40158 + u32 First_Replacement_Sector; /* The location of the first replacement sector. */
40159 + u32 Last_Replacement_Sector; /* The location of the last replacement sector. */
40160 + u32 Replacement_Sector_Count; /* The number of replacement sectors. */
40161 + u32 Flags; /* Flags global to the Bad Block Relocation Feature. */
40162 +} LVM_BBR_Table_First_Sector;
40164 +/* Flags for LVM_BBR_Table_First_Sector */
40165 +#define BBR_Flag_Write_Verify 0x00000001 /* Indicate convert Write I/O to Write/Verify */
40167 +#define BBR_TABLE_ENTRIES_PER_SECTOR 62
40169 +typedef struct _LVM_BBR_Table_Sector {
40170 + u32 Signature; /* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here. */
40171 + u32 CRC; /* CRC for this sector of the BBR Table. */
40172 + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
40173 + struct bbr_table_entry BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];
40174 + u32 reserved1; /* for block alignment */
40175 +} LVM_BBR_Table_Sector;
40178 +// Combined structure to hold entire BBR feature data as it exists on disk.
40179 +typedef struct _LVM_BBR_Feature {
40180 + LVM_BBR_Table_First_Sector control;
40181 + char reserved1[OS2_BYTES_PER_SECTOR -
40182 + sizeof (LVM_BBR_Table_First_Sector)];
40183 + LVM_BBR_Table_Sector remap[1];
40187 +/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for
40188 + Bad Block Relocation. Otherwise, 1 replacement sector per MB of disk space is allocated. */
40189 +#define BBR_FLOOR 62
40190 +#define BBR_LIMIT 4096
40192 +// In-memory Meta Data for Bad Block Relocation
40193 +// In-memory Meta Data for Drive Linking
40194 +struct os2_dl_entry {
40195 + u64 start_sector;
40196 + u64 sector_count;
40197 + u64 dl_lsn1; /* LSN of first on-disk copy of drive linking data. */
40198 + u64 dl_lsn2; /* LSN of the second on-disk copy of drive linking data. */
40200 + u32 partition_serial;
40201 + u64 bbr_lsn1; /* LSN of the first on-disk copy of the BBR data. */
40202 + u64 bbr_lsn2; /* LSN of the second on-disk copy of the BBR data. */
40203 + u32 bbr_feature_size; /* # of sectors of BBR data. */
40204 + u32 bbr_is_active;
40205 + struct semaphore bbr_table_lock; /* Used to serialize writers */
40206 + unsigned int guard1; /* Lamport's Theorem for mutual exclusion */
40208 + unsigned int guard2; /* Lamport's Theorem for mutual exclusion */
40209 + struct evms_logical_node *link_partition;
40210 + struct os2_dl_entry *next;
40213 +// In-memory Meta Data for each OS/2 LVM Volume:
40214 +typedef struct os2_volume_runtime_entry_s {
40216 + u32 Export_Needed;
40217 + u64 size_in_sectors;
40218 + u32 Volume_Serial_Number;
40219 + u32 drive_link_count;
40220 + struct os2_dl_entry *drive_link;
40221 + struct evms_logical_node *next_os2lvm_node;
40222 +} os2_volume_runtime_entry_t;
40225 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid0.h evms-2002-09-30/include/linux/evms/evms_raid0.h
40226 --- linux-2002-09-30/include/linux/evms/evms_raid0.h Wed Dec 31 18:00:00 1969
40227 +++ evms-2002-09-30/include/linux/evms/evms_raid0.h Tue Aug 6 01:03:24 2002
40229 +#ifndef _EVMS_RAID0_INCL_
40230 +#define _EVMS_RAID0_INCL_
40232 +#include <linux/evms/evms_md.h>
40236 + unsigned long zone_offset; /* Zone offset (in sectors) in md_dev */
40237 + unsigned long dev_offset; /* Zone offset (in sectors) in real dev */
40238 + unsigned long size_in_sects; /* Zone size in sectors */
40239 + int nb_dev; /* # of devices attached to the zone */
40240 + struct evms_logical_node *node[MD_SB_DISKS]; /* EVMS nodes attached to the zone */
40245 + struct strip_zone *zone0, *zone1;
40248 +struct raid0_private_data
40250 + struct raid0_hash *hash_table; /* Dynamically allocated */
40251 + struct strip_zone *strip_zone; /* This one too */
40252 + int nr_strip_zones;
40253 + struct strip_zone *smallest;
40257 +typedef struct raid0_private_data raid0_conf_t;
40259 +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
40262 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid1.h evms-2002-09-30/include/linux/evms/evms_raid1.h
40263 --- linux-2002-09-30/include/linux/evms/evms_raid1.h Wed Dec 31 18:00:00 1969
40264 +++ evms-2002-09-30/include/linux/evms/evms_raid1.h Tue Aug 6 01:03:24 2002
40266 +#ifndef _EVMS_RAID1_H
40267 +#define _EVMS_RAID1_H
40269 +#include <linux/evms/evms_md.h>
40271 +struct mirror_info {
40274 + struct evms_logical_node *node;
40277 + int head_position;
40289 +struct raid1_private_data {
40291 + struct mirror_info mirrors[MD_SB_DISKS];
40294 + int working_disks;
40296 + unsigned long next_sect;
40298 + struct evms_thread *thread, *resync_thread;
40299 + int resync_mirrors;
40300 + struct mirror_info *spare;
40301 + md_spinlock_t device_lock;
40303 + /* buffer pool */
40304 + /* buffer_heads that we have pre-allocated have b_pprev -> &freebh
40305 + * and are linked into a stack using b_next
40306 + * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
40307 + * All these variable are protected by device_lock
40309 + struct buffer_head *freebh;
40310 + int freebh_cnt; /* how many are on the list */
40311 + int freebh_blocked;
40312 + struct raid1_bh *freer1;
40313 + int freer1_blocked;
40315 + struct raid1_bh *freebuf; /* each bh_req has a page allocated */
40316 + md_wait_queue_head_t wait_buffer;
40318 + /* for use when syncing mirrors: */
40319 + unsigned long start_active, start_ready,
40320 + start_pending, start_future;
40321 + int cnt_done, cnt_active, cnt_ready,
40322 + cnt_pending, cnt_future;
40325 + md_wait_queue_head_t wait_done;
40326 + md_wait_queue_head_t wait_ready;
40327 + md_spinlock_t segment_lock;
40330 +typedef struct raid1_private_data raid1_conf_t;
40333 + * this is the only point in the RAID code where we violate
40334 + * C type safety. mddev->private is an 'opaque' pointer.
40336 +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
40339 + * this is our 'private' 'collective' RAID1 buffer head.
40340 + * it contains information about what kind of IO operations were started
40341 + * for this RAID1 operation, and about their status:
40345 + atomic_t remaining; /* 'have we finished' count,
40346 + * used from IRQ handlers
40349 + unsigned long state;
40351 + struct buffer_head *master_bh;
40352 + struct buffer_head *mirror_bh_list;
40353 + struct buffer_head bh_req;
40354 + struct evms_logical_node *node; /* map to evms node (READ only) */
40355 + struct raid1_bh *next_r1; /* next for retry or in free list */
40358 +typedef struct raid1_sync_cb_s {
40360 + atomic_t io_count;
40361 + md_wait_queue_head_t wait;
40362 +} raid1_sync_cb_t;
40364 +/* bits for raid1_bh.state */
40365 +#define R1BH_Uptodate 1
40366 +#define R1BH_SyncPhase 2
40367 +#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */
40369 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid5.h evms-2002-09-30/include/linux/evms/evms_raid5.h
40370 --- linux-2002-09-30/include/linux/evms/evms_raid5.h Wed Dec 31 18:00:00 1969
40371 +++ evms-2002-09-30/include/linux/evms/evms_raid5.h Tue Aug 6 01:03:23 2002
40376 +#include <linux/evms/evms_md.h>
40377 +#include <linux/evms/evms_xor.h>
40381 + * Each stripe contains one buffer per disc. Each buffer can be in
40382 + * one of a number of states determined by bh_state. Changes between
40383 + * these states happen *almost* exclusively under a per-stripe
40384 + * spinlock. Some very specific changes can happen in b_end_io, and
40385 + * these are not protected by the spin lock.
40387 + * The bh_state bits that are used to represent these states are:
40388 + * BH_Uptodate, BH_Lock
40390 + * State Empty == !Uptodate, !Lock
40391 + * We have no data, and there is no active request
40392 + * State Want == !Uptodate, Lock
40393 + * A read request is being submitted for this block
40394 + * State Dirty == Uptodate, Lock
40395 + * Some new data is in this buffer, and it is being written out
40396 + * State Clean == Uptodate, !Lock
40397 + * We have valid data which is the same as on disc
40399 + * The possible state transitions are:
40401 + * Empty -> Want - on read or write to get old data for parity calc
40402 + * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
40403 + * Empty -> Clean - on compute_block when computing a block for failed drive
40404 + * Want -> Empty - on failed read
40405 + * Want -> Clean - on successful completion of read request
40406 + * Dirty -> Clean - on successful completion of write request
40407 + * Dirty -> Clean - on failed write
40408 + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
40410 + * The Want->Empty, Want->Clean, Dirty->Clean, transitions
40411 + * all happen in b_end_io at interrupt time.
40412 + * Each sets the Uptodate bit before releasing the Lock bit.
40413 + * This leaves one multi-stage transition:
40414 + * Want->Dirty->Clean
40415 + * This is safe because thinking that a Clean buffer is actually dirty
40416 + * will at worst delay some action, and the stripe will be scheduled
40417 + * for attention after the transition is complete.
40419 + * There is one possibility that is not covered by these states. That
40420 + * is if one drive has failed and there is a spare being rebuilt. We
40421 + * can't distinguish between a clean block that has been generated
40422 + * from parity calculations, and a clean block that has been
40423 + * successfully written to the spare ( or to parity when resyncing).
40424 + * To distingush these states we have a stripe bit STRIPE_INSYNC that
40425 + * is set whenever a write is scheduled to the spare, or to the parity
40426 + * disc if there is no spare. A sync request clears this bit, and
40427 + * when we find it set with no buffers locked, we know the sync is
40430 + * Buffers for the md device that arrive via make_request are attached
40431 + * to the appropriate stripe in one of two lists linked on b_reqnext.
40432 + * One list (bh_read) for read requests, one (bh_write) for write.
40433 + * There should never be more than one buffer on the two lists
40434 + * together, but we are not guaranteed of that so we allow for more.
40436 + * If a buffer is on the read list when the associated cache buffer is
40437 + * Uptodate, the data is copied into the read buffer and it's b_end_io
40438 + * routine is called. This may happen in the end_request routine only
40439 + * if the buffer has just successfully been read. end_request should
40440 + * remove the buffers from the list and then set the Uptodate bit on
40441 + * the buffer. Other threads may do this only if they first check
40442 + * that the Uptodate bit is set. Once they have checked that they may
40443 + * take buffers off the read queue.
40445 + * When a buffer on the write list is committed for write is it copied
40446 + * into the cache buffer, which is then marked dirty, and moved onto a
40447 + * third list, the written list (bh_written). Once both the parity
40448 + * block and the cached buffer are successfully written, any buffer on
40449 + * a written list can be returned with b_end_io.
40451 + * The write list and read list both act as fifos. The read list is
40452 + * protected by the device_lock. The write and written lists are
40453 + * protected by the stripe lock. The device_lock, which can be
40454 + * claimed while the stipe lock is held, is only for list
40455 + * manipulations and will only be held for a very short time. It can
40456 + * be claimed from interrupts.
40459 + * Stripes in the stripe cache can be on one of two lists (or on
40460 + * neither). The "inactive_list" contains stripes which are not
40461 + * currently being used for any request. They can freely be reused
40462 + * for another stripe. The "handle_list" contains stripes that need
40463 + * to be handled in some way. Both of these are fifo queues. Each
40464 + * stripe is also (potentially) linked to a hash bucket in the hash
40465 + * table so that it can be found by sector number. Stripes that are
40466 + * not hashed must be on the inactive_list, and will normally be at
40467 + * the front. All stripes start life this way.
40469 + * The inactive_list, handle_list and hash bucket lists are all protected by the
40471 + * - stripes on the inactive_list never have their stripe_lock held.
40472 + * - stripes have a reference counter. If count==0, they are on a list.
40473 + * - If a stripe might need handling, STRIPE_HANDLE is set.
40474 + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
40475 + * handle_list else inactive_list
40477 + * This, combined with the fact that STRIPE_HANDLE is only ever
40478 + * cleared while a stripe has a non-zero count means that if the
40479 + * refcount is 0 and STRIPE_HANDLE is set, then it is on the
40480 + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
40481 + * the stripe is on inactive_list.
40483 + * The possible transitions are:
40484 + * activate an unhashed/inactive stripe (get_active_stripe())
40485 + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
40486 + * activate a hashed, possibly active stripe (get_active_stripe())
40487 + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
40488 + * attach a request to an active stripe (add_stripe_bh())
40489 + * lockdev attach-buffer unlockdev
40490 + * handle a stripe (handle_stripe())
40491 + * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
40492 + * release an active stripe (release_stripe())
40493 + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
40495 + * The refcount counts each thread that have activated the stripe,
40496 + * plus raid5d if it is handling it, plus one for each active request
40497 + * on a cached buffer.
40499 +struct stripe_head {
40500 + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
40501 + struct list_head lru; /* inactive_list or handle_list */
40502 + struct raid5_private_data *raid_conf;
40503 + struct buffer_head *bh_cache[MD_SB_DISKS]; /* buffered copy */
40504 + struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */
40505 + struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
40506 + struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
40507 + struct page *bh_page[MD_SB_DISKS]; /* saved bh_cache[n]->b_page when reading around the cache */
40508 + struct evms_logical_node *node[MD_SB_DISKS]; /* the target device node */
40509 + unsigned long sector; /* sector of this row */
40510 + int size; /* buffers size */
40511 + int pd_idx; /* parity disk index */
40512 + unsigned long state; /* state flags */
40513 + atomic_t count; /* nr of active thread/requests */
40522 +#define RECONSTRUCT_WRITE 1
40523 +#define READ_MODIFY_WRITE 2
40524 +/* not a write method, but a compute_parity mode */
40525 +#define CHECK_PARITY 3
40530 +#define STRIPE_ERROR 1
40531 +#define STRIPE_HANDLE 2
40532 +#define STRIPE_SYNCING 3
40533 +#define STRIPE_INSYNC 4
40534 +#define STRIPE_PREREAD_ACTIVE 5
40535 +#define STRIPE_DELAYED 6
40540 + * To improve write throughput, we need to delay the handling of some
40541 + * stripes until there has been a chance that several write requests
40542 + * for the one stripe have all been collected.
40543 + * In particular, any write request that would require pre-reading
40544 + * is put on a "delayed" queue until there are no stripes currently
40545 + * in a pre-read phase. Further, if the "delayed" queue is empty when
40546 + * a stripe is put on it then we "plug" the queue and do not process it
40547 + * until an unplg call is made. (the tq_disk list is run).
40549 + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
40550 + * it to the count of prereading stripes.
40551 + * When write is initiated, or the stripe refcnt == 0 (just in case) we
40552 + * clear the PREREAD_ACTIVE flag and decrement the count
40553 + * Whenever the delayed queue is empty and the device is not plugged, we
40554 + * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
40555 + * In stripe_handle, if we find pre-reading is necessary, we do it if
40556 + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
40557 + * HANDLE gets cleared if stripe_handle leave nothing locked.
40561 +struct disk_info {
40563 + struct evms_logical_node *node;
40572 +struct raid5_private_data {
40573 + struct stripe_head **stripe_hashtbl;
40575 + struct evms_thread *thread, *resync_thread;
40576 + struct disk_info disks[MD_SB_DISKS];
40577 + struct disk_info *spare;
40579 + int chunk_size, level, algorithm;
40580 + int raid_disks, working_disks, failed_disks;
40581 + int resync_parity;
40582 + int max_nr_stripes;
40584 + struct list_head handle_list; /* stripes needing handling */
40585 + struct list_head delayed_list; /* stripes that have plugged requests */
40586 + atomic_t preread_active_stripes; /* stripes with scheduled io */
40588 + * Free stripes pool
40590 + atomic_t active_stripes;
40591 + struct list_head inactive_list;
40592 + md_wait_queue_head_t wait_for_stripe;
40593 + int inactive_blocked; /* release of inactive stripes blocked,
40594 + * waiting for 25% to be free
40596 + md_spinlock_t device_lock;
40599 + struct tq_struct plug_tq;
40602 +typedef struct raid5_private_data raid5_conf_t;
40604 +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
40607 + * Our supported algorithms
40609 +#define ALGORITHM_LEFT_ASYMMETRIC 0
40610 +#define ALGORITHM_RIGHT_ASYMMETRIC 1
40611 +#define ALGORITHM_LEFT_SYMMETRIC 2
40612 +#define ALGORITHM_RIGHT_SYMMETRIC 3
40615 +#define EVMS_MD_RAID5_INIT_IO 1
40617 +struct r5_sync_io {
40624 diff -Naur linux-2002-09-30/include/linux/evms/evms_snapshot.h evms-2002-09-30/include/linux/evms/evms_snapshot.h
40625 --- linux-2002-09-30/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969
40626 +++ evms-2002-09-30/include/linux/evms/evms_snapshot.h Wed Sep 25 15:05:19 2002
40628 +/* -*- linux-c -*- */
40630 + * Copyright (c) International Business Machines Corp., 2000
40632 + * This program is free software; you can redistribute it and/or modify
40633 + * it under the terms of the GNU General Public License as published by
40634 + * the Free Software Foundation; either version 2 of the License, or
40635 + * (at your option) any later version.
40637 + * This program is distributed in the hope that it will be useful,
40638 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
40639 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
40640 + * the GNU General Public License for more details.
40642 + * You should have received a copy of the GNU General Public License
40643 + * along with this program; if not, write to the Free Software
40644 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40647 + * linux/include/linux/evms_snapshot.h
40649 + * EVMS Snapshot Feature kernel header file
40652 +#ifndef __EVMS_SNAPSHOT_INCLUDED__
40653 +#define __EVMS_SNAPSHOT_INCLUDED__
40655 +#define EVMS_SNAPSHOT_VERSION_MAJOR 2
40656 +#define EVMS_SNAPSHOT_VERSION_MINOR 1
40657 +#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL 1
40659 +#define EVMS_SNAPSHOT_FEATURE_ID 104
40661 +#define EVMS_SNAPSHOT_SIGNATURE 0x536e4170 /* SnAp */
40662 +#define EVMS_ORIGINAL_SIGNATURE 0x4f724967 /* OrIg */
40663 +#define MAX_HASH_CHAIN_ENTRIES 10
40665 +/* Status flags */
40666 +#define EVMS_SNAPSHOT 0x001
40667 +#define EVMS_SNAPSHOT_ORG 0x002
40668 +#define EVMS_SNAPSHOT_DISABLED 0x004
40669 +#define EVMS_SNAPSHOT_FULL 0x008
40670 +#define EVMS_SNAPSHOT_QUIESCED 0x010
40671 +#define EVMS_SNAPSHOT_WRITEABLE 0x020
40672 +#define EVMS_SNAPSHOT_ASYNC 0x040
40673 +#define EVMS_SNAPSHOT_ROLLBACK 0x080
40674 +#define EVMS_SNAPSHOT_ROLLBACK_COMP 0x100
40675 +#define EVMS_SNAPSHOT_DISABLED_PENDING 0x200
40677 +/* Private ioctl commands */
40678 +#define SNAPSHOT_QUERY_PERCENT_FULL 1
40679 +#define SNAPSHOT_START_ROLLBACK 2
40680 +#define SNAPSHOT_CHECK_STATE 3
40682 +/* Chunk states - for async mode */
40683 +#define SNAP_CHUNK_COPYING 1 /* Chunk is being copied from org to snap. */
40684 +#define SNAP_CHUNK_COPIED 0 /* Chunk has been copied from org to snap. */
40686 +#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128 /* sectors == 64k */
40687 +#define SNAPSHOT_MIN_CHUNK_SIZE 16 /* 8kB */
40688 +#define SNAPSHOT_MAX_CHUNK_SIZE 2048 /* 1MB */
40689 +#define SNAPSHOT_CHUNK_BUFFER_SIZE 128 /* copy buffer */
40691 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9)
40692 +#define min(a,b) (((a)<(b))?(a):(b))
40696 + * struct snapshot_metadata
40698 + * @signature: 0 : EVMS_SNAPSHOT_SIGNATURE
40700 + * @version: 8 : Major, minor, patchlevel
40701 + * @flags: 20 : EVMS_SNAPSHOT_*
40702 + * @original_volume: 24 : Name of volume being snapshotted.
40703 + * @original_size: 152: In sectors.
40704 + * @lba_of_COW_table: 160:
40705 + * @lba_of_first_chunk: 168:
40706 + * @chunk_size: 176: In sectors
40707 + * @total_chunks: 180:
40709 + * On-disk metadata sector for EVMS Snapshot feature.
40711 +struct snapshot_metadata {
40714 + struct evms_version version;
40716 + u8 original_volume[128];
40717 + u64 original_size;
40718 + u64 lba_of_COW_table;
40719 + u64 lba_of_first_chunk;
40721 + u32 total_chunks;
40725 + * struct snapshot_hash_entry
40727 + * @org_chunk: Chunk number, not LBA.
40728 + * @snap_chunk: Chunk_number, not LBA.
40729 + * @chunk_state: SNAP_CHUNK_*
40730 + * @chunk_state_lock: Protects access to chunk_state
40731 + * @snap_io: In async mode, the control-block for copying this chunk.
40735 + * Entries in the snapshot remapping hash-table.
40737 +struct snapshot_hash_entry {
40741 + spinlock_t chunk_state_lock;
40742 + struct async_snap_io * snap_io;
40743 + struct snapshot_hash_entry * next;
40744 + struct snapshot_hash_entry * prev;
40748 + * struct snapshot_volume
40750 + * @logical_node: Node below us.
40751 + * @exported_node: Node above us.
40752 + * @snapshot_org: The volume being snapshotted.
40753 + * @snapshot_next: List of volumes snapshotting this original.
40754 + * @snap_semaphore: On snapshots: protects access to the snapshot
40755 + * volume structure.
40756 + * On originals: protects the list of snapshots.
40757 + * @snapshot_map: Hash table of remapped chunks.
40758 + * @free_hash_list: List of pre-allocated hash entries.
40759 + * @chunk_size: In sectors.
40760 + * @chunk_shift: Shift value for chunk_size.
40761 + * @num_chunks: In this volume.
40762 + * @next_cow_entry: Index into current COW table sector.
40763 + * @current_cow_sector: Logical sector of current COW table.
40764 + * @next_free_chunk: Index of next free chunk (not LBA!).
40765 + * @hash_table_size: Size of the hash table for the remap.
40766 + * @flags: Status flags. EVMS_SNAPSHOT_*
40767 + * @cow_table: One sector's worth of COW tables.
40768 + * @async_io_thread: Thread for async copy-on-writes. Only on originals.
40769 + * @chunk_write_list: Lists and locks attached to the original.
40770 + * @chunk_write_list_lock:
40771 + * @org_pending_io_list:
40772 + * @org_pending_io_list_lock:
40773 + * @snap_pending_io_list:
40774 + * @snap_pending_io_list_lock:
40775 + * @cow_table_write_list: List and lock attached to the snapshot.
40776 + * @cow_table_write_list_lock:
40777 + * @rollback_thread: Thread for rollbacks. Only on snapshots.
40778 + * @chunk_data_buffer: Buffer for copying data during rollbacks.
40780 + * Private data for one snapshot volume or one original volume.
40782 +struct snapshot_volume {
40783 + struct evms_logical_node * logical_node;
40784 + struct evms_logical_node * exported_node;
40785 + struct snapshot_volume * snapshot_org;
40786 + struct snapshot_volume * snapshot_next;
40787 + struct rw_semaphore snap_semaphore;
40788 + struct snapshot_hash_entry ** snapshot_map;
40789 + struct snapshot_hash_entry * free_hash_list;
40793 + u32 next_cow_entry;
40794 + u64 current_cow_sector;
40795 + u32 next_free_chunk;
40796 + u32 hash_table_size;
40798 + u64 cow_table[64];
40799 + struct evms_thread * async_io_thread;
40800 + struct list_head chunk_write_list;
40801 + spinlock_t chunk_write_list_lock;
40802 + struct list_head org_pending_io_list;
40803 + spinlock_t org_pending_io_list_lock;
40804 + struct list_head snap_pending_io_list;
40805 + spinlock_t snap_pending_io_list_lock;
40806 + struct list_head cow_table_write_list;
40807 + spinlock_t cow_table_write_list_lock;
40808 +#ifdef SNAPSHOT_DEBUG
40809 + atomic_t cow_table_writes;
40810 + atomic_t cow_table_overlaps;
40812 + struct evms_thread * rollback_thread;
40813 + u8 * chunk_data_buffer;
40817 + * struct snap_io_buffer
40819 + * @bh: A pointer to the embedded buffer_head at the end.
40820 + * @buffer_private: Private data associated with this buffer.
40821 + * @buffer_next: List of snap_io_buffer's for one async_[org|snap]_io.
40822 + * @copy_next: List of buffers that will write the data that this
40823 + * buffer just read.
40824 + * @chunk_write_list: List for the thread to use to drive writes to the
40825 + * snapshot as part of a copy.
40826 + * @_bh: An embedded buffer_head. The b_private field will
40827 + * always point back at the snap_io_buffer.
40829 + * A wrapper around a buffer_head, to allow for the buffer to exist on the
40830 + * variety of lists used by snapshotting.
40832 +struct snap_io_buffer {
40833 + struct buffer_head * bh;
40834 + void * buffer_private;
40835 + struct snap_io_buffer * buffer_next;
40836 + struct snap_io_buffer * copy_next;
40837 + struct list_head chunk_write_list;
40838 + struct buffer_head _bh;
40841 +#define CHUNK_WRITE_ENTRY(lh) list_entry((lh), \
40842 + struct snap_io_buffer, \
40843 + chunk_write_list)
40846 + * struct async_snap_io
40848 + * @snap_volume: Snapshot volume that this chunk belongs to.
40849 + * @hash_table_entry: Hash table entry that this chunk belongs to.
40850 + * @org_io: Parent async I/O structure that contains list
40851 + * of read buffers.
40852 + * @pending_reads: List of pending read requests to the snapshot.
40853 + * @pending_writes: List of pending write requests to the snapshot.
40854 + * @copy_buffers: List of buffers to use to write this chunk to the
40856 + * @cow_table_buffer: Buffer for writing the cow table to disk.
40857 + * @snap_io_list_next: List of async_snap_io's for the parent async_org_io.
40858 + * @snap_pending_io_list: List of async_snap_io's to be processed by the thread.
40859 + * For each of these, the thread will process the contents
40860 + * of the pending_[reads|writes] lists.
40861 + * @cow_write_list: List of cow table writes to be processed by the thread.
40862 + * For each of these, the thread will process the
40863 + * cow_table_buffer.
40864 + * @write_count: Number of buffers remaining to write for this chunk
40865 + * (equal to the length of the copy_buffers list).
40866 + * @dev: Copy of the b_rdev field for this volume. Needed in
40867 + * order to tell EVMS about pending I/Os.
40869 + * Control structure that handles writing a single chunk to the snapshot during
40870 + * a copy-on-write.
40872 +struct async_snap_io {
40873 + struct snapshot_volume * snap_volume;
40874 + struct snapshot_hash_entry * hash_table_entry;
40875 + struct async_org_io * org_io;
40876 + struct buffer_head * pending_reads;
40877 + struct buffer_head * pending_writes;
40878 + struct snap_io_buffer * copy_buffers;
40879 + struct snap_io_buffer * cow_table_buffer;
40880 + struct async_snap_io * snap_io_list_next;
40881 + struct list_head snap_pending_io_list;
40882 + struct list_head cow_write_list;
40883 + atomic_t write_count;
40887 +#define SNAP_PENDING_IO_ENTRY(lh) list_entry((lh), \
40888 + struct async_snap_io, \
40889 + snap_pending_io_list)
40890 +#define COW_WRITE_ENTRY(lh) list_entry((lh), \
40891 + struct async_snap_io, \
40895 + * struct async_org_io
40897 + * @org_volume: Original volume that this chunk belongs to.
40898 + * @pending_writes: List of pending write requests to the original.
40899 + * @pending_writes_lock:Protect the pending_writes list.
40900 + * @copy_buffers: List ob buffers to use to read this chunk from the
40902 + * @snap_io_list: List of async_snap_io's that will write this chunk to
40904 + * @org_pending_io_list:List of async_org_io's to be processed by the thread.
40905 + * For each of these, the thread will process the contents
40906 + * of the pending_writes list.
40907 + * @copy_count: Number of snapshots remaining to write this chunk.
40908 + * @ref_count: = copy_count + 1. Needed to determine when the entire
40909 + * async I/O structure can be deallocated.
40910 + * @dev: Copy of the b_rdev field for this volume. Needed in
40911 + * order to tell EVMS about pending I/Os.
40913 +struct async_org_io {
40914 + struct snapshot_volume * org_volume;
40915 + struct buffer_head * pending_writes;
40916 + spinlock_t pending_writes_lock;
40917 + struct snap_io_buffer * copy_buffers;
40918 + struct async_snap_io * snap_io_list;
40919 + struct list_head org_pending_io_list;
40920 + atomic_t copy_count;
40921 + atomic_t ref_count;
40922 +#ifdef SNAPSHOT_DEBUG
40923 + struct async_org_io * debug_next_org_io;
40928 +#define ORG_PENDING_IO_ENTRY(lh) list_entry((lh), \
40929 + struct async_org_io, \
40930 + org_pending_io_list)
40932 +/* Debugging code */
40933 +#ifdef SNAPSHOT_DEBUG
40935 +#define DEBUG_CHECK_SNAP_IO(async_snap_io) \
40937 + if ( (async_snap_io)->pending_reads || \
40938 + (async_snap_io)->pending_writes ) { \
40943 +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io) \
40945 + struct async_org_io ** p_org_io; \
40946 + unsigned long flags; \
40947 + if ((async_org_io)->pending_writes) { \
40950 + spin_lock_irqsave(&debug_async_org_io_list_lock, flags); \
40951 + for ( p_org_io = &debug_async_org_io_list; *p_org_io; \
40952 + p_org_io = &(*p_org_io)->debug_next_org_io ) { \
40953 + if ( *p_org_io == (async_org_io) ) { \
40954 + *p_org_io = (async_org_io)->debug_next_org_io; \
40958 + (async_org_io)->debug_next_org_io = NULL; \
40959 + spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags); \
40962 +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io) \
40964 + unsigned long flags; \
40965 + spin_lock_irqsave(&debug_async_org_io_list_lock, flags); \
40966 + (async_org_io)->debug_next_org_io = debug_async_org_io_list; \
40967 + debug_async_org_io_list = (async_org_io); \
40968 + spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags); \
40971 +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume) \
40972 + atomic_inc(&(snap_volume)->cow_table_overlaps)
40974 +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume) \
40975 + atomic_inc(&(snap_volume)->cow_table_writes)
40977 +#else /* SNAPSHOT_DEBUG */
40979 +#define DEBUG_CHECK_SNAP_IO(async_snap_io)
40980 +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io)
40981 +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io)
40982 +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume)
40983 +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume)
40985 +#endif /* SNAPSHOT_DEBUG */
40987 +#endif /* __EVMS_SNAPSHOT_INCLUDED__ */
40989 diff -Naur linux-2002-09-30/include/linux/evms/evms_xor.h evms-2002-09-30/include/linux/evms/evms_xor.h
40990 --- linux-2002-09-30/include/linux/evms/evms_xor.h Wed Dec 31 18:00:00 1969
40991 +++ evms-2002-09-30/include/linux/evms/evms_xor.h Mon Feb 4 09:58:43 2002
40996 +#include <linux/evms/evms_md.h>
40998 +#define MAX_XOR_BLOCKS 5
41000 +extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);
41002 +struct xor_block_template {
41003 + struct xor_block_template *next;
41004 + const char *name;
41006 + void (*do_2)(unsigned long, unsigned long *, unsigned long *);
41007 + void (*do_3)(unsigned long, unsigned long *, unsigned long *,
41008 + unsigned long *);
41009 + void (*do_4)(unsigned long, unsigned long *, unsigned long *,
41010 + unsigned long *, unsigned long *);
41011 + void (*do_5)(unsigned long, unsigned long *, unsigned long *,
41012 + unsigned long *, unsigned long *, unsigned long *);
41016 diff -Naur linux-2002-09-30/include/linux/evms/ldev_mgr.h evms-2002-09-30/include/linux/evms/ldev_mgr.h
41017 --- linux-2002-09-30/include/linux/evms/ldev_mgr.h Wed Dec 31 18:00:00 1969
41018 +++ evms-2002-09-30/include/linux/evms/ldev_mgr.h Wed Aug 28 14:30:51 2002
41021 +/* -*- linux-c -*- */
41024 + * Copyright (c) International Business Machines Corp., 2000
41026 + * This program is free software; you can redistribute it and/or modify
41027 + * it under the terms of the GNU General Public License as published by
41028 + * the Free Software Foundation; either version 2 of the License, or
41029 + * (at your option) any later version.
41031 + * This program is distributed in the hope that it will be useful,
41032 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
41033 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
41034 + * the GNU General Public License for more details.
41036 + * You should have received a copy of the GNU General Public License
41037 + * along with this program; if not, write to the Free Software
41038 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
41041 +/* linux/driver/evms/ldev_mgr.h
41043 + * EVMS - Local Device (Hard Drive) Manager
41047 +/* plugin feature ID */
41048 +#define EVMS_LOCAL_DEVICE_MANAGER_ID 1
41050 +/* plugin ioctl feature command defines */
41051 +#define LDEV_MGR_BROADCAST_IOCTL_CMD 1
41054 + * struct ldev_plugin_ioctl - ldev mgr direct ioctl packet definition
41055 + * @disk_handle: handle identifying target disk
41056 + * @cmd: ioctl cmd
41057 + * @arg: ioctl argument
41059 + * local device manager direct ioctl packet definition
41061 +struct ldev_plugin_ioctl {