]> git.pld-linux.org Git - packages/kernel.git/blame - evms-1.2.0-linux-2.4.patch
- missing prototype on sparc64
[packages/kernel.git] / evms-1.2.0-linux-2.4.patch
CommitLineData
30dc7178 1diff -Naur linux-2002-09-30/drivers/evms/AIXlvm_vge.c evms-2002-09-30/drivers/evms/AIXlvm_vge.c
2--- linux-2002-09-30/drivers/evms/AIXlvm_vge.c Wed Dec 31 18:00:00 1969
3+++ evms-2002-09-30/drivers/evms/AIXlvm_vge.c Fri Sep 27 14:55:45 2002
4@@ -0,0 +1,3681 @@
5+/* -*- linux-c -*- */
6+
7+/*
8+ *
9+ *
10+ * Copyright (c) International Business Machines Corp., 2000
11+ *
12+ * This program is free software; you can redistribute it and/or modify
13+ * it under the terms of the GNU General Public License as published by
14+ * the Free Software Foundation; either version 2 of the License, or
15+ * (at your option) any later version.
16+ *
17+ * This program is distributed in the hope that it will be useful,
18+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
20+ * the GNU General Public License for more details.
21+ *
22+ * You should have received a copy of the GNU General Public License
23+ * along with this program; if not, write to the Free Software
24+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25+ *
26+ *
27+ */
28+/*
29+ * linux/drivers/evms/AIXlvm_vge.c
30+ *
31+ * EVMS AIX LVM Volume Group Emulator
32+ *
33+ *
34+ */
35+
36+#define EVMS_DEBUG 1
37+#define EVMS_AIX_DEBUG 1
38+
39+#define AIX_COMMON_SERVICES_MAJOR 0 // Required common services levels for the AIX kernel plugin
40+#define AIX_COMMON_SERVICES_MINOR 5 // These must be incremented if new function is added to common
41+#define AIX_COMMON_SERVICES_PATCHLEVEL 0 // services and the AIX kernel plugin uses the new function.
42+#define AIX_INCREMENT_REQUEST 1
43+#define AIX_DECREMENT_REQUEST -1
44+#define AIX_RESYNC_BLOCKSIZE 512
45+#define AIX_SYNC_INCOMPLETE 0x01
46+#define AIX_SYNC_COMPLETE 0x00
47+#define AIX_MASTER 0
48+#define AIX_SLAVE_1 1
49+#define AIX_SLAVE_2 2
50+
51+#include <linux/module.h>
52+#include <linux/kernel.h>
53+#include <linux/config.h>
54+
55+#include <linux/genhd.h>
56+#include <linux/string.h>
57+#include <linux/blk.h>
58+#include <linux/init.h>
59+#include <linux/slab.h>
60+
61+#include <linux/evms/evms.h>
62+#include <linux/evms/evms_aix.h>
63+#include <asm/system.h>
64+#include <asm/uaccess.h>
65+
66+#include <linux/sched.h>
67+#include <linux/smp_lock.h>
68+#include <linux/locks.h>
69+#include <linux/delay.h>
70+#include <linux/reboot.h>
71+#include <linux/completion.h>
72+#include <linux/vmalloc.h>
73+
74+#ifdef EVMS_AIX_DEBUG
75+static int AIX_volume_group_dump(void);
76+#endif
77+
78+static struct aix_volume_group *AIXVolumeGroupList = NULL;
79+static struct evms_thread *AIX_mirror_read_retry_thread;
80+static struct evms_thread *AIX_mirror_resync_thread;
81+static struct evms_pool_mgmt *AIX_BH_list_pool = NULL;
82+static struct aix_mirror_bh *AIX_retry_list = NULL;
83+static struct aix_mirror_bh **AIX_retry_tail = NULL;
84+static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
85+static spinlock_t AIX_resync_list_lock = SPIN_LOCK_UNLOCKED;
86+static spinlock_t AIX_resync_pp_lock = SPIN_LOCK_UNLOCKED;
87+static int AIXResyncInProgress = FALSE;
88+static struct aix_resync_struct *AIX_resync_list = NULL;
89+
90+// Plugin API prototypes
91+
92+static void AIXiod(void *data);
93+static void AIXresync(void *data);
94+static int discover_aix(struct evms_logical_node **evms_logical_disk_head);
95+static int discover_volume_groups(struct evms_logical_node **);
96+static int discover_logical_volumes(void);
97+static int end_discover_aix(struct evms_logical_node **evms_logical_disk_head);
98+static void read_aix(struct evms_logical_node *node, struct buffer_head *bh);
99+static void write_aix(struct evms_logical_node *node, struct buffer_head *bh);
100+static int ioctl_aix(struct evms_logical_node *logical_node,
101+ struct inode *inode,
102+ struct file *file, unsigned int cmd, unsigned long arg);
103+
104+static int aix_direct_ioctl(struct inode *inode,
105+ struct file *file,
106+ unsigned int cmd, unsigned long args);
107+
108+static int AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap
109+ u64 size, // size (in sectors) of request to remap
110+ u64 * new_sector, // remapped sector
111+ u64 * new_size, // new size (in sectors)
112+ struct partition_list_entry **partition, // new node for which new_sector is relative
113+ u32 * le, u32 * offset_in_le);
114+
115+static int validate_build_volume_group_disk_info(struct evms_logical_node
116+ *logical_node,
117+ struct AIXlvm_rec *AIXlvm);
118+
119+static int add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
120+ struct aix_volume_group *new_group,
121+ short int pvNum);
122+static int add_PV_to_volume_group(struct aix_volume_group *group,
123+ struct evms_logical_node *evms_partition,
124+ int pvNum);
125+static struct aix_volume_group *AIX_create_volume_group(struct evms_logical_node
126+ *logical_node,
127+ struct AIXlvm_rec
128+ *AIXlvm);
129+
130+static int AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
131+ struct evms_logical_node *logical_node,
132+ struct AIXlvm_rec *AIXlvm);
133+
134+static int AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node);
135+
136+static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs);
137+
138+static int AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force);
139+
140+static int AIX_copy_on_read(struct aix_logical_volume *volume,
141+ struct partition_list_entry *master_part,
142+ struct partition_list_entry *slave1_part,
143+ struct partition_list_entry *slave2_part,
144+ u64 master_offset,
145+ u64 slave1_offset,
146+ u64 slave2_offset, u32 pe_size, int le);
147+
148+static int export_volumes(struct evms_logical_node **evms_logical_disk_head);
149+static int lvm_cleanup(void);
150+static int AIX_copy_header_info(struct vg_header *AIXvgh,
151+ struct vg_header *AIXvgh2);
152+static int build_pe_maps(struct aix_volume_group *volume_group);
153+
154+static struct aix_logical_volume *new_logical_volume(struct lv_entries
155+ *AIXlvent,
156+ struct aix_volume_group
157+ *group, char *lv_name,
158+ u32 stripesize);
159+
160+static int check_log_volume_and_pe_maps(struct aix_volume_group *group);
161+static int check_volume_groups(void);
162+static int init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
163+ u64 sect_nr, /* disk LBA */
164+ u64 num_sects, /* # of sectors */
165+ void *buf_addr); /* buffer address */
166+
167+static int delete_logical_volume(struct aix_logical_volume *volume);
168+static int delete_aix_node(struct evms_logical_node *logical_node);
169+static int deallocate_volume_group(struct aix_volume_group *group);
170+
171+static void AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate);
172+
173+static void AIX_handle_write_mirror_drives(struct buffer_head *bh,
174+ int uptodate);
175+
176+static void aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep,
177+ unsigned long flags);
178+
179+static void AIX_schedule_resync(struct aix_logical_volume *resync_volume,
180+ int force);
181+static struct aix_logical_volume *AIX_get_volume_data(char *object_name);
182+
183+static void AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate);
184+
185+static int AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh,
186+ int index, int offset);
187+
188+static struct aix_mirror_bh *AIX_alloc_rbh(struct evms_logical_node *node,
189+ struct buffer_head *bh,
190+ u32 mirror_copies,
191+ u32 le, u64 org_sector, int cmd);
192+
193+static struct aix_mirror_bh *AIX_alloc_wbh(struct evms_logical_node *node,
194+ struct evms_logical_node *node2,
195+ struct evms_logical_node *node3,
196+ struct buffer_head *bh,
197+ u32 mirror_copies,
198+ u32 le,
199+ u64 new_sector2, u64 new_sector3);
200+
201+static struct aix_mirror_bh *AIX_alloc_sbh(struct aix_logical_volume *volume,
202+ struct partition_list_entry
203+ *master_part,
204+ struct partition_list_entry
205+ *slave1_part,
206+ struct partition_list_entry
207+ *slave2_part, u64 master_offset,
208+ u64 slave1_offset, u64 slave2_offset,
209+ u32 pe_size);
210+
211+static void AIX_free_headers(struct vg_header *AIXvgh,
212+ struct vg_header *AIXvgh2,
213+ struct vg_trailer *AIXvgt,
214+ struct vg_trailer *AIXvgt2);
215+
216+static int remove_group_from_list(struct aix_volume_group *group);
217+
218+//****************************************************************************************************
219+
220+/* END of PROTOTYES*/
221+
222+#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
223+
224+#define COMPARE_TIMESTAMPS(t1, t2) ( (t1).tv_sec == (t2).tv_sec && \
225+ (t1).tv_nsec == (t2).tv_nsec )
226+
227+#define COMPARE_UNIQUE_IDS(id1, id2) ( (id1).word1 == (id2).word1 && \
228+ (id1).word2 == (id2).word2 && \
229+ (id1).word3 == (id2).word3 && \
230+ (id1).word4 == (id2).word4 )
231+
232+#define SECTOR_IN_RANGE(s1, s2) ((s2 > s1) && (s2 < s1 + AIX_RESYNC_BLOCKSIZE))
233+
234+#define AIX_PV_STATE_VALID 0 // Both VGDAs are valid and match.
235+#define AIX_PV_STATE_FIRST_VGDA 1 // Only the first VGDA is valid.
236+#define AIX_PV_STATE_SECOND_VGDA 2 // Only the second VGDA is valid.
237+#define AIX_PV_STATE_EITHER_VGDA -1 // Both VGDAs are valid, but do not match each other.
238+#define AIX_PV_STATE_INVALID -2 // We're in an invalid state but there's more PVs in this group
239+
240+#ifndef EVMS_AIX_DEBUG
241+#define AIX_VOLUME_GROUP_DUMP()
242+#else
243+#define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
244+ AIX_volume_group_dump()
245+#endif
246+
247+// Global LVM data structures
248+
249+static struct evms_plugin_fops AIXlvm_fops = {
250+ .discover = discover_aix,
251+ .end_discover = end_discover_aix,
252+ .delete = delete_aix_node,
253+ .read = read_aix,
254+ .write = write_aix,
255+ .init_io = init_io_aix,
256+ .ioctl = ioctl_aix,
257+ .direct_ioctl = aix_direct_ioctl
258+};
259+
260+static struct evms_plugin_header plugin_header = {
261+ .id = SetPluginID(IBM_OEM_ID,
262+ EVMS_REGION_MANAGER,
263+ EVMS_AIX_FEATURE_ID),
264+ .version = {
265+ .major = 1,
266+ .minor = 1,
267+ .patchlevel = 1},
268+ .required_services_version = {
269+ .major = AIX_COMMON_SERVICES_MAJOR,
270+ .minor = AIX_COMMON_SERVICES_MINOR,
271+ .patchlevel =
272+ AIX_COMMON_SERVICES_PATCHLEVEL},
273+ .fops = &AIXlvm_fops
274+};
275+
276+/*
277+ * Function: remap sector
278+ * Common function to remap volume lba to partition lba in appropriate PE
279+ */
280+static int
281+AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap
282+ u64 size, // size (in sectors) of request to remap
283+ u64 * new_sector, // remapped sector
284+ u64 * new_size, // new size (in sectors)
285+ struct partition_list_entry **partition, // new node for which new_sector is relative
286+ u32 * le, u32 * offset_in_le)
287+{
288+ struct aix_logical_volume *volume;
289+
290+ u32 sectors_per_stripe;
291+ u32 partition_to_use;
292+ u32 column;
293+ u32 stripe_in_column;
294+
295+ u32 org_sector32; // Until striping is 64-bit enabled.
296+
297+ volume = (struct aix_logical_volume *) node->private;
298+
299+#ifdef EVMS_DEBUG
300+ LOG_DEBUG("-- %s volume:%p lv:%d size:" PFU64 " Name:%s\n",
301+ __FUNCTION__, volume, volume->lv_number, size, volume->name);
302+ LOG_DEBUG(" node %p node_name [%s] org_sector:" PFU64 "\n", node,
303+ node->name, org_sector);
304+ LOG_DEBUG(" mirror_copies:%d volume->lv_size:" PFU64 "\n",
305+ volume->mirror_copies, volume->lv_size);
306+#endif
307+
308+ org_sector32 = org_sector;
309+
310+ *(new_size) = size;
311+
312+ // Check if volume is striped. Reset the size if the request
313+ // crosses a stripe boundary.
314+ if (volume->stripes > 1) {
315+#ifdef EVMS_DEBUG
316+ LOG_DEBUG(" *** STRIPED ***\n");
317+ LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",
318+ volume->stripe_size, org_sector32, volume->stripes);
319+#endif
320+
321+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
322+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
323+
324+#ifdef EVMS_DEBUG
325+ LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n", *(le),
326+ *(offset_in_le));
327+#endif
328+
329+ sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
330+ partition_to_use =
331+ (org_sector32 / sectors_per_stripe) % volume->stripes;
332+ stripe_in_column =
333+ ((((org_sector32 / volume->stripe_size) / volume->stripes) *
334+ volume->stripe_size) +
335+ (org_sector32 % sectors_per_stripe));
336+ column =
337+ ((org_sector32 / sectors_per_stripe) / volume->stripes) *
338+ sectors_per_stripe;
339+
340+#ifdef EVMS_DEBUG
341+ LOG_DEBUG("offset_in_le:%d org_sector:" PFU64
342+ " pe_shift:%d stripe_shift:%d\n", *(offset_in_le),
343+ org_sector, volume->pe_size_shift,
344+ volume->stripe_size_shift);
345+
346+ LOG_DEBUG(" org_sector:%d sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",
347+ org_sector32, sectors_per_stripe, partition_to_use,
348+ stripe_in_column, column);
349+ LOG_DEBUG(" offset_in_le + size:" PFU64
350+ " volume->pe_size:%d volume->lv_size:" PFU64 "\n",
351+ (*(offset_in_le) + size), volume->pe_size,
352+ volume->lv_size);
353+#endif
354+
355+ if (*(offset_in_le) + size > volume->pe_size) {
356+ *new_size = volume->pe_size - *(offset_in_le);
357+ LOG_DEBUG(" new_size " PFU64 "\n", *new_size);
358+ }
359+
360+ }
361+ // Non-striped volume. Just find LE and offset. Reset the size
362+ // if the request crosses an LE boundary.
363+ else {
364+#ifdef EVMS_DEBUG
365+ LOG_DEBUG(" *** NON-STRIPED ***\n");
366+#endif
367+
368+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
369+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
370+
371+ }
372+
373+#ifdef EVMS_DEBUG
374+ LOG_DEBUG(" offset_in_le:%d org_sector:" PFU64 " shift:%d\n",
375+ *(offset_in_le), org_sector, volume->pe_size_shift);
376+
377+ if (*(le) >= volume->num_le) {
378+ LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",
379+ *(le), volume->num_le);
380+ return -EINVAL;
381+ }
382+#endif
383+
384+ *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
385+ *(partition) = volume->le_to_pe_map[*(le)].owning_pv;
386+
387+#ifdef EVMS_DEBUG
388+ LOG_DEBUG(" new_sector:" PFU64 "\n", *(new_sector));
389+ LOG_DEBUG(" Owning Part %p\n", *(partition));
390+ LOG_DEBUG(" End %s\n", __FUNCTION__);
391+#endif
392+
393+ return (0);
394+}
395+
396+/*
397+ * Function: read_aix
398+ */
399+static void
400+read_aix(struct evms_logical_node *node, struct buffer_head *bh)
401+{
402+ struct partition_list_entry *partition;
403+ u64 org_sector;
404+ u64 new_sector;
405+ u64 new_size;
406+ struct aix_logical_volume *volume;
407+ struct aix_mirror_bh *tmp_bh;
408+ u32 le, offset_in_le, count;
409+ int flags = 0;
410+
411+ volume = (struct aix_logical_volume *) node->private;
412+//#ifdef EVMS_DEBUG
413+// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
414+// volume->mirror_iterations);
415+//#endif
416+
417+#ifdef EVMS_DEBUG
418+ LOG_DEBUG(" node->total_vsectors:" PFU64 "\n", node->total_vsectors);
419+ LOG_DEBUG(" rsector:%lu rsize:%u node_flags:%u\n", bh->b_rsector,
420+ bh->b_size, node->flags);
421+#endif
422+
423+ // Check if I/O goes past end of logical volume.
424+ if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
425+ node->total_vsectors) {
426+ LOG_CRITICAL(" read_aix ERROR %d\n", __LINE__);
427+ buffer_IO_error(bh);
428+ return;
429+ }
430+
431+ // Logical-to-physical remapping.
432+ if (AIX_remap_sector
433+ (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
434+ &new_sector, &new_size, &partition, &le, &offset_in_le)
435+ || (!partition || !new_sector)) {
436+ LOG_CRITICAL(" read_aix bh: ERROR %d\n", __LINE__);
437+ buffer_IO_error(bh);
438+ return;
439+ }
440+
441+ org_sector = bh->b_rsector;
442+ bh->b_rsector = new_sector;
443+ //bh->b_size = new_size;
444+
445+#ifdef EVMS_DEBUG
446+ LOG_DEBUG(" read_aix Mirror_Copies:%d\n", volume->mirror_copies);
447+#endif
448+
449+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
450+
451+ tmp_bh =
452+ AIX_alloc_rbh(node, bh, 1, le, new_sector, AIX_LV_READ);
453+
454+ if (!tmp_bh) {
455+ buffer_IO_error(bh);
456+ return;
457+ }
458+
459+ if (volume->le_to_pe_map_mir1) {
460+ tmp_bh->mir_node1 =
461+ volume->le_to_pe_map_mir1[le].owning_pv->
462+ logical_node;
463+ tmp_bh->mir_sector1 =
464+ volume->le_to_pe_map_mir1[le].pe_sector_offset +
465+ offset_in_le;
466+ }
467+
468+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
469+ tmp_bh->mir_node2 =
470+ volume->le_to_pe_map_mir2[le].owning_pv->
471+ logical_node;
472+ tmp_bh->mir_sector2 =
473+ volume->le_to_pe_map_mir2[le].pe_sector_offset +
474+ offset_in_le;
475+ }
476+
477+ if (evms_cs_volume_request_in_progress
478+ (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
479+ buffer_IO_error(bh);
480+ return;
481+ }
482+
483+ if (AIXResyncInProgress) {
484+ if (SECTOR_IN_RANGE
485+ (tmp_bh->bh_req.b_rsector,
486+ AIX_resync_list->master_offset)) {
487+ spin_lock_irqsave(&AIX_resync_list_lock, flags);
488+ }
489+ }
490+
491+ R_IO(partition->logical_node, &tmp_bh->bh_req);
492+
493+ if (AIXResyncInProgress) {
494+ if (SECTOR_IN_RANGE
495+ (tmp_bh->bh_req.b_rsector,
496+ AIX_resync_list->master_offset)) {
497+ spin_unlock_irqrestore(&AIX_resync_list_lock,
498+ flags);
499+ }
500+ }
501+
502+ } else {
503+
504+ R_IO(partition->logical_node, bh);
505+ }
506+
507+#ifdef EVMS_DEBUG
508+ LOG_DEBUG(" ***** %s ***** returning\n", __FUNCTION__);
509+#endif
510+ return;
511+}
512+
513+/*
514+ * Function: write_aix
515+ */
516+static void
517+write_aix(struct evms_logical_node *node, struct buffer_head *bh)
518+{
519+ struct partition_list_entry *partition;
520+ u64 new_sector, new_sector2 = 0, new_sector3 = 0;
521+ u64 org_sector;
522+ u64 new_size;
523+ struct aix_logical_volume *volume;
524+ struct aix_mirror_bh *tmp_bh;
525+ struct evms_logical_node *node2 = NULL, *node3 = NULL;
526+ u32 le, offset_in_le, count;
527+ int flags = 0;
528+
529+ volume = (struct aix_logical_volume *) node->private;
530+
531+#ifdef EVMS_DEBUG
532+// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
533+// volume->mirror_iterations);
534+ LOG_DEBUG(" write_aix rsector:%lu rsize:%u\n", bh->b_rsector,
535+ bh->b_size);
536+ LOG_DEBUG(" write_aix total_sectors:" PFU64 "\n", node->total_vsectors);
537+#endif
538+
539+ if (volume->lv_access & EVMS_LV_INCOMPLETE) { //No writes allowed on incomplete volumes
540+ LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",
541+ __LINE__);
542+ buffer_IO_error(bh);
543+ return;
544+ }
545+
546+ // Check if I/O goes past end of logical volume.
547+ if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
548+ node->total_vsectors) {
549+ LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
550+ buffer_IO_error(bh);
551+ return;
552+ }
553+ // Logical-to-Physical remapping
554+ if (AIX_remap_sector
555+ (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
556+ &new_sector, &new_size, &partition, &le, &offset_in_le)
557+ || (!new_sector || !partition)) {
558+ LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
559+ buffer_IO_error(bh);
560+ return;
561+ }
562+
563+ org_sector = bh->b_rsector;
564+ bh->b_rsector = new_sector;
565+ //bh->b_size = new_size;
566+
567+#ifdef EVMS_DEBUG
568+ LOG_DEBUG(" write_aix Mirror_Copies:%d\n", volume->mirror_copies);
569+#endif
570+
571+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
572+
573+ if (volume->le_to_pe_map_mir1) {
574+ new_sector2 =
575+ volume->le_to_pe_map_mir1[le].pe_sector_offset +
576+ offset_in_le;
577+ node2 =
578+ volume->le_to_pe_map_mir1[le].owning_pv->
579+ logical_node;
580+ }
581+
582+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
583+
584+ new_sector3 =
585+ volume->le_to_pe_map_mir2[le].pe_sector_offset +
586+ offset_in_le;
587+ node3 =
588+ volume->le_to_pe_map_mir2[le].owning_pv->
589+ logical_node;
590+ }
591+
592+ tmp_bh =
593+ AIX_alloc_wbh(partition->logical_node, node2, node3, bh, le,
594+ volume->mirror_copies, new_sector2,
595+ new_sector3);
596+
597+ if (!tmp_bh) {
598+ buffer_IO_error(bh);
599+ return;
600+ }
601+ tmp_bh->node = node;
602+
603+ tmp_bh = tmp_bh->mirror_bh_list;
604+
605+ if (evms_cs_volume_request_in_progress
606+ (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
607+ buffer_IO_error(bh);
608+ // free memory here
609+ return;
610+ }
611+
612+ if (AIXResyncInProgress) {
613+ if (SECTOR_IN_RANGE
614+ (tmp_bh->bh_req.b_rsector,
615+ AIX_resync_list->master_offset)) {
616+ spin_lock_irqsave(&AIX_resync_list_lock, flags);
617+ }
618+ }
619+
620+ W_IO(tmp_bh->node, &tmp_bh->bh_req);
621+
622+ if (AIXResyncInProgress) {
623+ if (SECTOR_IN_RANGE
624+ (tmp_bh->bh_req.b_rsector,
625+ AIX_resync_list->master_offset)) {
626+ spin_unlock_irqrestore(&AIX_resync_list_lock,
627+ flags);
628+ }
629+ }
630+
631+ tmp_bh = tmp_bh->next_r1;
632+
633+ if (tmp_bh) {
634+ W_IO(tmp_bh->node, &tmp_bh->bh_req);
635+ tmp_bh = tmp_bh->next_r1;
636+ }
637+
638+ if (tmp_bh) {
639+ W_IO(tmp_bh->node, &tmp_bh->bh_req);
640+ }
641+
642+ } else {
643+
644+ W_IO(partition->logical_node, bh);
645+ }
646+
647+#ifdef EVMS_DEBUG
648+ LOG_DEBUG(" ***** %s returning *****\n", __FUNCTION__);
649+#endif
650+ return;
651+}
652+
653+/*
654+ * Function: ioctl_aix
655+ *
656+ */
657+static int
658+ioctl_aix(struct evms_logical_node *logical_node,
659+ struct inode *inode,
660+ struct file *file, unsigned int cmd, unsigned long arg)
661+{
662+ struct aix_logical_volume *volume =
663+ (struct aix_logical_volume *) (logical_node->private);
664+ int rc = 0;
665+
666+ LOG_EXTRA(" Ioctl %u\n", cmd);
667+
668+ switch (cmd) {
669+
670+ case HDIO_GETGEO:
671+ {
672+ // Fixed geomerty for all LVM volumes
673+ unsigned char heads = 64;
674+ unsigned char sectors = 32;
675+ long start = 0;
676+ struct hd_geometry *hd = (struct hd_geometry *) arg;
677+ short cylinders;
678+ cylinders = logical_node->total_vsectors;
679+ cylinders = (cylinders / heads) / sectors;
680+
681+ if (hd == NULL) {
682+ return -EINVAL;
683+ }
684+
685+ if (copy_to_user
686+ ((char *) (&hd->heads), &heads, sizeof (heads)) != 0
687+ || copy_to_user((char *) (&hd->sectors), &sectors,
688+ sizeof (sectors)) != 0
689+ || copy_to_user((short *) (&hd->cylinders),
690+ &cylinders, sizeof (cylinders)) != 0
691+ || copy_to_user((long *) (&hd->start), &start,
692+ sizeof (start)) != 0) {
693+ return -EFAULT;
694+ }
695+ }
696+ break;
697+
698+ case EVMS_QUIESCE_VOLUME:
699+ break;
700+
701+ case EVMS_GET_DISK_LIST:
702+ case EVMS_CHECK_MEDIA_CHANGE:
703+ case EVMS_REVALIDATE_DISK:
704+ case EVMS_OPEN_VOLUME:
705+ case EVMS_CLOSE_VOLUME:
706+ case EVMS_CHECK_DEVICE_STATUS:
707+ {
708+ // These five ioctl all need to be broadcast to all PVs.
709+ struct aix_volume_group *group = volume->group;
710+ struct partition_list_entry *partition;
711+ for (partition = group->partition_list; partition;
712+ partition = partition->next) {
713+ rc |=
714+ IOCTL(partition->logical_node, inode, file,
715+ cmd, arg);
716+ }
717+ }
718+ break;
719+
720+ default:
721+ // Currently the VGE does not send any ioctl's down to the
722+ // partitions. Which partition would they go to?
723+ rc = -ENOTTY;
724+ }
725+
726+ return rc;
727+}
728+
729+/* Function: aix_direct_ioctl
730+ *
731+ * This function provides a method for user-space to communicate directly
732+ * with a plugin in the kernel.
733+ */
734+static int
735+aix_direct_ioctl(struct inode *inode,
736+ struct file *file, unsigned int cmd, unsigned long args)
737+{
738+ struct aix_logical_volume *volume = NULL;
739+ struct evms_plugin_ioctl_pkt argument;
740+ int rc = 0;
741+
742+ MOD_INC_USE_COUNT;
743+ LOG_DEBUG(" Function:%s cmd:%d \n", __FUNCTION__, cmd);
744+
745+ // Copy user's parameters to kernel space
746+ if (copy_from_user
747+ (&argument, (struct evms_plugin_ioctl *) args, sizeof (argument))) {
748+ MOD_DEC_USE_COUNT;
749+ return -EFAULT;
750+ }
751+ // Make sure this is supposed to be our ioctl.
752+ if (argument.feature_id != plugin_header.id) {
753+ MOD_DEC_USE_COUNT;
754+ return -EINVAL;
755+ }
756+
757+ argument.feature_command = 1;
758+
759+ switch (argument.feature_command) {
760+
761+ case EVMS_AIX_RESYNC_MIRRORS:
762+ {
763+ struct aix_volume_resync_ioctl aix_lv_resync;
764+
765+ if (copy_from_user
766+ (&aix_lv_resync,
767+ (struct aix_volume_resync_ioctl *) argument.
768+ feature_ioctl_data, sizeof (aix_lv_resync))) {
769+ rc = -EINVAL;
770+ break;
771+ }
772+
773+ volume = AIX_get_volume_data(aix_lv_resync.object_name);
774+
775+ if (volume) {
776+ AIX_schedule_resync(volume, FALSE);
777+ } else {
778+ LOG_DEBUG
779+ (" Function:%s object_name:%s -- no match found\n",
780+ __FUNCTION__, aix_lv_resync.object_name);
781+ rc = -EINVAL;
782+ }
783+
784+ }
785+ break;
786+
787+ default:
788+ rc = -EINVAL;
789+ break;
790+ }
791+
792+ argument.status = rc;
793+ copy_to_user((struct evms_plugin_ioctl *) args, &argument,
794+ sizeof (argument));
795+ MOD_DEC_USE_COUNT;
796+ return rc;
797+}
798+
799+/* Function: aix_direct_ioctl
800+ *
801+ * This function provides a method for user-space to communicate directly
802+ * with a plugin in the kernel.
803+ */
804+static struct aix_logical_volume *
805+AIX_get_volume_data(char *object_name)
806+{
807+
808+ struct aix_volume_group *VG_ptr;
809+ struct aix_logical_volume *volume = NULL;
810+ int i;
811+
812+ LOG_DEBUG(" Function:%s object_name:%s \n", __FUNCTION__, object_name);
813+
814+ if (!object_name || !strlen(object_name)) {
815+ return NULL;
816+ }
817+
818+ for (VG_ptr = AIXVolumeGroupList; VG_ptr; VG_ptr = VG_ptr->next) {
819+ for (i = 0; VG_ptr->volume_list[i]; i++) {
820+ if (!strcmp(VG_ptr->volume_list[i]->name, object_name)) {
821+ LOG_DEBUG
822+ (" Function:%s FOUND!! volume_name:%s \n",
823+ __FUNCTION__,
824+ VG_ptr->volume_list[i]->name);
825+ volume = VG_ptr->volume_list[i];
826+ break;
827+ }
828+ }
829+ }
830+
831+ if (!volume) {
832+ LOG_DEBUG(" Function:%s object_name:%s NOT FOUND !! volume:%p \n",
833+ __FUNCTION__, object_name, volume);
834+ }
835+
836+ return volume;
837+}
838+
839+/*
840+ * Function: init_io_aix
841+ *
842+ */
843+static int
844+init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
845+ u64 sect_nr, /* disk LBA */
846+ u64 num_sects, /* # of sectors */
847+ void *buf_addr)
848+{ /* buffer address */
849+ struct partition_list_entry *partition;
850+ u64 new_sector = 0;
851+ u64 new_size = 0;
852+ int rc = 0;
853+ u32 le, offset;
854+
855+ LOG_DEBUG(" ************ init_io_aix() num_sects:" PFU64
856+ " node:%p sect_nr:" PFU64 "\n", num_sects, node, sect_nr);
857+
858+ // Init IO needs to deal with the possibility that a request can come
859+ // in that spans PEs or stripes. This is possible because there is no
860+ // limit on num_sects. To fix this, we loop through AIX_remap_sector and
861+ // INIT_IO until num_sects reaches zero.
862+
863+ while (num_sects > 0) {
864+
865+ if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size,
866+ &partition, &le, &offset) || (!new_sector || !partition)) {
867+ LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",
868+ __LINE__);
869+ return -EIO;
870+ }
871+
872+ LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:"
873+ PFU64 " new_size:" PFU64 "\n", __LINE__,
874+ partition->logical_node, io_flag, new_sector, new_size);
875+
876+ rc = INIT_IO(partition->logical_node, io_flag, new_sector,
877+ new_size, buf_addr);
878+ num_sects -= new_size;
879+ sect_nr += new_size;
880+ buf_addr = (void *) (((unsigned long) buf_addr) +
881+ (unsigned long) (new_size << EVMS_VSECTOR_SIZE_SHIFT));
882+ }
883+
884+ return rc;
885+}
886+
887+/*
888+ * Function: AIXlvm_vge_init
889+ *
890+ */
891+int __init
892+AIXlvm_vge_init(void)
893+{
894+
895+ LOG_DEBUG(" %s --------\n", __FUNCTION__);
896+
897+ MOD_INC_USE_COUNT;
898+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
899+}
900+
901+module_init(AIXlvm_vge_init);
902+
903+/********** Required Plugin Functions **********/
904+
905+/*
906+ * Function: discover_aix
907+ *
908+ * This is the entry point into the LVM discovery process.
909+ */
910+static int
911+discover_aix(struct evms_logical_node **evms_logical_disk_head)
912+{
913+ int rc = 0, count = 0;
914+
915+ MOD_INC_USE_COUNT;
916+ LOG_DEBUG("[%s] discover_volume_groups\n", __FUNCTION__);
917+
918+ rc = discover_volume_groups(evms_logical_disk_head);
919+
920+ if (rc) {
921+ LOG_ERROR("[%s] discover_volume_groups rc=%d\n", __FUNCTION__,rc);
922+ }
923+
924+ if (AIXVolumeGroupList && !rc) {
925+
926+ LOG_DEBUG("[%s] discover_logical_volumes\n", __FUNCTION__);
927+
928+ rc = discover_logical_volumes();
929+
930+ if (rc) {
931+ LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",
932+ __FUNCTION__, rc);
933+ }
934+
935+ LOG_DEBUG("[%s] export_volumes\n", __FUNCTION__);
936+
937+ count = export_volumes(evms_logical_disk_head);
938+
939+ LOG_DEBUG("[%s] export_volumes count=%d\n", __FUNCTION__,
940+ count);
941+ }
942+
943+ MOD_DEC_USE_COUNT;
944+ return (count);
945+}
946+
947+static int
948+discover_volume_groups(struct evms_logical_node **evms_logical_disk_head)
949+{
950+ struct evms_logical_node *logical_node;
951+ struct evms_logical_node *next_node;
952+ struct aix_ipl_rec_area *AIXpv;
953+ struct AIXlvm_rec *AIXlvm; // Temp holder for the LVM on disk rec
954+
955+ LOG_DEBUG(" Begin %s\n", __FUNCTION__);
956+
957+ AIXpv = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
958+ if (!AIXpv) {
959+ return -ENOMEM;
960+ }
961+
962+ // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
963+
964+ AIXlvm = kmalloc(sizeof (struct AIXlvm_rec), GFP_KERNEL);
965+ if (!AIXlvm) {
966+ kfree(AIXpv);
967+ return -ENOMEM;
968+ }
969+
970+ for (logical_node = *evms_logical_disk_head; logical_node;
971+ logical_node = next_node) {
972+
973+ // Grab the next list item in case we remove this partition from the global list.
974+ next_node = logical_node->next;
975+
976+ // Read the first sector and see if it has a valid AIX PV signature.
977+
978+ if (INIT_IO(logical_node, 0, 0, 1, AIXpv)) {
979+ // On an I/O error, continue on to the next
980+ // partition. The group that this partition
981+ // belongs to will be incomplete, but we still
982+ // need to discover any other groups.
983+
984+ LOG_ERROR(" Error reading PV [%p]\n", logical_node);
985+ continue;
986+ }
987+
988+ if (AIXpv->IPL_record_id == IPLRECID) {
989+
990+ // This partition is definitely a PV,
991+ // but is it part of a valid VG?
992+ LOG_DEBUG(" DVG removing node from list logical_node %p\n",
993+ logical_node);
994+
995+ if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
996+ LOG_ERROR(" Error reading PV [%p]\n",logical_node);
997+ continue;
998+ }
999+
1000+ if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
1001+
1002+ if (validate_build_volume_group_disk_info(
1003+ logical_node, AIXlvm)) {
1004+ // Again, continue on and we'll
1005+ // clean up later.
1006+ continue;
1007+ }
1008+
1009+ evms_cs_remove_logical_node_from_list(
1010+ evms_logical_disk_head, logical_node);
1011+
1012+ } else {
1013+ LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %d)\n",
1014+ AIXlvm->lvm_id);
1015+ continue;
1016+ }
1017+ } else {
1018+ LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",
1019+ logical_node);
1020+ }
1021+ }
1022+
1023+ AIX_VOLUME_GROUP_DUMP();
1024+
1025+ if (check_volume_groups()) {
1026+ return -EINVAL;
1027+ }
1028+
1029+ kfree(AIXpv);
1030+ kfree(AIXlvm);
1031+
1032+ return 0;
1033+}
1034+
1035+/*
1036+ * Function: validate_build_volume_group_disk_info
1037+ *
1038+ * Creates and validates the volume groups found on the disk structures.
1039+ *
1040+ */
1041+static int
1042+validate_build_volume_group_disk_info(struct evms_logical_node *logical_node,
1043+ struct AIXlvm_rec *AIXlvm)
1044+{
1045+
1046+ struct aix_volume_group *AIXVGLptr = AIXVolumeGroupList;
1047+
1048+ LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
1049+
1050+ while (AIXVGLptr) {
1051+ if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
1052+ break;
1053+ }
1054+ AIXVGLptr = AIXVGLptr->next; // There is more than one so walk the list
1055+ }
1056+
1057+ if (!AIXVGLptr) {
1058+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
1059+ AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm);
1060+ if (AIXVGLptr) {
1061+ AIXVGLptr->next = AIXVolumeGroupList;
1062+ AIXVolumeGroupList = AIXVGLptr;
1063+ }
1064+ } else {
1065+ LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n",
1066+ AIXVGLptr, __LINE__);
1067+
1068+ if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
1069+ LOG_DEBUG
1070+ (" VBVGDI ERROR on Rediscover AIXVGLptr:%p line:%d\n",
1071+ AIXVGLptr, __LINE__);
1072+ }
1073+ }
1074+
1075+ if (!AIXVGLptr) {
1076+
1077+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1078+ __LINE__);
1079+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1080+ LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
1081+ return -EINVAL;
1082+ } else {
1083+
1084+ LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n",
1085+ AIXVolumeGroupList, __LINE__);
1086+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1087+ __LINE__);
1088+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1089+
1090+ if (add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num)) {
1091+ return -EINVAL;
1092+ }
1093+ }
1094+
1095+ return 0;
1096+}
1097+
1098+/*
1099+ * Function: add_VG_data_to_VG_list
1100+ *
1101+ * Allocate space for a new LVM volume group and all of its sub-fields.
1102+ * Initialize the appropriate fields.
1103+ */
1104+
1105+static int
1106+add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
1107+ struct aix_volume_group *new_group, short int pvNum)
1108+{
1109+// int pvh_pos;
1110+
1111+// struct pv_header *AIXpvh;
1112+
1113+ // The array of pointer to the logical volumes.
1114+ // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
1115+ // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
1116+
1117+ LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n", pvNum,
1118+ new_group->vgda_psn);
1119+
1120+// pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
1121+
1122+/* AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1123+ if (!AIXpvh) {
1124+ return -ENOMEM;
1125+ }
1126+
1127+ memset(AIXpvh, 0, AIX_SECTOR_SIZE);
1128+
1129+ LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
1130+
1131+ if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
1132+ return -EIO;
1133+ }
1134+
1135+ LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
1136+*/
1137+ if (!new_group->volume_list) {
1138+ new_group->volume_list =
1139+ kmalloc(LVM_MAXLVS * sizeof (struct aix_logical_volume *),
1140+ GFP_KERNEL);
1141+ if (!new_group->volume_list) {
1142+// kfree(AIXpvh);
1143+ return -ENOMEM;
1144+ }
1145+ memset(new_group->volume_list, 0,
1146+ (LVM_MAXLVS * sizeof (struct aix_logical_volume *)));
1147+ }
1148+
1149+ new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1;
1150+ new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2;
1151+ new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3;
1152+ new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4;
1153+// new_group->numpvs = new_group->AIXvgh->numpvs;
1154+// new_group->numlvs = new_group->AIXvgh->numlvs;
1155+// new_group->lv_max = new_group->AIXvgh->maxlvs;
1156+ new_group->pe_size = GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) /
1157+ AIX_SECTOR_SIZE;
1158+
1159+// new_group->block_size = 0;
1160+// new_group->hard_sect_size = 0;
1161+ new_group->flags |= AIX_VG_DIRTY;
1162+
1163+// kfree(AIXpvh);
1164+
1165+ LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
1166+
1167+ return 0;
1168+}
1169+
1170+/*
1171+ * Function: add_PV_to_volume_group
1172+ *
1173+ * Create a new partition_list_entry for the specified volume group.
1174+ * Initialize the new partition with the evms node and lvm pv information,
1175+ * and add the new partition to the group's list.
1176+ */
1177+
1178+static int
1179+add_PV_to_volume_group(struct aix_volume_group *group,
1180+ struct evms_logical_node *evms_partition, int pvNum)
1181+{
1182+ struct partition_list_entry *new_partition;
1183+
1184+ LOG_DEBUG(" APVVG Entering pvNum:%d\n", pvNum);
1185+
1186+ group->flags |= AIX_VG_DIRTY;
1187+
1188+ for (new_partition = group->partition_list; new_partition != NULL;
1189+ new_partition = new_partition->next) {
1190+ if (new_partition->logical_node == evms_partition) {
1191+ return 0;
1192+ }
1193+ }
1194+
1195+ new_partition =
1196+ kmalloc(sizeof (struct partition_list_entry), GFP_KERNEL);
1197+ if (!new_partition) {
1198+ return -ENOMEM;
1199+ }
1200+
1201+ memset(new_partition, 0, sizeof (struct partition_list_entry));
1202+
1203+ // Add this partition to this group's list.
1204+ new_partition->logical_node = evms_partition;
1205+ new_partition->pv_number = pvNum;
1206+
1207+ if (evms_partition->hardsector_size > group->hard_sect_size) {
1208+ group->hard_sect_size = evms_partition->hardsector_size;
1209+ }
1210+ if (evms_partition->block_size > group->block_size) {
1211+ group->block_size = evms_partition->block_size;
1212+ }
1213+
1214+ // Add this partition to the beginning of its group's list.
1215+ new_partition->next = group->partition_list;
1216+ group->partition_list = new_partition;
1217+ group->partition_count++;
1218+
1219+ LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",
1220+ group->partition_count, pvNum);
1221+
1222+ return 0;
1223+}
1224+
1225+/****************************************************
1226+*
1227+*
1228+*
1229+*****************************************************/
1230+static struct aix_volume_group *
1231+AIX_create_volume_group(struct evms_logical_node *logical_node,
1232+ struct AIXlvm_rec *AIXlvm)
1233+{
1234+ struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1235+ struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1236+ struct aix_volume_group *AIXVGLptr;
1237+
1238+ AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1239+ if (!AIXvgh) {
1240+ return NULL;
1241+ }
1242+
1243+ AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1244+ if (!AIXvgh2) {
1245+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1246+ return NULL;
1247+ }
1248+
1249+ AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1250+ if (!AIXvgt) {
1251+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1252+ return NULL;
1253+ }
1254+
1255+ AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1256+ if (!AIXvgt2) {
1257+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1258+ return NULL;
1259+ }
1260+
1261+ memset(AIXvgh, 0, AIX_SECTOR_SIZE);
1262+ memset(AIXvgh2, 0, AIX_SECTOR_SIZE);
1263+ memset(AIXvgt, 0, AIX_SECTOR_SIZE);
1264+ memset(AIXvgt2, 0, AIX_SECTOR_SIZE);
1265+
1266+ // First time thru we want to read this in, we may only have one PV in this group, all others
1267+ // may be corrupt, etc. If the info is clean we shouldn't get here.
1268+
1269+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1270+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1271+ return NULL;
1272+ }
1273+
1274+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1275+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1276+ return NULL;
1277+ }
1278+
1279+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1280+ AIXvgt)) {
1281+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1282+ return NULL;
1283+ }
1284+
1285+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1286+ AIXvgt2)) {
1287+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1288+ return NULL;
1289+ }
1290+
1291+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1292+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1293+ LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1294+ LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1295+ LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",(int) sizeof (struct aix_volume_group));
1296+
1297+ AIXVGLptr = kmalloc(sizeof (struct aix_volume_group), GFP_KERNEL);
1298+ if (!AIXVGLptr) {
1299+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1300+ return NULL;
1301+ }
1302+ memset(AIXVGLptr, 0, sizeof (struct aix_volume_group));
1303+
1304+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1305+ AIXVGLptr->flags |= AIX_VG_DIRTY;
1306+
1307+ LOG_DEBUG("CVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1308+
1309+ AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1310+ if (!AIXVGLptr->AIXvgh) {
1311+ kfree(AIXVGLptr);
1312+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1313+ return NULL;
1314+ }
1315+ memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1316+
1317+ LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",
1318+ AIXVGLptr->CleanVGInfo);
1319+
1320+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1321+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1322+ if (COMPARE_TIMESTAMPS
1323+ (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1324+ if (COMPARE_TIMESTAMPS
1325+ (AIXvgh->vg_timestamp,
1326+ AIXvgh2->vg_timestamp)) {
1327+ // All timestamps match. Yea!
1328+ AIXVGLptr->CleanVGInfo =
1329+ AIX_PV_STATE_VALID;
1330+ } else {
1331+ // Both VGDAs are good, but timestamps are
1332+ // different. Can't tell yet which one is
1333+ // correct.
1334+ AIXVGLptr->CleanVGInfo =
1335+ AIX_PV_STATE_EITHER_VGDA;
1336+ }
1337+ } else {
1338+ // First VGDA is good, second is bad.
1339+ AIXVGLptr->CleanVGInfo =
1340+ AIX_PV_STATE_FIRST_VGDA;
1341+ }
1342+ } else {
1343+ if (COMPARE_TIMESTAMPS
1344+ (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1345+ // First VGDA is bad, second is good.
1346+ AIXVGLptr->CleanVGInfo =
1347+ AIX_PV_STATE_SECOND_VGDA;
1348+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1349+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1350+ } else {
1351+ // This should never happen.
1352+ LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n",
1353+ AIXVGLptr->vg_id.word2);
1354+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1355+
1356+ }
1357+ }
1358+
1359+ LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1360+ AIXVGLptr->CleanVGInfo);
1361+
1362+ switch (AIXVGLptr->CleanVGInfo) {
1363+ case AIX_PV_STATE_VALID:
1364+ case AIX_PV_STATE_FIRST_VGDA:
1365+
1366+ LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",
1367+ AIXVGLptr->CleanVGInfo,
1368+ (int) sizeof (struct vg_header));
1369+
1370+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1371+
1372+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1373+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1374+ break;
1375+
1376+ case AIX_PV_STATE_SECOND_VGDA:
1377+ LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",
1378+ AIXVGLptr->CleanVGInfo,
1379+ (int) sizeof (struct vg_header));
1380+
1381+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1382+
1383+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1384+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1385+ break;
1386+
1387+ case AIX_PV_STATE_EITHER_VGDA:
1388+ LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",
1389+ AIXVGLptr->CleanVGInfo,(int) sizeof (struct vg_header));
1390+ if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1391+
1392+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1393+
1394+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1395+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1396+ } else {
1397+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1398+ // Not sure where this PV belongs. It thinks it is
1399+ // supposed to be in two different containers. We will
1400+ // probably need to put this on a separate, temporary
1401+ // list, and determine later which container is missing
1402+ // a PV.
1403+ }
1404+ break;
1405+
1406+ default:
1407+ LOG_ERROR("Invalid PV state (%d) for %d\n",
1408+ AIXVGLptr->CleanVGInfo,
1409+ AIXVGLptr->vg_id.word2);
1410+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1411+ break;
1412+ }
1413+
1414+ }
1415+
1416+ // Currently AIX Big VGDA is not supported - cleanup and return NULL so this VG doesn't get added
1417+
1418+ if (AIXVGLptr->AIXvgh->bigvg != 0) {
1419+ LOG_SERIOUS("Error creating Volume Group AIX Big VGDA is not currently supported\n");
1420+ if (AIXVGLptr->AIXvgh) {
1421+ kfree(AIXVGLptr->AIXvgh);
1422+ AIXVGLptr->AIXvgh = NULL;
1423+ }
1424+
1425+ if (AIXVGLptr) {
1426+ kfree(AIXVGLptr);
1427+ AIXVGLptr = NULL;
1428+ }
1429+
1430+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1431+ return NULL;
1432+ }
1433+
1434+ add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1435+
1436+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1437+
1438+ LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1439+
1440+ return AIXVGLptr;
1441+}
1442+
1443+/****************************************************
1444+*
1445+*
1446+*
1447+*****************************************************/
1448+static int
1449+AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
1450+ struct evms_logical_node *logical_node,
1451+ struct AIXlvm_rec *AIXlvm)
1452+{
1453+ struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1454+ struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1455+
1456+ AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1457+ if (!AIXvgh) {
1458+ return -ENOMEM;
1459+ }
1460+
1461+ AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1462+ if (!AIXvgh2) {
1463+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1464+ return -ENOMEM;
1465+ }
1466+
1467+ AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1468+ if (!AIXvgt) {
1469+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1470+ return -ENOMEM;
1471+ }
1472+
1473+ AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1474+ if (!AIXvgt2) {
1475+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1476+ return -ENOMEM;
1477+ }
1478+
1479+ // First time thru we want to read this in, we may only have one PV in this group, all others
1480+ // may be corrupt, etc. If the info is clean we shouldn't get here.
1481+
1482+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1483+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1484+ return -ENOMEM;
1485+ }
1486+
1487+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1488+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1489+ return -ENOMEM;
1490+ }
1491+
1492+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1493+ AIXvgt)) {
1494+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1495+ return -ENOMEM;
1496+ }
1497+
1498+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1499+ AIXvgt2)) {
1500+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1501+ return -ENOMEM;
1502+ }
1503+
1504+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1505+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1506+ LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1507+ LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1508+
1509+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1510+ AIXVGLptr->flags |= AIX_VG_DIRTY;
1511+
1512+ LOG_DEBUG("UVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1513+
1514+ AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1515+ if (!AIXVGLptr->AIXvgh) {
1516+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1517+ return -ENOMEM;
1518+ }
1519+ memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1520+
1521+ LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1522+
1523+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1524+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1525+ if (COMPARE_TIMESTAMPS
1526+ (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1527+ if (COMPARE_TIMESTAMPS
1528+ (AIXvgh->vg_timestamp,
1529+ AIXvgh2->vg_timestamp)) {
1530+ // All timestamps match. Yea!
1531+ AIXVGLptr->CleanVGInfo =
1532+ AIX_PV_STATE_VALID;
1533+ } else {
1534+ // Both VGDAs are good, but timestamps are
1535+ // different. Can't tell yet which one is
1536+ // correct.
1537+ AIXVGLptr->CleanVGInfo =
1538+ AIX_PV_STATE_EITHER_VGDA;
1539+ }
1540+ } else {
1541+ // First VGDA is good, second is bad.
1542+ AIXVGLptr->CleanVGInfo =
1543+ AIX_PV_STATE_FIRST_VGDA;
1544+ }
1545+ } else {
1546+ if (COMPARE_TIMESTAMPS
1547+ (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1548+ // First VGDA is bad, second is good.
1549+ AIXVGLptr->CleanVGInfo =
1550+ AIX_PV_STATE_SECOND_VGDA;
1551+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1552+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1553+ } else {
1554+ // This should never happen.
1555+ LOG_DEBUG
1556+ ("All four VG timestamps for %d are different. What happened?!?\n",
1557+ AIXVGLptr->vg_id.word2);
1558+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1559+
1560+ }
1561+ }
1562+
1563+ LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1564+ AIXVGLptr->CleanVGInfo);
1565+
1566+ switch (AIXVGLptr->CleanVGInfo) {
1567+ case AIX_PV_STATE_VALID:
1568+ case AIX_PV_STATE_FIRST_VGDA:
1569+
1570+ LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",
1571+ AIXVGLptr->CleanVGInfo,
1572+ (int) sizeof (struct vg_header));
1573+
1574+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1575+
1576+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1577+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1578+ break;
1579+
1580+ case AIX_PV_STATE_SECOND_VGDA:
1581+ LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",
1582+ AIXVGLptr->CleanVGInfo,
1583+ (int) sizeof (struct vg_header));
1584+
1585+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1586+
1587+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1588+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1589+ break;
1590+
1591+ case AIX_PV_STATE_EITHER_VGDA:
1592+ LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",
1593+ AIXVGLptr->CleanVGInfo,
1594+ (int) sizeof (struct vg_header));
1595+ if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1596+
1597+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1598+
1599+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1600+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1601+ } else {
1602+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1603+ // Not sure where this PV belongs. It thinks it is
1604+ // supposed to be in two different containers. We will
1605+ // probably need to put this on a separate, temporary
1606+ // list, and determine later which container is missing
1607+ // a PV.
1608+ }
1609+ break;
1610+
1611+ default:
1612+ LOG_ERROR("UVG Invalid PV state (%d) for %d\n",
1613+ AIXVGLptr->CleanVGInfo,
1614+ AIXVGLptr->vg_id.word2);
1615+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1616+ break;
1617+ }
1618+
1619+ }
1620+
1621+// add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1622+ AIXVGLptr->flags |= AIX_VG_DIRTY;
1623+
1624+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1625+
1626+ LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1627+
1628+ return 0;
1629+}
1630+
1631+/****************************************************
1632+* Function: check_volume_groups
1633+*
1634+* We just want to make sure the volume groups have found
1635+* all their drives.
1636+*
1637+* If not, we'll continue and build what we can
1638+*****************************************************/
1639+static int
1640+check_volume_groups(void)
1641+{
1642+ struct aix_volume_group *group;
1643+ struct aix_volume_group *next_group;
1644+// struct partition_list_entry *partitions;
1645+// int NumPVS = 0;
1646+
1647+ LOG_DEBUG("CHVG Checking volume groups:\n");
1648+
1649+
1650+ for (group = AIXVolumeGroupList; group; group = next_group) {
1651+ next_group = group->next;
1652+
1653+ if (group->flags & AIX_VG_DIRTY){
1654+ if (group->AIXvgh->numlvs == 0) {
1655+ remove_group_from_list(group);
1656+ deallocate_volume_group(group);
1657+ } else {
1658+ if (group->partition_count != group->AIXvgh->numpvs) {
1659+ group->flags |= AIX_VG_INCOMPLETE;
1660+ LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",
1661+ group->flags);
1662+ LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",
1663+ group->partition_count, group->AIXvgh->numpvs);
1664+ }
1665+ }
1666+ }
1667+ }
1668+
1669+ LOG_DEBUG("CHVG Finished Checking volume groups:\n");
1670+ return 0;
1671+
1672+}
1673+
1674+/************************************************************************
1675+ * Function: discover_logical_volumes
1676+ *
1677+ * After all PVs have been claimed and added to the appropriate VG list,
1678+ * the volumes for each VG must be constructed.
1679+ *
1680+ *
1681+ */
1682+static int
1683+discover_logical_volumes(void)
1684+{
1685+
1686+ struct aix_volume_group *AIXVGLPtr;
1687+ struct aix_logical_volume *new_LV;
1688+ struct partition_list_entry *partition;
1689+ struct evms_logical_node *node;
1690+ struct lv_entries *AIXlvent, *AIXlventHead;
1691+ int j, lv_found, all_lvs_found, rc;
1692+ struct namelist *AIXnamelist;
1693+ char *NameBuffer;
1694+
1695+ AIXlventHead =
1696+ kmalloc(MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE, GFP_KERNEL);
1697+ if (!AIXlventHead) {
1698+ return -ENOMEM;
1699+ }
1700+
1701+ memset(AIXlventHead, 0, (MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE));
1702+
1703+ NameBuffer =
1704+ kmalloc(MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE, GFP_KERNEL);
1705+ if (!NameBuffer) {
1706+ kfree(AIXlventHead);
1707+ return -ENOMEM;
1708+ }
1709+
1710+ memset(NameBuffer, 0, (MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE));
1711+
1712+ for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr;
1713+ AIXVGLPtr = AIXVGLPtr->next ) {
1714+
1715+ partition = AIXVGLPtr->partition_list;
1716+
1717+ if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
1718+ continue;
1719+ }
1720+
1721+ if (partition == NULL) {
1722+ continue;
1723+ }
1724+
1725+ node = partition->logical_node;
1726+
1727+ if (node == NULL) {
1728+ continue;
1729+ }
1730+
1731+ LOG_DEBUG("DLV INIT_IO AIXNameList position:%d\n",
1732+ ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1733+ MAX_SECTORS_NAMELIST));
1734+ LOG_DEBUG("AIXVGLPTR:%p partition:%p node:%p \n", AIXVGLPtr,
1735+ partition, node);
1736+
1737+ if (INIT_IO(node, 0,
1738+ ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1739+ MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST,
1740+ NameBuffer)) {
1741+ continue;
1742+ }
1743+
1744+ LOG_DEBUG("DLV INIT_IO AIXNameList\n");
1745+
1746+ if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC,
1747+ MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
1748+ continue;
1749+ }
1750+ AIXlvent = AIXlventHead;
1751+ AIXnamelist = (struct namelist *) NameBuffer;
1752+
1753+ LOG_DEBUG("DLV INIT_IO AIXlvent\n");
1754+ // Search through the LV structs for valid LV entries
1755+ // We're just going to search until all valid LVs are found
1756+ // The max. allowable LVs is 256 and we want don't want to
1757+ // search for 255 if only 8 are defined 1-8 however, there
1758+ // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
1759+
1760+ for (j = 0, lv_found = 0, all_lvs_found = FALSE;
1761+ !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
1762+
1763+ LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",
1764+ AIXlvent->num_lps, AIXnamelist->name[j], j,
1765+ AIXlvent->lvname);
1766+ LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n",
1767+ AIXlvent->striping_width,
1768+ GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp),
1769+ AIXlvent->lv_state);
1770+ LOG_DEBUG(" DVIG Group:%x.Access:%x\n",
1771+ (unsigned int) AIXVGLPtr->vg_id.word2,
1772+ AIXlvent->permissions);
1773+ LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n",
1774+ AIXlvent->mirror, AIXlvent->mirror_policy,
1775+ AIXlvent->mirwrt_consist);
1776+
1777+ // This is the same check we used in "diskedit" and "readdisk"
1778+ if (AIXlvent->lv_state == 0 ||
1779+ AIXlvent->permissions > 0x10) {
1780+ continue;
1781+ }
1782+
1783+ lv_found++;
1784+ if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
1785+ all_lvs_found = TRUE;
1786+ }
1787+
1788+ LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n",
1789+ lv_found, all_lvs_found);
1790+
1791+ // Create a new logical volume and place it in the appropriate
1792+ // spot in this VG's volume list. For re-discovery, make sure
1793+ // this volume does not already exist.
1794+ if (!AIXVGLPtr->volume_list[AIXlvent->lvname]) {
1795+ new_LV =
1796+ new_logical_volume(AIXlvent,
1797+ AIXVGLPtr,
1798+ AIXnamelist->
1799+ name[j],
1800+ GET_PHYSICAL_PART_SIZE
1801+ (AIXlvent->
1802+ stripe_exp));
1803+ if (!new_LV) {
1804+ continue;
1805+ }
1806+ LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",
1807+ new_LV->lv_number,AIXVGLPtr->vg_id.word2);
1808+
1809+ AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
1810+ } else {
1811+ LOG_DEBUG("DVIG Updating Vol Exists\n");
1812+ }
1813+ }
1814+
1815+ // Build the le_to_pe_map for each volume that was discovered above.
1816+ // This has to be done after all volumes in the group are discovered
1817+ if ((rc = build_pe_maps(AIXVGLPtr))) {
1818+ continue;
1819+ }
1820+
1821+ check_log_volume_and_pe_maps(AIXVGLPtr);
1822+ }
1823+
1824+ kfree(NameBuffer);
1825+ kfree(AIXlventHead);
1826+
1827+ return 0;
1828+}
1829+
1830+/*
1831+ * Function: new_logical_volume
1832+ *
1833+ * Allocate space for a new LVM logical volume, including space for the
1834+ * PE map
1835+ */
1836+static struct aix_logical_volume *
1837+new_logical_volume(struct lv_entries *AIXlvent,
1838+ struct aix_volume_group *volume_group,
1839+ char *lv_name, u32 stripesize)
1840+{
1841+
1842+ struct aix_logical_volume *new_volume;
1843+ const char *name = "evms_AIXiod";
1844+ const char *resync_name = "evms_AIXresync";
1845+
1846+ LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n",
1847+ AIXlvent->lvname, AIXlvent->num_lps,
1848+ AIXlvent->num_lps * volume_group->pe_size);
1849+
1850+ // Allocate space for the new logical volume.
1851+ new_volume = kmalloc(sizeof (struct aix_logical_volume), GFP_KERNEL);
1852+ if (!new_volume) {
1853+ return NULL;
1854+ }
1855+ memset(new_volume, 0, sizeof (struct aix_logical_volume));
1856+
1857+ // Allocate space for the LE to PE mapping table
1858+ // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
1859+ new_volume->le_to_pe_map =
1860+ kmalloc((AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry),
1861+ GFP_KERNEL);
1862+ if (!new_volume->le_to_pe_map) {
1863+ delete_logical_volume(new_volume);
1864+ return NULL;
1865+ }
1866+
1867+ memset(new_volume->le_to_pe_map, 0,
1868+ (AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry));
1869+
1870+ if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
1871+ new_volume->le_to_pe_map_mir1 =
1872+ kmalloc((AIXlvent->num_lps +
1873+ 1) * sizeof (struct pe_table_entry), GFP_KERNEL);
1874+ if (!new_volume->le_to_pe_map_mir1) {
1875+ delete_logical_volume(new_volume);
1876+ return NULL;
1877+ }
1878+ memset(new_volume->le_to_pe_map_mir1, 0,
1879+ (AIXlvent->num_lps +
1880+ 1) * sizeof (struct pe_table_entry));
1881+ }
1882+
1883+ if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
1884+ new_volume->le_to_pe_map_mir2 =
1885+ kmalloc((AIXlvent->num_lps + 1)
1886+ * sizeof (struct pe_table_entry), GFP_KERNEL);
1887+ if (!new_volume->le_to_pe_map_mir2) {
1888+ delete_logical_volume(new_volume);
1889+ return NULL;
1890+ }
1891+ memset(new_volume->le_to_pe_map_mir2, 0,
1892+ (AIXlvent->num_lps +1)
1893+ * sizeof (struct pe_table_entry));
1894+ }
1895+
1896+ // Initialize the rest of the new volume.
1897+ new_volume->lv_number = AIXlvent->lvname;
1898+ new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size);
1899+ new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW; // All volumes start new.
1900+ new_volume->lv_status = AIXlvent->lv_state;
1901+ //new_volume->lv_minor = MINOR(1);
1902+ new_volume->mirror_copies = AIXlvent->mirror;
1903+// new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
1904+ new_volume->stripes = AIXlvent->striping_width;
1905+ new_volume->stripe_size = stripesize;
1906+ new_volume->stripe_size_shift = evms_cs_log2(stripesize);
1907+ new_volume->pe_size = volume_group->pe_size;
1908+ new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size);
1909+ new_volume->num_le = AIXlvent->num_lps;
1910+// new_volume->new_volume = TRUE;
1911+ new_volume->group = volume_group;
1912+
1913+ volume_group->numlvs++;
1914+
1915+ sprintf(new_volume->name, "aix/%s", lv_name);
1916+
1917+ if (!AIX_BH_list_pool
1918+ && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1919+
1920+ // We only need the ReSync thread if we have at least one mirrored LV.
1921+ // You can't ReSync a non-mirrored drive
1922+
1923+ AIX_BH_list_pool =
1924+ evms_cs_create_pool(sizeof (struct aix_mirror_bh),
1925+ "EVMS_AIX_BH", aix_notify_cache_ctor,
1926+ NULL);
1927+ if (!AIX_BH_list_pool) {
1928+ return NULL;
1929+
1930+ AIX_mirror_read_retry_thread =
1931+ evms_cs_register_thread(AIXiod, NULL, name);
1932+
1933+ AIX_mirror_resync_thread =
1934+ evms_cs_register_thread(AIXresync, NULL,
1935+ resync_name);
1936+ }
1937+ }
1938+
1939+ LOG_DEBUG("NLV lv_number:%d name:%s lv_size " PFU64 " \n",
1940+ new_volume->lv_number, new_volume->name, new_volume->lv_size);
1941+ LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n",
1942+ new_volume->stripe_size, new_volume->stripe_size_shift);
1943+
1944+ return new_volume;
1945+}
1946+
1947+/*
1948+ * Function: aix_notify_cache_ctor
1949+ * this function initializes the b_wait field in the buffer heads
1950+ * in our private buffer head pool.
1951+ */
1952+static void
1953+aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
1954+{
1955+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1956+ SLAB_CTOR_CONSTRUCTOR) {
1957+ struct aix_mirror_bh *rbh = (struct aix_mirror_bh *) foo;
1958+ memset(rbh, 0, sizeof (struct aix_mirror_bh));
1959+ rbh->remaining = (atomic_t) ATOMIC_INIT(0);
1960+ init_waitqueue_head(&rbh->bh_req.b_wait);
1961+ }
1962+}
1963+
1964+/*
1965+ * Function: build_pe_maps
1966+ *
1967+ * After all logical volumes have been discovered, the mappings from
1968+ * logical extents to physical extents must be constructed. Each PV
1969+ * contains a map on-disk of its PEs. Each PE map entry contains the
1970+ * logical volume number and the logical extent number on that volume.
1971+ * Our internal map is the reverse of this map for each volume, listing
1972+ * the PV node and sector offset for every logical extent on the volume.
1973+ */
1974+static int
1975+ build_pe_maps(struct aix_volume_group *volume_group)
1976+{
1977+ struct partition_list_entry *partition;
1978+ struct partition_list_entry *mirror_partition;
1979+ struct pp_entries *AIXppent, *AIXppent_buff;
1980+ struct pv_header *AIXpvh;
1981+ u64 offset;
1982+ u32 le_number;
1983+ u32 j, pp_count, pvh_pos;
1984+ u32 MirrorFound;
1985+ u32 pvh_posn[LVM_MAXPVS];
1986+ u32 rc;
1987+#ifdef EVMS_DEBUG_MIRRORS
1988+ u32 lv_found, all_lvs_found;
1989+ u32 mirs = 0;
1990+#endif
1991+
1992+ LOG_DEBUG(" *** BPEM ***\n");
1993+ // For every partition in this VG
1994+
1995+ AIXppent_buff = kmalloc(AIX_SECTOR_SIZE * PHYS_VOL_OFFSET, GFP_KERNEL);
1996+ if (!AIXppent_buff) {
1997+ return -ENOMEM;
1998+ }
1999+
2000+ memset(AIXppent_buff, 0, AIX_SECTOR_SIZE * PHYS_VOL_OFFSET);
2001+ memset(pvh_posn, 0, LVM_MAXPVS);
2002+
2003+ AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
2004+ if (!AIXpvh) {
2005+ kfree(AIXppent_buff);
2006+ return -ENOMEM;
2007+ }
2008+
2009+ memset(AIXpvh, 0, AIX_SECTOR_SIZE);
2010+
2011+ LOG_DEBUG(" BPEM AIXppent_buff:%d \n",
2012+ (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
2013+
2014+ // This next section is to calculate the sector spacing between PV info for the VG
2015+ // AIX doesn't always space the info. the same. It could be 17 or 34 sectors apart
2016+ // depending on the PE size selected.
2017+
2018+ rc = AIX_pvh_data_posn(volume_group->vgda_psn, pvh_posn, volume_group->partition_list, volume_group->AIXvgh->numpvs);
2019+
2020+ if (rc != 0) {
2021+ kfree(AIXppent_buff);
2022+ kfree(AIXpvh);
2023+ return (rc);
2024+ }
2025+
2026+ for (partition = volume_group->partition_list; partition;
2027+ partition = partition->next) {
2028+
2029+ LOG_DEBUG(" BPEM partition:%p next:%p\n", partition,
2030+ partition->next);
2031+
2032+ pvh_pos = pvh_posn[partition->pv_number];
2033+
2034+ LOG_DEBUG(" BPEM pvh_pos:%d pv_number:%d\n", pvh_pos, partition->pv_number);
2035+
2036+ if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
2037+ kfree(AIXppent_buff);
2038+ kfree(AIXpvh);
2039+ return -EIO;
2040+ }
2041+ // For every entry in the PE map, calculate the PE's sector offset
2042+ // and update the correct LV's PE map. LV number of 0 marks an unused PE.
2043+ // For re-discovery, only compute entries for new volumes.
2044+
2045+ if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH,
2046+ AIXppent_buff)) {
2047+ kfree(AIXppent_buff);
2048+ kfree(AIXpvh);
2049+ return -EIO;
2050+ }
2051+
2052+ AIXppent = AIXppent_buff;
2053+ AIXppent++;
2054+
2055+ pp_count = AIXpvh->pp_count;
2056+
2057+ LOG_DEBUG("BPEM AIXpvh data: pp_count:%d psn_part1:%d pv_id1:%d pv_id2:%d pv_id3:%d pv_id4:%d pv_num:%d pv_state:%d vgdas:%d res1:%d res2:%d\n", AIXpvh->pp_count,
2058+ AIXpvh->psn_part1,
2059+ AIXpvh->pv_id.word1,
2060+ AIXpvh->pv_id.word2,
2061+ AIXpvh->pv_id.word3,
2062+ AIXpvh->pv_id.word4,
2063+ AIXpvh->pv_num,
2064+ AIXpvh->pv_state, AIXpvh->pvnum_vgdas, AIXpvh->res1, AIXpvh->res2);
2065+
2066+ LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
2067+ volume_group->vg_id.word2, AIXpvh->pv_num, partition,
2068+ partition->next, AIXppent->lv_index, pp_count);
2069+
2070+ for (j = 0; j < pp_count; j++,AIXppent++) {
2071+ if (!AIXppent->lv_index || AIXppent->pp_state == AIX_LVM_LVUNDEF) {
2072+ continue;
2073+ }
2074+
2075+ LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%d cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
2076+ volume_group->vg_id.word2, j + 1,
2077+ AIXppent->pp_state,
2078+ volume_group->volume_list[AIXppent->lv_index -1]->name,
2079+ AIXppent->lv_index, AIXppent->lp_num,
2080+ AIXppent->copy, AIXppent->fst_alt_vol,
2081+ AIXppent->fst_alt_part,
2082+ AIXppent->snd_alt_vol,
2083+ AIXppent->snd_alt_part);
2084+
2085+ le_number = AIXppent->lp_num - 1; // AIX lp's start @ 1, we want a 0 index
2086+ offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
2087+
2088+ LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
2089+ le_number, partition, AIXppent->lv_index,
2090+ volume_group->volume_list[AIXppent->lv_index -1]->name);
2091+
2092+ if (!volume_group->volume_list[AIXppent->lv_index - 1]) {
2093+ LOG_SERIOUS("Failed attempt to access volume without memory allocation lv:%d\n",
2094+ AIXppent->lv_index - 1);
2095+ continue;
2096+ }
2097+
2098+ if (volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map
2099+ && le_number <= volume_group->volume_list[AIXppent->lv_index - 1]->num_le) {
2100+
2101+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].owning_pv = partition;
2102+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pe_sector_offset = offset;
2103+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pp_state = AIXppent->pp_state;
2104+ }
2105+
2106+ if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies >
2107+ AIX_DEFAULT_MIRRORING) {
2108+
2109+ LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n",
2110+ AIXppent->lv_index);
2111+
2112+ for (mirror_partition = volume_group->partition_list,
2113+ MirrorFound = FALSE;
2114+ mirror_partition && !MirrorFound;
2115+ mirror_partition = mirror_partition->next) {
2116+
2117+ if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
2118+
2119+ offset = (((AIXppent->fst_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2120+
2121+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition;
2122+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
2123+ volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pp_state = AIXppent->pp_state;
2124+
2125+ LOG_EXTRA(" PE Map: mirror_partition:%p \n",
2126+ mirror_partition);
2127+ LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n",
2128+ AIXppent->fst_alt_part);
2129+
2130+ MirrorFound = TRUE;
2131+ }
2132+ }
2133+
2134+ if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies == AIX_MAX_MIRRORS) {
2135+
2136+ for (mirror_partition = volume_group->partition_list,
2137+ MirrorFound = FALSE;
2138+ mirror_partition && !MirrorFound;
2139+ mirror_partition = mirror_partition->next) {
2140+
2141+ if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
2142+
2143+ offset = (((AIXppent->snd_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2144+
2145+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition;
2146+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
2147+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pp_state = AIXppent->pp_state;
2148+
2149+ LOG_EXTRA(" PE Map: mirror_partition2:%p \n",
2150+ mirror_partition);
2151+ LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n",
2152+ AIXppent->snd_alt_part);
2153+
2154+ MirrorFound = TRUE;
2155+ }
2156+ }
2157+ }
2158+
2159+ } // End of if mirroring is enabled
2160+ }
2161+ }
2162+
2163+// LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
2164+
2165+#ifdef EVMS_DEBUG_MIRRORS
2166+ for (mirs = 0, lv_found = 0, all_lvs_found = FALSE;
2167+ !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
2168+
2169+ if (volume_group->volume_list[mirs] != NULL) {
2170+ if (volume_group->volume_list[mirs]->lv_status ==
2171+ LV_ACTIVE) {
2172+
2173+ lv_found++;
2174+
2175+ LOG_DEBUG(" PE Map: owning part lv %d -- %p\n",
2176+ mirs,
2177+ volume_group->volume_list[mirs]->
2178+ le_to_pe_map[0].owning_pv);
2179+ if (volume_group->volume_list[mirs]->
2180+ mirror_copies > AIX_DEFAULT_MIRRORING) {
2181+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2182+ mirs,
2183+ volume_group->volume_list[mirs]->
2184+ le_to_pe_map_mir1[0].owning_pv);
2185+ }
2186+ if (volume_group->volume_list[mirs]->
2187+ mirror_copies == AIX_MAX_MIRRORS) {
2188+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2189+ mirs,
2190+ volume_group->volume_list[mirs]->
2191+ le_to_pe_map_mir2[0].owning_pv);
2192+ }
2193+ }
2194+ if (lv_found == volume_group->AIXvgh->numlvs) {
2195+ all_lvs_found = TRUE;
2196+ LOG_DEBUG(" PE Map: all_lvs_found\n");
2197+ }
2198+ }
2199+ }
2200+#endif
2201+
2202+ kfree(AIXpvh);
2203+ kfree(AIXppent_buff);
2204+
2205+ return 0;
2206+}
2207+
2208+/*
2209+ * Function: check_log_volume_and_pe_maps
2210+ *
2211+ * Make sure all volumes in this group have valid LE-to-PE maps.
2212+ * Any volume that doesn't is deleted. This is safe for re-discovery
2213+ * because only new volumes could have corrupted PE maps.
2214+ */
2215+static int
2216+check_log_volume_and_pe_maps(struct aix_volume_group *group)
2217+{
2218+ struct aix_logical_volume *volume;
2219+ int i, j, lv_found, all_lvs_found;
2220+
2221+ LOG_DEBUG(" check_pe_map.\n");
2222+
2223+ for (i = 0, all_lvs_found = FALSE, lv_found = 0;
2224+ !all_lvs_found && i < LVM_MAXLVS; i++) {
2225+ if (!group->volume_list[i]) {
2226+ LOG_DEBUG(" CPEM No Volume %d found \n", i);
2227+ continue;
2228+ }
2229+
2230+ volume = group->volume_list[i];
2231+ if (!volume->le_to_pe_map) {
2232+ LOG_DEBUG(" CPEM Volume %s has no PE map.\n",
2233+ volume->name);
2234+ delete_logical_volume(volume);
2235+ continue;
2236+ }
2237+
2238+ LOG_DEBUG(" CPEM volume %s num_le: %d \n", volume->name,
2239+ volume->num_le);
2240+
2241+ lv_found++;
2242+
2243+ if (lv_found == group->AIXvgh->numlvs) {
2244+ all_lvs_found = TRUE;
2245+ }
2246+
2247+ for (j = 0; j < volume->num_le; j++) {
2248+ if (!volume->le_to_pe_map[j].owning_pv ||
2249+ !volume->le_to_pe_map[j].pe_sector_offset) {
2250+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",
2251+ volume->name, j);
2252+ volume->lv_access |= EVMS_LV_INCOMPLETE;
2253+ }
2254+
2255+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
2256+ if (!volume->le_to_pe_map_mir1[j].owning_pv ||
2257+ !volume->le_to_pe_map_mir1[j].
2258+ pe_sector_offset) {
2259+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",
2260+ volume->name, j);
2261+ volume->lv_access |= EVMS_LV_INCOMPLETE;
2262+ }
2263+
2264+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
2265+ if (!volume->le_to_pe_map_mir2[j].
2266+ owning_pv
2267+ || !volume->le_to_pe_map_mir2[j].
2268+ pe_sector_offset) {
2269+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",
2270+ volume->name, j);
2271+ volume->lv_access |= EVMS_LV_INCOMPLETE;
2272+ }
2273+ }
2274+ }
2275+ }
2276+ }
2277+
2278+ LOG_EXTRA(" Leaving check_pe_map.\n");
2279+ return 0;
2280+}
2281+
2282+/*
2283+ * Function: export_volumes
2284+ *
2285+ * The last thing this VGE must do is take each constructed volume and
2286+ * place it back on the evms logical partition list.
2287+ */
2288+static int
2289+export_volumes(struct evms_logical_node **evms_partition_list)
2290+{
2291+ struct aix_volume_group *AIXVGLPtr;
2292+ struct evms_logical_node *new_node;
2293+ struct aix_logical_volume *volume;
2294+ int j, lv_found, all_lvs_found;
2295+ int count = 0;
2296+
2297+ for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr; AIXVGLPtr = AIXVGLPtr->next) {
2298+
2299+ if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
2300+ LOG_DEBUG(" EV Existing group(%d), not dirty, skipping\n",
2301+ AIXVGLPtr->vg_id.word2);
2302+ continue;
2303+ }
2304+ LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",
2305+ AIXVGLPtr->AIXvgh->numpvs, AIXVGLPtr->numlvs);
2306+
2307+ // Export every valid volume in the group. For re-discovery,
2308+ // make sure we are only exporting "new" volumes.
2309+
2310+ for (j = 0, all_lvs_found = FALSE, lv_found = 0;
2311+ !all_lvs_found && j < LVM_MAXLVS; j++) {
2312+ if (AIXVGLPtr->volume_list[j] != NULL) {
2313+ if (AIXVGLPtr->volume_list[j]->lv_access & EVMS_LV_NEW) {
2314+
2315+ LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",
2316+ j,AIXVGLPtr->volume_list[j]);
2317+
2318+ volume = AIXVGLPtr->volume_list[j];
2319+ lv_found++;
2320+
2321+ if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
2322+ all_lvs_found = TRUE;
2323+ }
2324+ // For new volumes, create a new EVMS node and
2325+ // initialize the appropriate fields.
2326+ if (evms_cs_allocate_logical_node(&new_node)) {
2327+ LOG_DEBUG(" Export Vol Error allocating node !!\n");
2328+ continue;
2329+ } else {
2330+ LOG_DEBUG(" EV Node allocated OK\n");
2331+ }
2332+
2333+// volume->new_volume = 0;
2334+ volume->volume_node = new_node;
2335+ volume->lv_access &= (~EVMS_LV_NEW);
2336+ new_node->hardsector_size = AIXVGLPtr->hard_sect_size;
2337+ new_node->block_size = AIXVGLPtr->block_size;
2338+ new_node->plugin = &plugin_header;
2339+ new_node->private = volume;
2340+ new_node->total_vsectors = volume->lv_size;
2341+
2342+ LOG_DEBUG(" EV volume->name:[%s]\n",
2343+ volume->name);
2344+
2345+ strncpy(new_node->name,volume->name,
2346+ EVMS_VOLUME_NAME_SIZE + 1);
2347+
2348+ // Is the volume read-only?
2349+ if (!(volume->lv_access & AIX_LV_WRITE)
2350+ || volume->lv_access & EVMS_LV_INCOMPLETE)
2351+ {
2352+ new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
2353+ LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",
2354+ volume->lv_access);
2355+ }
2356+
2357+ evms_cs_add_logical_node_to_list(evms_partition_list,
2358+ new_node);
2359+ count++;
2360+
2361+ LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n",
2362+ volume, new_node,new_node->name);
2363+ } else {
2364+ evms_cs_add_logical_node_to_list(evms_partition_list,
2365+ AIXVGLPtr->volume_list[j]->volume_node);
2366+ count++;
2367+ LOG_DEBUG(" ELV vol_list[%d]%p\n", j,
2368+ AIXVGLPtr->volume_list[j]);
2369+ }
2370+ } else {
2371+ LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
2372+ }
2373+ } // end checking all lvs
2374+
2375+ AIXVGLPtr->flags &= ~AIX_VG_DIRTY;
2376+ }
2377+
2378+ return count;
2379+}
2380+
2381+/*
2382+ * Function: delete_logical_volume
2383+ *
2384+ * This function deletes the in-memory representation of a single LVM
2385+ * logical volume, including its PE map and any snapshot data. It does
2386+ * not alter the parent volume group, except to remove this volume from
2387+ * its volume list.
2388+ */
2389+static int
2390+delete_logical_volume(struct aix_logical_volume *volume)
2391+{
2392+ struct aix_volume_group *group = volume->group;
2393+
2394+ LOG_DEBUG(" Deleting volume %s\n", volume->name);
2395+
2396+ // Now free up all the memory. This includes the LE-to-PE map, any
2397+ // mirror PEs, etc.
2398+ if (volume->le_to_pe_map) {
2399+ kfree(volume->le_to_pe_map);
2400+ volume->le_to_pe_map = NULL;
2401+ }
2402+
2403+ if (volume->le_to_pe_map_mir1) {
2404+ kfree(volume->le_to_pe_map_mir1);
2405+ volume->le_to_pe_map_mir1 = NULL;
2406+ }
2407+
2408+ if (volume->le_to_pe_map_mir2) {
2409+ kfree(volume->le_to_pe_map_mir2);
2410+ volume->le_to_pe_map_mir2 = NULL;
2411+ }
2412+ // Remove this volume from the volume-group's list.
2413+ if (group && group->volume_list[volume->lv_number] == volume) {
2414+ group->volume_list[volume->lv_number] = NULL;
2415+ group->numlvs--;
2416+ }
2417+
2418+ kfree(volume);
2419+
2420+ return 0;
2421+}
2422+
2423+/* Function: remove_group_from_list
2424+ *
2425+ * Remove an LVM volume group from the global LVM list.
2426+ */
2427+static int
2428+remove_group_from_list(struct aix_volume_group *group)
2429+{
2430+ struct aix_volume_group **p_group;
2431+
2432+ for (p_group = &AIXVolumeGroupList; *p_group;
2433+ p_group = &(*p_group)->next) {
2434+ if (*p_group == group) {
2435+ *p_group = (*p_group)->next;
2436+ group->next = NULL;
2437+ break;
2438+ }
2439+ }
2440+ return 0;
2441+}
2442+
2443+/*
2444+ * Function: delete_aix_node
2445+ *
2446+ * This function deletes the in-memory representation of an LVM
2447+ * logical volume. Right now it makes a lot of assumptions about
2448+ * the data in the group not being corrupted. It would be possible
2449+ * to put in a lot of consistency checks before deleting everything
2450+ * to indicate if problems have occurred during the lifetime of the
2451+ * volume and its volume group.
2452+ */
2453+static int
2454+delete_aix_node(struct evms_logical_node *logical_node)
2455+{
2456+ struct aix_logical_volume *volume =
2457+ (struct aix_logical_volume *) (logical_node->private);
2458+ struct aix_volume_group *group = volume->group;
2459+
2460+ if (delete_logical_volume(volume)) {
2461+ return -EINVAL;
2462+ }
2463+ // If we just removed the last volume from this group, the entire group
2464+ // can also be deleted.
2465+ if (group && group->numlvs == 0) {
2466+ remove_group_from_list(group);
2467+ deallocate_volume_group(group);
2468+ }
2469+ // Free the logical node.
2470+ evms_cs_deallocate_logical_node(logical_node);
2471+
2472+ return 0;
2473+}
2474+
2475+/* Function: deallocate_volume_group
2476+ *
2477+ * This function deletes the entire in-memory representation of an LVM
2478+ * volume group, including all partitions and logical volumes. If this
2479+ * group is on the VGE's volume group list, it is removed.
2480+ */
2481+static int
2482+deallocate_volume_group(struct aix_volume_group *group)
2483+{
2484+ struct partition_list_entry *partition;
2485+ struct partition_list_entry *next_part;
2486+ int i;
2487+
2488+ LOG_DEBUG(" Deleting volume group %x\n", group->vg_id.word2);
2489+
2490+ // Delete all partitions from the group's list.
2491+ for (partition = group->partition_list; partition;
2492+ partition = next_part) {
2493+
2494+ next_part = partition->next;
2495+
2496+ if (partition->logical_node) {
2497+ // Send a delete command down to the partition manager.
2498+ LOG_DEBUG(" Deleting PV %d from group %x\n",
2499+ partition->pv_number, group->vg_id.word2);
2500+ DELETE(partition->logical_node);
2501+ }
2502+ kfree(partition);
2503+ }
2504+
2505+ // Delete all logical volumes, and the array of pointers.
2506+ for (i = 0; i < LVM_MAXLVS; i++) {
2507+ if (group->volume_list[i]) {
2508+ delete_logical_volume(group->volume_list[i]);
2509+ }
2510+ }
2511+
2512+ kfree(group);
2513+
2514+ return 0;
2515+}
2516+
2517+/* Function: end_discover_aix
2518+ *
2519+ * The discovery process at the region-manager level is now iterative,
2520+ * much like the EVMS feature level. To accomplish this correctly, and
2521+ * also to accomplish partial volume discovery, a second discover
2522+ * entry point is needed, so EVMS can tell the region managers that
2523+ * discovery is over, and to finish up any discovery that is not yet
2524+ * complete. When this function is called, it should be assumed that
2525+ * the node list has had nothing new added to it since the last call
2526+ * of the regular discover function. Therefore, when this function is
2527+ * called, we do not need to try to discovery any additional volume
2528+ * groups. We will, however, look for logical volumes once more. This
2529+ * gives us the ability to export (read-only) volumes that have
2530+ * partially corrupted LE maps due to missing PVs in their VG.
2531+ */
2532+static int
2533+end_discover_aix(struct evms_logical_node **evms_logical_disk_head)
2534+{
2535+
2536+ int rc;
2537+
2538+ MOD_INC_USE_COUNT;
2539+ LOG_DEBUG("Final Discovery:\n");
2540+
2541+ rc = discover_logical_volumes();
2542+
2543+ if (!rc) {
2544+ rc = export_volumes(evms_logical_disk_head);
2545+
2546+ lvm_cleanup();
2547+ }
2548+
2549+ MOD_DEC_USE_COUNT;
2550+ return rc;
2551+}
2552+
2553+/****************************************************
2554+* Function: AIX_alloc_wbh
2555+*
2556+* Alloc any buffer heads from the pool and return a linked list
2557+*
2558+*
2559+*****************************************************/
2560+static struct aix_mirror_bh *
2561+AIX_alloc_wbh(struct evms_logical_node *node,
2562+ struct evms_logical_node *node2,
2563+ struct evms_logical_node *node3,
2564+ struct buffer_head *bh,
2565+ u32 mirror_copies, u32 le, u64 new_sector2, u64 new_sector3)
2566+{
2567+ struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
2568+ int i;
2569+
2570+ head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2571+
2572+ if (!head_bh) {
2573+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2574+ __LINE__);
2575+ return NULL;
2576+ }
2577+
2578+ head_bh->master_bh = bh;
2579+ head_bh->mirror_bh_list = NULL;
2580+ head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2581+
2582+ for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
2583+
2584+ tmp_bh =
2585+ evms_cs_allocate_from_pool(AIX_BH_list_pool,
2586+ EVMS_BLOCKABLE);
2587+ if (!tmp_bh) {
2588+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2589+ __LINE__);
2590+ return NULL;
2591+ }
2592+
2593+ tmp_bh->next_r1 = head_bh->mirror_bh_list;
2594+ head_bh->mirror_bh_list = tmp_bh;
2595+ atomic_inc(&head_bh->remaining);
2596+
2597+ memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2598+ tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2599+ init_waitqueue_head(&tmp_bh->bh_req.b_wait);
2600+ //tmp_bh->bh_req.b_size = bh->b_size;
2601+
2602+ switch (i) {
2603+
2604+ case AIX_DEFAULT_MIRRORING:
2605+ tmp_bh->node = node;
2606+ tmp_bh->bh_req.b_rsector = bh->b_rsector;
2607+ break;
2608+
2609+ case AIX_FIRST_MIRROR:
2610+ tmp_bh->node = node2;
2611+ tmp_bh->bh_req.b_rsector = new_sector2;
2612+ break;
2613+
2614+ case AIX_MAX_MIRRORS:
2615+ tmp_bh->node = node3;
2616+ tmp_bh->bh_req.b_rsector = new_sector3;
2617+ break;
2618+ }
2619+
2620+ tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives; //setup callback routine
2621+ tmp_bh->bh_req.b_private = (void *) head_bh;
2622+
2623+ }
2624+
2625+ return head_bh;
2626+
2627+}
2628+
2629+/****************************************************
2630+* Function: AIX_handle_write_mirror_drives
2631+*
2632+* Handles a write from a set of mirrored AIX LVs
2633+
2634+*
2635+*
2636+*****************************************************/
2637+static void
2638+AIX_handle_write_mirror_drives(struct buffer_head *bh, int uptodate)
2639+{
2640+ struct aix_logical_volume *volume;
2641+ struct evms_logical_node *node;
2642+ struct aix_mirror_bh *tmp_bh = NULL, *tmp_bh2 = NULL;
2643+ kdev_t tmp_b_rdev;
2644+ u32 count, le = 0;
2645+
2646+ tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2647+ tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2648+ node = tmp_bh->node;
2649+ volume = (struct aix_logical_volume *) node->private;
2650+
2651+ LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2652+ node, bh->b_state, uptodate, volume->mirror_copies);
2653+
2654+ if (!uptodate) {
2655+ le = tmp_bh->le;
2656+
2657+ switch (tmp_bh->iteration) {
2658+ case AIX_DEFAULT_MIRRORING:
2659+ volume->le_to_pe_map[le].pp_state += AIX_LVM_LVSTALE;
2660+ break;
2661+
2662+ case AIX_FIRST_MIRROR:
2663+ volume->le_to_pe_map_mir1[le].pp_state +=
2664+ AIX_LVM_LVSTALE;
2665+ break;
2666+
2667+ case AIX_MAX_MIRRORS:
2668+ volume->le_to_pe_map_mir2[le].pp_state +=
2669+ AIX_LVM_LVSTALE;
2670+ break;
2671+ }
2672+
2673+ AIX_evms_cs_notify_lv_io_error(node);
2674+ }
2675+
2676+ if (atomic_dec_and_test(&tmp_bh->remaining)) {
2677+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2678+ tmp_bh2 = tmp_bh->mirror_bh_list;
2679+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2680+
2681+ while (tmp_bh2) {
2682+ tmp_bh = tmp_bh2->next_r1;
2683+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
2684+ tmp_bh2 = tmp_bh;
2685+ }
2686+
2687+ evms_cs_volume_request_in_progress(tmp_b_rdev,
2688+ AIX_DECREMENT_REQUEST,
2689+ &count);
2690+ }
2691+
2692+ return;
2693+}
2694+
2695+/****************************************************
2696+* Function: AIX_alloc_rbh
2697+*
2698+* Alloc any buffer heads from the pool and return a linked list
2699+*
2700+*
2701+*****************************************************/
2702+static struct aix_mirror_bh *
2703+AIX_alloc_rbh(struct evms_logical_node *node,
2704+ struct buffer_head *bh,
2705+ u32 mirror_copies, u32 le, u64 org_sector, int cmd)
2706+{
2707+ struct aix_mirror_bh *tmp_bh = NULL;
2708+
2709+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2710+
2711+ if (!tmp_bh) {
2712+ LOG_SERIOUS
2713+ ("Unable to allocate memory for mirror pool line:%d\n",
2714+ __LINE__);
2715+ return NULL;
2716+ }
2717+
2718+ memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2719+ tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2720+ tmp_bh->node = node;
2721+ tmp_bh->master_bh = bh;
2722+ tmp_bh->iteration = AIX_FIRST_MIRROR;
2723+ //tmp_bh->eio.rsector = eio->rsector;
2724+ //tmp_bh->eio.rsize = eio->rsize;
2725+ tmp_bh->le = le;
2726+ //tmp_bh->eio.bh = &tmp_bh->bh_req;
2727+
2728+ if (cmd == AIX_LV_READ) {
2729+ tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives; //setup callback routine
2730+ } else {
2731+ tmp_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions; //setup callback routine
2732+ }
2733+
2734+ tmp_bh->bh_req.b_private = (void *) tmp_bh;
2735+
2736+ tmp_bh->cmd = cmd;
2737+ tmp_bh->next_r1 = NULL;
2738+ tmp_bh->node = node;
2739+
2740+ return tmp_bh;
2741+
2742+}
2743+
2744+/****************************************************
2745+* Function: AIX_reschedule_retry
2746+*
2747+* reschedule a read of one of our mirror copies
2748+*
2749+*
2750+*****************************************************/
2751+static void
2752+AIX_reschedule_retry(struct aix_mirror_bh *aix_bh)
2753+{
2754+ unsigned long flags;
2755+
2756+ spin_lock_irqsave(&AIX_retry_list_lock, flags);
2757+ if (AIX_retry_list == NULL)
2758+ AIX_retry_tail = &AIX_retry_list;
2759+ *AIX_retry_tail = aix_bh;
2760+ AIX_retry_tail = &aix_bh->next_r1;
2761+ aix_bh->next_r1 = NULL;
2762+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2763+ evms_cs_wakeup_thread(AIX_mirror_read_retry_thread);
2764+}
2765+
2766+/****************************************************
2767+* Function: AIX_handle_read_mirror_drives
2768+*
2769+* Handles a read from a set of mirrored AIX LVs
2770+
2771+*
2772+*
2773+*****************************************************/
2774+static void
2775+AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate)
2776+{
2777+ struct aix_logical_volume *volume;
2778+ struct evms_logical_node *node;
2779+ struct aix_mirror_bh *tmp_bh;
2780+ kdev_t tmp_b_rdev;
2781+ u32 count, le = 0;
2782+
2783+ tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2784+ tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2785+ volume = (struct aix_logical_volume *) tmp_bh->node->private;
2786+ node = tmp_bh->node;
2787+ le = tmp_bh->le;
2788+
2789+ LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2790+ node, bh->b_state, uptodate, volume->mirror_copies);
2791+
2792+ switch (tmp_bh->iteration) {
2793+ case AIX_DEFAULT_MIRRORING:
2794+ count = volume->le_to_pe_map[le].pp_state;
2795+ break;
2796+
2797+ case AIX_FIRST_MIRROR:
2798+ count = volume->le_to_pe_map[le].pp_state;
2799+ break;
2800+
2801+ case AIX_MAX_MIRRORS:
2802+ count = volume->le_to_pe_map[le].pp_state;
2803+ break;
2804+ }
2805+
2806+ if (count == (AIX_LVM_LVSTALE + AIX_LVM_LVDEFINED)) {
2807+ uptodate = 0;
2808+ count = 0;
2809+ }
2810+
2811+ if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
2812+ AIX_evms_cs_notify_lv_io_error(node);
2813+ AIX_reschedule_retry(tmp_bh);
2814+ } else {
2815+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2816+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2817+ evms_cs_volume_request_in_progress(tmp_b_rdev,
2818+ AIX_DECREMENT_REQUEST,
2819+ &count);
2820+
2821+ }
2822+
2823+ return;
2824+}
2825+
2826+/****************************************************
2827+* This is a temporary function until a common EVMS
2828+* notification function can be created.
2829+*
2830+*****************************************************/
2831+static int
2832+AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node)
2833+{
2834+ struct aix_logical_volume *volume;
2835+
2836+ volume = (struct aix_logical_volume *) node->private;
2837+
2838+ LOG_CRITICAL("Notify_ERROR !! node:%p volume->lv_status:%d volume->name:[%s]\n",
2839+ node, volume->lv_status, volume->name);
2840+
2841+ return 0;
2842+}
2843+
2844+/* Function: lvm_cleanup
2845+ *
2846+ * This function runs through the entire lvm data structure, removing
2847+ * all items that are not needed at runtime. Currently, this is just the
2848+ * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
2849+ * groups that don't contain any volumes are deleted. All of the other
2850+ * volume_group, logical_volume and evms_logical_node structures will be
2851+ * kept around at run-time.
2852+ */
2853+static int
2854+lvm_cleanup(void)
2855+{
2856+ struct aix_volume_group *group;
2857+
2858+ group = AIXVolumeGroupList;
2859+
2860+ while (group) {
2861+
2862+ if (group->AIXvgh) {
2863+ kfree(group->AIXvgh);
2864+ group->AIXvgh = NULL;
2865+ }
2866+
2867+ group = group->next;
2868+ }
2869+
2870+ return 0;
2871+}
2872+
2873+/****************************************************
2874+* Function: AIX_copy_header_info
2875+*
2876+* Copy the disk header info into the volume struct
2877+* so we can use it later.
2878+*
2879+*
2880+*
2881+*****************************************************/
2882+static int
2883+AIX_copy_header_info(struct vg_header *AIXvgh, struct vg_header *AIXvgh2)
2884+{
2885+
2886+ LOG_DEBUG("CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2887+
2888+ if (AIXvgh) {
2889+
2890+ AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec;
2891+ AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec;
2892+ AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1;
2893+ AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2;
2894+ AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3;
2895+ AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4;
2896+ AIXvgh->numlvs = AIXvgh2->numlvs;
2897+ AIXvgh->maxlvs = AIXvgh2->maxlvs;
2898+ AIXvgh->pp_size = AIXvgh2->pp_size;
2899+ AIXvgh->numpvs = AIXvgh2->numpvs;
2900+ AIXvgh->total_vgdas = AIXvgh2->total_vgdas;
2901+ AIXvgh->vgda_size = AIXvgh2->vgda_size;
2902+ AIXvgh->bigvg = AIXvgh2->bigvg;
2903+ AIXvgh->quorum = AIXvgh2->quorum;
2904+ AIXvgh->auto_varyon = AIXvgh2->auto_varyon;
2905+ AIXvgh->checksum = AIXvgh2->checksum;
2906+ AIXvgh->bigda_size = AIXvgh2->bigda_size;
2907+
2908+ } else {
2909+ return -ENOMEM;
2910+ }
2911+
2912+ LOG_DEBUG("Returning CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2913+
2914+ return 0;
2915+}
2916+
2917+/****************************************************
2918+* Function: AIX_free_header
2919+*
2920+*
2921+*
2922+*
2923+*
2924+*****************************************************/
2925+static void
2926+AIX_free_headers(struct vg_header *AIXvgh, struct vg_header *AIXvgh2,
2927+ struct vg_trailer *AIXvgt, struct vg_trailer *AIXvgt2)
2928+{
2929+
2930+ if (AIXvgh) {
2931+ kfree(AIXvgh);
2932+ AIXvgh = NULL;
2933+ }
2934+
2935+ if (AIXvgh2) {
2936+ kfree(AIXvgh2);
2937+ AIXvgh2 = NULL;
2938+ }
2939+
2940+ if (AIXvgt) {
2941+ kfree(AIXvgt);
2942+ AIXvgt = NULL;
2943+ }
2944+
2945+ if (AIXvgt2) {
2946+ kfree(AIXvgt2);
2947+ AIXvgt2 = NULL;
2948+ }
2949+
2950+}
2951+
2952+/****************************************************
2953+* Function: AIXiod
2954+*
2955+* This is a kernel thread that handles read of mirrors
2956+* This shouldn't ever run on a non-mirrored LV read
2957+*
2958+*
2959+*****************************************************/
2960+static void
2961+AIXiod(void *data)
2962+{
2963+ struct aix_mirror_bh *r1_bh;
2964+ struct evms_logical_node *node;
2965+ unsigned long flags;
2966+
2967+ while (1) {
2968+
2969+ spin_lock_irqsave(&AIX_retry_list_lock, flags);
2970+ if (AIX_retry_list == NULL) {
2971+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2972+ break;
2973+ }
2974+ r1_bh = AIX_retry_list;
2975+ AIX_retry_list = r1_bh->next_r1;
2976+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2977+ r1_bh->next_r1 = NULL; // for mark
2978+
2979+ switch (r1_bh->cmd) {
2980+ case AIX_LV_READ:
2981+
2982+ r1_bh->iteration++;
2983+ LOG_DEBUG("Report from thread AIXiod READ\n");
2984+
2985+ if (r1_bh->iteration == AIX_FIRST_MIRROR) {
2986+ node = r1_bh->mir_node1;
2987+ r1_bh->bh_req.b_rsector = r1_bh->mir_sector1;
2988+ } else {
2989+ node = r1_bh->mir_node2;
2990+ r1_bh->bh_req.b_rsector = r1_bh->mir_sector2;
2991+ }
2992+
2993+ R_IO(node, &r1_bh->bh_req);
2994+
2995+ break;
2996+
2997+ default:
2998+ LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n",
2999+ r1_bh->cmd);
3000+ break;
3001+ }
3002+
3003+ }
3004+ return;
3005+}
3006+
3007+/****************************************************
3008+* Function: AIX_schedule_resync
3009+*
3010+* schedule a resync of one of our lv mirror copies
3011+*
3012+*
3013+*****************************************************/
3014+static void
3015+AIX_schedule_resync(struct aix_logical_volume *resync_volume, int force)
3016+{
3017+ unsigned long flags;
3018+
3019+ LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__,
3020+ resync_volume->name);
3021+
3022+ spin_lock_irqsave(&AIX_resync_list_lock, flags);
3023+
3024+ if (!AIX_resync_list) {
3025+ AIX_resync_list =
3026+ kmalloc(sizeof (struct aix_resync_struct), GFP_ATOMIC);
3027+ if (!AIX_resync_list) {
3028+ return;
3029+ }
3030+ memset(AIX_resync_list, 0, sizeof (struct aix_resync_struct));
3031+ }
3032+
3033+ AIX_resync_list->resync_vol = resync_volume;
3034+ AIX_resync_list->next_resync_vol = NULL;
3035+
3036+ spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3037+ evms_cs_wakeup_thread(AIX_mirror_resync_thread);
3038+}
3039+
3040+/****************************************************
3041+* Function: AIXresync
3042+*
3043+* This is a kernel thread that handles resync of mirrors
3044+* This shouldn't ever run on a non-mirrored LV
3045+*
3046+*
3047+*****************************************************/
3048+static void
3049+AIXresync(void *data)
3050+{
3051+
3052+ struct aix_logical_volume *volume = NULL;
3053+ int force = FALSE; // Currently we don't force a resync of non-stale pe's
3054+
3055+ if (AIX_resync_list == NULL) {
3056+ LOG_ERROR("No Volumes on list to resync\n");
3057+ return;
3058+ }
3059+
3060+ volume = AIX_resync_list->resync_vol;
3061+ LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3062+
3063+ if (!volume) {
3064+ LOG_ERROR("Invalid volume passed to sync\n");
3065+ return;
3066+ }
3067+
3068+ if (AIXResyncInProgress) {
3069+ LOG_ERROR("Unable to resync multiple LVs concurrently %s\n",
3070+ volume->name);
3071+ return;
3072+ }
3073+
3074+ if (volume->mirror_copies == AIX_DEFAULT_MIRRORING) {
3075+ LOG_ERROR("Unable to resync non-mirrored LV %s \n",
3076+ volume->name);
3077+ return;
3078+ }
3079+
3080+ AIXResyncInProgress = TRUE;
3081+
3082+ AIX_resync_lv_mirrors(volume, force);
3083+
3084+ return;
3085+}
3086+
3087+/****************************************************
3088+* Function: AIX_resync_lv_mirrors
3089+*
3090+*
3091+*
3092+*
3093+*
3094+*****************************************************/
3095+static int
3096+AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force)
3097+{
3098+
3099+ int i;
3100+ char pp_stale = FALSE;
3101+
3102+ struct partition_list_entry *master_part = NULL;
3103+ struct partition_list_entry *slave1_part = NULL;
3104+ struct partition_list_entry *slave2_part = NULL;
3105+
3106+ u64 master_offset = 0;
3107+ u64 slave1_offset = 0;
3108+ u64 slave2_offset = 0;
3109+
3110+ LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3111+
3112+ for (i = 0; i < volume->num_le; i++, pp_stale = FALSE) {
3113+
3114+ // We need to see which mirror has a valid non-stale copy.
3115+ // The first non-stale copy will be our master and we'll
3116+ // copy to the slave(s).
3117+
3118+ if ((volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3119+ pp_stale = TRUE;
3120+ }
3121+
3122+ if (volume->le_to_pe_map_mir1 != NULL) {
3123+ if ((volume->le_to_pe_map_mir1[i].
3124+ pp_state & AIX_LVM_LVSTALE)) {
3125+ pp_stale = TRUE;
3126+ }
3127+ }
3128+
3129+ if (volume->le_to_pe_map_mir2 != NULL) {
3130+ if ((volume->le_to_pe_map_mir2[i].
3131+ pp_state & AIX_LVM_LVSTALE)) {
3132+ pp_stale = TRUE;
3133+ }
3134+ }
3135+
3136+ LOG_DEBUG("Function %s pp_stale:%d force:%d \n", __FUNCTION__,
3137+ pp_stale, force);
3138+
3139+ if (pp_stale || force) {
3140+ if (!(volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3141+
3142+ master_part = volume->le_to_pe_map[i].owning_pv;
3143+ master_offset = volume->le_to_pe_map[i].pe_sector_offset;
3144+
3145+ if (volume->le_to_pe_map_mir1 != NULL) {
3146+ slave1_part = volume->le_to_pe_map_mir1[i].owning_pv;
3147+ slave1_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3148+ }
3149+
3150+ if (volume->le_to_pe_map_mir2 != NULL) {
3151+ slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3152+ slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3153+ }
3154+ } else
3155+ if (!(volume->le_to_pe_map_mir1[i].pp_state & AIX_LVM_LVSTALE)) {
3156+ master_part = volume->le_to_pe_map_mir1[i].owning_pv;
3157+ master_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3158+
3159+ if (volume->le_to_pe_map != NULL) {
3160+ slave1_part = volume->le_to_pe_map[i].owning_pv;
3161+ slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3162+ }
3163+
3164+ if (volume->le_to_pe_map_mir2 != NULL) {
3165+ slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3166+ slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3167+ }
3168+ } else
3169+ if (!(volume->le_to_pe_map_mir2[i].pp_state & AIX_LVM_LVSTALE)) {
3170+ master_part = volume->le_to_pe_map_mir2[i].owning_pv;
3171+ master_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3172+
3173+ if (volume->le_to_pe_map != NULL) {
3174+ slave1_part = volume->le_to_pe_map[i].owning_pv;
3175+ slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3176+ }
3177+
3178+ if (volume->le_to_pe_map_mir1 != NULL) {
3179+ slave2_part = volume->le_to_pe_map_mir1[i].owning_pv;
3180+ slave2_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3181+ }
3182+ }
3183+
3184+ if (AIX_copy_on_read(volume, master_part, slave1_part, slave2_part,
3185+ master_offset, slave1_offset, slave2_offset,
3186+ volume->pe_size, i)) {
3187+
3188+ LOG_CRITICAL("ReSync of logical Volume %s FAILED !!\n",
3189+ volume->name);
3190+ AIX_evms_cs_notify_lv_io_error(volume->
3191+ volume_node);
3192+ break;
3193+ }
3194+
3195+ }
3196+
3197+ }
3198+
3199+ return 0;
3200+}
3201+
3202+/****************************************************
3203+* Function: AIX_copy_on_read
3204+*
3205+*
3206+*
3207+*
3208+*
3209+*****************************************************/
3210+static int
3211+AIX_copy_on_read(struct aix_logical_volume *volume,
3212+ struct partition_list_entry *master_part,
3213+ struct partition_list_entry *slave1_part,
3214+ struct partition_list_entry *slave2_part,
3215+ u64 master_offset,
3216+ u64 slave1_offset, u64 slave2_offset, u32 pe_size, int le)
3217+{
3218+ unsigned long flags;
3219+ struct aix_mirror_bh *tmp_bh = NULL;
3220+
3221+ // Check for valid partitions we need at least 2 good partitions so slave2 doesn't have to be valid
3222+
3223+ if (!master_part || !slave1_part) {
3224+ LOG_ERROR("Invalid partitions for resync master part:%p slave1_part:%p slave2_part:%p\n",
3225+ master_part, slave1_part, slave2_part);
3226+ return -EINVAL;
3227+ }
3228+
3229+ LOG_DEBUG("Function %s volume:%s master_part:%d, slave1_part:%d, slave2_part:%d master_offset:"
3230+ PFU64 ", slave1_offset:" PFU64 " slave2_offset:" PFU64 ", \n",
3231+ __FUNCTION__, volume->name, master_part->pv_number,
3232+ slave1_part->pv_number, slave2_part->pv_number, master_offset,
3233+ slave1_offset, slave2_offset);
3234+
3235+ LOG_DEBUG("pe_size:%d le:%d\n", pe_size, le);
3236+
3237+ tmp_bh =
3238+ AIX_alloc_sbh(volume, master_part, slave1_part, slave2_part,
3239+ master_offset, slave1_offset, slave2_offset, pe_size);
3240+
3241+ if (!tmp_bh) {
3242+ buffer_IO_error(&tmp_bh->bh_req);
3243+ return -ENOMEM;
3244+ }
3245+
3246+/* if (evms_cs_volume_request_in_progress
3247+ (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
3248+ buffer_IO_error(&tmp_bh->bh_req);
3249+ return -EIO;
3250+ } */
3251+
3252+ spin_lock_irqsave(&AIX_resync_pp_lock, flags);
3253+
3254+ LOG_DEBUG("Function:%s kicking off read node:%p\n", __FUNCTION__,
3255+ master_part->logical_node);
3256+
3257+ R_IO(master_part->logical_node, &tmp_bh->bh_req);
3258+
3259+ spin_unlock_irqrestore(&AIX_resync_pp_lock, flags);
3260+
3261+ return 0;
3262+}
3263+
3264+/****************************************************
3265+* Function: AIX_alloc_sbh
3266+*
3267+* Alloc any buffer heads from the pool and return a linked list
3268+*
3269+*
3270+*****************************************************/
3271+static struct aix_mirror_bh *
3272+AIX_alloc_sbh(struct aix_logical_volume *volume,
3273+ struct partition_list_entry *master_part,
3274+ struct partition_list_entry *slave1_part,
3275+ struct partition_list_entry *slave2_part,
3276+ u64 master_offset,
3277+ u64 slave1_offset, u64 slave2_offset, u32 pe_size)
3278+{
3279+ struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
3280+ unsigned long flags;
3281+
3282+ LOG_DEBUG("Function:%s Enter\n", __FUNCTION__);
3283+
3284+ head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3285+ if (!head_bh) {
3286+ LOG_SERIOUS
3287+ ("Unable to allocate memory for mirror pool line:%d\n",
3288+ __LINE__);
3289+ return NULL;
3290+ }
3291+ // Update buffer so we block on a read/write on the normal IO path
3292+ // if we're trying to sync the same sector on the disk
3293+ // We don't want to block if it's different sectors
3294+
3295+ spin_lock_irqsave(&AIX_resync_list_lock, flags);
3296+
3297+ AIX_resync_list->master_part = master_part;
3298+ AIX_resync_list->slave1_part = slave1_part;
3299+ AIX_resync_list->slave2_part = slave2_part;
3300+ AIX_resync_list->master_offset = master_offset;
3301+ AIX_resync_list->slave1_offset = slave1_offset;
3302+ AIX_resync_list->slave2_offset = slave2_offset;
3303+
3304+ head_bh->bh_req.b_data = kmalloc(AIX_RESYNC_BLOCKSIZE + 1, GFP_NOIO);
3305+ if (!head_bh->bh_req.b_data) {
3306+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, head_bh);
3307+ LOG_SERIOUS
3308+ ("Unable to allocate memory for mirror pool line:%d\n",
3309+ __LINE__);
3310+ return NULL;
3311+ }
3312+
3313+ memset(head_bh->bh_req.b_data, 0, AIX_RESYNC_BLOCKSIZE + 1);
3314+
3315+ head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3316+ head_bh->bh_req.b_rsector = master_offset;
3317+ head_bh->bh_req.b_size = AIX_RESYNC_BLOCKSIZE;
3318+ head_bh->sync_flag = AIX_SYNC_INCOMPLETE;
3319+ head_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions;
3320+ head_bh->bh_req.b_page = virt_to_page(head_bh->bh_req.b_data);
3321+ head_bh->bh_req.b_state = 0;
3322+ set_bit(BH_Dirty, &head_bh->bh_req.b_state);
3323+ set_bit(BH_Lock, &head_bh->bh_req.b_state);
3324+ set_bit(BH_Req, &head_bh->bh_req.b_state);
3325+ set_bit(BH_Mapped, &head_bh->bh_req.b_state);
3326+ head_bh->master_bh = NULL;
3327+ head_bh->mirror_bh_list = NULL;
3328+
3329+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3330+ if (!tmp_bh) {
3331+ LOG_SERIOUS
3332+ ("Unable to allocate memory for mirror pool line:%d\n",
3333+ __LINE__);
3334+ return NULL;
3335+ }
3336+
3337+ head_bh->next_r1 = tmp_bh;
3338+ memcpy(&tmp_bh->bh_req, head_bh, sizeof (struct buffer_head));
3339+ tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3340+ tmp_bh->bh_req.b_end_io = NULL;
3341+
3342+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
3343+ tmp_bh->next_r1 =
3344+ evms_cs_allocate_from_pool(AIX_BH_list_pool,
3345+ EVMS_BLOCKABLE);
3346+ if (!tmp_bh->next_r1) {
3347+ LOG_SERIOUS
3348+ ("Unable to allocate memory for mirror pool line:%d\n",
3349+ __LINE__);
3350+ return NULL;
3351+ }
3352+
3353+ memcpy(&tmp_bh->next_r1->bh_req, head_bh,
3354+ sizeof (struct buffer_head));
3355+ tmp_bh->next_r1->bh_req.b_end_io = NULL;
3356+ tmp_bh->next_r1->remaining = (atomic_t) ATOMIC_INIT(0);
3357+ }
3358+
3359+ init_waitqueue_head(&head_bh->bh_req.b_wait);
3360+
3361+ spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3362+
3363+ LOG_DEBUG("Function:%s Exit head_bh:%p\n", __FUNCTION__, head_bh);
3364+
3365+ return head_bh;
3366+}
3367+
3368+/****************************************************
3369+* Function: AIX_sync_mirrored_partitions
3370+*
3371+*
3372+*
3373+*
3374+*
3375+*****************************************************/
3376+static void
3377+AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate)
3378+{
3379+ struct aix_logical_volume *volume = NULL;
3380+ struct aix_mirror_bh *tmp_bh, *head_bh;
3381+
3382+ head_bh = tmp_bh = (struct aix_mirror_bh *) bh->b_private;
3383+ volume = (struct aix_logical_volume *) tmp_bh->node->private;
3384+
3385+ LOG_DEBUG("Function:%s Enter uptodate:%d\n", __FUNCTION__, uptodate);
3386+
3387+ if (!uptodate) {
3388+
3389+ AIX_evms_cs_notify_lv_io_error(tmp_bh->node);
3390+ }
3391+
3392+ tmp_bh = head_bh->next_r1;
3393+
3394+ LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3395+ __LINE__, tmp_bh);
3396+
3397+ if (tmp_bh) {
3398+ W_IO(tmp_bh->node, &tmp_bh->bh_req);
3399+ AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_1,
3400+ AIX_RESYNC_BLOCKSIZE);
3401+ }
3402+
3403+ tmp_bh = tmp_bh->next_r1;
3404+ LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3405+ __LINE__, tmp_bh);
3406+
3407+ if (tmp_bh) {
3408+ W_IO(tmp_bh->node, &tmp_bh->bh_req);
3409+ AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_2,
3410+ AIX_RESYNC_BLOCKSIZE);
3411+ }
3412+
3413+ LOG_DEBUG("Function:%s line:%d read from master:%p\n", __FUNCTION__,
3414+ __LINE__, head_bh);
3415+
3416+ if (head_bh && head_bh->sync_flag) {
3417+ AIX_get_set_mirror_offset(head_bh, AIX_MASTER,
3418+ AIX_RESYNC_BLOCKSIZE);
3419+ if (head_bh->sync_flag == AIX_SYNC_INCOMPLETE) {
3420+ R_IO(head_bh->node, &head_bh->bh_req);
3421+ }
3422+ }
3423+
3424+ LOG_DEBUG("Function:%s line:%d head_bh->sync_flag:%d\n", __FUNCTION__,
3425+ __LINE__, head_bh->sync_flag);
3426+
3427+ if (!head_bh->sync_flag) {
3428+ tmp_bh = head_bh;
3429+ head_bh = head_bh->next_r1;
3430+
3431+ while (tmp_bh != NULL) {
3432+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
3433+ tmp_bh = head_bh;
3434+ }
3435+
3436+ AIXResyncInProgress = FALSE;
3437+/* evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_rdev,
3438+ AIX_DECREMENT_REQUEST,
3439+ &count); */
3440+
3441+ if (AIX_resync_list) {
3442+ kfree(AIX_resync_list);
3443+ }
3444+ }
3445+
3446+ return;
3447+}
3448+
3449+/****************************************************
3450+* Function: AIX_get_set_mirror_offset
3451+*
3452+*
3453+*
3454+*
3455+*
3456+*****************************************************/
3457+static int
3458+AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh, int index, int offset)
3459+{
3460+ int flags;
3461+
3462+ if (!tmp_bh) {
3463+ return -EINVAL;
3464+ }
3465+
3466+ LOG_DEBUG("Function:%s Enter offset:%d\n", __FUNCTION__, offset);
3467+
3468+ tmp_bh->bh_req.b_rsector += tmp_bh->bh_req.b_rsector + offset;
3469+
3470+ if (tmp_bh->bh_req.b_rsector > tmp_bh->node->total_vsectors) {
3471+ tmp_bh->sync_flag = AIX_SYNC_COMPLETE;
3472+ return -EIO;
3473+ }
3474+ // Update buffer so we block on a read/write on the normal IO path
3475+ // if we're trying to sync the same sector on the disk
3476+ // We don't want to block if it's different sectors
3477+
3478+ spin_lock_irqsave(&AIX_resync_list_lock, flags);
3479+
3480+ if (AIX_resync_list->master_part->logical_node == tmp_bh->node) {
3481+ AIX_resync_list->master_offset += offset;
3482+ }
3483+
3484+ if (AIX_resync_list->slave1_part->logical_node == tmp_bh->node) {
3485+ AIX_resync_list->slave1_offset += offset;
3486+ }
3487+
3488+ if (AIX_resync_list->slave2_part->logical_node == tmp_bh->node) {
3489+ AIX_resync_list->slave2_offset += offset;
3490+ }
3491+
3492+ spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3493+
3494+ return 0;
3495+
3496+}
3497+
3498+static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs)
3499+{
3500+ struct partition_list_entry * pv;
3501+ struct pv_header * AIXpvh;
3502+ int posn = 0;
3503+ int num_pps;
3504+ int tmp,i;
3505+
3506+ LOG_DEBUG("APDP - vgda_psn:%d numpvs:%d \n", vgda_psn, numpvs);
3507+
3508+ AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
3509+ if (!AIXpvh) {
3510+ return -ENOMEM;
3511+ }
3512+
3513+ memset(AIXpvh, 0 , sizeof(struct pv_header));
3514+
3515+ // Adjust this because when AIX VGs/Volumes are created on Intel platforms, the
3516+ // pp_count could be anything since we don't give up the entire physical drive.
3517+ // This is for calculation purposes only.
3518+
3519+ pvh_posn[0] = 0;
3520+ pv = partition;
3521+
3522+ for (i = 1; i <= numpvs; i++) {
3523+ for (pv = partition; pv->pv_number != i; pv = pv->next );
3524+
3525+ LOG_DEBUG("APDP line:%d pp_count:%d \n", __LINE__, AIXpvh->pp_count);
3526+
3527+ num_pps = AIXpvh->pp_count;
3528+ num_pps++; // Account for the pv_header on the front
3529+
3530+ while ((num_pps * sizeof(struct pp_entries)) % AIX_SECTOR_SIZE) {
3531+ LOG_EXTRA("num_pps:%d \n", num_pps);
3532+ num_pps++;
3533+ }
3534+
3535+ tmp = (num_pps * sizeof(struct pp_entries)) / AIX_SECTOR_SIZE;
3536+
3537+ LOG_DEBUG("APDP tmp:%d num_pps:%d \n", tmp,num_pps);
3538+
3539+ posn = ((vgda_psn + PSN_PPH_OFFSET) + ((pv->pv_number -1) * tmp));
3540+
3541+ pvh_posn[pv->pv_number] = posn;
3542+
3543+ if (INIT_IO(pv->logical_node, 0, posn, 1, AIXpvh)) {
3544+ kfree(AIXpvh);
3545+ return -EIO;
3546+ }
3547+
3548+ pv = partition;
3549+ }
3550+
3551+ kfree(AIXpvh);
3552+
3553+ return 0;
3554+}
3555+
3556+/****************************************************
3557+* Function: AIX_volume_group_dump
3558+*
3559+* This is for debug purposes and will walk the volume group list
3560+* and LV's within the volume groups
3561+*
3562+* It can be called at anytime however the output to the display is large
3563+*
3564+*****************************************************/
3565+#ifdef EVMS_AIX_DEBUG
3566+static int
3567+AIX_volume_group_dump(void)
3568+{
3569+ struct aix_volume_group *AIXVGLDebugPtr;
3570+ struct partition_list_entry *DebugPartitionList;
3571+ struct aix_logical_volume *DebugLVList;
3572+ int i;
3573+
3574+ AIXVGLDebugPtr = AIXVolumeGroupList;
3575+
3576+ if (!AIXVGLDebugPtr) {
3577+ LOG_DEBUG("***********************************************\n");
3578+ LOG_DEBUG("ERROR Nothing built in the list to check !!! \n");
3579+ LOG_DEBUG("***********************************************\n");
3580+ return 0;
3581+ }
3582+
3583+ LOG_DEBUG("*********************************************** \n");
3584+ LOG_DEBUG("Begin Volume Group Dump \n");
3585+ LOG_DEBUG("*********************************************** \n");
3586+
3587+ while (AIXVGLDebugPtr) {
3588+
3589+ LOG_DEBUG("vg_number %x\n", AIXVGLDebugPtr->vg_id.word2);
3590+ LOG_DEBUG("numpsrtitions %d\n", AIXVGLDebugPtr->partition_count);
3591+ LOG_DEBUG("numlvs %d\n", AIXVGLDebugPtr->numlvs);
3592+ LOG_DEBUG("hard_sect_size %d\n", AIXVGLDebugPtr->hard_sect_size);
3593+ LOG_DEBUG("block_size %d\n", AIXVGLDebugPtr->block_size);
3594+ LOG_DEBUG("flags %d\n", AIXVGLDebugPtr->flags);
3595+// LOG_DEBUG("lv_max %d\n", AIXVGLDebugPtr->lv_max);
3596+ LOG_DEBUG("pe_size %d\n", AIXVGLDebugPtr->pe_size);
3597+ LOG_DEBUG("CleanVGInfo %d\n", AIXVGLDebugPtr->CleanVGInfo);
3598+
3599+ DebugPartitionList = AIXVGLDebugPtr->partition_list;
3600+
3601+ LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
3602+
3603+ if (!DebugPartitionList) {
3604+ LOG_DEBUG("No partitions to check !! \n");
3605+ }
3606+
3607+ while (DebugPartitionList) {
3608+ LOG_DEBUG("logical_node %p\n",
3609+ DebugPartitionList->logical_node);
3610+ LOG_DEBUG("pv_number %d\n",
3611+ DebugPartitionList->pv_number);
3612+ LOG_DEBUG("block_size %d\n",
3613+ DebugPartitionList->block_size);
3614+ LOG_DEBUG("hard_sect_size %d\n",
3615+ DebugPartitionList->hard_sect_size);
3616+ LOG_DEBUG("-------------------------------------------------------------\n");
3617+ DebugPartitionList = DebugPartitionList->next;
3618+ }
3619+
3620+ LOG_DEBUG("********* End Volume Partition Dump **********\n");
3621+
3622+ LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
3623+
3624+ DebugLVList = AIXVGLDebugPtr->volume_list[0];
3625+
3626+ if (!DebugLVList) {
3627+ LOG_DEBUG("No logical volumes to check !! \n");
3628+ }
3629+
3630+ for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
3631+
3632+ DebugLVList = AIXVGLDebugPtr->volume_list[i];
3633+
3634+ if (DebugLVList) {
3635+ LOG_DEBUG("volume_list # %d \n", i);
3636+ LOG_DEBUG("lv_number %d \n",
3637+ DebugLVList->lv_number);
3638+ LOG_DEBUG("LV name %s \n",
3639+ DebugLVList->name);
3640+ LOG_DEBUG("lv_size " PFU64 " \n",
3641+ DebugLVList->lv_size);
3642+ LOG_DEBUG("lv_access %d \n",
3643+ DebugLVList->lv_access);
3644+ LOG_DEBUG("lv_status %d \n",
3645+ DebugLVList->lv_status);
3646+// LOG_DEBUG("lv_minor %d \n",
3647+// DebugLVList->lv_minor);
3648+ LOG_DEBUG("mirror_copies %d \n",
3649+ DebugLVList->mirror_copies);
3650+// LOG_DEBUG("mirror_number %d \n",
3651+// DebugLVList->mirror_number);
3652+ LOG_DEBUG("stripes %d \n",
3653+ DebugLVList->stripes);
3654+ LOG_DEBUG("stripe_size %d \n",
3655+ DebugLVList->stripe_size);
3656+ LOG_DEBUG("stripe_size_shift%d \n",
3657+ DebugLVList->stripe_size_shift);
3658+ LOG_DEBUG("pe_size %d \n",
3659+ DebugLVList->pe_size);
3660+ LOG_DEBUG("pe_size_shift %d \n",
3661+ DebugLVList->pe_size_shift);
3662+ LOG_DEBUG("num_le %d \n",
3663+ DebugLVList->num_le);
3664+// LOG_DEBUG("new_volume %d \n",
3665+// DebugLVList->new_volume);
3666+ LOG_DEBUG("group %p \n",
3667+ DebugLVList->group);
3668+ }
3669+
3670+ }
3671+
3672+ AIXVGLDebugPtr = AIXVGLDebugPtr->next;
3673+
3674+ LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
3675+
3676+ }
3677+
3678+ LOG_DEBUG("***********************************************\n");
3679+ LOG_DEBUG("End Volume Group Dump \n");
3680+ LOG_DEBUG("***********************************************\n");
3681+
3682+ return 0;
3683+
3684+}
3685+#endif
3686diff -Naur linux-2002-09-30/drivers/evms/Config.in evms-2002-09-30/drivers/evms/Config.in
3687--- linux-2002-09-30/drivers/evms/Config.in Wed Dec 31 18:00:00 1969
3688+++ evms-2002-09-30/drivers/evms/Config.in Mon Sep 16 15:55:24 2002
3689@@ -0,0 +1,60 @@
3690+#
3691+# Copyright (c) International Business Machines Corp., 2000
3692+#
3693+# This program is free software; you can redistribute it and/or modify
3694+# it under the terms of the GNU General Public License as published by
3695+# the Free Software Foundation; either version 2 of the License, or
3696+# (at your option) any later version.
3697+#
3698+# This program is distributed in the hope that it will be useful,
3699+# but WITHOUT ANY WARRANTY; without even the implied warranty of
3700+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
3701+# the GNU General Public License for more details.
3702+#
3703+# You should have received a copy of the GNU General Public License
3704+# along with this program; if not, write to the Free Software
3705+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3706+#
3707+#
3708+# EVMS driver configuration
3709+#
3710+
3711+mainmenu_option next_comment
3712+comment 'Enterprise Volume Management System'
3713+
3714+tristate 'EVMS Kernel Runtime' CONFIG_EVMS
3715+dep_tristate ' EVMS Local Device Manager' CONFIG_EVMS_LOCAL_DEV_MGR $CONFIG_EVMS
3716+dep_tristate ' EVMS DOS Segment Manager' CONFIG_EVMS_DOS_SEGMENT_MGR $CONFIG_EVMS
3717+dep_tristate ' EVMS GPT Segment Manager' CONFIG_EVMS_GPT_SEGMENT_MGR $CONFIG_EVMS
3718+if [ "$CONFIG_ARCH_S390" = "y" ]; then
3719+dep_tristate ' EVMS S/390 Segment Manager' CONFIG_EVMS_S390_SEGMENT_MGR $CONFIG_EVMS
3720+fi
3721+dep_tristate ' EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT $CONFIG_EVMS
3722+dep_tristate ' EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK $CONFIG_EVMS
3723+dep_tristate ' EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR $CONFIG_EVMS
3724+dep_tristate ' EVMS Linux LVM Package' CONFIG_EVMS_LVM $CONFIG_EVMS
3725+dep_tristate ' EVMS Linux MD Package' CONFIG_EVMS_MD $CONFIG_EVMS
3726+dep_tristate ' EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR $CONFIG_EVMS_MD
3727+dep_tristate ' EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0 $CONFIG_EVMS_MD
3728+dep_tristate ' EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1 $CONFIG_EVMS_MD
3729+dep_tristate ' EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5 $CONFIG_EVMS_MD
3730+dep_tristate ' EVMS AIX LVM Package' CONFIG_EVMS_AIX $CONFIG_EVMS
3731+dep_tristate ' EVMS OS/2 LVM Package' CONFIG_EVMS_OS2 $CONFIG_EVMS
3732+#dep_tristate ' EVMS Clustering Package' CONFIG_EVMS_ECR $CONFIG_EVMS
3733+
3734+if [ "$CONFIG_EVMS" != "n" ]; then
3735+ choice ' EVMS Debug Level' \
3736+ "Critical CONFIG_EVMS_INFO_CRITICAL \
3737+ Serious CONFIG_EVMS_INFO_SERIOUS \
3738+ Error CONFIG_EVMS_INFO_ERROR \
3739+ Warning CONFIG_EVMS_INFO_WARNING \
3740+ Default CONFIG_EVMS_INFO_DEFAULT \
3741+ Details CONFIG_EVMS_INFO_DETAILS \
3742+ Debug CONFIG_EVMS_INFO_DEBUG \
3743+ Extra CONFIG_EVMS_INFO_EXTRA \
3744+ Entry_Exit CONFIG_EVMS_INFO_ENTRY_EXIT \
3745+ Everything CONFIG_EVMS_INFO_EVERYTHING" Default
3746+fi
3747+
3748+endmenu
3749+
3750diff -Naur linux-2002-09-30/drivers/evms/Makefile evms-2002-09-30/drivers/evms/Makefile
3751--- linux-2002-09-30/drivers/evms/Makefile Wed Dec 31 18:00:00 1969
3752+++ evms-2002-09-30/drivers/evms/Makefile Mon Sep 16 15:55:24 2002
3753@@ -0,0 +1,64 @@
3754+#
3755+# Makefile for the kernel EVMS driver and modules.
3756+#
3757+# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
3758+#
3759+
3760+O_TARGET := evmsdrvr.o
3761+
3762+export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o \
3763+ snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o \
3764+ os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o \
3765+ md_raid1.o md_raid5.o md_xor.o s390_part.o gpt_part.o
3766+
3767+# Link order is important! Plugins must come first, then the EVMS core.
3768+
3769+obj-$(CONFIG_EVMS_LOCAL_DEV_MGR) += ldev_mgr.o
3770+obj-$(CONFIG_EVMS_DOS_SEGMENT_MGR) += dos_part.o
3771+obj-$(CONFIG_EVMS_GPT_SEGMENT_MGR) += gpt_part.o
3772+obj-$(CONFIG_EVMS_S390_SEGMENT_MGR) += s390_part.o
3773+obj-$(CONFIG_EVMS_MD) += md_core.o
3774+obj-$(CONFIG_EVMS_MD_LINEAR) += md_linear.o
3775+obj-$(CONFIG_EVMS_MD_RAID0) += md_raid0.o
3776+obj-$(CONFIG_EVMS_MD_RAID1) += md_raid1.o
3777+obj-$(CONFIG_EVMS_MD_RAID5) += md_raid5.o md_xor.o
3778+obj-$(CONFIG_EVMS_LVM) += lvm_vge.o
3779+obj-$(CONFIG_EVMS_AIX) += AIXlvm_vge.o
3780+obj-$(CONFIG_EVMS_OS2) += os2lvm_vge.o
3781+obj-$(CONFIG_EVMS_DRIVELINK) += evms_drivelink.o
3782+obj-$(CONFIG_EVMS_BBR) += evms_bbr.o
3783+obj-$(CONFIG_EVMS_SNAPSHOT) += snapshot.o
3784+obj-$(CONFIG_EVMS_ECR) += evms_ecr.o
3785+obj-$(CONFIG_EVMS) += evms_passthru.o evms.o
3786+
3787+EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
3788+ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
3789+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
3790+endif
3791+ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
3792+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
3793+endif
3794+ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
3795+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
3796+endif
3797+ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
3798+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
3799+endif
3800+ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
3801+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
3802+endif
3803+ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
3804+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
3805+endif
3806+ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
3807+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
3808+endif
3809+ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
3810+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
3811+endif
3812+ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
3813+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
3814+endif
3815+
3816+include $(TOPDIR)/Rules.make
3817+
3818diff -Naur linux-2002-09-30/drivers/evms/dos_part.c evms-2002-09-30/drivers/evms/dos_part.c
3819--- linux-2002-09-30/drivers/evms/dos_part.c Wed Dec 31 18:00:00 1969
3820+++ evms-2002-09-30/drivers/evms/dos_part.c Fri Sep 13 16:09:55 2002
3821@@ -0,0 +1,1452 @@
3822+/* -*- linux-c -*- */
3823+/*
3824+ *
3825+ *
3826+ * Copyright (c) International Business Machines Corp., 2000
3827+ *
3828+ * This program is free software; you can redistribute it and/or modify
3829+ * it under the terms of the GNU General Public License as published by
3830+ * the Free Software Foundation; either version 2 of the License, or
3831+ * (at your option) any later version.
3832+ *
3833+ * This program is distributed in the hope that it will be useful,
3834+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3835+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
3836+ * the GNU General Public License for more details.
3837+ *
3838+ * You should have received a copy of the GNU General Public License
3839+ * along with this program; if not, write to the Free Software
3840+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3841+ *
3842+ *
3843+ */
3844+/*
3845+ * linux/drivers/evms/dos_part.c
3846+ *
3847+ * EVMS DOS partition manager
3848+ *
3849+ * Partial code extracted from
3850+ *
3851+ * linux/fs/partitions/msdos.c
3852+ *
3853+ */
3854+
3855+#include <linux/config.h>
3856+#include <linux/module.h>
3857+#include <linux/kernel.h>
3858+#include <linux/config.h>
3859+#include <linux/fs.h>
3860+#include <linux/genhd.h>
3861+#include <linux/string.h>
3862+#include <linux/blk.h>
3863+#include <linux/init.h>
3864+#include <linux/iobuf.h> /* for kiobuf stuffs */
3865+
3866+#ifdef CONFIG_BLK_DEV_IDE
3867+#include <linux/ide.h> /* IDE xlate */
3868+#endif /* CONFIG_BLK_DEV_IDE */
3869+
3870+#include <linux/evms/evms.h>
3871+#include <linux/evms/evms_os2.h>
3872+
3873+#include <asm/system.h>
3874+#include <asm/uaccess.h>
3875+
3876+/* prefix used in logging messages */
3877+#define LOG_PREFIX "dos_part: "
3878+
3879+/* #include "msdos.h" */
3880+#define MSDOS_LABEL_MAGIC 0xAA55
3881+#define GPT_ENTIRE_DISK_INDICATOR 0xEE
3882+#define GPT_ESP_INDICATOR 0xEF
3883+
3884+/**
3885+ * struct mbr_ebr - Skeletal MBR/EBR structure useful for our purposes
3886+ * @unused1: skip IPL record code
3887+ * @partitions: partition table
3888+ * @signature: DOS magic
3889+ *
3890+ * skeletal access to parition table in MBR/EBR
3891+ **/
3892+struct mbr_ebr {
3893+ u8 unused1[0x1be];
3894+ struct partition partitions[4];
3895+ u16 signature;
3896+};
3897+
3898+/**
3899+ * struct dos_private - Private data structure for this plugin
3900+ * @source_object: object this IO will get remapped to
3901+ * @start_sect: source object relative starting address in 512 byte units
3902+ * @nr_sect: partition size in 512 bytes units
3903+ * @type: partition type or filesystem format indicator
3904+ *
3905+ * private copy of the just the fields we require to remap IO requests
3906+ * to the underlying object.
3907+ **/
3908+struct dos_private {
3909+ struct evms_logical_node *source_disk;
3910+ u64 start_sect;
3911+ u64 nr_sects;
3912+ unsigned char type;
3913+};
3914+
3915+/**
3916+ * struct extended_part - Structure used to track progress traversing an EBR chain
3917+ * @extended: partition table in the extended boot record
3918+ * @start_sect: address of the extended boot record in 512 byte units
3919+ * @next_ebr_start: address of next ebr in the chain
3920+ * @done: progress flag
3921+ *
3922+ * struct used to track extended boot record chain traversals.
3923+ **/
3924+struct extended_part {
3925+ struct partition *extended;
3926+ u64 start_sect;
3927+ u64 next_ebr_start;
3928+ int done;
3929+};
3930+
3931+/* Global variables */
3932+static int cur_comp_part_num; /* used to track non-primary
3933+ * partition numbers
3934+ */
3935+static int exported_nodes; /* total # of exported segments
3936+ * produced during this discovery.
3937+ */
3938+
3939+/* External references */
3940+#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
3941+extern void md_autodetect_dev(kdev_t dev);
3942+#endif
3943+
3944+/* Prototypes */
3945+static int mbr_ebr_partition_discover(struct evms_logical_node **);
3946+static int mbr_ebr_partition_delete(struct evms_logical_node *);
3947+static void mbr_ebr_partition_read(struct evms_logical_node *,
3948+ struct buffer_head *);
3949+static void mbr_ebr_partition_write(struct evms_logical_node *,
3950+ struct buffer_head *);
3951+static int mbr_ebr_partition_ioctl(struct evms_logical_node *, struct inode *,
3952+ struct file *, unsigned int, unsigned long);
3953+static int mbr_ebr_partition_init_io(struct evms_logical_node *,
3954+ int, u64, u64, void *);
3955+
3956+static struct evms_plugin_fops fops = {
3957+ .discover = mbr_ebr_partition_discover,
3958+ .delete = mbr_ebr_partition_delete,
3959+ .read = mbr_ebr_partition_read,
3960+ .write = mbr_ebr_partition_write,
3961+ .init_io = mbr_ebr_partition_init_io,
3962+ .ioctl = mbr_ebr_partition_ioctl
3963+};
3964+
3965+#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
3966+
3967+static struct evms_plugin_header plugin_header = {
3968+ .id = SetPluginID(IBM_OEM_ID,
3969+ EVMS_SEGMENT_MANAGER,
3970+ EVMS_MSDOS_PARTITION_MANAGER_ID),
3971+ .version = {
3972+ .major = 1,
3973+ .minor = 1,
3974+ .patchlevel = 1
3975+ },
3976+ .required_services_version = {
3977+ .major = 0,
3978+ .minor = 5,
3979+ .patchlevel = 0
3980+ },
3981+ .fops = &fops
3982+};
3983+
3984+/*
3985+ * Many architectures don't like unaligned accesses, which is
3986+ * frequently the case with the nr_sects and start_sect partition
3987+ * table entries.
3988+ */
3989+#include <asm/unaligned.h>
3990+
3991+#define SYS_IND(p) (get_unaligned(&p->sys_ind))
3992+#define NR_SECTS(p) (u64)({ __typeof__(p->nr_sects) __a = \
3993+ get_unaligned(&p->nr_sects); \
3994+ le32_to_cpu(__a); \
3995+ })
3996+
3997+#define START_SECT(p) (u64)({ __typeof__(p->start_sect) __a = \
3998+ get_unaligned(&p->start_sect); \
3999+ le32_to_cpu(__a); \
4000+ })
4001+
4002+/******************************************/
4003+/* List Support - Variables, & Functions */
4004+/******************************************/
4005+
4006+/* Typedefs */
4007+
4008+struct segment_list_node {
4009+ struct evms_logical_node *segment;
4010+ struct segment_list_node *next;
4011+};
4012+
4013+struct disk_list_node {
4014+ struct evms_logical_node *disk;
4015+ struct segment_list_node *segment_list;
4016+ struct disk_list_node *next;
4017+};
4018+
4019+/* Variables */
4020+
4021+static struct disk_list_node *my_disk_list;
4022+
4023+/* Functions */
4024+
4025+static struct disk_list_node **
4026+lookup_disk(struct evms_logical_node *disk)
4027+{
4028+ struct disk_list_node **ldln;
4029+
4030+ ldln = &my_disk_list;
4031+ while (*ldln) {
4032+ if ((*ldln)->disk == disk)
4033+ break;
4034+ ldln = &(*ldln)->next;
4035+ }
4036+ return (ldln);
4037+}
4038+
4039+static struct segment_list_node **
4040+lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
4041+{
4042+ struct segment_list_node **lsln;
4043+
4044+ lsln = &disk->segment_list;
4045+ while (*lsln) {
4046+ if ((*lsln)->segment == segment)
4047+ break;
4048+ lsln = &(*lsln)->next;
4049+ }
4050+ return (lsln);
4051+}
4052+
4053+static struct evms_logical_node *
4054+find_segment_on_disk(struct evms_logical_node *disk,
4055+ u64 start_sect, u64 nr_sects)
4056+{
4057+ struct evms_logical_node *rc = NULL;
4058+ struct disk_list_node **ldln;
4059+ struct segment_list_node **lsln;
4060+ struct dos_private *dos_prv;
4061+
4062+ ldln = lookup_disk(disk);
4063+ if (*ldln) {
4064+ /* disk found in list */
4065+ /* attempt to find segment */
4066+
4067+ lsln = &(*ldln)->segment_list;
4068+ while (*lsln) {
4069+ dos_prv = (*lsln)->segment->private;
4070+ if (dos_prv->start_sect == start_sect)
4071+ if (dos_prv->nr_sects == nr_sects)
4072+ break;
4073+ lsln = &(*lsln)->next;
4074+ }
4075+ if (*lsln)
4076+ rc = (*lsln)->segment;
4077+ }
4078+ return (rc);
4079+}
4080+
4081+/* function description: add_segment_to_disk
4082+ *
4083+ * this function attempts to add a segment to the segment
4084+ * list of a disk. if the specified disk is not found, it
4085+ * will be added to the global disk list. this function will
4086+ * return a pointer to the matching segment in the disk's
4087+ * segment list. the caller must compare the returned pointer
4088+ * to the specified segment to see if the
4089+ * specified segment was already present in the disk's segment
4090+ * list. if the return pointer matches the specified segment,
4091+ * then the specified segment was added to the list. if the
4092+ * return segment pointer to does not match the specified
4093+ * segment pointer, then the specified segment pointer was
4094+ * a duplicate and can be thrown away.
4095+ */
4096+static int
4097+add_segment_to_disk(struct evms_logical_node *disk,
4098+ struct evms_logical_node *segment)
4099+{
4100+ int rc = 0;
4101+ struct disk_list_node **ldln, *new_disk;
4102+ struct segment_list_node **lsln, *new_segment;
4103+
4104+ ldln = lookup_disk(disk);
4105+ if (*ldln == NULL) {
4106+ /* disk not in list, add disk */
4107+ new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
4108+ if (new_disk) {
4109+ memset(new_disk, 0, sizeof (*new_disk));
4110+ new_disk->disk = disk;
4111+ *ldln = new_disk;
4112+ } else {
4113+ rc = -ENOMEM;
4114+ }
4115+ }
4116+ if (!rc) {
4117+ /* attempt to add segment */
4118+ lsln = lookup_segment(*ldln, segment);
4119+ if (*lsln == NULL) {
4120+ /* segment not in list, add segment */
4121+ new_segment =
4122+ kmalloc(sizeof (*new_segment), GFP_KERNEL);
4123+ if (new_segment) {
4124+ memset(new_segment, 0, sizeof (*new_segment));
4125+ new_segment->segment = segment;
4126+ *lsln = new_segment;
4127+ } else {
4128+ rc = -ENOMEM;
4129+ }
4130+ } else
4131+ rc = -1;
4132+ }
4133+ return (rc);
4134+}
4135+
4136+static int
4137+remove_segment_from_disk(struct evms_logical_node *disk,
4138+ struct evms_logical_node *segment,
4139+ struct evms_logical_node **empty_disk)
4140+{
4141+ int rc = 0;
4142+ struct disk_list_node **ldln, *tmp_disk_node;
4143+ struct segment_list_node **lsln, *tmp_segment_node;
4144+
4145+ *empty_disk = NULL;
4146+ ldln = lookup_disk(disk);
4147+ if (*ldln == NULL) {
4148+ rc = -1;
4149+ } else {
4150+ /* disk found in list */
4151+ /* attempt to add segment */
4152+ lsln = lookup_segment(*ldln, segment);
4153+ if (*lsln == NULL) {
4154+ rc = -2;
4155+ } else {
4156+ tmp_segment_node = *lsln;
4157+ /* remove segment from list */
4158+ *lsln = (*lsln)->next;
4159+ /* free the segment list node */
4160+ kfree(tmp_segment_node);
4161+
4162+ if ((*ldln)->segment_list == NULL) {
4163+ tmp_disk_node = *ldln;
4164+ *empty_disk = tmp_disk_node->disk;
4165+ /* remove disk from list */
4166+ *ldln = (*ldln)->next;
4167+ /* free the disk list node */
4168+ kfree(tmp_disk_node);
4169+ }
4170+ }
4171+ }
4172+ return (rc);
4173+}
4174+
4175+static inline int
4176+is_extended_partition(struct partition *p)
4177+{
4178+ return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
4179+ SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
4180+ SYS_IND(p) == LINUX_EXTENDED_PARTITION);
4181+}
4182+
4183+static inline u64
4184+part_start(struct partition *part, u64 ext_start, u64 ebr_start)
4185+{
4186+ u64 pstart = START_SECT(part);
4187+ pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
4188+ return (pstart);
4189+}
4190+
4191+static int
4192+validate_mbr_ebr(struct evms_logical_node *node,
4193+ struct mbr_ebr *mbr_ebr, u64 ext_start,
4194+ u64 ebr_start)
4195+{
4196+ int valid_mbr_ebr, i, j, mbr_flag;
4197+ struct partition *pi, *pj;
4198+ u64 pi_start, pi_end, pj_start, pj_end;
4199+
4200+ /* assume an MBR */
4201+ mbr_flag = TRUE;
4202+
4203+ /* assume its valid */
4204+ valid_mbr_ebr = TRUE;
4205+
4206+ /* check for valid signature */
4207+ if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
4208+ LOG_DEBUG("%s: invalid signature on '%s'!\n",
4209+ __FUNCTION__, node->name);
4210+ valid_mbr_ebr = FALSE;
4211+ }
4212+
4213+ /* check for an AIX IPL signature */
4214+#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */
4215+ if (*(unsigned int *) mbr_ebr == IPLRECID) {
4216+ LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
4217+ __FUNCTION__, node->name);
4218+ valid_mbr_ebr = FALSE;
4219+ }
4220+
4221+ /* check for boot sector fields */
4222+
4223+#if 0 //Remove checking of the first byte
4224+
4225+ /* attempt to make some initial assumptions about
4226+ * what type of data structure this could be. we
4227+ * start by checking the 1st byte. we can tell a
4228+ * few things based on what is or isn't there.
4229+ */
4230+ if (valid_mbr_ebr == TRUE)
4231+ switch (*(u_char *) mbr_ebr) {
4232+ /* check for JMP as 1st instruction
4233+ * if found, assume (for now), that
4234+ * this is a boot sector.
4235+ */
4236+ /* Removed the JMP opcode check because it's not enough to determine
4237+ * that this sector does not have a valid MBR.
4238+ * Note: To avoid going thru validation process of partition table,
4239+ * it's necessary to have a better boot sector check
4240+ * (eg. JMP opcode && other conditions) */
4241+ /*
4242+ case 0xEB:
4243+ LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
4244+ valid_mbr_ebr = FALSE;
4245+ */
4246+ /* let this fall thru to pick up the
4247+ * mbr_flag == FALSE.
4248+ */
4249+
4250+ /* the MBR should contain boot strap
4251+ * code, so we don't expect the 1st
4252+ * byte to be a 0x0. If the 1st byte
4253+ * IS 0x0, its assumed (for now) to
4254+ * be an EBR.
4255+ */
4256+ case 0:
4257+ mbr_flag = FALSE;
4258+ break;
4259+ }
4260+#endif //Remove checking of the first byte
4261+
4262+ if (valid_mbr_ebr == TRUE) {
4263+ /* dump the partition table entries in debug mode */
4264+ LOG_DEBUG
4265+ ("%s: disk relative starts: ext_part("PFU64"), ebr("PFU64").\n",
4266+ __FUNCTION__, ext_start, ebr_start);
4267+ for (i = 0; i < 4; i++) {
4268+ pi = &mbr_ebr->partitions[i];
4269+ LOG_DEBUG
4270+ ("%s: Partition: index(%d), start("PFU64"), size("PFU64"), sys(0x%x).\n",
4271+ __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi),
4272+ SYS_IND(pi));
4273+ }
4274+
4275+ /* check for PMBR (Protected Master Boot Record)
4276+ * and skip this node if found
4277+ */
4278+ for (i = 0; i < 4; i++) {
4279+ pi = &mbr_ebr->partitions[i];
4280+
4281+ if (SYS_IND(pi) == 0xEE) {
4282+ valid_mbr_ebr = FALSE;
4283+ LOG_DETAILS
4284+ ("%s: detected PMBR on '%s', skipping.\n",
4285+ __FUNCTION__, node->name);
4286+ break;
4287+ }
4288+ }
4289+
4290+ /* check of this segment is marked as non-dividable
4291+ * and skip if found
4292+ */
4293+ if (node->iflags & EVMS_TOP_SEGMENT) {
4294+ valid_mbr_ebr = FALSE;
4295+ }
4296+ }
4297+
4298+ if (valid_mbr_ebr == TRUE) {
4299+ /* check for mbr/ebr partition table validity */
4300+ for (i = 0; i < 4; i++) {
4301+ pi = &mbr_ebr->partitions[i];
4302+ if (NR_SECTS(pi)) {
4303+ /* check for partition extending past end of node */
4304+ pi_start = part_start(pi, ext_start, ebr_start);
4305+ pi_end = pi_start + NR_SECTS(pi) - 1;
4306+ if (pi_end >= node->total_vsectors) {
4307+ LOG_DEBUG
4308+ ("%s: partition(%d) ends("PFU64") beyond the end of the disk(%s,"PFU64")!\n",
4309+ __FUNCTION__, i, pi_end,
4310+ node->name, node->total_vsectors);
4311+ valid_mbr_ebr = FALSE;
4312+ }
4313+ if (valid_mbr_ebr == FALSE)
4314+ break;
4315+
4316+ /* check for partition overlap */
4317+ for (j = i + 1; j < 4; j++) {
4318+ pj = &mbr_ebr->partitions[j];
4319+ if (NR_SECTS(pj)) {
4320+ pj_start =
4321+ part_start(pj, ext_start,
4322+ ebr_start);
4323+ pj_end =
4324+ pj_start + NR_SECTS(pj) - 1;
4325+ if (pi_start == pj_start) {
4326+ valid_mbr_ebr = FALSE;
4327+ } else if (pi_start < pj_start) {
4328+ if (pi_end >= pj_start)
4329+ valid_mbr_ebr =
4330+ FALSE;
4331+ } else if (pi_start <= pj_end)
4332+ valid_mbr_ebr = FALSE;
4333+
4334+ if (valid_mbr_ebr == FALSE) {
4335+ LOG_DEBUG
4336+ ("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
4337+ __FUNCTION__, i, j,
4338+ node->name);
4339+ break;
4340+ }
4341+ }
4342+ }
4343+ if (valid_mbr_ebr == FALSE)
4344+ break;
4345+ }
4346+ }
4347+ }
4348+ if (valid_mbr_ebr == TRUE) {
4349+ LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
4350+ (mbr_flag == TRUE) ? 'M' : 'E', node->name);
4351+ } else {
4352+ LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
4353+ __FUNCTION__, node->name);
4354+ }
4355+ return (valid_mbr_ebr);
4356+}
4357+
4358+/*
4359+ * Function: add_segment
4360+ */
4361+static int
4362+mbr_ebr_process_segment(struct evms_logical_node **discover_list,
4363+ struct evms_logical_node *node,
4364+ u64 start_sect,
4365+ u64 nr_sects,
4366+ unsigned char type, int part_num, char *partition_name)
4367+{
4368+ struct dos_private *dos_prv = NULL;
4369+ struct evms_logical_node *segment;
4370+ int rc = 0;
4371+
4372+ segment = find_segment_on_disk(node, start_sect, nr_sects);
4373+ if (segment) {
4374+ LOG_DETAILS("exporting segment '%s'.\n", segment->name);
4375+ } else {
4376+ dos_prv = kmalloc(sizeof (*dos_prv), GFP_KERNEL);
4377+ if (dos_prv) {
4378+ memset(dos_prv, 0, sizeof (*dos_prv));
4379+ dos_prv->source_disk = node;
4380+ dos_prv->start_sect = start_sect;
4381+ dos_prv->nr_sects = nr_sects;
4382+ dos_prv->type = type;
4383+ rc = evms_cs_allocate_logical_node(&segment);
4384+ } else {
4385+ rc = -ENOMEM;
4386+ }
4387+ if (!rc) {
4388+ segment->plugin = &plugin_header;
4389+ segment->system_id = (unsigned int) type;
4390+ segment->total_vsectors = nr_sects;
4391+ segment->block_size = node->block_size;
4392+ segment->hardsector_size = node->hardsector_size;
4393+ segment->private = dos_prv;
4394+ segment->flags = node->flags;
4395+ if (partition_name)
4396+ strcpy(segment->name, partition_name);
4397+ else {
4398+ strcpy(segment->name, node->name);
4399+ if (GetPluginType(node->plugin->id) ==
4400+ EVMS_SEGMENT_MANAGER) {
4401+ strcat(segment->name, ".");
4402+ }
4403+ sprintf(segment->name + strlen(segment->name),
4404+ "%d", part_num);
4405+ }
4406+ /* watch for super floppy format gpt system partition
4407+ * and dont let it be sub divided
4408+ */
4409+ if (segment->system_id == GPT_ESP_INDICATOR) {
4410+ node->iflags |= EVMS_TOP_SEGMENT;
4411+ }
4412+ LOG_DETAILS("creating segment '%s'.\n", segment->name);
4413+ rc = add_segment_to_disk(node, segment);
4414+ if (rc) {
4415+ LOG_ERROR
4416+ ("%s: error(%d) adding segment '%s'!\n",
4417+ __FUNCTION__, rc, segment->name);
4418+ rc = 0;
4419+ } else {
4420+ MOD_INC_USE_COUNT;
4421+ }
4422+ }
4423+ if (rc) {
4424+ if (dos_prv)
4425+ kfree(dos_prv);
4426+ if (segment)
4427+ evms_cs_deallocate_logical_node(segment);
4428+ }
4429+ }
4430+ if (!rc) {
4431+ evms_cs_add_logical_node_to_list(discover_list, segment);
4432+ exported_nodes++;
4433+ }
4434+ return rc;
4435+}
4436+
4437+static void
4438+print_partition_info(char *leading_comment, struct partition *p)
4439+{
4440+ LOG_EXTRA
4441+ ("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA("PFU64"), sizeLBA("PFU64")\n",
4442+ leading_comment, p->boot_ind, p->sys_ind, p->cyl, p->head,
4443+ p->sector, p->end_cyl, p->end_head, p->end_sector, START_SECT(p),
4444+ NR_SECTS(p));
4445+}
4446+
4447+#ifdef CONFIG_BSD_DISKLABEL
4448+#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
4449+static void
4450+print_bsd_partition_info(char *leading_comment, struct bsd_partition *p)
4451+{
4452+ LOG_EXTRA
4453+ ("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
4454+ leading_comment, p->p_size, p->p_offset, p->p_fsize, p->p_fstype,
4455+ p->p_frag, p->p_cpg);
4456+}
4457+
4458+/*
4459+ * bsd_disklabel_partition
4460+ *
4461+ * Return:
4462+ * - 0 for 0 partition
4463+ * - (positive) number for number of BSD partitions found
4464+ * - (negative) error code
4465+ */
4466+static int
4467+bsd_disklabel_partition(struct evms_logical_node **discover_list,
4468+ struct evms_logical_node *node, struct partition *bsd)
4469+{
4470+ struct bsd_disklabel *l;
4471+ struct bsd_partition *p;
4472+ int max_partitions;
4473+ char *data;
4474+ int rc = 0;
4475+ int count = 0;
4476+
4477+ data = kmalloc(node->hardsector_size, GFP_KERNEL);
4478+ if (data)
4479+ rc = INIT_IO(node,
4480+ 0,
4481+ START_SECT(bsd) +
4482+ BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET, 1, data);
4483+ else
4484+ rc = -ENOMEM;
4485+ if (!rc) {
4486+
4487+ l = (struct bsd_disklabel *) data;
4488+ if (l->d_magic == BSD_DISKMAGIC) {
4489+
4490+ max_partitions =
4491+ ((SYS_IND(bsd) ==
4492+ OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS :
4493+ BSD_MAXPARTITIONS);
4494+ if (l->d_npartitions < max_partitions)
4495+ max_partitions = l->d_npartitions;
4496+ for (p = l->d_partitions;
4497+ p - l->d_partitions < max_partitions; p++) {
4498+ if (p->p_fstype != BSD_FS_UNUSED) {
4499+ evmsLOG2(EVMS_INFO_EXTRA,
4500+ (print_bsd_partition_info
4501+ (__FUNCTION__, p)));
4502+ rc = mbr_ebr_process_segment
4503+ (discover_list, node,
4504+ (u64) p->p_offset,
4505+ (u64) p->p_size, p->p_fstype,
4506+ cur_comp_part_num++, NULL);
4507+ if (rc)
4508+ break;
4509+ count++;
4510+ }
4511+ }
4512+ }
4513+ }
4514+ if (data)
4515+ kfree(data);
4516+ if (!rc)
4517+ rc = count;
4518+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4519+ return rc;
4520+}
4521+#endif
4522+
4523+#ifdef CONFIG_UNIXWARE_DISKLABEL
4524+#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
4525+
4526+/*
4527+ * unixware_partition
4528+ *
4529+ * Return:
4530+ * - 0 for 0 partition
4531+ * - (positive) number for number of UNIXWARE partitions found
4532+ * - (negative) error code
4533+ */
4534+static int
4535+unixware_partition(struct evms_logical_node **discover_list,
4536+ struct evms_logical_node *node,
4537+ struct partition *unixware_part)
4538+{
4539+ struct unixware_disklabel *l;
4540+ struct unixware_slice *p;
4541+ char *data = NULL;
4542+ int rc = 0;
4543+ int count = 0;
4544+
4545+ data = kmalloc(node->hardsector_size, GFP_KERNEL);
4546+ if (data)
4547+ rc = INIT_IO(node,
4548+ 0,
4549+ START_SECT(unixware_part) +
4550+ UNIXWARE_PART_TABLE_SECTOR_OFFSET, 1, data);
4551+ else
4552+ rc = -ENOMEM;
4553+ if (!rc) {
4554+ l = (struct unixware_disklabel *) data;
4555+ if (le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
4556+ le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
4557+ p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */
4558+ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
4559+ if (p->s_label != UNIXWARE_FS_UNUSED) {
4560+ rc = mbr_ebr_process_segment
4561+ (discover_list, node, START_SECT(p),
4562+ NR_SECTS(p), UNIXWARE_PARTITION,
4563+ cur_comp_part_num++, NULL);
4564+ if (rc)
4565+ break;
4566+ count++;
4567+ }
4568+ p++;
4569+ }
4570+ }
4571+ }
4572+ if (data)
4573+ kfree(data);
4574+ if (!rc)
4575+ rc = count;
4576+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4577+ return rc;
4578+}
4579+#endif
4580+
4581+#ifdef CONFIG_SOLARIS_X86_PARTITION
4582+#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
4583+/*
4584+ * solaris_x86_partition
4585+ *
4586+ * Return:
4587+ * - 0 for 0 partition
4588+ * - (positive) number for number of solaris partitions found
4589+ * - (negative) error code
4590+ */
4591+static int
4592+solaris_x86_partition(struct evms_logical_node **discover_list,
4593+ struct evms_logical_node *node,
4594+ struct partition *solaris_x86, int probe_only)
4595+{ /* if TRUE, do not add segments */
4596+ long offset = START_SECT(solaris_x86);
4597+ struct solaris_x86_vtoc *v;
4598+ struct solaris_x86_slice *s;
4599+ int i;
4600+ char *data = NULL;
4601+ int rc = 0;
4602+ int count = 0;
4603+
4604+ data = kmalloc(node->hardsector_size, GFP_KERNEL);
4605+ if (data)
4606+ rc = INIT_IO(node,
4607+ 0,
4608+ START_SECT(solaris_x86) +
4609+ SOLARIS_X86_PART_TABLE_SECTOR_OFFSET, 1, data);
4610+ else
4611+ rc = -ENOMEM;
4612+ if (!rc) {
4613+
4614+ v = (struct solaris_x86_vtoc *) data;
4615+
4616+ if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
4617+ if (v->v_version != 1) {
4618+ LOG_WARNING
4619+ ("%s: cannot handle version %d vtoc>\n",
4620+ __FUNCTION__, v->v_version);
4621+ } else {
4622+ for (i = 0; i < v->v_nparts; i++) {
4623+ s = &v->v_slice[i];
4624+ LOG_EXTRA
4625+ ("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
4626+ i, s->s_tag, s->s_flag, s->s_start,
4627+ s->s_size,
4628+ s->s_start + s->s_size - 1);
4629+
4630+ if ((s->s_size == 0)
4631+ || (s->s_tag == 0x05))
4632+ continue;
4633+ if (!probe_only) {
4634+ rc = mbr_ebr_process_segment
4635+ (discover_list, node,
4636+ (u64) (s->s_start +
4637+ offset),
4638+ (u64) s->s_size,
4639+ SOLARIS_X86_PARTITION,
4640+ cur_comp_part_num++, NULL);
4641+ if (rc)
4642+ break;
4643+ }
4644+ count++;
4645+ }
4646+ }
4647+ }
4648+ }
4649+ if (data)
4650+ kfree(data);
4651+ if (!rc)
4652+ rc = count;
4653+ LOG_DETAILS("%s: %s (%d) partitions\n",
4654+ __FUNCTION__, probe_only ? " " : "exported", rc);
4655+ return rc;
4656+}
4657+#endif
4658+
4659+/*
4660+ * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
4661+ *
4662+ * Returns: 1 - os2 DLAT was found
4663+ * 0 otherwise
4664+ *
4665+ */
4666+static int
4667+os2lvm_partition(u64 MBR_EBR_sect,
4668+ struct evms_logical_node *node, struct dla_table_sector *dlat)
4669+{
4670+ struct hd_geometry geometry;
4671+ int rc;
4672+ u32 crc_hold;
4673+
4674+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long) &geometry);
4675+ if (rc) {
4676+ LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n",
4677+ __FUNCTION__, rc, node->name);
4678+ } else
4679+ if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat))
4680+ {
4681+ if ((dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1))
4682+ && (dlat->DLA_Signature2 ==
4683+ cpu_to_le32(DLA_TABLE_SIGNATURE2))) {
4684+ crc_hold = le32_to_cpu(dlat->DLA_CRC);
4685+ dlat->DLA_CRC = 0;
4686+ if (evms_cs_calculate_crc
4687+ (EVMS_INITIAL_CRC, (void *) dlat,
4688+ node->hardsector_size) == crc_hold)
4689+ return 1;
4690+ }
4691+ }
4692+ return 0;
4693+}
4694+
4695+static int
4696+mbr_ebr_process_logical_drive(struct evms_logical_node **discover_list,
4697+ struct evms_logical_node *node,
4698+ struct extended_part *ext_info,
4699+ int i,
4700+ struct partition *p,
4701+ int os2lvm, struct dla_table_sector *dlat)
4702+{
4703+ int rc = 0;
4704+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4705+
4706+ LOG_EXTRA("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4707+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4708+
4709+ if (NR_SECTS(p)) {
4710+ if (is_extended_partition(p)) {
4711+ ext_info->next_ebr_start =
4712+ (u64) (START_SECT(p) +
4713+ START_SECT(ext_info->extended));
4714+ ext_info->done = FALSE; /* not done yet */
4715+ } else {
4716+ partition_name = NULL;
4717+ if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4718+ le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4719+ (ext_info->start_sect + START_SECT(p))
4720+ && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4721+ NR_SECTS(p)
4722+ && dlat->DLA_Array[i].Drive_Letter != '\0') {
4723+ sprintf(tmp_buf, "os2/%c",
4724+ dlat->DLA_Array[i].Drive_Letter);
4725+ partition_name = tmp_buf;
4726+ }
4727+ evmsLOG2(EVMS_INFO_EXTRA,
4728+ (print_partition_info(__FUNCTION__, p)));
4729+
4730+ rc = mbr_ebr_process_segment(discover_list,
4731+ node,
4732+ ext_info->start_sect +
4733+ START_SECT(p), NR_SECTS(p),
4734+ p->sys_ind,
4735+ cur_comp_part_num++,
4736+ partition_name);
4737+ }
4738+ }
4739+ return (rc);
4740+}
4741+
4742+static int
4743+mbr_ebr_process_ebr(struct evms_logical_node **discover_list,
4744+ struct evms_logical_node *node,
4745+ struct extended_part *ext_info, struct mbr_ebr *ebr)
4746+{
4747+ int rc = 0, i, os2lvm;
4748+ struct partition *p;
4749+ struct dla_table_sector *dlat = NULL;
4750+
4751+ /* allocate space for the OS2 DLAT info */
4752+ dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4753+ if (dlat) {
4754+ /* read the dlat for this mbr */
4755+ os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
4756+
4757+ /* walk thru the partition table in the mbr
4758+ * processing each partition record.
4759+ */
4760+ for (i = 0; i < 4; i++) {
4761+ p = &ebr->partitions[i];
4762+ rc = mbr_ebr_process_logical_drive(discover_list,
4763+ node,
4764+ ext_info,
4765+ i, p, os2lvm, dlat);
4766+ }
4767+ } else {
4768+ rc = -ENOMEM;
4769+ }
4770+
4771+ /* free the space used for OS2 DLAT info */
4772+ if (dlat)
4773+ kfree(dlat);
4774+
4775+ return (rc);
4776+}
4777+
4778+static int
4779+mbr_ebr_probe_for_ebr(struct evms_logical_node **discover_list,
4780+ struct evms_logical_node *node,
4781+ struct extended_part *ext_info)
4782+{
4783+ int rc = 0;
4784+ u_char *sector_buffer = NULL;
4785+ struct mbr_ebr *ebr = NULL;
4786+
4787+ /* allocate a sector size buffer */
4788+ sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
4789+ if (sector_buffer)
4790+ /* read the location of the mbr sector */
4791+ rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
4792+ else
4793+ rc = -ENOMEM;
4794+
4795+ if (!rc) {
4796+ ebr = (struct mbr_ebr *) sector_buffer;
4797+ if (validate_mbr_ebr(node, ebr,
4798+ START_SECT(ext_info->extended),
4799+ ext_info->start_sect) == TRUE)
4800+ rc = mbr_ebr_process_ebr(discover_list,
4801+ node, ext_info, ebr);
4802+ }
4803+
4804+ if (sector_buffer)
4805+ kfree(sector_buffer);
4806+
4807+ return (rc);
4808+}
4809+
4810+static int
4811+mbr_ebr_process_extended_partition(struct evms_logical_node **discover_list,
4812+ struct evms_logical_node *node,
4813+ struct partition *p)
4814+{
4815+ int rc = 0;
4816+ struct extended_part ext_info;
4817+
4818+ memset(&ext_info, 0, sizeof (ext_info));
4819+ ext_info.done = FALSE;
4820+ ext_info.extended = p;
4821+ ext_info.next_ebr_start = START_SECT(p);
4822+ while (ext_info.done == FALSE) {
4823+ ext_info.done = TRUE; /* assume done, unless we find another EBR */
4824+ ext_info.start_sect = ext_info.next_ebr_start;
4825+ rc = mbr_ebr_probe_for_ebr(discover_list, node, &ext_info);
4826+ }
4827+ return rc;
4828+}
4829+
4830+/*
4831+ * is_non_dos_extended
4832+ *
4833+ * This function returns TRUE if the partition entry represents a non-DOS
4834+ * extended partition such as UnixWare, Solaris x86 and BSD
4835+ */
4836+static int
4837+is_non_dos_extended(struct evms_logical_node **discover_list,
4838+ struct evms_logical_node *node, struct partition *p)
4839+{
4840+ if (NR_SECTS(p)) {
4841+#ifdef CONFIG_BSD_DISKLABEL
4842+ if (SYS_IND(p) == BSD_PARTITION ||
4843+ SYS_IND(p) == NETBSD_PARTITION ||
4844+ SYS_IND(p) == OPENBSD_PARTITION)
4845+ return TRUE;
4846+#endif
4847+
4848+#ifdef CONFIG_UNIXWARE_DISKLABEL
4849+ if (SYS_IND(p) == UNIXWARE_PARTITION)
4850+ return TRUE;
4851+#endif
4852+
4853+#ifdef CONFIG_SOLARIS_X86_PARTITION
4854+ if ((SYS_IND(p) == SOLARIS_X86_PARTITION) &&
4855+ (solaris_x86_partition(discover_list, node, p, TRUE) > 0))
4856+ return TRUE;
4857+#endif
4858+ }
4859+ return (FALSE);
4860+}
4861+
4862+/*
4863+ * mbr_ebr_process_other_primary_partition
4864+ * This function processes other (non-DOS) primary partitions such as
4865+ * UnixWare, Solaris x86 and BSD
4866+ */
4867+static int
4868+mbr_ebr_process_other_primary_partition(struct evms_logical_node
4869+ **discover_list,
4870+ struct evms_logical_node *node,
4871+ struct partition *p)
4872+{
4873+ if (NR_SECTS(p)) {
4874+#ifdef CONFIG_BSD_DISKLABEL
4875+ if (SYS_IND(p) == BSD_PARTITION ||
4876+ SYS_IND(p) == NETBSD_PARTITION ||
4877+ SYS_IND(p) == OPENBSD_PARTITION)
4878+ return bsd_disklabel_partition(discover_list, node, p);
4879+#endif
4880+
4881+#ifdef CONFIG_UNIXWARE_DISKLABEL
4882+ if (SYS_IND(p) == UNIXWARE_PARTITION)
4883+ return unixware_partition(discover_list, node, p);
4884+#endif
4885+
4886+#ifdef CONFIG_SOLARIS_X86_PARTITION
4887+ if (SYS_IND(p) == SOLARIS_X86_PARTITION)
4888+ return solaris_x86_partition(discover_list, node, p,
4889+ FALSE);
4890+#endif
4891+ }
4892+ return (0);
4893+}
4894+
4895+static int
4896+mbr_ebr_process_dos_primary_partition(struct evms_logical_node **discover_list,
4897+ struct evms_logical_node *node,
4898+ int i,
4899+ struct partition *p,
4900+ int os2lvm, struct dla_table_sector *dlat)
4901+{
4902+ int rc = 0;
4903+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4904+
4905+ LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4906+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4907+
4908+ if (NR_SECTS(p)) {
4909+
4910+ if (is_extended_partition(p))
4911+ rc = mbr_ebr_process_extended_partition(discover_list,
4912+ node, p);
4913+
4914+ else {
4915+ partition_name = NULL;
4916+ if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4917+ le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4918+ START_SECT(p)
4919+ && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4920+ NR_SECTS(p)
4921+ && dlat->DLA_Array[i].Drive_Letter != '\0') {
4922+ sprintf(tmp_buf, "os2/%c",
4923+ dlat->DLA_Array[i].Drive_Letter);
4924+ partition_name = tmp_buf;
4925+ }
4926+ evmsLOG2(EVMS_INFO_EXTRA,
4927+ (print_partition_info(__FUNCTION__, p)));
4928+
4929+ rc = mbr_ebr_process_segment(discover_list,
4930+ node,
4931+ START_SECT(p),
4932+ NR_SECTS(p),
4933+ p->sys_ind,
4934+ i + 1, partition_name);
4935+ }
4936+ }
4937+ return (rc);
4938+}
4939+
4940+static int
4941+mbr_ebr_process_mbr(struct evms_logical_node **discover_list,
4942+ struct evms_logical_node *node, struct mbr_ebr *mbr)
4943+{
4944+ int rc = 0, i, os2lvm;
4945+ struct partition *p;
4946+ struct dla_table_sector *dlat = NULL;
4947+
4948+ cur_comp_part_num = 5; /* set this value for each disk */
4949+
4950+ /* allocate space for the OS2 DLAT info */
4951+ dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4952+ if (dlat) {
4953+ /* read the dlat for this mbr */
4954+ os2lvm = os2lvm_partition(0, node, dlat);
4955+
4956+ /* Pass 1: walk thru the partition table in the mbr
4957+ * processing each partition record.
4958+ */
4959+ for (i = 0; i < 4; i++) {
4960+ p = &mbr->partitions[i];
4961+ if (is_non_dos_extended(discover_list, node, p)) {
4962+ LOG_DETAILS
4963+ (" Found and skip a non-dos extended partition.\n");
4964+ continue;
4965+ }
4966+
4967+ mbr_ebr_process_dos_primary_partition(discover_list,
4968+ node,
4969+ i,
4970+ p, os2lvm, dlat);
4971+ }
4972+
4973+ /* Pass 2: walk thru the partition table in the mbr
4974+ * processing each partition record for non-DOS extended partitions
4975+ */
4976+ for (i = 0; i < 4; i++) {
4977+ p = &mbr->partitions[i];
4978+ mbr_ebr_process_other_primary_partition(discover_list,
4979+ node, p);
4980+ }
4981+
4982+ } else {
4983+ rc = -ENOMEM;
4984+ }
4985+
4986+ /* free the space used for OS2 DLAT info */
4987+ if (dlat)
4988+ kfree(dlat);
4989+
4990+ return (rc);
4991+}
4992+
4993+static int
4994+mbr_ebr_probe_for_mbr(struct evms_logical_node **discover_list,
4995+ struct evms_logical_node *node)
4996+{
4997+ int rc = 0;
4998+ u_char *sector_buffer = NULL;
4999+ struct mbr_ebr *mbr = NULL;
5000+
5001+ LOG_DEBUG("%s: probing (%s).\n", __FUNCTION__, node->name);
5002+
5003+ /* allocate a sector size buffer */
5004+ sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
5005+ if (sector_buffer)
5006+ /* read the location of the mbr sector */
5007+ rc = INIT_IO(node, 0, 0, 1, sector_buffer);
5008+ else
5009+ rc = -ENOMEM;
5010+ if (rc) {
5011+ LOG_ERROR("%s: read error(%d) on '%s'.\n",
5012+ __FUNCTION__, rc, node->name);
5013+ } else {
5014+ mbr = (struct mbr_ebr *) sector_buffer;
5015+ if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
5016+ /* since it looks like this disk has a
5017+ * valid MBR, remove the disk node from
5018+ * the discover list. it may already be
5019+ * on the global list, or it will be
5020+ * added to it. in the case of an mbr
5021+ * with no partitions, it is simply
5022+ * removed and forgotten. when one or
5023+ * more partitions are created, the
5024+ * disk will be examined and handled
5025+ * properly during the following
5026+ * rediscover operation.
5027+ */
5028+ evms_cs_remove_logical_node_from_list(discover_list,
5029+ node);
5030+
5031+ rc = mbr_ebr_process_mbr(discover_list, node, mbr);
5032+ }
5033+ }
5034+
5035+ if (sector_buffer)
5036+ kfree(sector_buffer);
5037+
5038+ return (rc);
5039+}
5040+
5041+/*
5042+ * Function: mbr_ebr_partition_discover
5043+ *
5044+ */
5045+static int
5046+mbr_ebr_partition_discover(struct evms_logical_node **discover_list)
5047+{
5048+ int rc = 0;
5049+ struct evms_logical_node *node, *next_node;
5050+
5051+ MOD_INC_USE_COUNT;
5052+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
5053+
5054+ /* initialize global variable */
5055+ exported_nodes = 0;
5056+
5057+ /* examine each node on the discover list */
5058+ next_node = *discover_list;
5059+ while (next_node) {
5060+ node = next_node;
5061+ next_node = node->next;
5062+ if (node->plugin->id == plugin_header.id)
5063+ /* don't recurse into our own objects
5064+ */
5065+ continue;
5066+ mbr_ebr_probe_for_mbr(discover_list, node);
5067+ }
5068+
5069+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
5070+ __FUNCTION__, exported_nodes, rc);
5071+ if (exported_nodes)
5072+ rc = exported_nodes;
5073+ MOD_DEC_USE_COUNT;
5074+ return (rc);
5075+}
5076+
5077+/*
5078+ * Function: mbr_ebr_partition_delete
5079+ *
5080+ */
5081+static int
5082+mbr_ebr_partition_delete(struct evms_logical_node *segment)
5083+{
5084+ int rc = 0;
5085+ struct dos_private *dos_prv;
5086+ struct evms_logical_node *empty_disk = NULL;
5087+
5088+ LOG_DETAILS("deleting segment '%s'.\n", segment->name);
5089+
5090+ if (!segment) {
5091+ rc = -ENODEV;
5092+ } else {
5093+ dos_prv = segment->private;
5094+ if (dos_prv) {
5095+ /* remove the segment from the
5096+ * disk's segment list
5097+ */
5098+ rc = remove_segment_from_disk(dos_prv->source_disk,
5099+ segment, &empty_disk);
5100+ /* free the local instance data */
5101+ kfree(dos_prv);
5102+ }
5103+ /* free the segment node */
5104+ evms_cs_deallocate_logical_node(segment);
5105+ MOD_DEC_USE_COUNT;
5106+ /* if the last segment on the disk was
5107+ * deleted, delete the disk node too
5108+ */
5109+ if (empty_disk)
5110+ DELETE(empty_disk);
5111+ }
5112+ return (rc);
5113+}
5114+
5115+/*
5116+ * function: mbr_ebr_partition_io_error
5117+ *
5118+ * this function was primarily created because the function
5119+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
5120+ * to be set on inline functions. Since this was an error path
5121+ * and not mainline, I decided to add a trace statement to help
5122+ * report on the failing condition.
5123+ *
5124+ */
5125+static void
5126+mbr_ebr_partition_io_error(struct evms_logical_node *node,
5127+ int io_flag, struct buffer_head *bh)
5128+{
5129+ LOG_SERIOUS
5130+ ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector("PFU64").\n",
5131+ (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
5132+ (u64) bh->b_rsector);
5133+
5134+ bh->b_end_io(bh, 0);
5135+}
5136+
5137+/*
5138+ * Function: mbr_ebr_partition_read
5139+ *
5140+ */
5141+static void
5142+mbr_ebr_partition_read(struct evms_logical_node *partition,
5143+ struct buffer_head *bh)
5144+{
5145+ struct dos_private *dos_prv = partition->private;
5146+
5147+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5148+ partition->total_vsectors) {
5149+ bh->b_rsector += dos_prv->start_sect;
5150+ R_IO(dos_prv->source_disk, bh);
5151+ } else
5152+ mbr_ebr_partition_io_error(partition, READ, bh);
5153+}
5154+
5155+/*
5156+ * Function: mbr_ebr_partition_write
5157+ *
5158+ */
5159+static void
5160+mbr_ebr_partition_write(struct evms_logical_node *partition,
5161+ struct buffer_head *bh)
5162+{
5163+ struct dos_private *dos_prv = partition->private;
5164+
5165+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5166+ partition->total_vsectors) {
5167+ bh->b_rsector += dos_prv->start_sect;
5168+ W_IO(dos_prv->source_disk, bh);
5169+ } else
5170+ mbr_ebr_partition_io_error(partition, WRITE, bh);
5171+}
5172+
5173+/*
5174+ * Function: mbr_ebr_partition_init_io
5175+ *
5176+ */
5177+static int
5178+mbr_ebr_partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */
5179+ u64 sect_nr, /* disk LBA */
5180+ u64 num_sects, /* # of sectors */
5181+ void *buf_addr)
5182+{ /* buffer address */
5183+ int rc;
5184+ struct dos_private *dos_prv = partition->private;
5185+
5186+ if ((sect_nr + num_sects) <= partition->total_vsectors) {
5187+ rc = INIT_IO(dos_prv->source_disk, io_flag,
5188+ sect_nr + dos_prv->start_sect, num_sects,
5189+ buf_addr);
5190+ } else {
5191+ LOG_SERIOUS
5192+ ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
5193+ (io_flag) ? "WRITE" : "READ", partition->name,
5194+ (dos_prv->nr_sects - 1), sect_nr, num_sects);
5195+ rc = -EINVAL;
5196+ }
5197+
5198+ return (rc);
5199+}
5200+
5201+/*
5202+ * Function: mbr_ebr_partition_ioctl
5203+ *
5204+ */
5205+static int
5206+mbr_ebr_partition_ioctl(struct evms_logical_node *partition,
5207+ struct inode *inode,
5208+ struct file *file, unsigned int cmd, unsigned long arg)
5209+{
5210+ struct dos_private *dos_prv;
5211+ struct hd_geometry hd_geo;
5212+ int rc;
5213+
5214+ rc = 0;
5215+ dos_prv = partition->private;
5216+ if (!inode)
5217+ return -EINVAL;
5218+ switch (cmd) {
5219+ case HDIO_GETGEO:
5220+ {
5221+ rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5222+ if (rc)
5223+ break;
5224+ if (copy_from_user
5225+ (&hd_geo, (void *) arg,
5226+ sizeof (struct hd_geometry)))
5227+ rc = -EFAULT;
5228+ if (rc)
5229+ break;
5230+ hd_geo.start = dos_prv->start_sect;
5231+ if (copy_to_user
5232+ ((void *) arg, &hd_geo,
5233+ sizeof (struct hd_geometry)))
5234+ rc = -EFAULT;
5235+ }
5236+ break;
5237+ case EVMS_GET_BMAP:
5238+ {
5239+ struct evms_get_bmap_pkt *bmap =
5240+ (struct evms_get_bmap_pkt *) arg;
5241+ bmap->rsector += dos_prv->start_sect;
5242+ /* intentionally fall thru to
5243+ * default ioctl down to device
5244+ * manager.
5245+ */
5246+ }
5247+ default:
5248+ rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5249+ }
5250+ return rc;
5251+}
5252+
5253+/*
5254+ * Function: dos_part_init
5255+ *
5256+ */
5257+static int __init
5258+dos_part_init(void)
5259+{
5260+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
5261+}
5262+
5263+static void __exit
5264+dos_part_exit(void)
5265+{
5266+ evms_cs_unregister_plugin(&plugin_header);
5267+}
5268+
5269+module_init(dos_part_init);
5270+module_exit(dos_part_exit);
5271+#ifdef MODULE_LICENSE
5272+MODULE_LICENSE("GPL");
5273+#endif
5274diff -Naur linux-2002-09-30/drivers/evms/evms.c evms-2002-09-30/drivers/evms/evms.c
5275--- linux-2002-09-30/drivers/evms/evms.c Wed Dec 31 18:00:00 1969
5276+++ evms-2002-09-30/drivers/evms/evms.c Thu Sep 26 11:55:45 2002
5277@@ -0,0 +1,5865 @@
5278+/* -*- linux-c -*- */
5279+/*
5280+ *
5281+ *
5282+ * Copyright (c) International Business Machines Corp., 2000
5283+ *
5284+ * This program is free software; you can redistribute it and/or modify
5285+ * it under the terms of the GNU General Public License as published by
5286+ * the Free Software Foundation; either version 2 of the License, or
5287+ * (at your option) any later version.
5288+ *
5289+ * This program is distributed in the hope that it will be useful,
5290+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5291+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
5292+ * the GNU General Public License for more details.
5293+ *
5294+ * You should have received a copy of the GNU General Public License
5295+ * along with this program; if not, write to the Free Software
5296+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
5297+ *
5298+ *
5299+ */
5300+/*
5301+ *
5302+ * linux/drivers/evms/evms.c
5303+ *
5304+ * EVMS Base and Common Services
5305+ *
5306+ */
5307+
5308+#define DEVICE_NR(device) MINOR(device) /* evms has no partition bits */
5309+#define DEVICE_NAME "evms" /* name for messaging */
5310+#define DEVICE_NO_RANDOM /* no entropy to contribute */
5311+#define DEVICE_OFF(d) /* do nothing */
5312+
5313+//#define LOCAL_DEBUG 1
5314+
5315+#include <linux/config.h>
5316+#include <linux/module.h>
5317+#include <linux/errno.h>
5318+#include <linux/kernel.h>
5319+#include <linux/init.h>
5320+#include <linux/fs.h>
5321+#include <linux/slab.h>
5322+#include <asm/uaccess.h>
5323+#include <linux/blk.h> /* must be included by all block drivers */
5324+#include <linux/blkdev.h>
5325+#include <linux/blkpg.h>
5326+#include <linux/iobuf.h>
5327+#include <linux/genhd.h>
5328+#include <linux/sched.h>
5329+#include <linux/completion.h>
5330+#include <linux/version.h>
5331+#include <linux/swap.h>
5332+#include <net/checksum.h>
5333+#include <linux/sysctl.h>
5334+#include <linux/smp_lock.h>
5335+#include <linux/reboot.h>
5336+#include <linux/compiler.h>
5337+#include <linux/evms/evms.h>
5338+
5339+//#define VFS_PATCH_PRESENT
5340+
5341+/* prefix used in logging messages */
5342+#define LOG_PREFIX
5343+
5344+struct evms_registered_plugin {
5345+ struct evms_plugin_header *plugin;
5346+ struct evms_registered_plugin *next;
5347+};
5348+static struct evms_registered_plugin *registered_plugin_head = NULL;
5349+
5350+static struct evms_list_node *evms_global_device_list = NULL;
5351+static struct evms_list_node *evms_global_feature_node_list = NULL;
5352+static struct evms_list_node *evms_global_notify_list = NULL;
5353+
5354+int evms_info_level = EVMS_INFO_LEVEL;
5355+struct proc_dir_entry *evms_proc_dir = NULL;
5356+EXPORT_SYMBOL(evms_info_level);
5357+static struct evms_logical_volume *evms_logical_volumes;
5358+static int evms_volumes = 0;
5359+/* a few variables to aid in detecting memory leaks.
5360+ * these variables are always in use, regardless of
5361+ * the state of EVMS_MEM_DEBUG.
5362+ */
5363+static atomic_t evms_allocs = (atomic_t) ATOMIC_INIT(0);
5364+static atomic_t evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
5365+
5366+u8 *evms_primary_string = "primary";
5367+EXPORT_SYMBOL(evms_primary_string);
5368+u8 *evms_secondary_string = "secondary";
5369+EXPORT_SYMBOL(evms_secondary_string);
5370+
5371+static struct evms_version evms_svc_version = {
5372+ .major = EVMS_COMMON_SERVICES_MAJOR,
5373+ .minor = EVMS_COMMON_SERVICES_MINOR,
5374+ .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL
5375+};
5376+
5377+/* Handles for "private" EVMS object pools */
5378+static struct evms_pool_mgmt *evms_io_notify_pool;
5379+
5380+/* Handles for "public" EVMS object pools */
5381+struct evms_pool_mgmt *evms_bh_pool;
5382+EXPORT_SYMBOL(evms_bh_pool);
5383+
5384+/* Handle for the devfs directory entry */
5385+devfs_handle_t evms_dir_devfs_handle;
5386+devfs_handle_t evms_blk_devfs_handle;
5387+
5388+/**********************************************************/
5389+/* SYSCTL - EVMS folder */
5390+/**********************************************************/
5391+
5392+#ifdef CONFIG_PROC_FS
5393+static struct ctl_table_header *evms_table_header;
5394+static int evms_info_level_min = EVMS_INFO_CRITICAL;
5395+static int evms_info_level_max = EVMS_INFO_EVERYTHING;
5396+
5397+static ctl_table evms_table[] = {
5398+ {DEV_EVMS_INFO_LEVEL, "evms_info_level",
5399+ &evms_info_level, sizeof (int), 0644, NULL,
5400+ &proc_dointvec_minmax, &sysctl_intvec,
5401+ NULL, &evms_info_level_min, &evms_info_level_max},
5402+ {0}
5403+};
5404+
5405+static ctl_table evms_dir_table[] = {
5406+ {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},
5407+ {0}
5408+};
5409+
5410+static ctl_table dev_dir_table[] = {
5411+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
5412+ {0}
5413+};
5414+#endif
5415+
5416+/**********************************************************/
5417+/* START -- arch ioctl32 support */
5418+/**********************************************************/
5419+#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
5420+#include <linux/evms/evms_bbr_k.h>
5421+#include <linux/raid/md.h>
5422+
5423+extern asmlinkage long
5424+sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
5425+
5426+extern int
5427+register_ioctl32_conversion(unsigned int cmd, void *handler);
5428+
5429+extern int
5430+unregister_ioctl32_conversion(unsigned int cmd);
5431+
5432+#define uvirt_to_kernel(__x) ((unsigned long)(__x))
5433+typedef unsigned int __uvirt_addr;
5434+
5435+struct evms_sector_io32 {
5436+ u64 disk_handle;
5437+ s32 io_flag;
5438+ u64 starting_sector;
5439+ u64 sector_count;
5440+ __uvirt_addr buffer_address;
5441+ s32 status;
5442+};
5443+
5444+struct evms_rediscover32 {
5445+ s32 status;
5446+ u32 drive_count;
5447+ __uvirt_addr drive_array;
5448+};
5449+
5450+struct evms_compute_csum32 {
5451+ __uvirt_addr buffer_address;
5452+ s32 buffer_size;
5453+ u32 insum;
5454+ u32 outsum;
5455+ s32 status;
5456+};
5457+
5458+struct evms_plugin_ioctl32 {
5459+ u32 feature_id;
5460+ s32 feature_command;
5461+ s32 status;
5462+ __uvirt_addr feature_ioctl_data;
5463+};
5464+
5465+struct evms_notify_bbr32 {
5466+ char object_name[EVMS_VOLUME_NAME_SIZE+1];
5467+ u64 count;
5468+ u64 start_sect;
5469+ u64 nr_sect;
5470+ __uvirt_addr buffer;
5471+ s32 rw;
5472+};
5473+
5474+#define EVMS_MD_ID 4
5475+#define EVMS_MD_PERS_IOCTL_CMD 1
5476+#define EVMS_MD_ADD 2
5477+#define EVMS_MD_REMOVE 3
5478+#define EVMS_MD_ACTIVATE 4
5479+#define EVMS_MD_DEACTIVATE 5
5480+#define EVMS_MD_GET_ARRAY_INFO 6
5481+#define EVMS_MD_RAID5_INIT_IO 1
5482+
5483+struct evms_md_ioctl {
5484+ int mddev_idx;
5485+ int cmd;
5486+ void *arg;
5487+};
5488+
5489+struct evms_md_ioctl32 {
5490+ u32 mddev_idx;
5491+ u32 cmd;
5492+ __uvirt_addr arg;
5493+};
5494+
5495+struct evms_md_array_info {
5496+ unsigned long state;
5497+ mdp_super_t *sb;
5498+};
5499+
5500+struct evms_md_array_info32 {
5501+ u32 state;
5502+ __uvirt_addr sb;
5503+};
5504+
5505+struct raid5_ioctl_init_io {
5506+ int rw;
5507+ u64 lsn;
5508+ u64 nr_sects;
5509+ void *data;
5510+};
5511+
5512+struct raid5_ioctl_init_io32 {
5513+ s32 rw;
5514+ u64 lsn;
5515+ u64 nr_sects;
5516+ __uvirt_addr data;
5517+};
5518+
5519+#define EVMS_MD_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5520+ (EVMS_REGION_MANAGER << 12) | EVMS_MD_ID)
5521+#define EVMS_BBR_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5522+ (EVMS_FEATURE << 12) | EVMS_BBR_FEATURE_ID)
5523+
5524+#define EVMS_SECTOR_IO_32 _IOWR(EVMS_MAJOR, \
5525+ EVMS_SECTOR_IO_NUMBER, \
5526+ struct evms_sector_io32)
5527+#define EVMS_REDISCOVER_VOLUMES_32 _IOWR(EVMS_MAJOR, \
5528+ EVMS_REDISCOVER_VOLUMES_NUMBER, \
5529+ struct evms_rediscover32)
5530+#define EVMS_COMPUTE_CSUM_32 _IOWR(EVMS_MAJOR, \
5531+ EVMS_COMPUTE_CSUM_NUMBER, \
5532+ struct evms_compute_csum32)
5533+#define EVMS_PLUGIN_IOCTL_32 _IOR(EVMS_MAJOR, \
5534+ EVMS_PLUGIN_IOCTL_NUMBER, \
5535+ struct evms_plugin_ioctl32)
5536+
5537+static int evms_sector_io(unsigned int fd,
5538+ unsigned int cmd,
5539+ unsigned long arg)
5540+{
5541+ mm_segment_t old_fs = get_fs();
5542+ struct evms_sector_io32 parms32;
5543+ struct evms_sector_io_pkt parms;
5544+ unsigned int kcmd;
5545+ void *karg;
5546+ int rc = 0;
5547+
5548+ if (copy_from_user(&parms32, (struct evms_sector_io32 *)arg,
5549+ sizeof(struct evms_sector_io32)))
5550+ return -EFAULT;
5551+
5552+ parms.disk_handle = parms32.disk_handle;
5553+ parms.io_flag = parms32.io_flag;
5554+ parms.starting_sector = parms32.starting_sector;
5555+ parms.sector_count = parms32.sector_count;
5556+ parms.buffer_address = (u8 *)uvirt_to_kernel(parms32.buffer_address);
5557+ parms.status = 0;
5558+
5559+ kcmd = EVMS_SECTOR_IO;
5560+ karg = &parms;
5561+
5562+ set_fs(KERNEL_DS);
5563+ rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5564+ set_fs(old_fs);
5565+
5566+ parms32.status = parms.status;
5567+
5568+ if (copy_to_user((struct evms_sector_io32 *)arg, &parms32,
5569+ sizeof(struct evms_sector_io32)))
5570+ return -EFAULT;
5571+
5572+ return rc;
5573+}
5574+
5575+static int evms_rediscover(unsigned int fd,
5576+ unsigned int cmd,
5577+ unsigned long arg)
5578+{
5579+ mm_segment_t old_fs = get_fs();
5580+ struct evms_rediscover32 parms32;
5581+ struct evms_rediscover_pkt parms;
5582+ unsigned int kcmd;
5583+ void *karg;
5584+ int rc = 0;
5585+
5586+ if (copy_from_user(&parms32, (struct evms_rediscover32 *)arg,
5587+ sizeof(struct evms_rediscover32)))
5588+ return -EFAULT;
5589+
5590+ parms.drive_count = parms32.drive_count;
5591+ parms.drive_array = (void *)uvirt_to_kernel(parms32.drive_array);
5592+ parms.status = 0;
5593+
5594+ kcmd = EVMS_REDISCOVER_VOLUMES;
5595+ karg = &parms;
5596+
5597+ set_fs(KERNEL_DS);
5598+ rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5599+ set_fs(old_fs);
5600+
5601+ parms32.status = parms.status;
5602+
5603+ if (copy_to_user((struct evms_rediscover32 *)arg, &parms32,
5604+ sizeof(struct evms_rediscover32)))
5605+ return -EFAULT;
5606+
5607+ return rc;
5608+}
5609+
5610+static int evms_compute_csum(unsigned int fd,
5611+ unsigned int cmd,
5612+ unsigned long arg)
5613+{
5614+ mm_segment_t old_fs = get_fs();
5615+ struct evms_compute_csum32 parms32;
5616+ struct evms_compute_csum_pkt parms;
5617+ unsigned int kcmd;
5618+ void *karg;
5619+ int rc = 0;
5620+
5621+ if (copy_from_user(&parms32, (struct evms_compute_csum32 *)arg,
5622+ sizeof(struct evms_compute_csum32)))
5623+ return -EFAULT;
5624+
5625+ parms.insum = parms32.insum;
5626+ parms.outsum = parms32.outsum;
5627+ parms.buffer_size = parms32.buffer_size;
5628+ parms.buffer_address = (void *)uvirt_to_kernel(parms32.buffer_address);
5629+ parms.status = 0;
5630+
5631+ kcmd = EVMS_COMPUTE_CSUM;
5632+ karg = &parms;
5633+
5634+ set_fs(KERNEL_DS);
5635+ rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5636+ set_fs(old_fs);
5637+
5638+ parms32.status = parms.status;
5639+ parms32.outsum = parms.outsum;
5640+
5641+ if (copy_to_user((struct evms_compute_csum32 *)arg, &parms32,
5642+ sizeof(struct evms_compute_csum32)))
5643+ return -EFAULT;
5644+
5645+ return rc;
5646+}
5647+
5648+static int evms_bbr_plugin_ioctl(unsigned int fd,
5649+ unsigned int cmd,
5650+ unsigned long arg)
5651+{
5652+ mm_segment_t old_fs = get_fs();
5653+ struct evms_notify_bbr32 bbr_parms32;
5654+ struct evms_notify_bbr bbr_parms;
5655+ struct evms_plugin_ioctl_pkt *parms =
5656+ (struct evms_plugin_ioctl_pkt *)arg;
5657+ void *old_ptr = NULL;
5658+ int rc;
5659+
5660+ if (copy_from_user(&bbr_parms32,
5661+ (struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5662+ sizeof(struct evms_notify_bbr32)))
5663+ return -EFAULT;
5664+
5665+ memcpy(&bbr_parms, &bbr_parms32, sizeof(struct evms_notify_bbr32));
5666+ bbr_parms.buffer = (void *)uvirt_to_kernel(bbr_parms32.buffer);
5667+ bbr_parms.rw = bbr_parms32.rw;
5668+ old_ptr = parms->feature_ioctl_data;
5669+ parms->feature_ioctl_data = &bbr_parms;
5670+
5671+ set_fs(KERNEL_DS);
5672+ rc = sys_ioctl(fd, cmd, arg);
5673+ set_fs(old_fs);
5674+
5675+ parms->feature_ioctl_data = old_ptr;
5676+
5677+ if (!rc) {
5678+ bbr_parms32.nr_sect = bbr_parms.nr_sect;
5679+ rc = copy_to_user((struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5680+ &bbr_parms32,
5681+ sizeof(struct evms_notify_bbr32));
5682+ }
5683+
5684+ return rc;
5685+}
5686+
5687+static int evms_md_plugin_ioctl(unsigned int fd,
5688+ unsigned int cmd,
5689+ unsigned long arg)
5690+{
5691+ mm_segment_t old_fs = get_fs();
5692+ void *old_ptr = NULL;
5693+ void *old_md_ptr = NULL;
5694+ struct evms_md_ioctl32 md_parms32;
5695+ struct evms_md_ioctl md_parms;
5696+ struct evms_md_array_info32 md_array_parms32;
5697+ struct evms_md_array_info md_array_parms;
5698+ struct raid5_ioctl_init_io32 r5_init_io_parms32;
5699+ struct raid5_ioctl_init_io r5_init_io_parms;
5700+ struct evms_plugin_ioctl_pkt *parms =
5701+ (struct evms_plugin_ioctl_pkt *)arg;
5702+ int rc;
5703+
5704+ if (copy_from_user(&md_parms32,
5705+ (struct evms_md_ioctl*)parms->feature_ioctl_data,
5706+ sizeof(struct evms_md_ioctl32)))
5707+ return -EFAULT;
5708+
5709+ md_parms.mddev_idx = md_parms32.mddev_idx;
5710+ md_parms.cmd = md_parms32.cmd;
5711+ md_parms.arg = (void *)uvirt_to_kernel(md_parms32.arg);
5712+ old_ptr = parms->feature_ioctl_data;
5713+ parms->feature_ioctl_data = &md_parms;
5714+
5715+ if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5716+ if (copy_from_user(&md_array_parms32,
5717+ (struct evms_md_array_info32*)md_parms.arg,
5718+ sizeof(struct evms_md_array_info32)))
5719+ return -EFAULT;
5720+
5721+ md_array_parms.state = md_array_parms32.state;
5722+ md_array_parms.sb =
5723+ (void *)uvirt_to_kernel(md_array_parms32.sb);
5724+ old_md_ptr = (void *)md_parms.arg;
5725+ md_parms.arg = &md_array_parms;
5726+ } else if (parms->feature_command == EVMS_MD_PERS_IOCTL_CMD) {
5727+ if (md_parms.cmd == EVMS_MD_RAID5_INIT_IO) {
5728+ if (copy_from_user(&r5_init_io_parms32,
5729+ (struct raid5_ioctl_init_io32*)md_parms.arg,
5730+ sizeof(struct raid5_ioctl_init_io32)))
5731+ return -EFAULT;
5732+
5733+ r5_init_io_parms.rw = r5_init_io_parms32.rw;
5734+ r5_init_io_parms.lsn = r5_init_io_parms32.lsn;
5735+ r5_init_io_parms.nr_sects = r5_init_io_parms32.nr_sects;
5736+ r5_init_io_parms.data =
5737+ (void *)uvirt_to_kernel(r5_init_io_parms32.data);
5738+ old_md_ptr = (void *)md_parms.arg;
5739+ md_parms.arg = &r5_init_io_parms;
5740+ }
5741+ }
5742+
5743+ set_fs(KERNEL_DS);
5744+ rc = sys_ioctl(fd, cmd, arg);
5745+ set_fs(old_fs);
5746+
5747+ parms->feature_ioctl_data = old_ptr;
5748+ md_parms.arg = old_md_ptr;
5749+
5750+ if (!rc) {
5751+ if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5752+ md_array_parms32.state = md_array_parms.state;
5753+ rc = copy_to_user((struct evms_md_array_info32 *)md_parms.arg,
5754+ &md_array_parms32,
5755+ sizeof(struct evms_md_array_info32));
5756+ }
5757+ if (!rc) {
5758+ md_parms32.mddev_idx = md_parms.mddev_idx;
5759+ rc = copy_to_user((struct evms_md_ioctl*)parms->feature_ioctl_data,
5760+ &md_parms32,
5761+ sizeof(struct evms_md_ioctl32));
5762+ }
5763+ }
5764+
5765+ return rc;
5766+}
5767+
5768+static int evms_plugin_ioctl(unsigned int fd,
5769+ unsigned int cmd,
5770+ unsigned long arg)
5771+{
5772+ mm_segment_t old_fs = get_fs();
5773+ struct evms_plugin_ioctl32 parms32;
5774+ struct evms_plugin_ioctl_pkt parms;
5775+ unsigned int kcmd;
5776+ void *karg;
5777+ int rc;
5778+
5779+ if (copy_from_user(&parms32, (struct evms_plugin_ioctl32 *)arg,
5780+ sizeof(struct evms_plugin_ioctl32)))
5781+ return -EFAULT;
5782+
5783+ parms.feature_id = parms32.feature_id;
5784+ parms.feature_command = parms32.feature_command;
5785+ parms.status = parms32.status;
5786+ parms.feature_ioctl_data =
5787+ (void *)uvirt_to_kernel(parms32.feature_ioctl_data);
5788+
5789+ kcmd = EVMS_PLUGIN_IOCTL;
5790+ karg = &parms;
5791+
5792+ switch (parms.feature_id) {
5793+ case EVMS_MD_PLUGIN_ID:
5794+ rc = evms_md_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5795+ break;
5796+ case EVMS_BBR_PLUGIN_ID:
5797+ rc = evms_bbr_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5798+ break;
5799+ default:
5800+ set_fs(KERNEL_DS);
5801+ rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5802+ set_fs(old_fs);
5803+ }
5804+
5805+ if (!rc) {
5806+ parms32.status = parms.status;
5807+ rc = copy_to_user((struct evms_plugin_ioctl32 *)arg, &parms32,
5808+ sizeof(struct evms_plugin_ioctl32));
5809+ }
5810+
5811+ return rc;
5812+}
5813+#endif
5814+
5815+/**********************************************************/
5816+/* START -- exported functions/Common Services */
5817+/**********************************************************/
5818+
5819+/*
5820+ * Function: evms_cs_get_version
5821+ * Description: This function returns the current EVMS version
5822+ */
5823+void
5824+evms_cs_get_version(int *major, int *minor)
5825+{
5826+ *major = EVMS_MAJOR_VERSION;
5827+ *minor = EVMS_MINOR_VERSION;
5828+}
5829+
5830+EXPORT_SYMBOL(evms_cs_get_version);
5831+
5832+int
5833+evms_cs_check_version(struct evms_version *required,
5834+ struct evms_version *actual)
5835+{
5836+ if (required->major != actual->major)
5837+ return -EINVAL;
5838+ else if (required->minor > actual->minor)
5839+ return -EINVAL;
5840+ else if (required->minor == actual->minor)
5841+ if (required->patchlevel > actual->patchlevel)
5842+ return -EINVAL;
5843+ return 0;
5844+}
5845+
5846+EXPORT_SYMBOL(evms_cs_check_version);
5847+
5848+int
5849+evms_cs_allocate_logical_node(struct evms_logical_node **pp)
5850+{
5851+ *pp = kmalloc(sizeof (struct evms_logical_node), GFP_KERNEL);
5852+ if (*pp) {
5853+ memset(*pp, 0, sizeof (struct evms_logical_node));
5854+ atomic_inc(&evms_logical_nodes);
5855+ return 0;
5856+ }
5857+ return -ENOMEM;
5858+}
5859+
5860+EXPORT_SYMBOL(evms_cs_allocate_logical_node);
5861+
5862+void
5863+evms_cs_deallocate_volume_info(struct evms_logical_node *p)
5864+{
5865+ if (p->iflags & EVMS_FEATURE_BOTTOM) {
5866+ evms_cs_remove_item_from_list(&evms_global_feature_node_list,
5867+ p);
5868+ kfree(p->volume_info);
5869+ p->volume_info = NULL;
5870+ p->iflags &= ~EVMS_FEATURE_BOTTOM;
5871+ }
5872+}
5873+
5874+EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
5875+
5876+void
5877+evms_cs_deallocate_logical_node(struct evms_logical_node *p)
5878+{
5879+ if (p->next) {
5880+ LOG_SERIOUS
5881+ ("Deallocating object whose NEXT ptr is not null!!\n");
5882+ }
5883+ evms_cs_deallocate_volume_info(p);
5884+ if (p->feature_header) {
5885+ kfree(p->feature_header);
5886+ p->feature_header = NULL;
5887+ }
5888+ kfree(p);
5889+ atomic_dec(&evms_logical_nodes);
5890+}
5891+
5892+EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
5893+
5894+/*
5895+ * Function: evms_cs_register_plugin
5896+ * Description: This function is exported so that all plugins can register with EVMS
5897+ */
5898+int
5899+evms_cs_register_plugin(struct evms_plugin_header *plugin)
5900+{
5901+ int rc = 0;
5902+ struct evms_registered_plugin *reg_record, **pp;
5903+ struct evms_version *ver;
5904+
5905+ ver = &plugin->required_services_version;
5906+
5907+ LOG_EXTRA
5908+ ("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5909+ GetPluginOEM(plugin->id), GetPluginType(plugin->id),
5910+ GetPluginID(plugin->id), plugin->version.major,
5911+ plugin->version.minor, plugin->version.patchlevel, ver->major,
5912+ ver->minor, ver->patchlevel);
5913+
5914+ /* check common services requirements */
5915+ rc = evms_cs_check_version(ver, &evms_svc_version);
5916+ if (rc) {
5917+ LOG_SERIOUS
5918+ ("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
5919+ EVMS_COMMON_SERVICES_MAJOR, EVMS_COMMON_SERVICES_MINOR,
5920+ EVMS_COMMON_SERVICES_PATCHLEVEL);
5921+ }
5922+ if (!rc) {
5923+ /* ensure a plugin with this feature id is
5924+ * not already loaded.
5925+ */
5926+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
5927+ if ((*pp)->plugin->id == plugin->id) {
5928+ rc = -EBUSY;
5929+ LOG_ERROR
5930+ ("error(%d) attempting to load another plugin with id(%x).\n",
5931+ rc, plugin->id);
5932+ }
5933+ }
5934+ }
5935+ if (!rc) {
5936+ /* ensure the plugin has provided functions for
5937+ * the mandatory entry points.
5938+ */
5939+ if (!plugin->fops->discover) {
5940+ rc = -EINVAL;
5941+ } else if (!plugin->fops->init_io) {
5942+ rc = -EINVAL;
5943+ } else if (!plugin->fops->ioctl) {
5944+ rc = -EINVAL;
5945+ } else if (!plugin->fops->read) {
5946+ rc = -EINVAL;
5947+ } else if (!plugin->fops->write) {
5948+ rc = -EINVAL;
5949+ } else if (!plugin->fops->delete) {
5950+ rc = -EINVAL;
5951+ }
5952+ }
5953+ if (!rc) {
5954+ /* allocate a new plugin registration record */
5955+ reg_record =
5956+ kmalloc(sizeof (struct evms_registered_plugin), GFP_KERNEL);
5957+ if (!reg_record) {
5958+ rc = -ENOMEM;
5959+ }
5960+ }
5961+ if (!rc) {
5962+ memset(reg_record, 0, sizeof (struct evms_registered_plugin));
5963+ /* store ptr to plugin header in new registration record */
5964+ reg_record->plugin = plugin;
5965+
5966+ /* terminate the record */
5967+ reg_record->next = NULL;
5968+
5969+ /* find end of the plugin registration list */
5970+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) ;
5971+ /* add registration record to list */
5972+ *pp = reg_record;
5973+
5974+ /* increment the usage count */
5975+ MOD_INC_USE_COUNT;
5976+ }
5977+
5978+ return (rc);
5979+}
5980+
5981+EXPORT_SYMBOL(evms_cs_register_plugin);
5982+
5983+/*
5984+ * Function: evms_cs_unregister_plugin
5985+ * Description: This function is exported so that all plugins can
5986+ * unregister with EVMS
5987+ */
5988+int
5989+evms_cs_unregister_plugin(struct evms_plugin_header *plugin)
5990+{
5991+ int rc = 0, found = FALSE;
5992+ struct evms_registered_plugin **pp;
5993+ struct evms_version *ver;
5994+
5995+ ver = &plugin->required_services_version;
5996+
5997+ LOG_EXTRA
5998+ ("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5999+ GetPluginOEM(plugin->id), GetPluginType(plugin->id),
6000+ GetPluginID(plugin->id), plugin->version.major,
6001+ plugin->version.minor, plugin->version.patchlevel, ver->major,
6002+ ver->minor, ver->patchlevel);
6003+ /* ensure a plugin with this feature id is
6004+ * currently loaded.
6005+ */
6006+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
6007+ if ((*pp)->plugin->id == plugin->id) {
6008+ found = TRUE;
6009+ break;
6010+ }
6011+ }
6012+ if (!found) {
6013+ rc = -ENOPKG;
6014+ LOG_ERROR
6015+ ("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
6016+ rc, plugin->id);
6017+ }
6018+ /* actually unload the plugin now */
6019+ if (!rc) {
6020+ struct evms_registered_plugin *tmp = *pp;
6021+
6022+ /* remove the plugin record from our
6023+ * internal plugin list
6024+ */
6025+ *pp = (*pp)->next;
6026+ /* deallocate the plugin registration record
6027+ */
6028+ kfree(tmp);
6029+
6030+ /* decrement the usage count */
6031+ MOD_DEC_USE_COUNT;
6032+ }
6033+ return (rc);
6034+}
6035+
6036+EXPORT_SYMBOL(evms_cs_unregister_plugin);
6037+
6038+/* function: evms_cs_add_logical_node_to_list
6039+ *
6040+ * This functions adds a new logical node to the end of a
6041+ * node list.
6042+ *
6043+ * NOTE: This function is only expected to be called at
6044+ * discovery time, which is singled threaded by nature,
6045+ * and therefore doesn't need to be made SMP safe.
6046+ */
6047+int
6048+evms_cs_add_logical_node_to_list(struct evms_logical_node **list_head,
6049+ struct evms_logical_node *node)
6050+{
6051+ int rc = 0;
6052+ struct evms_logical_node **pp = NULL;
6053+
6054+ /* check to make sure node is not already on a list */
6055+ if (node->next)
6056+ rc = 1;
6057+ else
6058+ /* check to make sure node being added is not already in the list */
6059+ for (pp = list_head; *pp; pp = &(*pp)->next)
6060+ if (*pp == node) {
6061+ rc = 2;
6062+ break;
6063+ }
6064+
6065+ /* add node to the end of the list */
6066+ if (!rc)
6067+ *pp = node;
6068+
6069+ return (rc);
6070+}
6071+
6072+EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
6073+
6074+/* function: evms_cs_remove_logical_node_from_list
6075+ *
6076+ * This functions removes a new logical node from a node list.
6077+ *
6078+ * NOTE: This function is only expected to be called at
6079+ * discovery time, which is singled threaded by nature,
6080+ * and therefore doesn't need to be made SMP safe.
6081+ */
6082+int
6083+evms_cs_remove_logical_node_from_list(struct evms_logical_node **list_head,
6084+ struct evms_logical_node *node)
6085+{
6086+ /* remove this node from the head of the list */
6087+ int rc = 1; /* assume failure until target node is found */
6088+ struct evms_logical_node **pp;
6089+ for (pp = list_head; *pp; pp = &(*pp)->next)
6090+ if (*pp == node) {
6091+ *pp = (*pp)->next;
6092+ node->next = NULL;
6093+ rc = 0;
6094+ break;
6095+ }
6096+ return (rc);
6097+}
6098+
6099+EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
6100+
6101+int
6102+evms_cs_kernel_ioctl(struct evms_logical_node *node, unsigned int cmd,
6103+ unsigned long arg)
6104+{
6105+ int rc = 0;
6106+ struct inode tmp_inode;
6107+ mm_segment_t fs;
6108+
6109+ lock_kernel();
6110+ fs = get_fs();
6111+ set_fs(get_ds());
6112+ rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
6113+ set_fs(fs);
6114+ unlock_kernel();
6115+
6116+ return (rc);
6117+
6118+}
6119+
6120+EXPORT_SYMBOL(evms_cs_kernel_ioctl);
6121+
6122+/*
6123+ * function: evms_cs_size_in_vsectors
6124+ *
6125+ * In EVMS a V(irtual)Sector is 512 bytes in size.
6126+ * This function computes the number of VSECTORs an specified
6127+ * item size would require.
6128+ *
6129+ * NOTE: This function has been coded to work with 64 bit values.
6130+ */
6131+unsigned long
6132+evms_cs_size_in_vsectors(long long item_size)
6133+{
6134+ long long sectors;
6135+
6136+ sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
6137+ if (item_size & (EVMS_VSECTOR_SIZE - 1))
6138+ sectors++;
6139+
6140+ return (sectors);
6141+}
6142+
6143+EXPORT_SYMBOL(evms_cs_size_in_vsectors);
6144+
6145+/*
6146+ * function: evms_cs_log2
6147+ *
6148+ * this function computes the power of the 2 of specified
6149+ * value. If the value is 0, a -1 is returned. If the value
6150+ * is NOT a power of 2, a -2 is return. Otherwise the power
6151+ * of 2 is returned.
6152+ */
6153+int
6154+evms_cs_log2(long long value)
6155+{
6156+ int result = -1;
6157+ long long tmp;
6158+
6159+ if (value) {
6160+ tmp = value;
6161+ result++;
6162+ while (!(tmp & 1)) {
6163+ result++;
6164+ tmp >>= 1;
6165+ }
6166+ if (tmp != 1) {
6167+ result = -2;
6168+ }
6169+ }
6170+ return (result);
6171+}
6172+
6173+EXPORT_SYMBOL(evms_cs_log2);
6174+
6175+/*
6176+ * Functions:
6177+ *
6178+ * build_crc_table()
6179+ * calculate_crc()
6180+ *
6181+ *
6182+ * Description: The functions in this module provide a means of calculating
6183+ * the 32 bit CRC for a block of data. build_crc_table must
6184+ * be called to initialize this module. calculate_crc must
6185+ * NOT be used until after build_crc_table has been called.
6186+ * Once build_crc_table has been called, calculate_crc can
6187+ * be used to calculate the crc of the data residing in a
6188+ * user specified buffer.
6189+ *
6190+ */
6191+
6192+#define CRC_POLYNOMIAL 0xEDB88320L
6193+
6194+static u32 crc_table[256];
6195+static u32 crc_table_built = FALSE;
6196+
6197+/*********************************************************************/
6198+/* */
6199+/* Function Name: build_crc_table */
6200+/* */
6201+/* Descriptive Name: This module implements the crc function using */
6202+/* a table driven method. The required table */
6203+/* must be setup before the calculate_crc */
6204+/* function can be used. This table only needs */
6205+/* to be set up once. This function sets up the */
6206+/* crc table needed by calculate_crc. */
6207+/* */
6208+/* Input: None */
6209+/* */
6210+/* Output: None */
6211+/* */
6212+/* Error Handling: N/A */
6213+/* */
6214+/* Side Effects: The internal crc table is initialized. */
6215+/* */
6216+/* Notes: None. */
6217+/* */
6218+/*********************************************************************/
6219+static void
6220+build_crc_table(void)
6221+{
6222+ u32 i, j, crc;
6223+
6224+ for (i = 0; i <= 255; i++) {
6225+ crc = i;
6226+ for (j = 8; j > 0; j--) {
6227+ if (crc & 1)
6228+ crc = (crc >> 1) ^ CRC_POLYNOMIAL;
6229+ else
6230+ crc >>= 1;
6231+ }
6232+ crc_table[i] = crc;
6233+ }
6234+ crc_table_built = TRUE;
6235+}
6236+
6237+/*********************************************************************/
6238+/* */
6239+/* Function Name: calculate_crc */
6240+/* */
6241+/* Descriptive Name: This function calculates the crc value for */
6242+/* the data in the buffer specified by Buffer. */
6243+/* */
6244+/* Input: u32 crc : This is the starting crc. If you are */
6245+/* starting a new crc calculation, then */
6246+/* this should be set to 0xFFFFFFFF. If */
6247+/* you are continuing a crc calculation */
6248+/* (i.e. all of the data did not fit in */
6249+/* the buffer so you could not calculate */
6250+/* the crc in a single operation), then */
6251+/* this is the crc output by the last */
6252+/* calculate_crc call. */
6253+/* */
6254+/* Output: The crc for the data in the buffer, based upon the value*/
6255+/* of the input parameter crc. */
6256+/* */
6257+/* Error Handling: None. */
6258+/* */
6259+/* Side Effects: None. */
6260+/* */
6261+/* Notes: None. */
6262+/* */
6263+/*********************************************************************/
6264+u32
6265+evms_cs_calculate_crc(u32 crc, void *buffer, u32 buffersize)
6266+{
6267+ unsigned char *current_byte;
6268+ u32 temp1, temp2, i;
6269+
6270+ current_byte = (unsigned char *) buffer;
6271+ /* Make sure the crc table is available */
6272+ if (crc_table_built == FALSE)
6273+ build_crc_table();
6274+ /* Process each byte in the buffer. */
6275+ for (i = 0; i < buffersize; i++) {
6276+ temp1 = (crc >> 8) & 0x00FFFFFF;
6277+ temp2 =
6278+ crc_table[(crc ^ (u32) *
6279+ current_byte) & (u32) 0xff];
6280+ current_byte++;
6281+ crc = temp1 ^ temp2;
6282+ }
6283+ return (crc);
6284+}
6285+
6286+EXPORT_SYMBOL(evms_cs_calculate_crc);
6287+
6288+#define EVMS_ORIGINAL_CALLBACK_FLAG 1<<0
6289+typedef struct io_notify_s {
6290+ unsigned int flags;
6291+ void *private;
6292+ struct buffer_head *bh;
6293+ u64 rsector;
6294+ kdev_t rdev;
6295+ void *b_private;
6296+ void (*callback_function) (struct evms_logical_node * node,
6297+ struct buffer_head * bh,
6298+ int uptodate, int *redrive);
6299+ struct io_notify_s *next;
6300+} io_notify_t;
6301+
6302+struct evms_pool_mgmt *
6303+evms_cs_create_pool(int objsize,
6304+ u8 * pool_name,
6305+ void (*ctor) (void *, kmem_cache_t *, unsigned long),
6306+ void (*dtor) (void *, kmem_cache_t *, unsigned long))
6307+{
6308+ struct evms_pool_mgmt *pool;
6309+
6310+ /* create the pool management structure */
6311+ pool = kmalloc(sizeof (struct evms_pool_mgmt), GFP_KERNEL);
6312+ if (!pool) {
6313+ LOG_CRITICAL("Cannot create %s fpool mgmt structure",
6314+ pool_name);
6315+ return NULL;
6316+ }
6317+ /* initialize various field in pool mgmt structure */
6318+ memset(pool, 0, sizeof (struct evms_pool_mgmt));
6319+ pool->member_size = objsize;
6320+ pool->name = pool_name;
6321+ pool->waiters = (atomic_t) ATOMIC_INIT(0);
6322+ init_waitqueue_head(&pool->wait_queue);
6323+ /* go create the pool */
6324+ pool->cachep = kmem_cache_create(pool->name,
6325+ pool->member_size,
6326+ 0, SLAB_HWCACHE_ALIGN, ctor, dtor);
6327+ if (!pool->cachep)
6328+ panic("Cannot create %s SLAB cache", pool->name);
6329+ return (pool);
6330+}
6331+
6332+EXPORT_SYMBOL(evms_cs_create_pool);
6333+
6334+void *
6335+evms_cs_allocate_from_pool(struct evms_pool_mgmt *pool, int blockable)
6336+{
6337+ void *objp;
6338+
6339+ while (1) {
6340+ objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
6341+ if (objp || !blockable) {
6342+ return (objp);
6343+ } else {
6344+ /* block and wait for an object to
6345+ * be returned to the pool
6346+ */
6347+ atomic_inc(&pool->waiters);
6348+ wait_event(pool->wait_queue,
6349+ (!atomic_read(&pool->waiters)));
6350+ }
6351+ }
6352+ return (objp);
6353+}
6354+
6355+EXPORT_SYMBOL(evms_cs_allocate_from_pool);
6356+
6357+void
6358+evms_cs_deallocate_to_pool(struct evms_pool_mgmt *pool, void *objp)
6359+{
6360+ kmem_cache_free(pool->cachep, objp);
6361+ atomic_set(&pool->waiters, 0);
6362+ if (waitqueue_active(&pool->wait_queue)) {
6363+ wake_up(&pool->wait_queue);
6364+ }
6365+}
6366+
6367+EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
6368+
6369+void
6370+evms_cs_destroy_pool(struct evms_pool_mgmt *pool)
6371+{
6372+ kmem_cache_destroy(pool->cachep);
6373+ kfree(pool);
6374+}
6375+
6376+EXPORT_SYMBOL(evms_cs_destroy_pool);
6377+
6378+/*
6379+ * function: evms_end_io
6380+ *
6381+ * This is a support function for
6382+ * evms_cs_register_for_end_io_notification.
6383+ * This function is called during I/O completion on any buffer
6384+ * head that was registered by a plugin. Control is passed here
6385+ * and this routine will, thru the use of the I/O notify entry
6386+ * stored in the b_private field of the buffer head, restore
6387+ * the b_rsector value the buffer head had at the time of
6388+ * registration and pass control to the registered callback
6389+ * address, with pointers to the buffer head and an optional
6390+ * plugin private data. Upon completion of the callback,
6391+ * control is returned back here. The io notify list entry
6392+ * is deleted. This process repeats until this routine
6393+ * detects that all registered plugins have been called back
6394+ * and the buffer head's original end_io function has been
6395+ * called. At this point the DONE flag is set, and we terminate
6396+ * callback loop and exit.
6397+ *
6398+ * Plugins may desire to break or interrupt the callback
6399+ * sequence or chain. This may be useful to redrive I/O or
6400+ * to wait for other buffer heads to complete before
6401+ * allowing the original buffer head callback to occur.
6402+ * To interrupt the callback "chain", a registered
6403+ * plugin's callback must return with the DONE flag set.
6404+ *
6405+ * NOTE: If a plugin set the DONE flag, and wishes to redrive
6406+ * a buffer head, the plugin MUST reregister the buffer head
6407+ * to receive another callback on this buffer head. Also, the
6408+ * plugin MUST ensure that the original buffer head end_io
6409+ * function get called at some point, either by reregistering
6410+ * this buffer head and receiving another callback, or by
6411+ * means of buffer head aggregation triggered by the callbacks
6412+ * of other buffer heads.
6413+ *
6414+ */
6415+static void
6416+evms_end_io(struct buffer_head *bh, int uptodate)
6417+{
6418+ io_notify_t *entry;
6419+ int done;
6420+
6421+ done = FALSE;
6422+ while (!done) {
6423+ /* retrieve the io_notify_entry ptr from
6424+ * the b_private field in the buffer head.
6425+ */
6426+ entry = (io_notify_t *) bh->b_private;
6427+
6428+ /* restore the b_private value to
6429+ * the previous b_private value (which
6430+ * should be a previous io_notify_entry
6431+ * or the original b_private pointer).
6432+ */
6433+ bh->b_private = entry->b_private;
6434+
6435+ /* check for original callback for this bh */
6436+ if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
6437+ /* this is the original for bh */
6438+
6439+ /* turn off flag marking this as the original */
6440+ entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
6441+
6442+ /* decrement volume's requests_in_progress var */
6443+ atomic_dec(&evms_logical_volumes[MINOR(bh->b_rdev)].
6444+ requests_in_progress);
6445+
6446+ /* restore b_end_io to original value */
6447+ bh->b_end_io = (void *) entry->callback_function;
6448+ if (bh->b_end_io) {
6449+ /* invoke original callback function
6450+ * if it exists.
6451+ */
6452+ bh->b_end_io(bh, uptodate);
6453+ }
6454+ done = TRUE;
6455+ } else {
6456+ /* this is a plugin callback */
6457+
6458+ /* restore the rsector value to the
6459+ * value at the time of callback
6460+ * registration.
6461+ */
6462+ bh->b_rsector = entry->rsector;
6463+ bh->b_rdev = entry->rdev;
6464+ /* invoke plugin callback function */
6465+ entry->callback_function(entry->private, bh, uptodate,
6466+ &done);
6467+ }
6468+ /* free the io notify entry */
6469+ evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
6470+ }
6471+}
6472+
6473+/*
6474+ * function: evms_cs_register_for_end_io_notification
6475+ *
6476+ * This function is an evms common service.
6477+ * This routine allows a (plugin) function to register to
6478+ * participate in the io completion notification process.
6479+ * This is useful for plugins which alter data after it
6480+ * has been read from the disk (i.e. encryption or
6481+ * compression).
6482+ *
6483+ * This routine also records the rsector value at the time
6484+ * of registration, so that it can be restored to that value
6485+ * prior to the callback to a plugin, thus allowing that
6486+ * plugin to work with the value it had seen during the
6487+ * initiating I/O request.
6488+ *
6489+ * This routine also records a private data pointer at the
6490+ * time of registration, and is returned to the plugin
6491+ * at callback time. This private data pointer was designed
6492+ * to contain context/callback/buffer_head specific data, and
6493+ * frees the plugin from having to store and find associated
6494+ * data at the time of the callback. This field is not used
6495+ * by this function and is optional (NULL if unused). It is
6496+ * recorded and returned as a convenience for the plugins.
6497+ *
6498+ * DANGER!!! - WILL ROBINSON - DANGER!!!
6499+ * This routine uses the b_private field in the
6500+ * buffer_head structure. If any lower level driver uses this
6501+ * field and do NOT restore it, the I/O callback will fail!!
6502+ *
6503+ * Any plugins writers requiring a field for private storage
6504+ * should instead use the private field parameter in this
6505+ * function to store their private data.
6506+ *
6507+ */
6508+
6509+int
6510+evms_cs_register_for_end_io_notification(void *private,
6511+ struct buffer_head *bh,
6512+ void *callback_function)
6513+{
6514+ int rc = 0, done;
6515+ io_notify_t *new_entry;
6516+
6517+ done = FALSE;
6518+ while (!done) {
6519+ /* allocate a notify entry */
6520+ new_entry =
6521+ evms_cs_allocate_from_pool(evms_io_notify_pool,
6522+ EVMS_BLOCKABLE);
6523+ if (!new_entry) {
6524+ schedule();
6525+ continue;
6526+ }
6527+
6528+ /* initialize notify entry */
6529+ new_entry->private = private;
6530+ new_entry->bh = bh;
6531+ new_entry->rsector = bh->b_rsector;
6532+ new_entry->rdev = bh->b_rdev;
6533+ new_entry->b_private = bh->b_private;
6534+ new_entry->flags = 0;
6535+
6536+ /* is this the first callback for this bh? */
6537+ if (bh->b_end_io != evms_end_io) {
6538+ /* yes, first callback */
6539+ new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
6540+ new_entry->callback_function = (void *) bh->b_end_io;
6541+
6542+ /* increment volume's requests_in_progress var */
6543+ atomic_inc(&evms_logical_volumes[MINOR(bh->b_rdev)].
6544+ requests_in_progress);
6545+
6546+ /* set b_end_io so we get control */
6547+ bh->b_end_io = evms_end_io;
6548+ } else {
6549+ /* no, not first callback */
6550+ new_entry->callback_function = callback_function;
6551+ done = TRUE;
6552+ }
6553+ /* set b_private to aid in quick lookup */
6554+ bh->b_private = new_entry;
6555+ }
6556+ return (rc);
6557+}
6558+
6559+EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
6560+
6561+/* function description: evms_cs_lookup_item_in_list
6562+ *
6563+ * this function searches for the specified item in the
6564+ * specified node list. it returns the address of the
6565+ * evms_list_node containing the specified item.
6566+ */
6567+struct evms_list_node **
6568+evms_cs_lookup_item_in_list(struct evms_list_node **node_list, void *item)
6569+{
6570+ struct evms_list_node **list_node;
6571+
6572+ list_node = node_list;
6573+ while (*list_node) {
6574+ if ((*list_node)->item == item)
6575+ break;
6576+ list_node = &(*list_node)->next;
6577+ }
6578+ return (list_node);
6579+}
6580+
6581+EXPORT_SYMBOL(evms_cs_lookup_item_in_list);
6582+
6583+/* function description: evms_add_item_to_list
6584+ *
6585+ * this function adds an item to the list. the
6586+ * node for the new item is added to the end
6587+ * of the list. the list is traversed to find the end.
6588+ * while the traversal occurs, the list is checked
6589+ * for the presence of the specified item. if already
6590+ * present in the list, and error code is returned.
6591+ */
6592+/* function description: evms_cs_add_item_to_list
6593+ *
6594+ * this function adds an item to an item list.
6595+ *
6596+ * RC == 0 is returned for:
6597+ * a successful add of a new item
6598+ *
6599+ * RC == 1 is returned when:
6600+ * the item is already on the list
6601+ *
6602+ * RC < 0 is returned for an error attempting to add the item.
6603+ */
6604+int
6605+evms_cs_add_item_to_list(struct evms_list_node **list, void *item)
6606+{
6607+ int rc = 0;
6608+ struct evms_list_node **list_node, *new_node;
6609+
6610+ list_node = evms_cs_lookup_item_in_list(list, item);
6611+ if (*list_node == NULL) {
6612+ new_node = kmalloc(sizeof (struct evms_list_node), GFP_NOIO);
6613+ if (new_node) {
6614+ memset(new_node, 0, sizeof (struct evms_list_node));
6615+ new_node->item = item;
6616+ *list_node = new_node;
6617+ } else {
6618+ rc = -ENOMEM;
6619+ }
6620+ } else {
6621+ rc = 1;
6622+ LOG_DEBUG
6623+ ("warning: attempt to add duplicate item(%p) to list(%p).\n",
6624+ item, list);
6625+ }
6626+ return (rc);
6627+}
6628+
6629+EXPORT_SYMBOL(evms_cs_add_item_to_list);
6630+
6631+/* function description: evms_remove_item_from_list
6632+ *
6633+ * this function removes a specified item from the
6634+ * specified list. if the specified item is not
6635+ * found in the list, and error is returned.
6636+ */
6637+int
6638+evms_cs_remove_item_from_list(struct evms_list_node **list, void *item)
6639+{
6640+ int rc = 0;
6641+ struct evms_list_node **list_node;
6642+
6643+ /* check to see if item is in the list */
6644+ list_node = evms_cs_lookup_item_in_list(list, item);
6645+
6646+ /* was the node found in the list? */
6647+ if (*list_node) {
6648+ /* yes, it was found */
6649+ struct evms_list_node *tmp_node;
6650+
6651+ /* save ptr to node being removed */
6652+ tmp_node = *list_node;
6653+ /* remove it from the global list */
6654+ *list_node = tmp_node->next;
6655+ /* delete removed node */
6656+ kfree(tmp_node);
6657+ } else {
6658+ /* no, it was not found */
6659+ rc = -1;
6660+ LOG_ERROR
6661+ ("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
6662+ rc, item, list);
6663+ }
6664+ return (rc);
6665+}
6666+
6667+EXPORT_SYMBOL(evms_cs_remove_item_from_list);
6668+
6669+/* function description: evms_cs_register_device
6670+ *
6671+ * this function adds a device to the EVMS global device list.
6672+ *
6673+ * RC == 0 is returned for:
6674+ * a successful add of a new device
6675+ *
6676+ * RC == 1 is returned when:
6677+ * the device is already on the list
6678+ *
6679+ * RC < 0 is returned for an error attempting to add the device.
6680+ */
6681+int
6682+evms_cs_register_device(struct evms_logical_node *device)
6683+{
6684+ return (evms_cs_add_item_to_list(&evms_global_device_list, device));
6685+}
6686+
6687+EXPORT_SYMBOL(evms_cs_register_device);
6688+
6689+/* function description: evms_cs_unregister_device
6690+ *
6691+ * this function removes a device from the EVMS global device list.
6692+ *
6693+ * RC == 0 is returned for:
6694+ * a successful removal of the specified device
6695+ *
6696+ * RC < 0 is returned for an error attempting to add the device.
6697+ * -ENODATA is returned if specified device is not found.
6698+ */
6699+int
6700+evms_cs_unregister_device(struct evms_logical_node *device)
6701+{
6702+ return (evms_cs_remove_item_from_list(&evms_global_device_list,
6703+ device));
6704+}
6705+
6706+EXPORT_SYMBOL(evms_cs_unregister_device);
6707+
6708+static struct evms_list_node *find_first_next_list_node = NULL;
6709+int
6710+evms_cs_find_next_device(struct evms_logical_node *in_device,
6711+ struct evms_logical_node **out_device)
6712+{
6713+ int rc = 0;
6714+ struct evms_list_node **list_node;
6715+
6716+ if (in_device == NULL)
6717+ find_first_next_list_node = evms_global_device_list;
6718+ else {
6719+ list_node =
6720+ evms_cs_lookup_item_in_list(&evms_global_device_list,
6721+ in_device);
6722+ find_first_next_list_node = *list_node;
6723+ if (find_first_next_list_node == NULL)
6724+ rc = -ENODATA;
6725+ else
6726+ find_first_next_list_node =
6727+ find_first_next_list_node->next;
6728+ }
6729+
6730+ if (find_first_next_list_node == NULL)
6731+ *out_device = NULL;
6732+ else
6733+ *out_device = (struct evms_logical_node *)
6734+ find_first_next_list_node->item;
6735+
6736+ return (rc);
6737+}
6738+
6739+EXPORT_SYMBOL(evms_cs_find_next_device);
6740+
6741+void
6742+evms_cs_signal_event(int eventid)
6743+{
6744+ int rc;
6745+ struct evms_list_node **list_node;
6746+
6747+ /* signal PID(s) of specified event */
6748+ list_node = &evms_global_notify_list;
6749+ while (*list_node) {
6750+ struct evms_event *event;
6751+
6752+ event = (*list_node)->item;
6753+ if (event->eventid == eventid) {
6754+ struct task_struct *tsk;
6755+
6756+ tsk = find_task_by_pid(event->pid);
6757+ if (tsk) {
6758+ struct siginfo siginfo;
6759+
6760+ siginfo.si_signo = event->signo;
6761+ siginfo.si_errno = 0;
6762+ siginfo.si_code = 0;
6763+ rc = send_sig_info(event->signo, &siginfo, tsk);
6764+ } else {
6765+ /* TODO:
6766+ * unregister this stale
6767+ * notification record
6768+ */
6769+ }
6770+ }
6771+ list_node = &(*list_node)->next;
6772+ }
6773+}
6774+
6775+EXPORT_SYMBOL(evms_cs_signal_event);
6776+
6777+static inline void
6778+evms_flush_signals(void)
6779+{
6780+ spin_lock(&current->sigmask_lock);
6781+ flush_signals(current);
6782+ spin_unlock(&current->sigmask_lock);
6783+}
6784+
6785+static inline void
6786+evms_init_signals(void)
6787+{
6788+ current->exit_signal = SIGCHLD;
6789+ siginitsetinv(&current->blocked, sigmask(SIGKILL));
6790+}
6791+
6792+static int
6793+evms_thread(void *arg)
6794+{
6795+ struct evms_thread *thread = arg;
6796+ lock_kernel();
6797+
6798+ /*
6799+ * Detach thread
6800+ */
6801+
6802+ daemonize();
6803+
6804+ sprintf(current->comm, thread->name);
6805+ evms_init_signals();
6806+ evms_flush_signals();
6807+ thread->tsk = current;
6808+
6809+ current->policy = SCHED_OTHER;
6810+#ifdef O1_SCHEDULER
6811+ set_user_nice(current, -20);
6812+#else
6813+ current->nice = -20;
6814+#endif
6815+ unlock_kernel();
6816+
6817+ complete(thread->event);
6818+ while (thread->run) {
6819+ void (*run) (void *data);
6820+ DECLARE_WAITQUEUE(wait, current);
6821+
6822+ add_wait_queue(&thread->wqueue, &wait);
6823+#ifdef O1_SCHEDULER
6824+ set_current_state(TASK_INTERRUPTIBLE);
6825+#else
6826+ set_task_state(current, TASK_INTERRUPTIBLE);
6827+#endif
6828+ if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {
6829+ schedule();
6830+ }
6831+#ifdef O1_SCHEDULER
6832+ set_current_state(TASK_RUNNING);
6833+#else
6834+ current->state = TASK_RUNNING;
6835+#endif
6836+ remove_wait_queue(&thread->wqueue, &wait);
6837+ clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6838+
6839+ run = thread->run;
6840+ if (run) {
6841+ run(thread->data);
6842+ run_task_queue(&tq_disk);
6843+ }
6844+ if (signal_pending(current)) {
6845+ evms_flush_signals();
6846+ }
6847+ }
6848+ complete(thread->event);
6849+ return 0;
6850+}
6851+
6852+struct evms_thread *
6853+evms_cs_register_thread(void (*run) (void *), void *data, const u8 * name)
6854+{
6855+ struct evms_thread *thread;
6856+ int ret;
6857+ struct completion event;
6858+
6859+ thread = kmalloc(sizeof (struct evms_thread), GFP_KERNEL);
6860+ if (!thread) {
6861+ return NULL;
6862+ }
6863+ memset(thread, 0, sizeof (struct evms_thread));
6864+ init_waitqueue_head(&thread->wqueue);
6865+
6866+ init_completion(&event);
6867+ thread->event = &event;
6868+ thread->run = run;
6869+ thread->data = data;
6870+ thread->name = name;
6871+ ret = kernel_thread(evms_thread, thread, 0);
6872+ if (ret < 0) {
6873+ kfree(thread);
6874+ return NULL;
6875+ }
6876+ wait_for_completion(&event);
6877+ return thread;
6878+}
6879+
6880+EXPORT_SYMBOL(evms_cs_register_thread);
6881+
6882+void
6883+evms_cs_unregister_thread(struct evms_thread *thread)
6884+{
6885+ struct completion event;
6886+
6887+ init_completion(&event);
6888+
6889+ thread->event = &event;
6890+ thread->run = NULL;
6891+ thread->name = NULL;
6892+ evms_cs_interrupt_thread(thread);
6893+ wait_for_completion(&event);
6894+ kfree(thread);
6895+}
6896+
6897+EXPORT_SYMBOL(evms_cs_unregister_thread);
6898+
6899+void
6900+evms_cs_wakeup_thread(struct evms_thread *thread)
6901+{
6902+ set_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6903+ wake_up(&thread->wqueue);
6904+}
6905+
6906+EXPORT_SYMBOL(evms_cs_wakeup_thread);
6907+
6908+void
6909+evms_cs_interrupt_thread(struct evms_thread *thread)
6910+{
6911+ if (!thread->tsk) {
6912+ LOG_ERROR("error: attempted to interrupt an invalid thread!\n");
6913+ return;
6914+ }
6915+ send_sig(SIGKILL, thread->tsk, 1);
6916+}
6917+
6918+EXPORT_SYMBOL(evms_cs_interrupt_thread);
6919+
6920+struct proc_dir_entry *
6921+evms_cs_get_evms_proc_dir(void)
6922+{
6923+#ifdef CONFIG_PROC_FS
6924+ if (!evms_proc_dir) {
6925+ evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);
6926+ }
6927+#endif
6928+ return (evms_proc_dir);
6929+}
6930+
6931+EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);
6932+
6933+int
6934+evms_cs_volume_request_in_progress(kdev_t dev,
6935+ int operation, int *current_count)
6936+{
6937+ int rc = 0;
6938+ struct evms_logical_volume *volume;
6939+
6940+ volume = &evms_logical_volumes[MINOR(dev)];
6941+ if (volume->node) {
6942+ if (operation > 0) {
6943+ atomic_inc(&volume->requests_in_progress);
6944+ } else if (operation < 0) {
6945+ atomic_dec(&volume->requests_in_progress);
6946+ }
6947+ if (current_count) {
6948+ *current_count =
6949+ atomic_read(&volume->requests_in_progress);
6950+ }
6951+ } else {
6952+ rc = -ENODEV;
6953+ }
6954+ return (rc);
6955+}
6956+
6957+EXPORT_SYMBOL(evms_cs_volume_request_in_progress);
6958+
6959+void
6960+evms_cs_invalidate_volume(struct evms_logical_node *node)
6961+{
6962+ int i;
6963+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6964+ if (evms_logical_volumes[i].node && node->name) {
6965+ if (!
6966+ (strcmp
6967+ (evms_logical_volumes[i].node->name,
6968+ node->name))) {
6969+ LOG_DETAILS
6970+ ("Invalidating EVMS device %s minor %d\n",
6971+ node->name, i);
6972+ invalidate_device(MKDEV(EVMS_MAJOR, i), 0);
6973+ break;
6974+ }
6975+ }
6976+ }
6977+}
6978+
6979+EXPORT_SYMBOL(evms_cs_invalidate_volume);
6980+
6981+static int
6982+is_open(int minor)
6983+{
6984+ return atomic_read(&evms_logical_volumes[minor].opens);
6985+}
6986+
6987+/**********************************************************/
6988+/* END -- exported functions/Common Services */
6989+/**********************************************************/
6990+
6991+/**********************************************************/
6992+/* START -- Proc FS Support functions */
6993+/**********************************************************/
6994+
6995+#ifdef CONFIG_PROC_FS
6996+static int
6997+evms_info_read_proc(char *page,
6998+ char **start, off_t off, int count, int *eof, void *data)
6999+{
7000+ int sz = 0;
7001+ char *info_level_text = NULL;
7002+
7003+ PROCPRINT("Enterprise Volume Management System: Info\n");
7004+ switch (evms_info_level) {
7005+ case EVMS_INFO_CRITICAL:
7006+ info_level_text = "critical";
7007+ break;
7008+ case EVMS_INFO_SERIOUS:
7009+ info_level_text = "serious";
7010+ break;
7011+ case EVMS_INFO_ERROR:
7012+ info_level_text = "error";
7013+ break;
7014+ case EVMS_INFO_WARNING:
7015+ info_level_text = "warning";
7016+ break;
7017+ case EVMS_INFO_DEFAULT:
7018+ info_level_text = "default";
7019+ break;
7020+ case EVMS_INFO_DETAILS:
7021+ info_level_text = "details";
7022+ break;
7023+ case EVMS_INFO_DEBUG:
7024+ info_level_text = "debug";
7025+ break;
7026+ case EVMS_INFO_EXTRA:
7027+ info_level_text = "extra";
7028+ break;
7029+ case EVMS_INFO_ENTRY_EXIT:
7030+ info_level_text = "entry exit";
7031+ break;
7032+ case EVMS_INFO_EVERYTHING:
7033+ info_level_text = "everything";
7034+ break;
7035+ default:
7036+ info_level_text = "unknown";
7037+ break;
7038+ }
7039+ PROCPRINT("EVMS info level: %d (%s).\n",
7040+ evms_info_level, info_level_text);
7041+
7042+ PROCPRINT("EVMS kernel version: %d.%d.%d\n",
7043+ EVMS_MAJOR_VERSION,
7044+ EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
7045+
7046+ PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",
7047+ EVMS_IOCTL_INTERFACE_MAJOR,
7048+ EVMS_IOCTL_INTERFACE_MINOR, EVMS_IOCTL_INTERFACE_PATCHLEVEL);
7049+
7050+ PROCPRINT("EVMS Common Services version: %d.%d.%d\n",
7051+ EVMS_COMMON_SERVICES_MAJOR,
7052+ EVMS_COMMON_SERVICES_MINOR, EVMS_COMMON_SERVICES_PATCHLEVEL);
7053+
7054+ *eof = 1;
7055+
7056+out:
7057+ *start = page + off;
7058+ sz -= off;
7059+ if (sz < 0)
7060+ sz = 0;
7061+ return sz > count ? count : sz;
7062+}
7063+
7064+static int
7065+evms_plugins_read_proc(char *page,
7066+ char **start, off_t off, int count, int *eof, void *data)
7067+{
7068+ int sz = 0;
7069+ struct evms_registered_plugin *rp = NULL;
7070+
7071+ PROCPRINT("Enterprise Volume Management System: Plugins\n");
7072+ /* 0 1 1 2 2 3 3 4 4 5 5 6 6 7 */
7073+ /* 1 5 0 5 0 5 0 5 0 5 0 5 0 5 0 */
7074+ PROCPRINT(" ---------Plugin---------- required services\n");
7075+ PROCPRINT(" ----id---- version version\n\n");
7076+ for (rp = registered_plugin_head; rp; rp = rp->next) {
7077+ PROCPRINT(" %x.%x.%x\t %d.%d.%d\t%d.%d.%d\n",
7078+ GetPluginOEM(rp->plugin->id),
7079+ GetPluginType(rp->plugin->id),
7080+ GetPluginID(rp->plugin->id),
7081+ rp->plugin->version.major,
7082+ rp->plugin->version.minor,
7083+ rp->plugin->version.patchlevel,
7084+ rp->plugin->required_services_version.major,
7085+ rp->plugin->required_services_version.minor,
7086+ rp->plugin->required_services_version.patchlevel);
7087+ }
7088+
7089+out:
7090+ *start = page + off;
7091+ sz -= off;
7092+ if (sz < 0)
7093+ sz = 0;
7094+ return sz > count ? count : sz;
7095+}
7096+
7097+static int
7098+evms_volumes_read_proc(char *page,
7099+ char **start, off_t off, int count, int *eof, void *data)
7100+{
7101+ int sz = 0, j;
7102+
7103+ PROCPRINT("Enterprise Volume Management System: Volumes\n");
7104+ PROCPRINT("major minor #blocks type flags name\n\n");
7105+ for (j = 1; j < MAX_EVMS_VOLUMES; j++) {
7106+ struct evms_logical_volume *volume;
7107+
7108+ volume = &evms_logical_volumes[j];
7109+ if (volume->node) {
7110+ PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",
7111+ EVMS_MAJOR, j,
7112+ (long long)volume->node->total_vsectors >> 1,
7113+ (volume->
7114+ flags & EVMS_VOLUME_FLAG) ? "evms " :
7115+ "compat",
7116+ (volume->
7117+ flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",
7118+ (volume->
7119+ flags & EVMS_VOLUME_PARTIAL) ? "p " : " ",
7120+ EVMS_DEV_NODE_PATH, volume->name);
7121+ }
7122+ }
7123+out:
7124+ *start = page + off;
7125+ sz -= off;
7126+ if (sz < 0)
7127+ sz = 0;
7128+ return sz > count ? count : sz;
7129+
7130+}
7131+#endif
7132+
7133+/**********************************************************/
7134+/* END -- Proc FS Support functions */
7135+/**********************************************************/
7136+
7137+/**********************************************************/
7138+/* START -- FOPS functions definitions */
7139+/**********************************************************/
7140+
7141+/************************************************/
7142+/* START -- IOCTL commands -- EVMS specific */
7143+/************************************************/
7144+
7145+static int
7146+evms_ioctl_cmd_get_ioctl_version(void *arg)
7147+{
7148+ int rc = 0;
7149+ struct evms_version ver;
7150+
7151+ ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
7152+ ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
7153+ ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
7154+
7155+ /* copy info to userspace */
7156+ if (copy_to_user(arg, &ver, sizeof (ver)))
7157+ rc = -EFAULT;
7158+
7159+ return (rc);
7160+}
7161+
7162+static int
7163+evms_ioctl_cmd_get_version(void *arg)
7164+{
7165+ int rc = 0;
7166+ struct evms_version ver;
7167+
7168+ ver.major = EVMS_MAJOR_VERSION;
7169+ ver.minor = EVMS_MINOR_VERSION;
7170+ ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
7171+
7172+ /* copy info to userspace */
7173+ if (copy_to_user(arg, &ver, sizeof (ver)))
7174+ rc = -EFAULT;
7175+
7176+ return (rc);
7177+}
7178+
7179+static int
7180+evms_ioctl_cmd_get_info_level(void *arg)
7181+{
7182+ int rc = 0;
7183+
7184+ /* copy info to userspace */
7185+ if (copy_to_user(arg, &evms_info_level, sizeof (evms_info_level)))
7186+ rc = -EFAULT;
7187+
7188+ return (rc);
7189+}
7190+
7191+static int
7192+evms_ioctl_cmd_set_info_level(void *arg)
7193+{
7194+ int temp, rc = 0;
7195+
7196+ /* copy info from userspace */
7197+ if (copy_from_user(&temp, arg, sizeof (temp)))
7198+ rc = -EFAULT;
7199+ else
7200+ evms_info_level = temp;
7201+
7202+ return (rc);
7203+}
7204+
7205+/* function: evms_quiesce_volume
7206+ *
7207+ * this function performs the actual quiesce operation on
7208+ * a volume in kernel memory.
7209+ *
7210+ * when quiescing, all new I/Os to a volume are stopped,
7211+ * causing the calling thread to block. this thread then
7212+ * waits until all I/Os in progress are completed, before
7213+ * return control to the caller.
7214+ *
7215+ * when unquiescing, all new I/Os are allowed to proceed
7216+ * unencumbered, and all threads waiting (blocked) on this
7217+ * volume, are woken up and allowed to proceed.
7218+ *
7219+ */
7220+static int
7221+evms_quiesce_volume(struct evms_logical_volume *volume,
7222+ struct inode *inode,
7223+ struct file *file, struct evms_quiesce_vol_pkt *qv)
7224+{
7225+ int rc;
7226+
7227+ LOG_DEBUG("%squiescing %s.\n",
7228+ ((qv->command) ? "" : "un"), volume->name);
7229+
7230+#ifdef VFS_PATCH_PRESENT
7231+ if (qv->do_vfs) {
7232+ /* VFS function call to sync and lock the filesystem */
7233+ fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));
7234+ volume->vfs_quiesced = TRUE;
7235+ }
7236+#endif
7237+ volume->quiesced = qv->command;
7238+
7239+ /* Command specified was "quiesce". */
7240+ if (qv->command) {
7241+ /* After setting the volume to
7242+ * a quiesced state, there could
7243+ * be threads (on SMP systems)
7244+ * that are executing in the
7245+ * function, evms_handle_request,
7246+ * between the "wait_event" and the
7247+ * "atomic_inc" lines. We need to
7248+ * provide a "delay" sufficient
7249+ * to allow those threads to
7250+ * to reach the atomic_inc's
7251+ * before executing the while loop
7252+ * below. The "schedule" call should
7253+ * provide this.
7254+ */
7255+ schedule();
7256+ /* wait for outstanding requests
7257+ * to complete
7258+ */
7259+ while (atomic_read(&volume->requests_in_progress) > 0)
7260+ schedule();
7261+ }
7262+ /* send this command down the stack so lower */
7263+ /* layers can know about this */
7264+ rc = IOCTL(volume->node, inode, file,
7265+ EVMS_QUIESCE_VOLUME, (unsigned long) qv);
7266+ if (!rc) {
7267+ /* Command specified was "unquiesce". */
7268+ if (!qv->command) {
7269+ /* "wakeup" any I/O requests waiting on
7270+ * this volume.
7271+ */
7272+ if (waitqueue_active(&volume->wait_queue))
7273+ wake_up(&volume->wait_queue);
7274+#ifdef VFS_PATCH_PRESENT
7275+ if (volume->vfs_quiesced) {
7276+ /* VFS function call to unlock the filesystem */
7277+ unlockfs(MKDEV(EVMS_MAJOR, qv->minor));
7278+ volume->vfs_quiesced = FALSE;
7279+ }
7280+#endif
7281+ }
7282+ } else {
7283+ LOG_ERROR("error(%d) %squiescing %s.\n",
7284+ rc, ((qv->command) ? "" : "un"), volume->name);
7285+ }
7286+ return (rc);
7287+}
7288+
7289+/* function: evms_delete_volume
7290+ *
7291+ * this function performs the actual delete operation on
7292+ * a volume to purge it from kernel memory. all structures
7293+ * and memory consumed by this volume will be free as well
7294+ * as clearing or unregistering any system services or
7295+ * global data arrays.
7296+ *
7297+ * NOTE: this function will return -EBUSY on attempts to
7298+ * delete mounted volumes.
7299+ *
7300+ */
7301+static int
7302+evms_delete_volume(struct evms_logical_volume *volume,
7303+ struct evms_delete_vol_pkt *dv)
7304+{
7305+ int rc = 0;
7306+
7307+ /* if this is a "permament" delete */
7308+ /* check to make sure volume is not mounted */
7309+ if (dv->command) {
7310+ if (is_open(dv->minor)) {
7311+ rc = -EBUSY;
7312+ } else {
7313+ // invalidate the device since it is not coming back
7314+ // this is required incase we are re-using the minor number
7315+ invalidate_device(MKDEV(EVMS_MAJOR, dv->minor), 1);
7316+ }
7317+ }
7318+
7319+ /* invoke the delete ioctl at the top of the feature stack */
7320+ if (!rc) {
7321+ LOG_DETAILS("deleting '%s'.\n", volume->name);
7322+ rc = DELETE(volume->node);
7323+ }
7324+
7325+ /* the volume has been deleted, do any clean up work
7326+ * required.
7327+ */
7328+ if (!rc) {
7329+ devfs_unregister(volume->devfs_handle);
7330+ if (dv->command) {
7331+ /* if "permanent" delete, free the name
7332+ * and NULL the name field.
7333+ */
7334+ kfree(volume->name);
7335+ volume->name = NULL;
7336+ volume->flags = 0;
7337+ } else {
7338+ /* if "soft" delete, leave the name so
7339+ * we can use it to reassign the same
7340+ * minor to this volume after a
7341+ * rediscovery.
7342+ */
7343+ volume->flags = EVMS_VOLUME_SOFT_DELETED;
7344+ }
7345+ volume->node = NULL;
7346+ set_device_ro(MKDEV(EVMS_MAJOR, dv->minor), 0);
7347+ blk_size[EVMS_MAJOR][dv->minor] = 0;
7348+ blksize_size[EVMS_MAJOR][dv->minor] = 0;
7349+ hardsect_size[EVMS_MAJOR][dv->minor] = 0;
7350+ evms_volumes--;
7351+ } else {
7352+ LOG_ERROR("error(%d) %s deleting %s.\n",
7353+ rc, ((dv->command) ? "hard" : "soft"), volume->name);
7354+ }
7355+ return (rc);
7356+}
7357+
7358+/* function: evms_user_delete_volume
7359+ *
7360+ * this function, depending on the parameters, performs
7361+ * a "soft" or a "hard" delete. for a "soft" delete, a
7362+ * quiesce & delete request is queued up, to be executed
7363+ * at the beginning of the next rediscovery. for a
7364+ * "hard" delete, the target volume is quiesced and then
7365+ * deleted. if there is any errors attempting to delete
7366+ * the target, then the target is unquiesced. if an
7367+ * associative volume is specified it is quiesced before
7368+ * the target volume is quiesced, and is unquiesced
7369+ * after the attempt to delete the target volume.
7370+ *
7371+ */
7372+static int
7373+evms_user_delete_volume(struct evms_logical_volume *lvt,
7374+ struct inode *inode,
7375+ struct file *file, struct evms_delete_vol_pkt *dv)
7376+{
7377+ int rc = 0;
7378+
7379+ if (!dv->command) {
7380+ /* "soft delete" requested */
7381+ lvt->flags |= (EVMS_REQUESTED_QUIESCE | EVMS_REQUESTED_DELETE);
7382+ if (dv->do_vfs) {
7383+ lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;
7384+ }
7385+ } else {
7386+ /* "hard delete" requested */
7387+ int qa = FALSE;
7388+ struct evms_quiesce_vol_pkt qv;
7389+ struct evms_logical_volume *lva = NULL;
7390+
7391+ if (dv->associative_minor) {
7392+ /* associative volume specified
7393+ *
7394+ * quiesce it
7395+ */
7396+ lva = &evms_logical_volumes[dv->associative_minor];
7397+ /* quiesce associative volume */
7398+ qv.command = EVMS_QUIESCE;
7399+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
7400+ qv.minor = dv->associative_minor;
7401+ rc = evms_quiesce_volume(lva, inode, file, &qv);
7402+ qa = (rc) ? FALSE : TRUE;
7403+ }
7404+ if (!rc) {
7405+ /* quiesce target volume */
7406+ qv.command = EVMS_QUIESCE;
7407+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
7408+ qv.minor = dv->minor;
7409+ rc = evms_quiesce_volume(lvt, inode, file, &qv);
7410+ }
7411+ if (!rc) {
7412+ /* delete the target volume */
7413+ rc = evms_delete_volume(lvt, dv);
7414+ if (rc) {
7415+ /* got an error undeleting...
7416+ *
7417+ * unquiesce the target
7418+ */
7419+ qv.command = EVMS_UNQUIESCE;
7420+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
7421+ qv.minor = dv->minor;
7422+ evms_quiesce_volume(lvt, inode, file, &qv);
7423+ }
7424+ }
7425+ if (dv->associative_minor) {
7426+ /* associative volume specified
7427+ *
7428+ * unquiesce it
7429+ */
7430+ if (qa) {
7431+ /* only unquiesce associative
7432+ * if we successfully quiesced
7433+ * it previously.
7434+ */
7435+ qv.command = EVMS_UNQUIESCE;
7436+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
7437+ qv.minor = dv->associative_minor;
7438+ evms_quiesce_volume(lva, inode, file, &qv);
7439+ }
7440+ }
7441+ }
7442+ return (rc);
7443+}
7444+
7445+/* function: evms_ioctl_cmd_delete_volume
7446+ *
7447+ * this function copy user data to/from the kernel, and
7448+ * validates user parameters. after validation, control
7449+ * is passed to worker routine evms_user_delete_volume.
7450+ *
7451+ */
7452+static int
7453+evms_ioctl_cmd_delete_volume(struct inode *inode,
7454+ struct file *file, unsigned long arg)
7455+{
7456+ int rc = 0;
7457+ struct evms_delete_vol_pkt tmp, *user_parms;
7458+ struct evms_logical_volume *volume = NULL;
7459+
7460+ user_parms = (struct evms_delete_vol_pkt *) arg;
7461+ /* copy user's parameters to kernel space */
7462+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7463+ rc = -EFAULT;
7464+
7465+ /* check to make sure associative minor is in use */
7466+ if (!rc) {
7467+ if (tmp.associative_minor) {
7468+ volume = &evms_logical_volumes[tmp.associative_minor];
7469+ if (volume->node == NULL)
7470+ rc = -ENXIO;
7471+ }
7472+ }
7473+ /* check to make sure target minor is in use */
7474+ if (!rc) {
7475+ volume = &evms_logical_volumes[tmp.minor];
7476+ if (volume->node == NULL)
7477+ rc = -ENXIO;
7478+ else
7479+ rc = evms_user_delete_volume(volume, inode, file, &tmp);
7480+ }
7481+ /* copy the status value back to the user */
7482+ tmp.status = rc;
7483+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7484+ rc = -EFAULT;
7485+
7486+ return (rc);
7487+}
7488+
7489+/* function: evms_full_rediscover_prep
7490+ *
7491+ * this function helps to prevent problems when evms is
7492+ * configured with the base built in statically and some
7493+ * plugins built as modules.
7494+ *
7495+ * in these cases, when the initial discovery is done,
7496+ * only the statically built modules are available for
7497+ * volume construction. as a result, some volumes that
7498+ * require the plugins built as modules (which haven't
7499+ * been loaded), to be fully reconstructed, may come up
7500+ * as compatibility volumes or partial volumes.
7501+ *
7502+ * when parts of evms are built as modules, the
7503+ * evms_rediscover_pkty utility is used, to perform a secondary
7504+ * rediscover, after all the plugins built as modules
7505+ * have been loaded, to construct all the volumes
7506+ * requiring these plugins.
7507+ *
7508+ * however since some of the volumes, requiring the plugins
7509+ * built as modules, may have been already exported as
7510+ * compatibility or partial volumes, we need to purge these
7511+ * volumes from kernel's memory, so that can be rediscovered
7512+ * and claimed by the appropriate plugins, and reconstructed
7513+ * into the correct volumes.
7514+ *
7515+ * this function purges all compatibility volumes that are
7516+ * not in use(mounted) and all partial volumes, prior to
7517+ * doing the secondary rediscover, thus allowing volumes to
7518+ * rediscovered correctly.
7519+ *
7520+ * NOTE: again, this is only required in cases when a
7521+ * combination of plugins are built statically and as
7522+ * modules.
7523+ *
7524+ */
7525+static void
7526+evms_full_rediscover_prep(struct inode *inode, struct file *file)
7527+{
7528+ int rc = 0, i;
7529+
7530+ LOG_DETAILS("%s: started.\n", __FUNCTION__);
7531+ /* check for acceptable volumes to be deleted */
7532+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7533+ struct evms_logical_volume *volume = NULL;
7534+ struct evms_delete_vol_pkt dv;
7535+ int volume_open, doit;
7536+
7537+ volume = &evms_logical_volumes[i];
7538+ if (!volume->node)
7539+ continue;
7540+ volume_open = is_open(i);
7541+ /* only proceed on volumes that are:
7542+ * partial volumes
7543+ * OR
7544+ * unopened compatibility volumes
7545+ */
7546+ doit = FALSE;
7547+ if (volume->flags & EVMS_VOLUME_PARTIAL) {
7548+ /* do all partial volumes
7549+ */
7550+ doit = TRUE;
7551+ } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
7552+ /* check all compatibility volumes
7553+ */
7554+ if (!volume_open && !is_swap_partition(MKDEV(EVMS_MAJOR, i))) {
7555+ /* only do unopened volumes
7556+ */
7557+ doit = TRUE;
7558+ }
7559+ }
7560+ if (doit == FALSE) {
7561+ continue;
7562+ }
7563+ /* delete the volume from memory.
7564+ * do a 'soft' delete if volume
7565+ * is mounted, and 'hard' delete
7566+ * if it is not.
7567+ *
7568+ * NOTE: the delete operation will
7569+ * clear the bits in the flags field.
7570+ */
7571+ dv.command = (volume_open) ?
7572+ EVMS_SOFT_DELETE : EVMS_HARD_DELETE;
7573+ dv.minor = i;
7574+ dv.associative_minor = 0;
7575+ dv.status = 0;
7576+ rc = evms_user_delete_volume(volume, inode, file, &dv);
7577+ }
7578+ LOG_DETAILS("%s: completed.\n", __FUNCTION__);
7579+}
7580+
7581+static int
7582+evms_ioctl_cmd_rediscover_volumes(struct inode *inode,
7583+ struct file *file,
7584+ unsigned int cmd, unsigned long arg)
7585+{
7586+ int rc, i;
7587+ struct evms_rediscover_pkt tmp, *user_parms;
7588+ u64 *array_ptr = NULL;
7589+ ulong array_size = 0;
7590+ struct evms_logical_volume *volume = NULL;
7591+
7592+ rc = tmp.drive_count = 0;
7593+ user_parms = (struct evms_rediscover_pkt *) arg;
7594+ /* copy user's parameters to kernel space */
7595+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7596+ rc = -EFAULT;
7597+
7598+ if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
7599+ evms_full_rediscover_prep(inode, file);
7600+ }
7601+ /* quiesce all queued volumes */
7602+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7603+ struct evms_quiesce_vol_pkt qv;
7604+
7605+ volume = &evms_logical_volumes[i];
7606+ if (!volume->node) {
7607+ continue;
7608+ }
7609+ if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {
7610+ continue;
7611+ }
7612+ qv.command = EVMS_QUIESCE;
7613+ qv.minor = i;
7614+ qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ?
7615+ EVMS_VFS_DO : EVMS_VFS_DO_NOTHING, qv.status = 0;
7616+ rc = evms_quiesce_volume(volume, inode, file, &qv);
7617+ }
7618+ /* "soft" delete all queued volumes */
7619+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7620+ struct evms_delete_vol_pkt dv;
7621+
7622+ volume = &evms_logical_volumes[i];
7623+ if (!volume->node) {
7624+ continue;
7625+ }
7626+ if (!(volume->flags & EVMS_REQUESTED_DELETE)) {
7627+ continue;
7628+ }
7629+ dv.command = EVMS_SOFT_DELETE;
7630+ dv.minor = i;
7631+ dv.associative_minor = 0;
7632+ dv.status = 0;
7633+ rc = evms_delete_volume(volume, &dv);
7634+ }
7635+
7636+ if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {
7637+ if (!rc) {
7638+ /* create space for userspace drive array */
7639+ array_size =
7640+ sizeof (*tmp.drive_array) * tmp.drive_count;
7641+ array_ptr = tmp.drive_array;
7642+ tmp.drive_array = kmalloc(array_size, GFP_KERNEL);
7643+ if (!tmp.drive_array) {
7644+ rc = -ENOMEM;
7645+ }
7646+ }
7647+ if (!rc)
7648+ /* copy rediscover drive array to kernel space */
7649+ if (copy_from_user
7650+ (tmp.drive_array, array_ptr, array_size))
7651+ rc = -EFAULT;
7652+ }
7653+
7654+ if (!rc) {
7655+ static int evms_discover_volumes(struct evms_rediscover_pkt *);
7656+ /* perform the rediscovery operation */
7657+ rc = evms_discover_volumes(&tmp);
7658+ }
7659+
7660+ /* clean up after operation */
7661+ if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES))
7662+ kfree(tmp.drive_array);
7663+
7664+ /* set return code and copy info to userspace */
7665+ tmp.status = rc;
7666+ if (copy_to_user(&user_parms->status, &tmp.status, sizeof (tmp.status)))
7667+ rc = -EFAULT;
7668+
7669+ return (rc);
7670+}
7671+
7672+static struct evms_list_node *user_disk_ptr;
7673+static int
7674+evms_ioctl_cmd_get_logical_disk(void *arg)
7675+{
7676+ int rc = 0;
7677+ struct evms_user_disk_pkt tmp, *user_parms;
7678+
7679+ user_parms = (struct evms_user_disk_pkt *) arg;
7680+ /* copy user's parameters to kernel space */
7681+ if (copy_from_user
7682+ (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7683+ rc = -EFAULT;
7684+
7685+ if (!rc) {
7686+ if (tmp.command == EVMS_FIRST_DISK)
7687+ user_disk_ptr = evms_global_device_list;
7688+ else /* tmp.command == EVMS_NEXT_DISK */
7689+ user_disk_ptr = user_disk_ptr->next;
7690+
7691+ if (user_disk_ptr == NULL)
7692+ tmp.status = EVMS_DISK_INVALID;
7693+ else {
7694+ tmp.status = EVMS_DISK_VALID;
7695+ tmp.disk_handle =
7696+ NODE_TO_DEV_HANDLE(user_disk_ptr->item);
7697+ }
7698+ /* copy info to userspace */
7699+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7700+ rc = -EFAULT;
7701+ }
7702+ return (rc);
7703+}
7704+
7705+static int
7706+evms_ioctl_cmd_get_logical_disk_info(void *arg)
7707+{
7708+ int rc = 0;
7709+ struct evms_user_disk_info_pkt tmp, *user_parms;
7710+ struct evms_list_node *p;
7711+ struct evms_logical_node *disk_node = NULL;
7712+
7713+ user_parms = (struct evms_user_disk_info_pkt *) arg;
7714+ /* copy user's parameters to kernel space */
7715+ if (copy_from_user
7716+ (&tmp.disk_handle, &user_parms->disk_handle,
7717+ sizeof (tmp.disk_handle)))
7718+ rc = -EFAULT;
7719+
7720+ /* check handle for validity */
7721+ if (!rc) {
7722+ rc = -EINVAL;
7723+ disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7724+ for (p = evms_global_device_list; p; p = p->next)
7725+ if (p->item == disk_node) {
7726+ rc = 0;
7727+ user_disk_ptr = p;
7728+ break;
7729+ }
7730+ }
7731+
7732+ /* populate kernel copy of user's structure with appropriate info */
7733+ if (!rc) {
7734+ struct hd_geometry geo;
7735+ struct evms_logical_node *node =
7736+ (struct evms_logical_node *) user_disk_ptr->item;
7737+ tmp.flags = node->flags;
7738+ strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
7739+ strcat(tmp.disk_name, node->name);
7740+ rc = evms_cs_kernel_ioctl(node, EVMS_UPDATE_DEVICE_INFO,
7741+ (ulong) NULL);
7742+ if (!rc) {
7743+ tmp.total_sectors = node->total_vsectors;
7744+ tmp.hardsect_size = node->hardsector_size;
7745+ tmp.block_size = node->block_size;
7746+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO,
7747+ (unsigned long) &geo);
7748+ }
7749+ if (!rc) {
7750+ tmp.geo_sectors = geo.sectors;
7751+ tmp.geo_heads = geo.heads;
7752+ tmp.geo_cylinders = geo.cylinders;
7753+ }
7754+ }
7755+
7756+ /* set return code and copy info to userspace */
7757+ tmp.status = rc;
7758+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7759+ rc = -EFAULT;
7760+
7761+ return (rc);
7762+}
7763+
7764+static int
7765+evms_ioctl_cmd_sector_io(void *arg)
7766+{
7767+ int rc;
7768+#define MAX_IO_SIZE 128
7769+ u64 io_size, max_io_size = MAX_IO_SIZE;
7770+#undef MAX_IO_SIZE
7771+ struct evms_sector_io_pkt tmp, *user_parms;
7772+ struct evms_logical_node *disk_node = NULL;
7773+ struct evms_list_node *list_node;
7774+ unsigned char *io_buffer;
7775+
7776+ rc = 0;
7777+ list_node = NULL;
7778+ io_buffer = NULL;
7779+
7780+ user_parms = (struct evms_sector_io_pkt *) arg;
7781+ /* copy user's parameters to kernel space */
7782+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7783+ rc = -EFAULT;
7784+
7785+ /* check handle for validity */
7786+ if (!rc) {
7787+ rc = -EINVAL;
7788+ disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7789+ for (list_node = evms_global_device_list; list_node;
7790+ list_node = list_node->next)
7791+ if (list_node->item == disk_node) {
7792+ rc = 0;
7793+ break;
7794+ }
7795+ }
7796+ if (!rc) {
7797+ int done;
7798+ /* allocate a io buffer upto 64Kbytes in size */
7799+ if (tmp.sector_count < max_io_size)
7800+ max_io_size = tmp.sector_count;
7801+ do {
7802+ done = TRUE;
7803+ /* allocate buffer large enough to max_io_size sectors */
7804+ io_buffer =
7805+ kmalloc(max_io_size << EVMS_VSECTOR_SIZE_SHIFT,
7806+ GFP_KERNEL);
7807+ if (!io_buffer) {
7808+ max_io_size >>= 1;
7809+ if (!max_io_size) {
7810+ rc = -ENOMEM;
7811+ } else {
7812+ done = FALSE;
7813+ }
7814+ }
7815+ } while (!done);
7816+ }
7817+ /* perform io with specified disk */
7818+ if (!rc) {
7819+ u64 io_sector_offset, io_remaining;
7820+ u64 io_bytes;
7821+ u_char *user_buffer_ptr;
7822+
7823+ io_remaining = tmp.sector_count;
7824+ io_sector_offset = 0;
7825+ user_buffer_ptr = tmp.buffer_address;
7826+ while (io_remaining) {
7827+ /* compute the io_size for this pass */
7828+ io_size = (io_remaining >= max_io_size) ?
7829+ max_io_size : io_remaining;
7830+
7831+ io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
7832+ /* for writes, copy a sector from user to kernel */
7833+ if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {
7834+ /* copy sector from user data buffer */
7835+ if (copy_from_user(io_buffer,
7836+ user_buffer_ptr, io_bytes))
7837+ rc = -EFAULT;
7838+ }
7839+ if (rc)
7840+ break;
7841+
7842+ /* perform IO one sector at a time */
7843+ rc = INIT_IO(disk_node,
7844+ tmp.io_flag,
7845+ io_sector_offset + tmp.starting_sector,
7846+ io_size, io_buffer);
7847+
7848+ if (rc)
7849+ break;
7850+
7851+ if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {
7852+ /* copy sector to user data buffer */
7853+ if (copy_to_user(user_buffer_ptr,
7854+ io_buffer, io_bytes))
7855+ rc = -EFAULT;
7856+ }
7857+ if (rc)
7858+ break;
7859+
7860+ user_buffer_ptr += io_bytes;
7861+ tmp.buffer_address += io_bytes;
7862+ io_sector_offset += io_size;
7863+ io_remaining -= io_size;
7864+ }
7865+ }
7866+
7867+ /* if the sector_buffer was allocated, free it */
7868+ if (io_buffer)
7869+ kfree(io_buffer);
7870+
7871+ /* copy the status value back to the user */
7872+ tmp.status = rc;
7873+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7874+ rc = -EFAULT;
7875+
7876+ return (rc);
7877+}
7878+
7879+static int user_minor;
7880+static int
7881+evms_ioctl_cmd_get_minor(void *arg)
7882+{
7883+ int rc = 0;
7884+ struct evms_user_minor_pkt tmp, *user_parms;
7885+
7886+ user_parms = (struct evms_user_minor_pkt *) arg;
7887+ /* copy user's parameters to kernel space */
7888+ if (copy_from_user
7889+ (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7890+ rc = -EFAULT;
7891+
7892+ if (!rc) {
7893+ if (tmp.command == EVMS_FIRST_VOLUME)
7894+ user_minor = 1;
7895+ else /* tmp.command == EVMS_NEXT_VOLUME */
7896+ user_minor++;
7897+
7898+ tmp.status = EVMS_VOLUME_INVALID;
7899+ for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
7900+ struct evms_logical_volume *lv;
7901+
7902+ lv = &evms_logical_volumes[user_minor];
7903+ /* see if any corrupt volumes have been
7904+ * unmounted. If so, clean up the
7905+ * evms_logical_volumes array entry, and
7906+ * don't report the volume to the user.
7907+ */
7908+ if (lv->flags & EVMS_VOLUME_CORRUPT) {
7909+ if (!is_open(user_minor)) {
7910+ /* clear logical volume structure
7911+ * for this volume so it may be
7912+ * reused.
7913+ */
7914+ LOG_WARNING
7915+ ("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
7916+ ((lv->
7917+ flags & EVMS_VOLUME_SOFT_DELETED)
7918+ ? "'soft deleted'" : ""),
7919+ EVMS_MAJOR, user_minor, lv->name);
7920+ LOG_WARNING
7921+ (" releasing minor(%d) used by volume(%s)!\n",
7922+ user_minor, lv->name);
7923+ kfree(lv->name);
7924+ lv->name = NULL;
7925+ lv->flags = 0;
7926+ }
7927+ }
7928+ if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
7929+ tmp.status = EVMS_VOLUME_VALID;
7930+ tmp.minor = user_minor;
7931+ break;
7932+ }
7933+ }
7934+
7935+ /* copy info to userspace */
7936+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7937+ rc = -EFAULT;
7938+ }
7939+ return (rc);
7940+}
7941+
7942+static int
7943+evms_ioctl_cmd_get_volume_data(void *arg)
7944+{
7945+ int rc = 0;
7946+ struct evms_volume_data_pkt tmp, *user_parms;
7947+ struct evms_logical_volume *volume = NULL;
7948+ struct evms_logical_node *node = NULL;
7949+
7950+ user_parms = (struct evms_volume_data_pkt *) arg;
7951+ /* copy user's parameters to kernel space */
7952+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7953+ rc = -EFAULT;
7954+
7955+ if (!rc) {
7956+ volume = &evms_logical_volumes[tmp.minor];
7957+ node = volume->node;
7958+ if (node == NULL)
7959+ rc = -ENODEV;
7960+ }
7961+ if (!rc) {
7962+ tmp.flags = volume->flags;
7963+ strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
7964+ strcat(tmp.volume_name, volume->name);
7965+ }
7966+
7967+ /* copy return code and info to userspace */
7968+ tmp.status = rc;
7969+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7970+ rc = -EFAULT;
7971+ return (rc);
7972+}
7973+
7974+static struct evms_registered_plugin *ioctl_reg_record;
7975+static int
7976+evms_ioctl_cmd_get_plugin(void *arg)
7977+{
7978+ int rc = 0;
7979+ struct evms_kernel_plugin_pkt tmp, *user_parms;
7980+
7981+ user_parms = (struct evms_kernel_plugin_pkt *) arg;
7982+ /* copy user's parameters to kernel space */
7983+ if (copy_from_user
7984+ (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7985+ rc = -EFAULT;
7986+
7987+ if (!rc) {
7988+ /* if the command is not 0, then verify
7989+ * that ioctl_reg_record is pointing to
7990+ * current and valid plugin header.
7991+ */
7992+ if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */
7993+ struct evms_registered_plugin *tmp_reg_record;
7994+ tmp_reg_record = registered_plugin_head;
7995+ /* search the current plugin list */
7996+ while (tmp_reg_record) {
7997+ if (tmp_reg_record == ioctl_reg_record)
7998+ break;
7999+ tmp_reg_record = tmp_reg_record->next;
8000+ }
8001+ /* if the ioctl_reg_record is not in the
8002+ * current list, then start at the beginning.
8003+ */
8004+ if (!tmp_reg_record)
8005+ tmp.command = EVMS_FIRST_PLUGIN;
8006+ }
8007+
8008+ if (tmp.command == EVMS_FIRST_PLUGIN)
8009+ /* start at beginning of plugin list */
8010+ ioctl_reg_record = registered_plugin_head;
8011+ else /* tmp.command == EVMS_NEXT_PLUGIN */
8012+ /* continue from current position in list */
8013+ ioctl_reg_record = ioctl_reg_record->next;
8014+
8015+ tmp.status = EVMS_PLUGIN_INVALID;
8016+ tmp.id = 0;
8017+ if (ioctl_reg_record) {
8018+ tmp.id = ioctl_reg_record->plugin->id;
8019+ tmp.version = ioctl_reg_record->plugin->version;
8020+ tmp.status = EVMS_PLUGIN_VALID;
8021+ }
8022+
8023+ /* copy info to userspace */
8024+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8025+ rc = -EFAULT;
8026+ }
8027+ return (rc);
8028+}
8029+
8030+static int
8031+evms_ioctl_cmd_plugin_ioctl(struct inode *inode,
8032+ struct file *file,
8033+ unsigned int cmd, unsigned long arg)
8034+{
8035+ int rc = 0, found = FALSE;
8036+ struct evms_plugin_ioctl_pkt tmp, *user_parms;
8037+ struct evms_registered_plugin *p;
8038+
8039+ user_parms = (struct evms_plugin_ioctl_pkt *) arg;
8040+ /* copy user's parameters to kernel space */
8041+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8042+ rc = -EFAULT;
8043+
8044+ if (!rc) {
8045+ /* search for the specified plugin */
8046+ for (p = registered_plugin_head; p; p = p->next)
8047+ /* check for the specified feature id */
8048+ if (p->plugin->id == tmp.feature_id) {
8049+ found = TRUE;
8050+ /* check that entry point is used */
8051+ if (p->plugin->fops->direct_ioctl)
8052+ rc = DIRECT_IOCTL(p, inode, file, cmd,
8053+ arg);
8054+ else
8055+ rc = -ENOSYS;
8056+ break;
8057+ }
8058+ /* was the specified plugin found? */
8059+ if (found == FALSE)
8060+ rc = -ENOPKG;
8061+
8062+ /* copy the status value back to the user */
8063+ tmp.status = rc;
8064+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8065+ rc = -EFAULT;
8066+ }
8067+ return (rc);
8068+}
8069+
8070+#define MAX_BUFFER_SIZE 65536
8071+static int
8072+evms_ioctl_cmd_kernel_partial_csum(void *arg)
8073+{
8074+ int rc = 0;
8075+ u64 compute_size = MAX_BUFFER_SIZE;
8076+ struct evms_compute_csum_pkt tmp, *user_parms;
8077+ unsigned char *buffer = NULL;
8078+
8079+ user_parms = (struct evms_compute_csum_pkt *) arg;
8080+ /* copy user's parameters to kernel space */
8081+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8082+ rc = -EFAULT;
8083+
8084+ if (!rc) {
8085+ /* allocate a io buffer upto 64Kbytes in size */
8086+ if (tmp.buffer_size < MAX_BUFFER_SIZE)
8087+ compute_size = tmp.buffer_size;
8088+
8089+ /* allocate buffer large enough to hold a single sector */
8090+ buffer = kmalloc(compute_size, GFP_KERNEL);
8091+ if (!buffer) {
8092+ rc = -ENOMEM;
8093+ }
8094+ }
8095+ /* perform io with specified disk */
8096+ if (!rc) {
8097+ u64 remaining_bytes;
8098+ u_char *user_buffer_ptr;
8099+ unsigned int insum = tmp.insum;
8100+
8101+ remaining_bytes = tmp.buffer_size;
8102+ user_buffer_ptr = tmp.buffer_address;
8103+ while (remaining_bytes) {
8104+ /* compute the compute_size for this pass */
8105+ compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ?
8106+ MAX_BUFFER_SIZE : remaining_bytes;
8107+
8108+ /* copy into kernel from user data buffer */
8109+ if (copy_from_user(buffer, user_buffer_ptr,
8110+ compute_size))
8111+ rc = -EFAULT;
8112+ if (rc)
8113+ break;
8114+ /* compute the checksum for this pass */
8115+ tmp.outsum = csum_partial(buffer, tmp.buffer_size,
8116+ insum);
8117+ /* set up for another possible pass */
8118+ insum = tmp.outsum;
8119+ /* update loop progress variables */
8120+ user_buffer_ptr += compute_size;
8121+ tmp.buffer_address += compute_size;
8122+ remaining_bytes -= compute_size;
8123+ }
8124+ }
8125+
8126+ /* if the sector_buffer was allocated, free it */
8127+ if (buffer)
8128+ kfree(buffer);
8129+
8130+ /* copy the status value back to the user */
8131+ tmp.status = rc;
8132+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8133+ rc = -EFAULT;
8134+
8135+ return (rc);
8136+}
8137+
8138+#undef MAX_BUFFER_SIZE
8139+
8140+static int
8141+evms_ioctl_cmd_get_bmap(struct inode *inode,
8142+ struct file *file, unsigned int cmd, unsigned long arg)
8143+{
8144+ int rc = 0;
8145+ struct evms_get_bmap_pkt tmp, *user_parms;
8146+
8147+ user_parms = (struct evms_get_bmap_pkt *) arg;
8148+ /* copy user's parameters to kernel space */
8149+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8150+ rc = -EFAULT;
8151+
8152+ /* pass the ioctl down the volume stack */
8153+ if (!rc) {
8154+ struct evms_logical_volume *volume;
8155+
8156+ volume = &evms_logical_volumes[MINOR(inode->i_rdev)];
8157+ rc = IOCTL(volume->node, inode, file, cmd,
8158+ (unsigned long) &tmp);
8159+ }
8160+ /* copy the status value back to the user */
8161+ tmp.status = rc;
8162+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8163+ rc = -EFAULT;
8164+
8165+ return (rc);
8166+}
8167+
8168+static int
8169+evms_ioctl_cmd_process_notify_event(unsigned long arg)
8170+{
8171+ int rc = 0, found = FALSE;
8172+ struct evms_notify_pkt tmp, *user_parms;
8173+ struct evms_list_node **list_node = NULL;
8174+ struct evms_event *event = NULL;
8175+
8176+ user_parms = (struct evms_notify_pkt *) arg;
8177+ /* copy user's parameters to kernel space */
8178+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8179+ rc = -EFAULT;
8180+
8181+ /* check to see if PID has already been registered
8182+ * for this event.
8183+ */
8184+ if (!rc) {
8185+ list_node = &evms_global_notify_list;
8186+ while (*list_node) {
8187+ event = (*list_node)->item;
8188+ if ((event->pid == tmp.eventry.pid) &&
8189+ (event->eventid == tmp.eventry.eventid)) {
8190+ found = TRUE;
8191+ break;
8192+ }
8193+ list_node = &(*list_node)->next;
8194+ }
8195+ }
8196+ if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */
8197+ /* registration code */
8198+ if (found) {
8199+ rc = -EBUSY;
8200+ LOG_ERROR
8201+ ("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",
8202+ rc, tmp.eventry.pid, tmp.eventry.signo,
8203+ tmp.eventry.eventid);
8204+ } else {
8205+ /* register this pid/event type */
8206+ event = kmalloc(sizeof (struct evms_event), GFP_KERNEL);
8207+ if (!event) {
8208+ rc = -ENOMEM;
8209+ LOG_ERROR
8210+ ("error(%d) allocating event structure.\n",
8211+ rc);
8212+ } else {
8213+ memset(event, 0, sizeof (struct evms_event));
8214+ event->pid = tmp.eventry.pid;
8215+ event->eventid = tmp.eventry.eventid;
8216+ event->signo = tmp.eventry.signo;
8217+ rc = evms_cs_add_item_to_list
8218+ (&evms_global_notify_list, event);
8219+ }
8220+ }
8221+ } else { /* tmp.command == EVMS_UNREGISTER_EVENT */
8222+ /* unregistration code */
8223+ if (!found) {
8224+ rc = -ENODATA;
8225+ LOG_ERROR
8226+ ("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",
8227+ rc, tmp.eventry.pid, tmp.eventry.eventid);
8228+ } else {
8229+ event = (*list_node)->item;
8230+ rc = evms_cs_remove_item_from_list
8231+ (&evms_global_notify_list, event);
8232+ if (!rc) {
8233+ kfree(event);
8234+ }
8235+ }
8236+ }
8237+ /* copy the status value back to the user */
8238+ tmp.status = rc;
8239+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8240+ rc = -EFAULT;
8241+
8242+ return (rc);
8243+}
8244+
8245+static int
8246+evms_ioctl_cmd_check_mount_status(struct inode *inode, struct file *file,
8247+ ulong arg)
8248+{
8249+ int rc = 0;
8250+ struct evms_mount_status_pkt tmp, *user_parms;
8251+
8252+ user_parms = (struct evms_mount_status_pkt *) arg;
8253+ /* copy user's parameters to kernel space */
8254+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8255+ rc = -EFAULT;
8256+
8257+ if (!rc) {
8258+ tmp.mounted =
8259+ (is_mounted(MKDEV(EVMS_MAJOR, tmp.minor))) ? TRUE : FALSE;
8260+ }
8261+
8262+ /* copy the status value back to the user */
8263+ tmp.status = rc;
8264+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8265+ rc = -EFAULT;
8266+
8267+ return (rc);
8268+}
8269+
8270+static int
8271+evms_ioctl_cmd_check_open_status(struct inode *inode, struct file *file,
8272+ ulong arg)
8273+{
8274+ int rc = 0;
8275+ struct evms_open_status_pkt tmp, *user_parms;
8276+
8277+ user_parms = (struct evms_open_status_pkt *) arg;
8278+ /* copy user's parameters to kernel space */
8279+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8280+ rc = -EFAULT;
8281+
8282+ if (!rc) {
8283+ tmp.opens = is_open(tmp.minor);
8284+ }
8285+
8286+ /* copy the status value back to the user */
8287+ tmp.status = rc;
8288+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8289+ rc = -EFAULT;
8290+
8291+ return (rc);
8292+}
8293+
8294+/************************************************/
8295+/* END -- IOCTL commands -- EVMS specific */
8296+/************************************************/
8297+
8298+/************************************************/
8299+/* START -- IOCTL commands -- Volume specific */
8300+/************************************************/
8301+
8302+/************************************************/
8303+/* END -- IOCTL commands -- Volume specific */
8304+/************************************************/
8305+
8306+/************************************************/
8307+/* START -- IOCTL main */
8308+/************************************************/
8309+
8310+/*
8311+ * Function: evms_ioctl
8312+ *
8313+ * This function is the main ioctl entry point for all of evms.
8314+ */
8315+
8316+static int
8317+evms_ioctl(struct inode *inode,
8318+ struct file *file, unsigned int cmd, unsigned long arg)
8319+{
8320+ unsigned long minor = 0;
8321+ int rc = 0;
8322+ struct evms_logical_node *node = NULL;
8323+
8324+ /* check user access */
8325+ if (!capable(CAP_SYS_ADMIN))
8326+ rc = -EACCES;
8327+
8328+ if (!inode)
8329+ rc = -EINVAL;
8330+
8331+ if (!rc) {
8332+ /* get the minor */
8333+ minor = MINOR(inode->i_rdev);
8334+ LOG_EXTRA
8335+ ("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
8336+ minor, (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK,
8337+ (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
8338+ (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
8339+ (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK);
8340+
8341+ /* insure this minor points to a valid volume */
8342+ if (minor) {
8343+ node = evms_logical_volumes[minor].node;
8344+ if (node == NULL)
8345+ rc = -ENXIO;
8346+ }
8347+ }
8348+
8349+ /* process the IOCTL commands */
8350+ if (!rc) {
8351+ if (!minor) {
8352+ /* process all EVMS specific commands */
8353+ switch (cmd) {
8354+ case EVMS_GET_IOCTL_VERSION:
8355+ rc = evms_ioctl_cmd_get_ioctl_version((void *)
8356+ arg);
8357+ break;
8358+ case EVMS_GET_VERSION:
8359+ rc = evms_ioctl_cmd_get_version((void *) arg);
8360+ break;
8361+ case EVMS_GET_INFO_LEVEL:
8362+ rc = evms_ioctl_cmd_get_info_level((void *)
8363+ arg);
8364+ break;
8365+ case EVMS_SET_INFO_LEVEL:
8366+ rc = evms_ioctl_cmd_set_info_level((void *)
8367+ arg);
8368+ break;
8369+ case EVMS_REDISCOVER_VOLUMES:
8370+ rc = evms_ioctl_cmd_rediscover_volumes(inode,
8371+ file,
8372+ cmd,
8373+ arg);
8374+ break;
8375+ case EVMS_GET_LOGICAL_DISK:
8376+ rc = evms_ioctl_cmd_get_logical_disk((void *)
8377+ arg);
8378+ break;
8379+ case EVMS_GET_LOGICAL_DISK_INFO:
8380+ rc = evms_ioctl_cmd_get_logical_disk_info((void
8381+ *)
8382+ arg);
8383+ break;
8384+ case EVMS_SECTOR_IO:
8385+ rc = evms_ioctl_cmd_sector_io((void *) arg);
8386+ break;
8387+ case EVMS_GET_MINOR:
8388+ rc = evms_ioctl_cmd_get_minor((void *) arg);
8389+ break;
8390+ case EVMS_GET_VOLUME_DATA:
8391+ rc = evms_ioctl_cmd_get_volume_data((void *)
8392+ arg);
8393+ break;
8394+ case EVMS_DELETE_VOLUME:
8395+ rc = evms_ioctl_cmd_delete_volume(inode, file,
8396+ arg);
8397+ break;
8398+ case EVMS_GET_PLUGIN:
8399+ rc = evms_ioctl_cmd_get_plugin((void *) arg);
8400+ break;
8401+ case EVMS_PLUGIN_IOCTL:
8402+ rc = evms_ioctl_cmd_plugin_ioctl(inode, file,
8403+ cmd, arg);
8404+ break;
8405+ case EVMS_COMPUTE_CSUM:
8406+ rc = evms_ioctl_cmd_kernel_partial_csum((void *)
8407+ arg);
8408+ break;
8409+ case EVMS_PROCESS_NOTIFY_EVENT:
8410+ rc = evms_ioctl_cmd_process_notify_event(arg);
8411+ break;
8412+ case EVMS_CHECK_MOUNT_STATUS:
8413+ rc = evms_ioctl_cmd_check_mount_status(inode,
8414+ file,
8415+ arg);
8416+ break;
8417+ case EVMS_CHECK_OPEN_STATUS:
8418+ rc = evms_ioctl_cmd_check_open_status(inode,
8419+ file,
8420+ arg);
8421+ break;
8422+ default:
8423+ rc = -EINVAL;
8424+ break;
8425+ }
8426+ } else {
8427+ /* process Volume specific commands */
8428+ switch (cmd) {
8429+ /* pick up standard blk ioctls */
8430+ case BLKFLSBUF:
8431+ case BLKROSET:
8432+ case BLKROGET:
8433+ case BLKRASET:
8434+ case BLKRAGET:
8435+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8436+ case BLKBSZGET:
8437+ case BLKBSZSET:
8438+#endif
8439+ case BLKSSZGET:
8440+ rc = blk_ioctl(inode->i_rdev, cmd, arg);
8441+ break;
8442+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8443+ case BLKGETSIZE:
8444+ {
8445+ /* casting size down to 32-bits until
8446+ * kernel allows return of 64-bit size
8447+ * values.
8448+ */
8449+ long size = node->total_vsectors;
8450+ if (copy_to_user
8451+ ((long *) arg, &size,
8452+ sizeof (long)))
8453+ rc = -EFAULT;
8454+ }
8455+ break;
8456+ case BLKGETSIZE64:
8457+ {
8458+ u64 size_in_bytes =
8459+ node->
8460+ total_vsectors <<
8461+ EVMS_VSECTOR_SIZE_SHIFT;
8462+ if (copy_to_user
8463+ ((u64 *) arg, &size_in_bytes,
8464+ sizeof (u64)))
8465+ rc = -EFAULT;
8466+ }
8467+ break;
8468+#endif
8469+ case EVMS_GET_IOCTL_VERSION:
8470+ rc = evms_ioctl_cmd_get_ioctl_version((void *)
8471+ arg);
8472+ break;
8473+ case EVMS_GET_BMAP:
8474+ rc = evms_ioctl_cmd_get_bmap(inode, file, cmd,
8475+ arg);
8476+ break;
8477+ case EVMS_GET_VOL_STRIPE_INFO:
8478+ {
8479+ struct evms_vol_stripe_info_pkt info;
8480+
8481+ info.size =
8482+ PAGE_SIZE >>
8483+ EVMS_VSECTOR_SIZE_SHIFT;
8484+ info.width = 1;
8485+ if (copy_to_user
8486+ ((struct evms_vol_stripe_info_pkt *)
8487+ arg, &info, sizeof (info)))
8488+ rc = -EFAULT;
8489+ }
8490+ break;
8491+
8492+ default:
8493+ rc = IOCTL(node, inode, file, cmd, arg);
8494+ break;
8495+ }
8496+ }
8497+ }
8498+ return rc;
8499+}
8500+
8501+/************************************************/
8502+/* END -- IOCTL main */
8503+/************************************************/
8504+
8505+/************************************************/
8506+/* START -- CHECK MEDIA CHANGE */
8507+/************************************************/
8508+
8509+static int
8510+evms_check_media_change(kdev_t dev)
8511+{
8512+ int rc = 0;
8513+ struct evms_logical_volume *volume = NULL;
8514+
8515+ /* check user access */
8516+ if (!capable(CAP_SYS_ADMIN))
8517+ rc = -EACCES;
8518+ if (!rc) {
8519+ int minor;
8520+ /* get the minor */
8521+ minor = MINOR(dev);
8522+ /* insure this minor points to a valid volume */
8523+ volume = &evms_logical_volumes[minor];
8524+ if (volume->node == NULL) {
8525+ rc = -ENXIO;
8526+ }
8527+ }
8528+ if (!rc) {
8529+ if (volume->flags & EVMS_DEVICE_REMOVABLE) {
8530+ /* check for media change */
8531+ rc = evms_cs_kernel_ioctl(volume->node,
8532+ EVMS_CHECK_MEDIA_CHANGE,
8533+ (unsigned long) NULL);
8534+ if (rc < 0) {
8535+ LOG_ERROR
8536+ ("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8537+ rc, volume->name);
8538+ }
8539+ }
8540+ }
8541+ return (rc);
8542+}
8543+
8544+/************************************************/
8545+/* END -- CHECK MEDIA CHANGE */
8546+/************************************************/
8547+
8548+static int
8549+evms_check_for_device_changes(struct inode *inode, struct file *file)
8550+{
8551+ int rc = 0, something_changed = 0, i;
8552+ struct evms_rediscover_pkt kernel_rd_pckt = { 0, 0, NULL };
8553+ struct evms_list_node *disk_list = NULL, *lnode, *next_lnode;
8554+ struct evms_logical_node *disk, *new_device_list = NULL;
8555+ struct evms_logical_volume *volume = NULL;
8556+
8557+ /* check for new devices
8558+ *
8559+ * put all new devices on the disk list so they
8560+ * will be included in the rediscovery process.
8561+ */
8562+ static void evms_discover_logical_disks(struct evms_logical_node **);
8563+ evms_discover_logical_disks(&new_device_list);
8564+ if (new_device_list) {
8565+ LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);
8566+ something_changed++;
8567+ /* put these new nodes on the disk list */
8568+ while (new_device_list) {
8569+ disk = new_device_list;
8570+ rc = evms_cs_remove_logical_node_from_list
8571+ (&new_device_list, disk);
8572+ if (rc) {
8573+ LOG_ERROR
8574+ ("%s: error(%d) removing device(%s) from list.\n",
8575+ __FUNCTION__, rc, disk->name);
8576+ }
8577+ rc = evms_cs_add_item_to_list(&disk_list, disk);
8578+ if (rc) {
8579+ LOG_ERROR
8580+ ("%s: error(%d) adding device(%s) from list.\n",
8581+ __FUNCTION__, rc, disk->name);
8582+ }
8583+ }
8584+ }
8585+
8586+ /* check all devices for changed removable media
8587+ *
8588+ * scan the global device list and issue check
8589+ * media change on each removable media device.
8590+ * put all removable devices that indicate a
8591+ * media change on the disk list.
8592+ *
8593+ * also scan for devices that have been unplugged
8594+ * or contain corrupt volumes.
8595+ */
8596+ for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
8597+ int add_to_list = FALSE;
8598+ disk = (struct evms_logical_node *) lnode->item;
8599+ /* only really check removable media devices */
8600+ if (disk->flags & EVMS_DEVICE_REMOVABLE) {
8601+ /* check for media change */
8602+ rc = evms_cs_kernel_ioctl(disk,
8603+ EVMS_CHECK_MEDIA_CHANGE,
8604+ (unsigned long) NULL);
8605+ if (rc < 0) {
8606+ LOG_ERROR
8607+ ("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8608+ __FUNCTION__, rc, disk->name);
8609+ } else if (rc == 1) {
8610+ add_to_list = TRUE;
8611+ }
8612+ }
8613+ /* check for device that where present
8614+ * before but are gone (unplugged
8615+ * device or unloaded driver).
8616+ */
8617+ rc = IOCTL(disk, inode, file,
8618+ EVMS_CHECK_DEVICE_STATUS, (ulong) NULL);
8619+ if (rc) {
8620+ LOG_ERROR
8621+ ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8622+ rc, volume->name);
8623+ }
8624+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
8625+ add_to_list = TRUE;
8626+ }
8627+ if (add_to_list) {
8628+ something_changed++;
8629+ rc = evms_cs_add_item_to_list(&disk_list, disk);
8630+ }
8631+ }
8632+ /* log a statement that we detected changed media.
8633+ */
8634+ if (disk_list) {
8635+ LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);
8636+ }
8637+
8638+ /* check for volumes with removed removable media.
8639+ * mark the volumes that reside on changed media.
8640+ */
8641+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8642+ volume = &evms_logical_volumes[i];
8643+ if (!volume->node)
8644+ continue;
8645+ if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
8646+ continue;
8647+ if (evms_check_media_change(MKDEV(EVMS_MAJOR, i)) <= 0)
8648+ continue;
8649+ /* remember which volumes have changed media */
8650+ volume->flags |= EVMS_MEDIA_CHANGED;
8651+ something_changed++;
8652+ }
8653+
8654+ /* check for removed devices */
8655+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8656+ int status;
8657+ volume = &evms_logical_volumes[i];
8658+ if (!volume->node)
8659+ continue;
8660+ /* check for device status */
8661+ status = 0;
8662+ rc = IOCTL(volume->node, inode, file,
8663+ EVMS_CHECK_DEVICE_STATUS, (ulong) & status);
8664+ if (rc) {
8665+ LOG_ERROR
8666+ ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8667+ rc, volume->name);
8668+ continue;
8669+ }
8670+ if (!(status & EVMS_DEVICE_UNAVAILABLE)) {
8671+ continue;
8672+ }
8673+ /* remember which volumes have changed media */
8674+ volume->flags |= EVMS_DEVICE_UNPLUGGED;
8675+ something_changed++;
8676+ }
8677+
8678+ /* do we have some work to do? */
8679+ if (something_changed) {
8680+ /* check for volumes to be deleted */
8681+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8682+ struct evms_quiesce_vol_pkt qv;
8683+
8684+ volume = &evms_logical_volumes[i];
8685+ if (!volume->node)
8686+ continue;
8687+ /* only proceed on volumes with:
8688+ * changed media,
8689+ * hot-unplugged devices,
8690+ * & partial volumes
8691+ */
8692+ if (!(volume->flags &
8693+ (EVMS_MEDIA_CHANGED |
8694+ EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8695+ continue;
8696+ /* gather the disk's needing to be
8697+ * rediscovered to rebuild this
8698+ * volume.
8699+ *
8700+ * this will locate other disks that
8701+ * the volume resides on that don't
8702+ * indicate media change.
8703+ */
8704+ rc = evms_cs_kernel_ioctl(volume->node,
8705+ EVMS_GET_DISK_LIST,
8706+ (unsigned long) &disk_list);
8707+ if (rc) {
8708+ LOG_ERROR
8709+ ("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",
8710+ __FUNCTION__, rc, volume->name);
8711+ continue;
8712+ }
8713+ /* quiesce all the changed volumes
8714+ * prior to being deleted.
8715+ */
8716+ qv.command = 1; // quiesce
8717+ qv.minor = i; //
8718+ qv.status = 0; // reset status
8719+ qv.do_vfs = 0;
8720+ rc = evms_quiesce_volume(volume, inode, file, &qv);
8721+ if (rc) {
8722+ LOG_ERROR
8723+ ("%s: error(%d) attempting to quiesce '%s%s'.\n",
8724+ __FUNCTION__, rc, EVMS_DEV_NODE_PATH,
8725+ volume->name);
8726+ }
8727+ }
8728+
8729+ /* we need to revalidate all the changed
8730+ * media. this is accomplished by issuing
8731+ * the revalidate disk ioctl to each device
8732+ * with changed media. the device manager
8733+ * remembers which devices indicated
8734+ * media changed (set by check media
8735+ * changed ioctl issued earlier), and will
8736+ * only issue the revalidate disk ioctl to
8737+ * those disks one time.
8738+ *
8739+ * NOTE:
8740+ * this needs to be done BEFORE deleting
8741+ * the volumes because deleting the
8742+ * last segment on disk will cause the
8743+ * associated disk node to freed, and we
8744+ * will not be able to issue the
8745+ * revalidate disk ioctl after that.
8746+ */
8747+ for (lnode = disk_list; lnode; lnode = lnode->next) {
8748+ disk = (struct evms_logical_node *) lnode->item;
8749+ /* only really do removable media devices */
8750+ if (disk->flags & EVMS_MEDIA_CHANGED) {
8751+ /* go revalidate the change media */
8752+ rc = evms_cs_kernel_ioctl(disk,
8753+ EVMS_REVALIDATE_DISK,
8754+ (unsigned long) NULL);
8755+ if (rc) {
8756+ LOG_ERROR
8757+ ("%s: error(%d) attempting to revalidate '%s%s'.\n",
8758+ __FUNCTION__, rc,
8759+ EVMS_DEV_NODE_PATH, volume->name);
8760+ }
8761+ }
8762+ }
8763+
8764+ /* delete all the affected volumes */
8765+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8766+ struct evms_delete_vol_pkt dv;
8767+
8768+ volume = &evms_logical_volumes[i];
8769+ if (!volume->node)
8770+ continue;
8771+ /* only proceed on volumes with:
8772+ * changed media,
8773+ * hot-unplugged devices,
8774+ * & partial volumes
8775+ */
8776+ if (!(volume->flags &
8777+ (EVMS_MEDIA_CHANGED |
8778+ EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8779+ continue;
8780+ /* only delete quiesced volumes */
8781+ if (!volume->quiesced)
8782+ continue;
8783+ /* delete the volume from memory.
8784+ * do a 'soft' delete if volume
8785+ * is mounted, and 'hard' delete
8786+ * if it is not.
8787+ *
8788+ * NOTE: the delete operation will
8789+ * clear the bits in the flags field.
8790+ */
8791+ dv.command = is_open(i);
8792+ dv.minor = i;
8793+ dv.status = 0;
8794+ rc = evms_delete_volume(volume, &dv);
8795+ }
8796+
8797+ /* at this point all devices indicating
8798+ * media change that had volumes on them
8799+ * should be gone. however, we could still
8800+ * have devices indicating media change
8801+ * that had no volumes on them in the disk
8802+ * list. we need to delete these devices
8803+ * from kernel memory and the global device
8804+ * list.
8805+ */
8806+ for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
8807+ next_lnode = lnode->next;
8808+
8809+ disk = (struct evms_logical_node *) lnode->item;
8810+ if (disk->flags & EVMS_MEDIA_CHANGED) {
8811+ rc = DELETE(disk);
8812+ }
8813+ }
8814+
8815+ /* all the devices that indicated media
8816+ * change should be gone, both from kernel
8817+ * memory and global device list. we now
8818+ * need to remove any references to these
8819+ * devices from the disk list.
8820+ *
8821+ * when removable media is installed, it
8822+ * will get detected in the device manager's
8823+ * rediscovery as a new device and added to
8824+ * the discover list.
8825+ */
8826+ for (lnode = disk_list; lnode; lnode = next_lnode) {
8827+ struct evms_list_node *glnode;
8828+ int lnode_still_there;
8829+
8830+ next_lnode = lnode->next;
8831+
8832+ lnode_still_there = FALSE;
8833+ for (glnode = evms_global_device_list;
8834+ glnode; glnode = glnode->next) {
8835+ if (glnode->item == lnode->item) {
8836+ lnode_still_there = TRUE;
8837+ break;
8838+ }
8839+ }
8840+ if (lnode_still_there == FALSE) {
8841+ rc = evms_cs_remove_item_from_list(&disk_list,
8842+ lnode->item);
8843+ if (rc) {
8844+ LOG_ERROR
8845+ ("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",
8846+ __FUNCTION__, rc, lnode->item,
8847+ &disk_list);
8848+ }
8849+ }
8850+ }
8851+
8852+ /* build the in-kernel rediscover packet */
8853+
8854+ /* allocate the space for the drive_array in
8855+ * the struct evms_rediscover_pkt packet. to do this
8856+ * we need to count the number of disk nodes,
8857+ * then allocate the necessary space.
8858+ */
8859+ /* count the disk nodes */
8860+ for (lnode = disk_list; lnode; lnode = lnode->next)
8861+ kernel_rd_pckt.drive_count++;
8862+ /* allocate the space */
8863+ if (kernel_rd_pckt.drive_count) {
8864+ kernel_rd_pckt.drive_array =
8865+ kmalloc(kernel_rd_pckt.drive_count *
8866+ sizeof (u64), GFP_KERNEL);
8867+ if (!kernel_rd_pckt.drive_array) {
8868+ rc = -ENOMEM;
8869+ LOG_ERROR
8870+ ("%s: error(%d) allocating rediscover drive array.\n",
8871+ __FUNCTION__, rc);
8872+ }
8873+ }
8874+ /* populate the drive array
8875+ *
8876+ * this also frees the disk_list which is useful
8877+ * if we had an error allocating the drive array.
8878+ */
8879+ for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
8880+ next_lnode = lnode->next;
8881+
8882+ /* remove this disk from the disk list */
8883+ disk = (struct evms_logical_node *) lnode->item;
8884+ rc = evms_cs_remove_item_from_list(&disk_list, disk);
8885+ if (!rc) {
8886+ /* add this disk to rediscover
8887+ * packet
8888+ */
8889+ kernel_rd_pckt.drive_array[i] =
8890+ NODE_TO_DEV_HANDLE(disk);
8891+ }
8892+ }
8893+ /* perform the rediscovery operation */
8894+ if (!rc) {
8895+ static int evms_discover_volumes(struct
8896+ evms_rediscover_pkt *);
8897+ rc = evms_discover_volumes(&kernel_rd_pckt);
8898+ if (kernel_rd_pckt.drive_count) {
8899+ kfree(kernel_rd_pckt.drive_array);
8900+ }
8901+ }
8902+ LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);
8903+ }
8904+
8905+ return (rc);
8906+}
8907+
8908+/************************************************/
8909+/* START -- REVALIDATE DISK */
8910+/************************************************/
8911+
8912+static int
8913+evms_revalidate_disk(kdev_t dev)
8914+{
8915+ int rc = 0;
8916+ struct evms_logical_volume *volume = NULL;
8917+
8918+ /* check user access */
8919+ if (!capable(CAP_SYS_ADMIN))
8920+ rc = -EACCES;
8921+ if (!rc) {
8922+ int minor;
8923+ /* get the minor */
8924+ minor = MINOR(dev);
8925+ /* insure this minor points to a valid volume */
8926+ volume = &evms_logical_volumes[minor];
8927+ if (volume->node == NULL) {
8928+ rc = -ENXIO;
8929+ }
8930+ }
8931+ if (!rc) {
8932+ /* go revalidate the change media */
8933+ rc = evms_cs_kernel_ioctl(volume->node,
8934+ EVMS_REVALIDATE_DISK,
8935+ (unsigned long) NULL);
8936+ }
8937+ return (rc);
8938+}
8939+
8940+/************************************************/
8941+/* END -- REVALIDATE DISK */
8942+/************************************************/
8943+
8944+/************************************************/
8945+/* START -- OPEN */
8946+/************************************************/
8947+
8948+static int
8949+evms_open(struct inode *inode, struct file *file)
8950+{
8951+ int rc = 0, minor = 0;
8952+ struct evms_logical_volume *volume = NULL;
8953+
8954+ /* check user access */
8955+ if (!capable(CAP_SYS_ADMIN))
8956+ rc = -EACCES;
8957+ if (!rc) {
8958+ if (!inode)
8959+ rc = -EINVAL;
8960+ }
8961+ rc = evms_check_for_device_changes(inode, file);
8962+ if (!rc) {
8963+ /* get the minor */
8964+ minor = MINOR(inode->i_rdev);
8965+ if (minor) {
8966+ /* insure this minor points to a valid volume */
8967+ volume = &evms_logical_volumes[minor];
8968+ if (volume->node == NULL) {
8969+ rc = -ENXIO;
8970+ }
8971+ }
8972+ }
8973+ /* go "open" the volume */
8974+ if (!rc && minor) {
8975+ atomic_inc(&volume->opens);
8976+ rc = IOCTL(volume->node, inode, file,
8977+ EVMS_OPEN_VOLUME, (unsigned long) NULL);
8978+ if (rc) {
8979+ LOG_ERROR
8980+ ("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
8981+ rc, volume->name);
8982+ atomic_dec(&volume->opens);
8983+ }
8984+ }
8985+ return (rc);
8986+}
8987+
8988+/************************************************/
8989+/* END -- OPEN */
8990+/************************************************/
8991+
8992+/************************************************/
8993+/* START -- RELEASE */
8994+/************************************************/
8995+
8996+static int
8997+evms_release(struct inode *inode, struct file *file)
8998+{
8999+ int rc = 0, minor = 0;
9000+ struct evms_logical_volume *volume = NULL;
9001+
9002+ if (!inode)
9003+ rc = -EINVAL;
9004+ if (!rc) {
9005+ /* get the minor */
9006+ minor = MINOR(inode->i_rdev);
9007+ if (minor) {
9008+ /* insure this minor points to a valid volume */
9009+ volume = &evms_logical_volumes[minor];
9010+ if (volume->node == NULL) {
9011+ rc = -ENXIO;
9012+ }
9013+ }
9014+ }
9015+ /* go "close" the volume */
9016+ if (!rc && minor) {
9017+ rc = IOCTL(volume->node, inode, file,
9018+ EVMS_CLOSE_VOLUME, (unsigned long) NULL);
9019+ if (rc) {
9020+ LOG_ERROR
9021+ ("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
9022+ rc, volume->name);
9023+ } else {
9024+ atomic_dec(&volume->opens);
9025+ }
9026+ }
9027+ return (rc);
9028+}
9029+
9030+/************************************************/
9031+/* END -- RELEASE */
9032+/************************************************/
9033+
9034+static struct block_device_operations evms_fops = {
9035+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)
9036+ owner:THIS_MODULE,
9037+#endif
9038+ open:evms_open,
9039+ release:evms_release,
9040+ ioctl:evms_ioctl,
9041+ check_media_change:evms_check_media_change,
9042+ revalidate:evms_revalidate_disk
9043+};
9044+
9045+/**********************************************************/
9046+/* END -- FOPS functions definitions */
9047+/**********************************************************/
9048+
9049+/**********************************************************/
9050+/* START -- RUNTIME support functions */
9051+/**********************************************************/
9052+
9053+static void
9054+evms_do_request_fn(request_queue_t * q)
9055+{
9056+ LOG_WARNING("This function should not be called.\n");
9057+}
9058+
9059+#ifdef CONFIG_SMP
9060+static request_queue_t *
9061+evms_find_queue(kdev_t dev)
9062+{
9063+ request_queue_t *rq = NULL;
9064+ struct evms_logical_volume *volume;
9065+
9066+ volume = &evms_logical_volumes[MINOR(dev)];
9067+ if (volume->node)
9068+ rq = &volume->request_queue;
9069+ return (rq);
9070+}
9071+#endif
9072+
9073+/*
9074+ * Function: evms_make_request_fn
9075+ *
9076+ */
9077+static int
9078+evms_make_request_fn(request_queue_t * q, int rw, struct buffer_head *bh)
9079+{
9080+ struct evms_logical_volume *volume;
9081+
9082+ volume = &evms_logical_volumes[MINOR(bh->b_rdev)];
9083+ wait_event(volume->wait_queue, (!volume->quiesced));
9084+ if (volume->node) {
9085+ switch (rw) {
9086+ case READ:
9087+ case READA:
9088+ atomic_inc(&volume->requests_in_progress);
9089+ R_IO(volume->node, bh);
9090+ atomic_dec(&volume->requests_in_progress);
9091+ return 0;
9092+ case WRITE:
9093+ atomic_inc(&volume->requests_in_progress);
9094+ W_IO(volume->node, bh);
9095+ atomic_dec(&volume->requests_in_progress);
9096+ return 0;
9097+ default:
9098+ buffer_IO_error(bh);
9099+ return 0;
9100+ }
9101+ } else {
9102+ LOG_ERROR("request for unknown logical volume [minor(%d)].\n",
9103+ MINOR(bh->b_rdev));
9104+ buffer_IO_error(bh);
9105+ }
9106+ return 0;
9107+}
9108+
9109+/**********************************************************/
9110+/* END -- RUNTIME support functions */
9111+/**********************************************************/
9112+
9113+/**********************************************************/
9114+/* START -- INIT/DISCOVERY support functions */
9115+/**********************************************************/
9116+
9117+#ifdef LOCAL_DEBUG
9118+static void
9119+display_discover_list(struct evms_logical_node *discover_list, char *text)
9120+{
9121+ struct evms_logical_node *node;
9122+
9123+ LOG_DETAILS("discover list:(%s)\n", text);
9124+ for (node = discover_list; node; node = node->next) {
9125+ LOG_DETAILS("\nnode info:\n");
9126+ LOG_DETAILS("node.....................(0x%p)\n", node);
9127+ LOG_DETAILS("name.....................(%s)\n", node->name);
9128+ LOG_DETAILS("plugin id................(0x%x)\n",
9129+ node->plugin->id);
9130+ LOG_DETAILS("size.....................("PFU64")\n",
9131+ node->total_vsectors);
9132+ LOG_DETAILS("flags....................(0x%x)\n", node->flags);
9133+ LOG_DETAILS("iflags...................(0x%x)\n", node->iflags);
9134+ LOG_DETAILS("sector size..............(%d)\n",
9135+ node->hardsector_size);
9136+ LOG_DETAILS("block size...............(%d)\n",
9137+ node->block_size);
9138+ LOG_DETAILS("sys id...................(0x%x)\n",
9139+ node->system_id);
9140+
9141+ if (node->feature_header) {
9142+ struct evms_feature_header *fh;
9143+
9144+ fh = node->feature_header;
9145+ LOG_DETAILS("\nfeature header:\n");
9146+ LOG_DETAILS("signature................(0x%x)\n",
9147+ fh->signature);
9148+ LOG_DETAILS("crc......................(0x%x)\n",
9149+ fh->crc);
9150+ LOG_DETAILS("feature header version...(%d.%d.%d)\n",
9151+ fh->version.major, fh->version.minor,
9152+ fh->version.patchlevel);
9153+ LOG_DETAILS("engine version...........(%d.%d.%d)\n",
9154+ fh->engine_version.major,
9155+ fh->engine_version.minor,
9156+ fh->engine_version.patchlevel);
9157+ LOG_DETAILS("flags....................(0x%x)\n",
9158+ fh->flags);
9159+ LOG_DETAILS("feature id...............(0x%x)\n",
9160+ fh->feature_id);
9161+ LOG_DETAILS("sequence#................("PFU64")\n",
9162+ fh->sequence_number);
9163+ LOG_DETAILS("alignment padding........("PFU64")\n",
9164+ fh->alignment_padding);
9165+ LOG_DETAILS("feature data1 lsn........("PFU64")\n",
9166+ fh->feature_data1_start_lsn);
9167+ LOG_DETAILS("feature data1 size.......("PFU64")\n",
9168+ fh->feature_data1_size);
9169+ LOG_DETAILS("feature data2 lsn........("PFU64")\n",
9170+ fh->feature_data2_start_lsn);
9171+ LOG_DETAILS("feature data2 size.......("PFU64")\n",
9172+ fh->feature_data2_size);
9173+ LOG_DETAILS("volume sn................("PFU64")\n",
9174+ fh->volume_serial_number);
9175+ LOG_DETAILS("volume minor#............(%d)\n",
9176+ fh->volume_system_id);
9177+ LOG_DETAILS("object depth.............(%d)\n",
9178+ fh->object_depth);
9179+ LOG_DETAILS("object name..............(%s)\n",
9180+ fh->object_name);
9181+ LOG_DETAILS("volume name..............(%s)\n",
9182+ fh->volume_name);
9183+ }
9184+
9185+ if (node->volume_info) {
9186+ struct evms_volume_info *vi;
9187+
9188+ vi = node->volume_info;
9189+ LOG_DETAILS("\nvolume info:\n");
9190+ LOG_DETAILS("volume name..............(%s)\n",
9191+ vi->volume_name);
9192+ LOG_DETAILS("volume sn................("PFU64")\n",
9193+ vi->volume_sn);
9194+ LOG_DETAILS("volume minor#............(%d)\n",
9195+ vi->volume_minor);
9196+ }
9197+ }
9198+ if (discover_list) {
9199+ LOG_DETAILS("\n");
9200+ }
9201+}
9202+#endif
9203+
9204+/*
9205+ * Function: evms_discover_logical_disks
9206+ * Description: Construct the logical disk list by calling all registered device managers.
9207+ */
9208+static void
9209+evms_discover_logical_disks(struct evms_logical_node **disk_list)
9210+{
9211+ struct evms_registered_plugin *p;
9212+ LOG_EXTRA("discovering logical disks...\n");
9213+ for (p = registered_plugin_head; p; p = p->next) {
9214+ if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
9215+ DISCOVER(p, disk_list);
9216+ }
9217+ }
9218+}
9219+
9220+/*
9221+ * Function: evms_discover_logical_partitions
9222+ * Description: Construct the logical partition list by calling all registered partition managers.
9223+ */
9224+static void
9225+evms_discover_logical_partitions(struct evms_logical_node **discover_list)
9226+{
9227+ int rc, done;
9228+
9229+ struct evms_registered_plugin *p;
9230+ LOG_EXTRA("discovering logical partitions...\n");
9231+ do {
9232+ done = TRUE;
9233+ for (p = registered_plugin_head; p; p = p->next) {
9234+ if (GetPluginType(p->plugin->id) ==
9235+ EVMS_SEGMENT_MANAGER) {
9236+ rc = DISCOVER(p, discover_list);
9237+ /* RC > 0 means the plugin
9238+ * added something to the
9239+ * discover list. This also
9240+ * means we must loop thru
9241+ * these plugins another time.
9242+ * RC == 0 means nothing was
9243+ * added to the discover list
9244+ * by this plugin.
9245+ * RC < 0 means the plugin
9246+ * encountered some error and
9247+ * nothing was added to the list.
9248+ * NOTE: If a plugin has both
9249+ * added something new to the
9250+ * discover list and encountered
9251+ * an error, RC > 0 must be
9252+ * returned.
9253+ */
9254+ if (rc > 0)
9255+ done = FALSE;
9256+ }
9257+ }
9258+ } while (done == FALSE);
9259+
9260+ /* send the end of discovery signal to each
9261+ * partition manager plugin.
9262+ */
9263+ for (p = registered_plugin_head; p; p = p->next)
9264+ if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
9265+ if (p->plugin->fops->end_discover)
9266+ rc = END_DISCOVER(p, discover_list);
9267+}
9268+
9269+/*
9270+ * Function: evms_discover_volume_groups
9271+ * Description: Find volume groups within the logical partitions list
9272+ */
9273+static void
9274+evms_discover_volume_groups(struct evms_logical_node **discover_list)
9275+{
9276+ int rc, done;
9277+
9278+ struct evms_registered_plugin *p;
9279+ LOG_EXTRA("discovering logical volume groups...\n");
9280+ do {
9281+ done = TRUE;
9282+ for (p = registered_plugin_head; p; p = p->next) {
9283+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
9284+ rc = DISCOVER(p, discover_list);
9285+ /* RC > 0 means the plugin
9286+ * added something to the
9287+ * discover list. This also
9288+ * means we must loop thru
9289+ * these plugins another time.
9290+ * RC == 0 means nothing was
9291+ * added to the discover list
9292+ * by this plugin.
9293+ * RC < 0 means the plugin
9294+ * encountered some error and
9295+ * nothing was added to the list.
9296+ * NOTE: If a plugin has both
9297+ * added something new to the
9298+ * discover list and encountered
9299+ * an error, RC > 0 must be
9300+ * returned.
9301+ */
9302+ if (rc > 0)
9303+ done = FALSE;
9304+ }
9305+ }
9306+ } while (done == FALSE);
9307+
9308+ /* send the end of discovery signal to each volume
9309+ * group plugin.
9310+ */
9311+ for (p = registered_plugin_head; p; p = p->next)
9312+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
9313+ if (p->plugin->fops->end_discover)
9314+ rc = END_DISCOVER(p, discover_list);
9315+}
9316+
9317+/*
9318+ *
9319+ * convert all the feature header fields into cpu native format
9320+ * from the on-disk Little Endian format. From this point forward
9321+ * all plugins can deal with feature headers natively.
9322+ */
9323+void
9324+le_feature_header_to_cpu(struct evms_feature_header *fh)
9325+{
9326+ fh->signature = le32_to_cpup(&fh->signature);
9327+ fh->crc = le32_to_cpup(&fh->crc);
9328+ fh->version.major = le32_to_cpup(&fh->version.major);
9329+ fh->version.minor = le32_to_cpup(&fh->version.minor);
9330+ fh->version.patchlevel = le32_to_cpup(&fh->version.patchlevel);
9331+ fh->engine_version.major = le32_to_cpup(&fh->engine_version.major);
9332+ fh->engine_version.minor = le32_to_cpup(&fh->engine_version.minor);
9333+ fh->engine_version.patchlevel =
9334+ le32_to_cpup(&fh->engine_version.patchlevel);
9335+ fh->flags = le32_to_cpup(&fh->flags);
9336+ fh->feature_id = le32_to_cpup(&fh->feature_id);
9337+ fh->sequence_number = le64_to_cpup(&fh->sequence_number);
9338+ fh->alignment_padding = le64_to_cpup(&fh->alignment_padding);
9339+ fh->feature_data1_start_lsn =
9340+ le64_to_cpup(&fh->feature_data1_start_lsn);
9341+ fh->feature_data1_size = le64_to_cpup(&fh->feature_data1_size);
9342+ fh->feature_data2_start_lsn =
9343+ le64_to_cpup(&fh->feature_data2_start_lsn);
9344+ fh->feature_data2_size = le64_to_cpup(&fh->feature_data2_size);
9345+ fh->volume_serial_number = le64_to_cpup(&fh->volume_serial_number);
9346+ fh->volume_system_id = le32_to_cpup(&fh->volume_system_id);
9347+ fh->object_depth = le32_to_cpup(&fh->object_depth);
9348+}
9349+
9350+static int
9351+edef_load_feature_header(struct evms_logical_node *node)
9352+{
9353+ int i, rc = 0, rc_array[2] = { 0, 0 };
9354+ unsigned long size_in_bytes;
9355+ u64 size_in_sectors, starting_sector = 0;
9356+ struct evms_feature_header *fh = NULL, *fh1 = NULL, *fh2 = NULL;
9357+ char *location_name = NULL;
9358+ struct evms_version version = {
9359+ EVMS_FEATURE_HEADER_MAJOR,
9360+ EVMS_FEATURE_HEADER_MINOR,
9361+ EVMS_FEATURE_HEADER_PATCHLEVEL
9362+ };
9363+
9364+ if (!node->feature_header) {
9365+ size_in_sectors = evms_cs_size_in_vsectors(sizeof (*fh));
9366+ size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
9367+ fh1 = kmalloc(size_in_bytes, GFP_KERNEL);
9368+ if (fh1) {
9369+ fh2 = kmalloc(size_in_bytes, GFP_KERNEL);
9370+ if (!fh2) {
9371+ kfree(fh1);
9372+ rc = -ENOMEM;
9373+ }
9374+ } else {
9375+ rc = -ENOMEM;
9376+ }
9377+
9378+ for (i = 0; i < 2; i++) {
9379+ if (i == 0) {
9380+ starting_sector =
9381+ node->total_vsectors - size_in_sectors;
9382+ fh = fh1;
9383+ location_name = evms_primary_string;
9384+ } else {
9385+ starting_sector--;
9386+ fh = fh2;
9387+ location_name = evms_secondary_string;
9388+ }
9389+ /* read header into buffer */
9390+ rc = INIT_IO(node,
9391+ 0, starting_sector, size_in_sectors, fh);
9392+ if (rc) {
9393+ LOG_ERROR
9394+ ("error(%d) probing for %s feature header(at "PFU64") on '%s'.\n",
9395+ rc, location_name, starting_sector,
9396+ node->name);
9397+ rc_array[i] = rc;
9398+ continue;
9399+ }
9400+ /* validate header signature */
9401+ if (cpu_to_le32(fh->signature) !=
9402+ EVMS_FEATURE_HEADER_SIGNATURE) {
9403+ rc = -ENODATA;
9404+ rc_array[i] = rc;
9405+ continue;
9406+ }
9407+ /* validate header CRC */
9408+ if (fh->crc != EVMS_MAGIC_CRC) {
9409+ u32 org_crc, final_crc;
9410+ org_crc = cpu_to_le32(fh->crc);
9411+ fh->crc = 0;
9412+ final_crc =
9413+ evms_cs_calculate_crc(EVMS_INITIAL_CRC, fh,
9414+ sizeof (*fh));
9415+ if (final_crc != org_crc) {
9416+ LOG_ERROR
9417+ ("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at "PFU64") on '%s'.\n",
9418+ org_crc, final_crc, location_name,
9419+ starting_sector, node->name);
9420+ rc = -EINVAL;
9421+ rc_array[i] = rc;
9422+ continue;
9423+ }
9424+ } else {
9425+ LOG_WARNING
9426+ ("CRC disabled in %s feature header(at "PFU64") on '%s'.\n",
9427+ location_name, starting_sector,
9428+ node->name);
9429+ }
9430+ /* convert the feature header from the
9431+ * on-disk format (Little Endian) to
9432+ * native cpu format.
9433+ */
9434+ le_feature_header_to_cpu(fh);
9435+ /* verify the system data version */
9436+ rc = evms_cs_check_version(&version, &fh->version);
9437+ if (rc) {
9438+ LOG_ERROR
9439+ ("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
9440+ fh->version.major, fh->version.minor,
9441+ fh->version.patchlevel, location_name,
9442+ node->name);
9443+ rc_array[i] = rc;
9444+ }
9445+ }
9446+
9447+ /* getting same return code for both copies? */
9448+ if (rc_array[0] == rc_array[1]) {
9449+ rc = rc_array[0];
9450+ /* if no errors on both copies,
9451+ * check the sequence numbers.
9452+ * use the highest sequence number.
9453+ */
9454+ if (!rc) {
9455+ /* compare sequence numbers */
9456+ if (fh1->sequence_number ==
9457+ fh2->sequence_number) {
9458+ fh = fh1;
9459+ } else {
9460+ LOG_WARNING
9461+ ("%s feature header sequence number("PFU64") mismatches %s feature header sequence number("PFU64") on '%s'!\n",
9462+ evms_primary_string,
9463+ fh1->sequence_number,
9464+ evms_secondary_string,
9465+ fh2->sequence_number, node->name);
9466+ if (fh1->sequence_number >
9467+ fh2->sequence_number) {
9468+ fh = fh1;
9469+ location_name =
9470+ evms_primary_string;
9471+ /* indicate bad sequence number of secondary */
9472+ rc_array[1] = -1;
9473+ } else {
9474+ fh = fh2;
9475+ location_name =
9476+ evms_secondary_string;
9477+ /* indicate bad sequence number of primary */
9478+ rc_array[0] = -1;
9479+ }
9480+ }
9481+ }
9482+ /* getting different return codes for each copy */
9483+ } else
9484+ /* either primary or secondary copy is
9485+ * valid, so use the valid copy.
9486+ */
9487+ if ((rc_array[0] == 0) || (rc_array[1] == 0)) {
9488+ char *warn_name = NULL;
9489+
9490+ /* indicate success */
9491+ rc = 0;
9492+ /* set variables based on which copy is valid */
9493+ if (rc_array[0] == 0) {
9494+ /* use primary (rear) copy if its good */
9495+ fh = fh1;
9496+ location_name = evms_primary_string;
9497+ warn_name = evms_secondary_string;
9498+ } else {
9499+ /* use secondary (front) copy if its good */
9500+ fh = fh2;
9501+ location_name = evms_secondary_string;
9502+ warn_name = evms_primary_string;
9503+ }
9504+ /* warn the user about the invalid copy */
9505+ LOG_WARNING
9506+ ("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
9507+ rc_array[0] + rc_array[1], warn_name, node->name);
9508+ } else
9509+ /* both copies had a different error,
9510+ * and one was a fatal error, so
9511+ * indicate fatal error.
9512+ */
9513+ if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
9514+ rc = -EINVAL;
9515+ }
9516+
9517+ /* on error, set fh to NULL */
9518+ if (rc)
9519+ fh = NULL;
9520+
9521+ /* deallocate metadata buffers appropriately */
9522+ if (fh != fh1)
9523+ kfree(fh1);
9524+ if (fh != fh2)
9525+ kfree(fh2);
9526+
9527+ /* save validated feature header pointer */
9528+ if (!rc) {
9529+ node->feature_header = fh;
9530+ if (rc_array[0] != rc_array[1]) {
9531+ LOG_DETAILS
9532+ ("using %s feature header on '%s'.\n",
9533+ location_name, node->name);
9534+ }
9535+ }
9536+
9537+ /* if no signature found, adjust return code */
9538+ if (rc == -ENODATA) {
9539+ rc = 0;
9540+ LOG_DEBUG("no feature header found on '%s'.\n",
9541+ node->name);
9542+ }
9543+ }
9544+ return (rc);
9545+}
9546+
9547+static int
9548+edef_find_first_features(struct evms_logical_node **discover_list)
9549+{
9550+ int rc;
9551+ struct evms_logical_node *node, *tmp_list_head;
9552+
9553+ tmp_list_head = *discover_list;
9554+ *discover_list = NULL;
9555+
9556+ while (tmp_list_head) {
9557+ struct evms_list_node **evms_node;
9558+
9559+ node = tmp_list_head;
9560+ rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
9561+ node);
9562+ if (rc)
9563+ BUG();
9564+
9565+ /* check for duplicate pointers
9566+ * search for the node in global list
9567+ */
9568+ evms_node =
9569+ evms_cs_lookup_item_in_list(&evms_global_feature_node_list,
9570+ node);
9571+ /* already present? */
9572+ if (*evms_node) {
9573+ /* yes, already present */
9574+ rc = -ENODATA; /* dont process this node further */
9575+ LOG_DETAILS("deleting duplicate reference to '%s'.\n",
9576+ node->name);
9577+ /* forget this node */
9578+ node = NULL;
9579+ } else {
9580+ /* load the feature header if present */
9581+ rc = edef_load_feature_header(node);
9582+ /* This node have a feature header ?
9583+ * it won't be if there is no header to load
9584+ * OR
9585+ * there was a fatal error attempting to read it.
9586+ */
9587+ if (node->feature_header) {
9588+ /* check for object flag */
9589+ if (node->feature_header->flags &
9590+ EVMS_VOLUME_DATA_OBJECT) {
9591+ LOG_DEFAULT
9592+ ("object detected, deleting '%s'.\n",
9593+ node->name);
9594+ rc = -EINVAL;
9595+ } else
9596+ /* check for stop-data flag */
9597+ if (node->feature_header->flags &
9598+ EVMS_VOLUME_DATA_STOP) {
9599+ LOG_DEFAULT
9600+ ("stop data detected, deleting '%s'.\n",
9601+ node->name);
9602+ rc = -EINVAL;
9603+ } else {
9604+ /* we have a valid feature header.
9605+ * initialize appropriate node fields
9606+ * to indicate this.
9607+ */
9608+ node->flags |= EVMS_VOLUME_FLAG;
9609+ node->iflags |= EVMS_FEATURE_BOTTOM;
9610+ node->volume_info =
9611+ kmalloc(sizeof
9612+ (struct evms_volume_info),
9613+ GFP_KERNEL);
9614+ if (node->volume_info) {
9615+ /* set up volume
9616+ * info struct
9617+ */
9618+ memset(node->volume_info, 0,
9619+ sizeof
9620+ (struct
9621+ evms_volume_info));
9622+ node->volume_info->volume_sn =
9623+ node->feature_header->
9624+ volume_serial_number;
9625+ node->volume_info->
9626+ volume_minor =
9627+ node->feature_header->
9628+ volume_system_id;
9629+ strcpy(node->volume_info->
9630+ volume_name,
9631+ node->feature_header->
9632+ volume_name);
9633+ /* register(add) node to
9634+ * the global list.
9635+ */
9636+ rc = evms_cs_add_item_to_list
9637+ (&evms_global_feature_node_list,
9638+ node);
9639+ } else {
9640+ rc = -ENOMEM;
9641+ }
9642+ }
9643+ }
9644+ }
9645+ /* if any errors, delete the node */
9646+ if (rc) {
9647+ if (node) {
9648+ DELETE(node);
9649+ }
9650+ } else
9651+ /* on successful processing of this node
9652+ * place it back on the discover list.
9653+ */
9654+ evms_cs_add_logical_node_to_list(discover_list, node);
9655+ }
9656+ return (0);
9657+}
9658+
9659+/* These define describe the node types that can be isolated. */
9660+#define ISOLATE_ASSOCIATIVE_FEATURES 0
9661+#define ISOLATE_COMPATIBILITY_VOLUMES 1
9662+#define ISOLATE_EVMS_VOLUMES 2
9663+#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER 3
9664+#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH 4
9665+static int
9666+edef_isolate_nodes_by_type(unsigned int type,
9667+ struct evms_logical_node **src_list,
9668+ struct evms_logical_node **trg_list,
9669+ u32 compare32, u64 compare64)
9670+{
9671+ struct evms_logical_node *node, *next_node;
9672+ int rc = 0, found_node;
9673+ struct evms_feature_header *fh = NULL;
9674+
9675+ for (node = *src_list; node; node = next_node) {
9676+ next_node = node->next;
9677+
9678+ if (node->feature_header)
9679+ fh = node->feature_header;
9680+ found_node = FALSE;
9681+ switch (type) {
9682+ case ISOLATE_ASSOCIATIVE_FEATURES:
9683+ if (fh) {
9684+ if (GetPluginType(fh->feature_id) ==
9685+ EVMS_ASSOCIATIVE_FEATURE)
9686+ found_node = TRUE;
9687+ }
9688+ break;
9689+ case ISOLATE_COMPATIBILITY_VOLUMES:
9690+ if (!(node->flags & EVMS_VOLUME_FLAG))
9691+ found_node = TRUE;
9692+ break;
9693+ case ISOLATE_EVMS_VOLUMES:
9694+ if (node->flags & EVMS_VOLUME_FLAG)
9695+ found_node = TRUE;
9696+ break;
9697+ /* EVMS volumes with same serial # */
9698+ case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
9699+ if (node->volume_info->volume_sn == compare64)
9700+ found_node = TRUE;
9701+ break;
9702+ case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
9703+ if (fh)
9704+ if (fh->object_depth == compare64)
9705+ if (fh->feature_id == compare32)
9706+ found_node = TRUE;
9707+ break;
9708+ }
9709+ if (found_node == TRUE) {
9710+ rc = evms_cs_remove_logical_node_from_list(src_list,
9711+ node);
9712+ if (rc)
9713+ break;
9714+ rc = evms_cs_add_logical_node_to_list(trg_list, node);
9715+ if (rc)
9716+ break;
9717+ }
9718+ }
9719+ return (rc);
9720+}
9721+
9722+static int
9723+edef_apply_feature(struct evms_logical_node *node,
9724+ struct evms_logical_node **volume_node_list)
9725+{
9726+ struct evms_registered_plugin *p;
9727+ int rc = -1;
9728+
9729+ for (p = registered_plugin_head; p; p = p->next) {
9730+ if (p->plugin->id == node->feature_header->feature_id) {
9731+ rc = DISCOVER(p, volume_node_list);
9732+ break;
9733+ }
9734+ }
9735+ return (rc);
9736+}
9737+
9738+static int
9739+edef_get_feature_plugin_header(u32 id, struct evms_plugin_header **header)
9740+{
9741+ int rc = -ENOPKG;
9742+ struct evms_registered_plugin *p;
9743+
9744+ for (p = registered_plugin_head; p; p = p->next) {
9745+ if (p->plugin->id == id) {
9746+ *header = p->plugin;
9747+ rc = 0;
9748+ break;
9749+ }
9750+ }
9751+ if (rc) {
9752+ LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);
9753+ }
9754+ return (rc);
9755+}
9756+
9757+typedef struct evms_volume_build_info_s {
9758+ int node_count;
9759+ int feature_header_count;
9760+ int feature_count;
9761+ int associative_feature_count;
9762+ u64 max_depth;
9763+ struct evms_plugin_header *plugin;
9764+ struct evms_logical_node *feature_node_list;
9765+} evms_volume_build_info_t;
9766+
9767+/*
9768+ * edef_evaluate_volume_node_list:
9769+ * does:
9770+ * 1) put all nodes from feature list back on volume list
9771+ * 2) loads the node's feature headers
9772+ * 3) counts the node list's entries
9773+ * 4) builds the feature node list
9774+ * 5) counts the feature headers for associative features
9775+ * 6) sets feature count to >1 if >1 features to be processed
9776+ */
9777+static int
9778+edef_evaluate_volume_node_list(struct evms_logical_node **volume_node_list,
9779+ evms_volume_build_info_t * vbi,
9780+ int volume_complete)
9781+{
9782+ int rc;
9783+ struct evms_logical_node *node;
9784+
9785+ vbi->node_count =
9786+ vbi->feature_count =
9787+ vbi->associative_feature_count = vbi->max_depth = 0;
9788+ vbi->plugin = NULL;
9789+
9790+ /* put all feature nodes back on the volume list */
9791+ rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9792+ &vbi->feature_node_list,
9793+ volume_node_list, 0, 0);
9794+ if (rc)
9795+ return (rc);
9796+
9797+ /* load all the feature headers */
9798+ if (!volume_complete) {
9799+ for (node = *volume_node_list; node; node = node->next) {
9800+ rc = edef_load_feature_header(node);
9801+ if (rc)
9802+ return (rc);
9803+ }
9804+ }
9805+
9806+ /* find the 1st max depth object:
9807+ * record the depth
9808+ * record the plugin
9809+ */
9810+ for (node = *volume_node_list; node; node = node->next) {
9811+ struct evms_plugin_header *plugin;
9812+ struct evms_feature_header *fh = node->feature_header;
9813+
9814+ /* count the nodes */
9815+ vbi->node_count++;
9816+
9817+ /* no feature header found, continue to next node */
9818+ if (!fh)
9819+ continue;
9820+
9821+ /* check the depth */
9822+ if (fh->object_depth > vbi->max_depth) {
9823+ /* record new max depth */
9824+ vbi->max_depth = fh->object_depth;
9825+ /* find the plugin header for this feature id */
9826+ rc = edef_get_feature_plugin_header(fh->feature_id,
9827+ &plugin);
9828+ if (rc)
9829+ return (rc);
9830+ /* check for >1 plugins */
9831+ if (vbi->plugin != plugin) {
9832+ vbi->feature_count++;
9833+ vbi->plugin = plugin;
9834+ }
9835+ }
9836+ /* check for "associative" feature indicator */
9837+ if (GetPluginType(vbi->plugin->id) == EVMS_ASSOCIATIVE_FEATURE)
9838+ vbi->associative_feature_count++;
9839+ }
9840+ /* build a list of max depth nodes for this feature */
9841+ if (vbi->max_depth) {
9842+ rc = edef_isolate_nodes_by_type
9843+ (ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, volume_node_list,
9844+ &vbi->feature_node_list, vbi->plugin->id, vbi->max_depth);
9845+ if (rc)
9846+ return (rc);
9847+ if (!vbi->plugin)
9848+ return (-ENODATA);
9849+ if (!vbi->feature_node_list)
9850+ return (-ENODATA);
9851+ }
9852+
9853+ return (rc);
9854+}
9855+
9856+/* function: edef_check_feature_conditions
9857+ *
9858+ * This routine verifies the state of volume based on the features
9859+ * headers and nodes in the current discovery list. All detected
9860+ * errors are considered fatal.
9861+ */
9862+static int
9863+edef_check_feature_conditions(evms_volume_build_info_t * vbi)
9864+{
9865+ int rc = 0;
9866+
9867+ if (vbi->associative_feature_count) {
9868+ if (vbi->node_count > 1) {
9869+ rc = -EVMS_VOLUME_FATAL_ERROR;
9870+ LOG_ERROR
9871+ ("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
9872+ vbi->node_count);
9873+ } else if (vbi->max_depth != 1) {
9874+ rc = -EVMS_VOLUME_FATAL_ERROR;
9875+ LOG_ERROR
9876+ ("associative ERROR: associative feature found at node depth("PFU64") != 1!\n",
9877+ vbi->max_depth);
9878+ } else
9879+ rc = -EVMS_ASSOCIATIVE_FEATURE;
9880+ }
9881+ if (!rc) {
9882+ if (!vbi->max_depth) {
9883+ if (vbi->node_count > 1) {
9884+ rc = -EVMS_VOLUME_FATAL_ERROR;
9885+ LOG_ERROR
9886+ ("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
9887+ vbi->node_count);
9888+ }
9889+ } else if (vbi->max_depth == 1) {
9890+ if (vbi->feature_count > 1) {
9891+ rc = -EVMS_VOLUME_FATAL_ERROR;
9892+ LOG_ERROR
9893+ ("max depth 1 ERROR: > 1 features remaining to be processed!\n");
9894+ }
9895+ }
9896+ }
9897+ return (rc);
9898+}
9899+
9900+/* function: edef_apply_features
9901+ *
9902+ * This routine applies none, one, or more features to an EVMS
9903+ * volume. The system data structure is first verified and then
9904+ * features are applied and verified recursively until the
9905+ * entire volume has been constructed. Fatal errors result in
9906+ * all nodes in the volume discovery list being deleted.
9907+ */
9908+static int
9909+edef_apply_features(struct evms_logical_node **volume_node_list)
9910+{
9911+ int rc = 1, done, top_feature_applying;
9912+ evms_volume_build_info_t vbi;
9913+
9914+ vbi.feature_node_list = NULL;
9915+ rc = edef_evaluate_volume_node_list(volume_node_list, &vbi, FALSE);
9916+
9917+ /* ensure we don't go into the next loop
9918+ * without having a target plugin to
9919+ * pass control to.
9920+ */
9921+ if (!rc) {
9922+ if (!vbi.plugin) {
9923+ rc = -ENODATA;
9924+ }
9925+ }
9926+
9927+ /* this loop should ONLY get used when
9928+ * there are features to process.
9929+ */
9930+ done = (rc) ? TRUE : FALSE;
9931+ while (!done) {
9932+ rc = edef_check_feature_conditions(&vbi);
9933+ if (rc)
9934+ break;
9935+ top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
9936+ rc = vbi.plugin->fops->discover(&vbi.feature_node_list);
9937+ if (!rc) {
9938+ rc = edef_evaluate_volume_node_list(volume_node_list,
9939+ &vbi,
9940+ top_feature_applying);
9941+ if (top_feature_applying == TRUE) {
9942+ if (vbi.node_count > 1) {
9943+ rc = -EVMS_VOLUME_FATAL_ERROR;
9944+ LOG_ERROR
9945+ ("ERROR: detected > 1 node at volume completion!\n");
9946+ }
9947+ done = TRUE;
9948+ } else {
9949+ if (!vbi.plugin) {
9950+ rc = -EVMS_VOLUME_FATAL_ERROR;
9951+ LOG_ERROR
9952+ ("ERROR: depth("PFU64"): expected another feature!\n",
9953+ vbi.max_depth);
9954+ done = TRUE;
9955+ }
9956+ }
9957+ } else { /* rc != 0 */
9958+ rc = -EVMS_VOLUME_FATAL_ERROR;
9959+ done = TRUE;
9960+ }
9961+ }
9962+ if (rc)
9963+ /* put all feature nodes back on the volume list */
9964+ if (edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9965+ &vbi.feature_node_list,
9966+ volume_node_list, 0, 0))
9967+ BUG();
9968+ return (rc);
9969+}
9970+
9971+static int
9972+edef_delete_node(struct evms_logical_node **node_list,
9973+ struct evms_logical_node *node, int return_code,
9974+ char *log_text)
9975+{
9976+ int rc;
9977+
9978+ rc = evms_cs_remove_logical_node_from_list(node_list, node);
9979+ if (!rc) {
9980+ LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
9981+ log_text, return_code,
9982+ node->volume_info->volume_name, node->name);
9983+ rc = DELETE(node);
9984+ if (rc) {
9985+ LOG_ERROR("error(%d) while deleting node(%s)\n",
9986+ rc, node->name);
9987+ }
9988+ } else {
9989+ LOG_WARNING
9990+ ("%s error(%d): node gone, assumed deleted by plugin.\n",
9991+ log_text, return_code);
9992+ /* plugin must have cleaned up the node.
9993+ * So just reset the return code and leave.
9994+ */
9995+ rc = 0;
9996+ }
9997+
9998+ return (rc);
9999+}
10000+
10001+static int
10002+edef_process_evms_volumes(struct evms_logical_node **discover_list,
10003+ struct evms_logical_node **associative_feature_list)
10004+{
10005+ int rc = 0;
10006+ struct evms_logical_node *node, *evms_volumes_list, *volume_node_list;
10007+ u64 volume_sn;
10008+
10009+ /* put all EVMS volumes on their own list */
10010+ evms_volumes_list = NULL;
10011+ rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
10012+ discover_list,
10013+ &evms_volumes_list, 0, 0);
10014+
10015+ /* apply features to each EVMS volume */
10016+ /* one volume at a time on each pass */
10017+ while (evms_volumes_list) {
10018+ node = evms_volumes_list;
10019+ /* put all nodes for one EVMS volume on separate list */
10020+ volume_node_list = NULL;
10021+ volume_sn = node->volume_info->volume_sn;
10022+ rc = edef_isolate_nodes_by_type
10023+ (ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, &evms_volumes_list,
10024+ &volume_node_list, 0, volume_sn);
10025+ if (rc)
10026+ break;
10027+ /* go apply all the volume features now */
10028+ rc = edef_apply_features(&volume_node_list);
10029+ switch (rc) {
10030+ case 0: /* SUCCESS */
10031+ /* remove volume just processed */
10032+ node = volume_node_list;
10033+ rc = evms_cs_remove_logical_node_from_list
10034+ (&volume_node_list, node);
10035+ if (rc)
10036+ break;
10037+ /* put volume on global list */
10038+ rc = evms_cs_add_logical_node_to_list(discover_list,
10039+ node);
10040+ break;
10041+ case -EVMS_ASSOCIATIVE_FEATURE:
10042+ /* put all "associative" features on their own list */
10043+ rc = edef_isolate_nodes_by_type
10044+ (ISOLATE_ASSOCIATIVE_FEATURES, &volume_node_list,
10045+ associative_feature_list, 0, 0);
10046+ break;
10047+ default: /* FATAL ERROR */
10048+ /* delete each node remaining in the list */
10049+ if (volume_node_list) {
10050+ LOG_ERROR
10051+ ("encountered fatal error building volume '%s'\n",
10052+ volume_node_list->volume_info->
10053+ volume_name);
10054+ }
10055+ while (volume_node_list) {
10056+ node = volume_node_list;
10057+ edef_delete_node(&volume_node_list,
10058+ node, rc, "EVMS feature");
10059+ }
10060+ rc = 0;
10061+ break;
10062+ }
10063+ if (rc)
10064+ break;
10065+ }
10066+ return (rc);
10067+}
10068+
10069+static int
10070+edef_process_associative_volumes(struct evms_logical_node
10071+ **associative_feature_list,
10072+ struct evms_logical_node **discover_list)
10073+{
10074+ int rc = 0;
10075+ struct evms_logical_node *node;
10076+
10077+ while (*associative_feature_list) {
10078+ node = *associative_feature_list;
10079+ /* remove this node from associative feature list */
10080+ rc = evms_cs_remove_logical_node_from_list
10081+ (associative_feature_list, node);
10082+ if (rc)
10083+ break;
10084+ /* put volume on global list */
10085+ rc = evms_cs_add_logical_node_to_list(discover_list, node);
10086+ if (rc)
10087+ break;
10088+ rc = edef_load_feature_header(node);
10089+ if (rc)
10090+ break;
10091+ rc = edef_apply_feature(node, discover_list);
10092+ if (rc)
10093+ edef_delete_node(discover_list, node, rc,
10094+ "Associative feature");
10095+ }
10096+ return (rc);
10097+}
10098+
10099+static int
10100+edef_check_for_incomplete_volumes(struct evms_logical_node **discover_list)
10101+{
10102+ int rc = 0;
10103+ struct evms_logical_node *next_node, *node;
10104+
10105+ /* check to see if any incomplete volumes are left around */
10106+ /* if so, delete them. */
10107+ /* complete volumes should not have feature_headers */
10108+ /* hanging off them, if we find any, we know the volume */
10109+ /* is incomplete. */
10110+
10111+ for (node = *discover_list; node; node = next_node) {
10112+ next_node = node->next;
10113+
10114+ if (node->feature_header) {
10115+ edef_delete_node(discover_list, node, rc,
10116+ "Unexpected feature header");
10117+ }
10118+ }
10119+ return (rc);
10120+}
10121+
10122+/*
10123+ * Function: evms_discover_evms_features
10124+ * Description: Find features for nodes on the logical partitions list
10125+ */
10126+static int
10127+evms_discover_evms_features(struct evms_logical_node **discover_list)
10128+{
10129+ struct evms_logical_node *associative_feature_list;
10130+ int rc = 0;
10131+
10132+ LOG_EXTRA("discovering evms volume features...\n");
10133+
10134+ /* initialize "associative" features list */
10135+ associative_feature_list = NULL;
10136+
10137+ /* find the bottom features */
10138+ rc = edef_find_first_features(discover_list);
10139+#ifdef LOCAL_DEBUG
10140+ display_discover_list(*discover_list, "after 1st features hdr");
10141+#endif
10142+ if (!rc)
10143+ /* process EVMS volumes here */
10144+ rc = edef_process_evms_volumes(discover_list,
10145+ &associative_feature_list);
10146+#ifdef LOCAL_DEBUG
10147+ display_discover_list(*discover_list, "after evms volumes");
10148+#endif
10149+ if (!rc)
10150+ /* process "associative" features here */
10151+ rc = edef_process_associative_volumes(&associative_feature_list,
10152+ discover_list);
10153+#ifdef LOCAL_DEBUG
10154+ display_discover_list(*discover_list, "after associatives");
10155+#endif
10156+ if (!rc)
10157+ /* check for incomplete volumes */
10158+ rc = edef_check_for_incomplete_volumes(discover_list);
10159+
10160+ return (rc);
10161+}
10162+
10163+/*
10164+ * function: eelv_assign_volume_minor
10165+ *
10166+ * This is a support function for evms_export_logical_volumes.
10167+ * This routine assigns a specific minor number to a volume. It
10168+ * also performs the remaining steps to make this volume visible
10169+ * and usable to the kernel.
10170+ *
10171+ */
10172+static void
10173+eelv_assign_volume_minor(struct evms_logical_node *node, int minor)
10174+{
10175+ struct evms_logical_volume *volume;
10176+
10177+ /* initialize the logical_node entry in the volume array */
10178+ volume = &evms_logical_volumes[minor];
10179+ volume->node = node;
10180+ volume->name =
10181+ kmalloc(strlen(EVMS_GET_NODE_NAME(node)) + 1, GFP_KERNEL);
10182+ if (!volume->name)
10183+ BUG();
10184+ strcpy(volume->name, EVMS_GET_NODE_NAME(node));
10185+
10186+ /* copy flags from top level node into volume structure */
10187+ volume->flags = node->flags;
10188+
10189+ /* check for read-only volume */
10190+ if (volume->flags & EVMS_VOLUME_READ_ONLY) {
10191+ set_device_ro(MKDEV(EVMS_MAJOR, minor), 1);
10192+ }
10193+
10194+ /* adjust volume size based on hardsector size */
10195+ node->total_vsectors &=
10196+ ~((node->hardsector_size >> EVMS_VSECTOR_SIZE_SHIFT) - 1);
10197+
10198+ /* initialize the global device arrays */
10199+ blksize_size[EVMS_MAJOR][minor] = node->block_size;
10200+ hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;
10201+ blk_size[EVMS_MAJOR][minor] = (int) (node->total_vsectors >> 1);
10202+
10203+ /* register this volume with devfs */
10204+ volume->devfs_handle =
10205+ devfs_register(evms_dir_devfs_handle,
10206+ volume->name,
10207+ DEVFS_FL_DEFAULT,
10208+ EVMS_MAJOR, minor,
10209+ S_IFBLK | S_IRUGO | S_IWUGO, &evms_fops, NULL);
10210+
10211+ evms_volumes++;
10212+
10213+ LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
10214+ EVMS_MAJOR, minor, EVMS_DEV_NODE_PATH, volume->name);
10215+}
10216+
10217+/*
10218+ * function: eelv_check_for_duplicity
10219+ *
10220+ * This is a support function for evms_export_logical_volumes.
10221+ * This routine compares the serial number in the top most node
10222+ * in the volume to the list of currently exported volumes. If
10223+ * this volumes serial number is found in the list then we know
10224+ * this volume is a duplicate and it is then delete.
10225+ *
10226+ */
10227+static void
10228+eelv_check_for_duplicity(struct evms_logical_node **discover_list)
10229+{
10230+ struct evms_logical_node *next_node, *node;
10231+ struct evms_logical_volume *lv;
10232+ int i, is_dup;
10233+
10234+ for (node = *discover_list; node; node = next_node) {
10235+ next_node = node->next;
10236+
10237+ is_dup = FALSE;
10238+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10239+ lv = &evms_logical_volumes[i];
10240+ /* only check exported volumes */
10241+ if (lv->node) {
10242+ char *type_ptr = NULL;
10243+
10244+ /* check for duplicate pointer */
10245+ if (node == lv->node) {
10246+ is_dup = TRUE;
10247+ type_ptr = "pointer";
10248+ /* check for duplicate node */
10249+ } else if (!strcmp(node->name, lv->node->name)) {
10250+ is_dup = TRUE;
10251+ type_ptr = "node";
10252+ }
10253+ if (is_dup == TRUE) {
10254+ evms_cs_remove_logical_node_from_list
10255+ (discover_list, node);
10256+ LOG_DETAILS
10257+ ("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
10258+ type_ptr, EVMS_MAJOR, i,
10259+ EVMS_GET_NODE_NAME(node));
10260+ /* forget duplicate */
10261+ break;
10262+ }
10263+ }
10264+ }
10265+ }
10266+}
10267+
10268+/*
10269+ * function: eelv_reassign_soft_deleted_volume_minors
10270+ *
10271+ * This is a support function for evms_export_logical_volumes.
10272+ * This routine reassigns minor numbers to rediscovered "soft"
10273+ * deleted volumes.
10274+ *
10275+ */
10276+static void
10277+eelv_reassign_soft_deleted_volume_minors(struct evms_logical_node
10278+ **discover_list)
10279+{
10280+ struct evms_logical_node *next_node, *node;
10281+ struct evms_logical_volume *lv;
10282+ int i, node_removed;
10283+
10284+ for (node = *discover_list; node; node = next_node) {
10285+ next_node = node->next;
10286+
10287+ node_removed = FALSE;
10288+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10289+ lv = &evms_logical_volumes[i];
10290+ /* only check soft deleted volumes:
10291+ * they have a non-NULL name.
10292+ */
10293+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10294+ if (!strcmp(EVMS_GET_NODE_NAME(node), lv->name)) {
10295+ /* reassign requested minor */
10296+ evms_cs_remove_logical_node_from_list
10297+ (discover_list, node);
10298+ node_removed = TRUE;
10299+ LOG_DEFAULT("Re");
10300+ /* free the previously used name */
10301+ kfree(lv->name);
10302+ lv->name = NULL;
10303+ /* clear the EVMS_VOLUME_SOFT_DELETED flag */
10304+ lv->flags = 0;
10305+ eelv_assign_volume_minor(node, i);
10306+ break;
10307+ }
10308+ }
10309+ }
10310+ }
10311+}
10312+
10313+/*
10314+ * function: eelv_assign_evms_volume_minors
10315+ *
10316+ * This is a support function for evms_export_logical_volumes.
10317+ * This routine assigns minor numbers to new evms volumes. If
10318+ * the specified minor is already in use, the requested minor
10319+ * is set to 0, and will be assigned next available along with
10320+ * any remaining volumes at the end of evms_export_logical_volumes.
10321+ *
10322+ */
10323+static void
10324+eelv_assign_evms_volume_minors(struct evms_logical_node **discover_list)
10325+{
10326+ struct evms_logical_node *next_node, *node, *lv_node;
10327+ unsigned int requested_minor, node_removed;
10328+
10329+ for (node = *discover_list; node; node = next_node) {
10330+ next_node = node->next;
10331+
10332+ node_removed = FALSE;
10333+ /* only process evms volumes */
10334+ if (node->flags & EVMS_VOLUME_FLAG) {
10335+ requested_minor = node->volume_info->volume_minor;
10336+ /* is there a requested minor? */
10337+ if (requested_minor) {
10338+ int lv_flags = 0;
10339+
10340+ /* check range of requested minor */
10341+ if (requested_minor >= MAX_EVMS_VOLUMES)
10342+ lv_node = node;
10343+ else {
10344+ struct evms_logical_volume *lv;
10345+ lv = &evms_logical_volumes
10346+ [requested_minor];
10347+ lv_node = lv->node;
10348+ lv_flags = lv->flags;
10349+ }
10350+ if ((!lv_node)
10351+ && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED))) {
10352+ /* assign requested minor */
10353+ evms_cs_remove_logical_node_from_list
10354+ (discover_list, node);
10355+ node_removed = TRUE;
10356+ eelv_assign_volume_minor(node,
10357+ requested_minor);
10358+ } else {
10359+ LOG_WARNING
10360+ ("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
10361+ node->volume_info->volume_name,
10362+ requested_minor);
10363+ /*
10364+ * requested minor is already
10365+ * in use, defer assignment
10366+ * until later.
10367+ */
10368+ node->volume_info->volume_minor = 0;
10369+ }
10370+ }
10371+ }
10372+ }
10373+}
10374+
10375+/*
10376+ * function: eelv_assign_remaining_evms_volume_minors
10377+ *
10378+ * This is a support function for evms_export_logical_volumes.
10379+ * This routine assigns minor numbers to new evms volumes that
10380+ * have no/conflicting minor assignments. This function will
10381+ * search from high(255) minor values down, for the first available
10382+ * minor. Searching high to low minimizes the possibility of
10383+ * conflicting evms volumes causing "compatibility" minor
10384+ * assignments to shift from expected assignments.
10385+ *
10386+ */
10387+static void
10388+eelv_assign_remaining_evms_volume_minors(struct evms_logical_node
10389+ **discover_list)
10390+{
10391+ struct evms_logical_node *next_node, *node;
10392+ int requested_minor, node_removed;
10393+
10394+ for (node = *discover_list; node; node = next_node) {
10395+ next_node = node->next;
10396+
10397+ node_removed = FALSE;
10398+ /* only process evms volumes */
10399+ /* all remaining evms volumes should now
10400+ * have a minor value of 0, meaning they
10401+ * had no minor assignment, or their minor
10402+ * assignment conflicted with an existing
10403+ * minor assignment.
10404+ */
10405+ if (node->flags & EVMS_VOLUME_FLAG) {
10406+ evms_cs_remove_logical_node_from_list(discover_list,
10407+ node);
10408+ node_removed = TRUE;
10409+ /* find next available minor number */
10410+ for (requested_minor = 255;
10411+ (evms_logical_volumes[requested_minor].node ||
10412+ evms_logical_volumes[requested_minor].name) &&
10413+ requested_minor; requested_minor--) ;
10414+ /* check range of assigned minor */
10415+ if (!requested_minor) {
10416+ LOG_CRITICAL
10417+ ("no more minor numbers available for evms volumes!!!!\n");
10418+ DELETE(node);
10419+ } else
10420+ /* assign requested minor */
10421+ eelv_assign_volume_minor(node, requested_minor);
10422+ }
10423+ }
10424+}
10425+
10426+/*
10427+ * function: eelv_assign_remaining_volume_minors
10428+ *
10429+ * This is a support function for evms_export_logical_volumes.
10430+ * This routine assigns minor numbers to all remaining unassigned
10431+ * volumes. Minor numbers are assigned on an availability
10432+ * basis. The first free minor number is used in the assignment.
10433+ *
10434+ */
10435+static void
10436+eelv_assign_remaining_volume_minors(struct evms_logical_node **discover_list)
10437+{
10438+ struct evms_logical_node *node;
10439+ int minor;
10440+
10441+ while (*discover_list) {
10442+ node = *discover_list;
10443+ evms_cs_remove_logical_node_from_list(discover_list, node);
10444+
10445+ /* find next available minor number */
10446+ for (minor = 1;
10447+ (evms_logical_volumes[minor].node ||
10448+ evms_logical_volumes[minor].name) &&
10449+ minor < MAX_EVMS_VOLUMES; minor++) ;
10450+
10451+ if (minor >= MAX_EVMS_VOLUMES) {
10452+ LOG_CRITICAL
10453+ ("no more minor numbers available for compatibility volumes!!!!\n");
10454+ DELETE(node);
10455+ } else
10456+ /* assign minor */
10457+ eelv_assign_volume_minor(node, minor);
10458+ }
10459+}
10460+
10461+/*
10462+ * function: eelv_check_for_unreassign_soft_deleted_volume
10463+ *
10464+ * This is a support function for evms_export_logical_volumes.
10465+ * This routine reports any "soft deleted" volumes that were not
10466+ * found after a rediscovery.
10467+ */
10468+static void
10469+eelv_check_for_unreassign_soft_deleted_volume(void)
10470+{
10471+ struct evms_logical_volume *lv;
10472+ int i;
10473+
10474+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10475+ lv = &evms_logical_volumes[i];
10476+ /* only check soft deleted volumes:
10477+ * they have a NULL node ptr &
10478+ * they have a non-NULL name.
10479+ */
10480+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10481+ if (is_open(i))
10482+ lv->flags |= EVMS_VOLUME_CORRUPT;
10483+ LOG_ERROR
10484+ ("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
10485+ ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
10486+ EVMS_MAJOR, i, lv->name);
10487+ if (lv->flags & EVMS_VOLUME_CORRUPT) {
10488+ LOG_ERROR
10489+ (" flagging volume(%u,%u,%s) as CORRUPT!\n",
10490+ EVMS_MAJOR, i, lv->name);
10491+ } else {
10492+ LOG_ERROR
10493+ (" releasing minor(%d) used by volume(%s)!\n",
10494+ i, lv->name);
10495+ /* clear logical volume structure
10496+ * for this volume so it may be
10497+ * reused.
10498+ */
10499+ kfree(lv->name);
10500+ lv->name = NULL;
10501+ lv->flags = 0;
10502+ }
10503+ }
10504+ }
10505+}
10506+
10507+static void
10508+eelv_unquiesce_volumes(void)
10509+{
10510+ int i;
10511+
10512+ /* check each volume array entry */
10513+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10514+ struct evms_logical_volume *volume;
10515+
10516+ volume = &evms_logical_volumes[i];
10517+ /* is this volume "quiesced" ? */
10518+ if (volume->quiesced) {
10519+ int rc = 1;
10520+ if (volume->node) {
10521+ /* "unquiesce" it */
10522+ struct inode inode;
10523+ struct evms_quiesce_vol_pkt qv;
10524+
10525+ qv.command = qv.status = 0;
10526+ qv.do_vfs = 0;
10527+ qv.minor = i;
10528+ rc = evms_quiesce_volume(volume, &inode, NULL,
10529+ &qv);
10530+ }
10531+ /* Wake up any waiters */
10532+ if (rc) {
10533+ /* clear the flag */
10534+ volume->quiesced = 0;
10535+ /* wake up the waiters */
10536+ if (waitqueue_active(&volume->wait_queue))
10537+ wake_up(&volume->wait_queue);
10538+#ifdef VFS_PATCH_PRESENT
10539+ /* unquiesce VFS if quiesced */
10540+ if (volume->vfs_quiesced) {
10541+ /* VFS function call to unlock the filesystem */
10542+ unlockfs(MKDEV(EVMS_MAJOR, i));
10543+ volume->vfs_quiesced = FALSE;
10544+ }
10545+#endif
10546+ }
10547+ }
10548+ }
10549+}
10550+
10551+/*
10552+ * Function: evms_export_logical_volumes
10553+ *
10554+ * This function is called from evms_discover_volumes. It
10555+ * check for duplicate volumes, assigns minor values to evms
10556+ * volumes, and assigns minor values to the remaining volumes.
10557+ * In addition to assigning minor values to each volume this
10558+ * function also completes the final steps necessary to allow
10559+ * the volumes to be using by the operating system.
10560+ */
10561+static void
10562+evms_export_logical_volumes(struct evms_logical_node **discover_list)
10563+{
10564+ LOG_EXTRA("exporting EVMS logical volumes...\n");
10565+
10566+ eelv_check_for_duplicity(discover_list);
10567+
10568+ eelv_reassign_soft_deleted_volume_minors(discover_list);
10569+
10570+ eelv_assign_evms_volume_minors(discover_list);
10571+
10572+ eelv_assign_remaining_evms_volume_minors(discover_list);
10573+
10574+ eelv_assign_remaining_volume_minors(discover_list);
10575+
10576+ eelv_check_for_unreassign_soft_deleted_volume();
10577+
10578+ /* "unquiesce" any "quiesced" volumes */
10579+ eelv_unquiesce_volumes();
10580+}
10581+
10582+static int
10583+edv_populate_discover_list(struct evms_list_node *src_list,
10584+ struct evms_logical_node **trg_list,
10585+ struct evms_rediscover_pkt *discover_parms)
10586+{
10587+ int rc = 0, i, move_node, use_all_disks = FALSE;
10588+ struct evms_list_node *src_node;
10589+ struct evms_logical_node *disk_node = NULL;
10590+
10591+ /* if no discover parameters are specified */
10592+ /* copy ALL the disk nodes into the */
10593+ /* discovery list. */
10594+ if ((discover_parms == NULL) ||
10595+ (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
10596+ use_all_disks = TRUE;
10597+
10598+ /* copy the disk nodes specified in the */
10599+ /* discover_parms over to a discover list */
10600+ src_node = src_list;
10601+ while (src_node) {
10602+ move_node = use_all_disks;
10603+ if (move_node == FALSE)
10604+ /* check the rediscovery array */
10605+ for (i = 0; i < discover_parms->drive_count; i++) {
10606+ disk_node =
10607+ DEV_HANDLE_TO_NODE(discover_parms->
10608+ drive_array[i]);
10609+ if (disk_node == src_node->item) {
10610+ move_node = TRUE;
10611+ break;
10612+ }
10613+ }
10614+ /* check to see if we want this node */
10615+ if (move_node == TRUE)
10616+ evms_cs_add_logical_node_to_list(trg_list,
10617+ (struct
10618+ evms_logical_node *)
10619+ src_node->item);
10620+ /* advance to next struct evms_list_node */
10621+ src_node = src_node->next;
10622+ }
10623+ return (rc);
10624+}
10625+
10626+static int
10627+evms_discover_volumes(struct evms_rediscover_pkt *discover_parms)
10628+{
10629+ int rc = 0;
10630+ struct evms_logical_node *discover_list = NULL;
10631+
10632+ evms_discover_logical_disks(&discover_list);
10633+ if (evms_global_device_list) {
10634+ /* move the appropriate disk nodes, based on */
10635+ /* on the discover parameters, onto the */
10636+ /* discover list for the partition managers */
10637+ /* to process */
10638+ edv_populate_discover_list(evms_global_device_list,
10639+ &discover_list, discover_parms);
10640+ }
10641+ if (discover_list) {
10642+#ifdef LOCAL_DEBUG
10643+ display_discover_list(discover_list, "after dev mgrs");
10644+#endif
10645+ evms_discover_logical_partitions(&discover_list);
10646+ }
10647+ if (discover_list) {
10648+#ifdef LOCAL_DEBUG
10649+ display_discover_list(discover_list, "after seg mgrs");
10650+#endif
10651+ evms_discover_volume_groups(&discover_list);
10652+ }
10653+ if (discover_list) {
10654+#ifdef LOCAL_DEBUG
10655+ display_discover_list(discover_list, "after reg mgrs");
10656+#endif
10657+ evms_discover_evms_features(&discover_list);
10658+ }
10659+ if (discover_list) {
10660+#ifdef LOCAL_DEBUG
10661+ display_discover_list(discover_list, "after features");
10662+#endif
10663+ evms_export_logical_volumes(&discover_list);
10664+ evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);
10665+ }
10666+ return (rc);
10667+}
10668+
10669+/* function: evms_notify_reboot
10670+ *
10671+ * this function gets called at shutdown time and is used
10672+ * to remove any evms controlled volumes from memory, thus
10673+ * allowing any plugins needing to flush internal caches
10674+ * to do so.
10675+ */
10676+int
10677+evms_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
10678+{
10679+ int i;
10680+ struct evms_logical_volume *volume;
10681+
10682+ switch (code) {
10683+ case SYS_DOWN:
10684+ case SYS_HALT:
10685+ case SYS_POWER_OFF:
10686+ LOG_DEFAULT("stopping all evms controlled volumes.\n");
10687+
10688+ /* quiesce all volumes */
10689+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10690+ struct evms_quiesce_vol_pkt qv;
10691+ struct inode inode;
10692+
10693+ volume = &evms_logical_volumes[i];
10694+ if (!volume->node)
10695+ continue;
10696+ qv.command = 1; // quiesce
10697+ qv.minor = i; //
10698+ qv.status = 0; // reset status
10699+ qv.do_vfs = 0;
10700+ evms_quiesce_volume(volume, &inode, NULL, &qv);
10701+ }
10702+ /* delete all volumes
10703+ *
10704+ * to ensure this work under the
10705+ * most circumstances, a "soft"
10706+ * delete will be done. this will
10707+ * handle the strange case of a
10708+ * volume still being mounted.
10709+ */
10710+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10711+ struct evms_delete_vol_pkt dv;
10712+
10713+ volume = &evms_logical_volumes[i];
10714+ if (!volume->node)
10715+ continue;
10716+ /* only delete quiesced volumes */
10717+ if (!volume->quiesced)
10718+ continue;
10719+ /* delete the volume from memory.
10720+ * do a 'soft' delete if volume
10721+ * is mounted, and 'hard' delete
10722+ * if it is not.
10723+ */
10724+ dv.command = is_open(i);
10725+ dv.minor = i;
10726+ dv.status = 0;
10727+ evms_delete_volume(volume, &dv);
10728+ }
10729+ }
10730+ return NOTIFY_DONE;
10731+}
10732+
10733+static struct notifier_block evms_notifier = {
10734+ .notifier_call = evms_notify_reboot,
10735+ .next = NULL,
10736+ .priority = INT_MAX, /* before any real devices */
10737+};
10738+
10739+/*
10740+ * Function: find_root_fs_dev
10741+ * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
10742+ * is not enabled, we need to determine the appropriate minor number for the
10743+ * specified volume for the root fs.
10744+ */
10745+static void
10746+find_root_fs_dev(void)
10747+{
10748+#ifndef MODULE
10749+ char root_name[64] = { 0 };
10750+ char *name;
10751+ int i;
10752+
10753+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,18)
10754+ strncpy(root_name, root_device_name, 63);
10755+#else
10756+ get_root_device_name(root_name);
10757+#endif
10758+
10759+ if (!strncmp(root_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME) + 1)) {
10760+ name = &root_name[strlen(EVMS_DIR_NAME) + 1];
10761+
10762+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10763+ if (evms_logical_volumes[i].name &&
10764+ !strncmp(name, evms_logical_volumes[i].name,
10765+ strlen(evms_logical_volumes[i].name))) {
10766+ ROOT_DEV = MKDEV(EVMS_MAJOR, i);
10767+ return;
10768+ }
10769+ }
10770+ }
10771+#endif
10772+}
10773+
10774+/*
10775+ * Function: bh_cache_ctor
10776+ * this function initializes the b_wait field in the buffer heads
10777+ * in our private buffer head pool.
10778+ */
10779+static void
10780+io_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10781+{
10782+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10783+ SLAB_CTOR_CONSTRUCTOR) {
10784+ io_notify_t *io_notify = (io_notify_t *) foo;
10785+ memset(io_notify, 0, sizeof (*io_notify));
10786+ }
10787+}
10788+
10789+/*
10790+ * Function: bh_cache_ctor
10791+ * this function initializes the b_wait field in the buffer heads
10792+ * in our private buffer head pool.
10793+ */
10794+static void
10795+bh_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10796+{
10797+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10798+ SLAB_CTOR_CONSTRUCTOR) {
10799+ struct buffer_head *bh = (struct buffer_head *) foo;
10800+ memset(bh, 0, sizeof (*bh));
10801+ init_waitqueue_head(&bh->b_wait);
10802+ }
10803+}
10804+
10805+/*
10806+ * Function: evms_init_module
10807+ * This function runs once at system initialization.
10808+ */
10809+static int __init
10810+evms_init_module(void)
10811+{
10812+ int rc = 0, i;
10813+ int *evms_blocksizes;
10814+
10815+ LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n",
10816+ EVMS_MAJOR_VERSION,
10817+ EVMS_MINOR_VERSION,
10818+ EVMS_PATCHLEVEL_VERSION, evms_info_level);
10819+
10820+ /* initialize memory management counters */
10821+ evms_allocs = (atomic_t) ATOMIC_INIT(0);
10822+ evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
10823+
10824+ /* initialize the io_notify_entry pool */
10825+ if (!rc)
10826+ evms_io_notify_pool = evms_cs_create_pool(sizeof (io_notify_t),
10827+ "EVMS IO Notify",
10828+ io_notify_cache_ctor,
10829+ NULL);
10830+
10831+ /* initialize the "public" buffer_head pool */
10832+ if (!rc)
10833+ evms_bh_pool = evms_cs_create_pool(sizeof (struct buffer_head),
10834+ "EVMS BH",
10835+ bh_cache_ctor, NULL);
10836+
10837+ /* allocate the logical volume array */
10838+ if (!rc)
10839+ evms_logical_volumes =
10840+ kmalloc(sizeof (struct evms_logical_volume) *
10841+ MAX_EVMS_VOLUMES, GFP_KERNEL);
10842+ if (!evms_logical_volumes) {
10843+ rc = -ENOMEM;
10844+ }
10845+
10846+ /* initialize the logical volume array entries */
10847+ if (!rc) {
10848+ memset(evms_logical_volumes, 0,
10849+ sizeof (struct evms_logical_volume) * MAX_EVMS_VOLUMES);
10850+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10851+ struct evms_logical_volume *volume;
10852+
10853+ volume = &evms_logical_volumes[i];
10854+ init_waitqueue_head(&volume->wait_queue);
10855+ volume->requests_in_progress =
10856+ (atomic_t) ATOMIC_INIT(0);
10857+#ifdef CONFIG_SMP
10858+ blk_init_queue(&volume->request_queue,
10859+ evms_do_request_fn);
10860+ blk_queue_make_request(&volume->request_queue,
10861+ evms_make_request_fn);
10862+#endif
10863+ }
10864+ }
10865+
10866+ /* allocate EVMS' blk_size array */
10867+ if (!rc) {
10868+ evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10869+ sizeof (int), GFP_KERNEL);
10870+ if (!evms_blocksizes) {
10871+ rc = -ENOMEM;
10872+ LOG_CRITICAL
10873+ ("can't allocate memory for EVMS blk_size\n");
10874+ } else {
10875+ memset(evms_blocksizes, 0,
10876+ MAX_EVMS_VOLUMES * sizeof (int));
10877+ blk_size[EVMS_MAJOR] = evms_blocksizes;
10878+ }
10879+ }
10880+
10881+ /* allocate EVMS' blksize_size array */
10882+ if (!rc) {
10883+ evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10884+ sizeof (int), GFP_KERNEL);
10885+ if (!evms_blocksizes) {
10886+ rc = -ENOMEM;
10887+ LOG_CRITICAL
10888+ ("can't allocate memory for EVMS blksize_size\n");
10889+ } else {
10890+ memset(evms_blocksizes, 0,
10891+ MAX_EVMS_VOLUMES * sizeof (int));
10892+ blksize_size[EVMS_MAJOR] = evms_blocksizes;
10893+ }
10894+ }
10895+
10896+ /* allocate EVMS' hardsect_size array */
10897+ if (!rc) {
10898+ evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10899+ sizeof (int), GFP_KERNEL);
10900+ if (!evms_blocksizes) {
10901+ rc = -ENOMEM;
10902+ LOG_CRITICAL
10903+ ("can't allocate memory for EVMS hardsect_size\n");
10904+ } else {
10905+ memset(evms_blocksizes, 0,
10906+ MAX_EVMS_VOLUMES * sizeof (int));
10907+ hardsect_size[EVMS_MAJOR] = evms_blocksizes;
10908+ }
10909+ }
10910+
10911+ /* Register the block device */
10912+ if (!rc) {
10913+ rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME,
10914+ &evms_fops);
10915+ if (rc) {
10916+ LOG_CRITICAL
10917+ ("error calling devfs_register_blkdev() err=%u\n",
10918+ rc);
10919+ rc = -EINVAL;
10920+ }
10921+ }
10922+
10923+ /* Register with devfs */
10924+ if (!rc) {
10925+ evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
10926+ // A NULL return cannot be fatal.
10927+ // Devfs just might not be running
10928+ if (!evms_dir_devfs_handle) {
10929+ LOG_EXTRA
10930+ ("NULL return from devfs_mk_dir() for \"%s\"\n",
10931+ EVMS_DIR_NAME);
10932+ LOG_EXTRA("Is devfs enabled?\n");
10933+ } else {
10934+ evms_blk_devfs_handle =
10935+ devfs_register(evms_dir_devfs_handle, EVMS_DEV_NAME,
10936+ DEVFS_FL_DEFAULT, EVMS_MAJOR, 0,
10937+ S_IFBLK | S_IRUGO | S_IWUGO,
10938+ &evms_fops, NULL);
10939+ if (!evms_blk_devfs_handle) {
10940+ LOG_DETAILS
10941+ ("NULL return from devfs_register() for \"%s\"\n",
10942+ EVMS_DEV_NAME);
10943+ }
10944+ }
10945+ }
10946+
10947+ if (!rc) {
10948+ read_ahead[EVMS_MAJOR] = 4096;
10949+#ifdef CONFIG_SMP
10950+ blk_dev[EVMS_MAJOR].queue = evms_find_queue;
10951+#else
10952+ blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10953+ evms_do_request_fn);
10954+ blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10955+ evms_make_request_fn);
10956+#endif
10957+#ifdef CONFIG_PROC_FS
10958+ evms_cs_get_evms_proc_dir();
10959+ if (evms_proc_dir) {
10960+ create_proc_read_entry("info", 0, evms_proc_dir,
10961+ evms_info_read_proc, NULL);
10962+ create_proc_read_entry("plugins", 0, evms_proc_dir,
10963+ evms_plugins_read_proc, NULL);
10964+ create_proc_read_entry("volumes", 0, evms_proc_dir,
10965+ evms_volumes_read_proc, NULL);
10966+ }
10967+ evms_table_header = register_sysctl_table(dev_dir_table, 1);
10968+#endif
10969+ /* Register for reboot notification */
10970+ register_reboot_notifier(&evms_notifier);
10971+
10972+#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
10973+ /* Register evms 32bit ioctl handlers */
10974+ lock_kernel();
10975+ register_ioctl32_conversion(EVMS_GET_INFO_LEVEL,NULL);
10976+ register_ioctl32_conversion(EVMS_SET_INFO_LEVEL,NULL);
10977+ register_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32,
10978+ evms_rediscover);
10979+ register_ioctl32_conversion(EVMS_DELETE_VOLUME,NULL);
10980+ register_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32,
10981+ evms_plugin_ioctl);
10982+ register_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT,NULL);
10983+ register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK,NULL);
10984+ register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO,NULL);
10985+ register_ioctl32_conversion(EVMS_SECTOR_IO_32, evms_sector_io);
10986+ register_ioctl32_conversion(EVMS_GET_MINOR,NULL);
10987+ register_ioctl32_conversion(EVMS_GET_VOLUME_DATA,NULL);
10988+ register_ioctl32_conversion(EVMS_GET_PLUGIN,NULL);
10989+ register_ioctl32_conversion(EVMS_COMPUTE_CSUM_32,
10990+ evms_compute_csum);
10991+ register_ioctl32_conversion(EVMS_GET_BMAP,NULL);
10992+ register_ioctl32_conversion(EVMS_GET_IOCTL_VERSION,NULL);
10993+ register_ioctl32_conversion(EVMS_GET_VERSION,NULL);
10994+ register_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO,NULL);
10995+ register_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS,NULL);
10996+ register_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO,NULL);
10997+ unlock_kernel();
10998+#endif
10999+
11000+ }
11001+
11002+ return rc;
11003+}
11004+
11005+/*
11006+ * Function: evms_exit_module
11007+ * This function runs once when the EVMS core module is unloaded.
11008+ */
11009+static void __exit
11010+evms_exit_module(void)
11011+{
11012+ LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n",
11013+ EVMS_MAJOR_VERSION,
11014+ EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
11015+
11016+#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
11017+ /* Un-Register evms 32bit ioctl handlers */
11018+ lock_kernel();
11019+ unregister_ioctl32_conversion(EVMS_GET_INFO_LEVEL);
11020+ unregister_ioctl32_conversion(EVMS_SET_INFO_LEVEL);
11021+ unregister_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32);
11022+ unregister_ioctl32_conversion(EVMS_DELETE_VOLUME);
11023+ unregister_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32);
11024+ unregister_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT);
11025+ unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK);
11026+ unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO);
11027+ unregister_ioctl32_conversion(EVMS_SECTOR_IO_32);
11028+ unregister_ioctl32_conversion(EVMS_GET_MINOR);
11029+ unregister_ioctl32_conversion(EVMS_GET_VOLUME_DATA);
11030+ unregister_ioctl32_conversion(EVMS_GET_PLUGIN);
11031+ unregister_ioctl32_conversion(EVMS_COMPUTE_CSUM_32);
11032+ unregister_ioctl32_conversion(EVMS_GET_BMAP);
11033+ unregister_ioctl32_conversion(EVMS_GET_IOCTL_VERSION);
11034+ unregister_ioctl32_conversion(EVMS_GET_VERSION);
11035+ unregister_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO);
11036+ unregister_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS);
11037+ unregister_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO);
11038+ unlock_kernel();
11039+#endif
11040+
11041+ /* unregister with devfs
11042+ */
11043+ devfs_unregister(evms_dir_devfs_handle);
11044+ /* clean up the queue for the block device
11045+ */
11046+ blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR, 0)));
11047+ /* unregister block device
11048+ */
11049+ devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
11050+ /* deallocate device arrays
11051+ */
11052+ kfree(blk_size[EVMS_MAJOR]);
11053+ blk_size[EVMS_MAJOR] = NULL;
11054+ kfree(blksize_size[EVMS_MAJOR]);
11055+ blksize_size[EVMS_MAJOR] = NULL;
11056+ kfree(hardsect_size[EVMS_MAJOR]);
11057+ hardsect_size[EVMS_MAJOR] = NULL;
11058+ read_ahead[EVMS_MAJOR] = 0;
11059+ /* deallocate logical volumes array
11060+ */
11061+ kfree(evms_logical_volumes);
11062+ /* destroy buffer head pool
11063+ */
11064+ evms_cs_destroy_pool(evms_bh_pool);
11065+ /* destroy io notify pool
11066+ */
11067+ evms_cs_destroy_pool(evms_io_notify_pool);
11068+#ifdef CONFIG_PROC_FS
11069+ if (evms_proc_dir) {
11070+ remove_proc_entry("volumes", evms_proc_dir);
11071+ remove_proc_entry("plugins", evms_proc_dir);
11072+ remove_proc_entry("info", evms_proc_dir);
11073+ remove_proc_entry("evms", NULL);
11074+ }
11075+ unregister_sysctl_table(evms_table_header);
11076+#endif
11077+}
11078+
11079+/*
11080+ * Function: evms_init_discover
11081+ * If EVMS is statically built into the kernel, this function will be called
11082+ * to perform an initial volume discovery.
11083+ */
11084+int __init
11085+evms_init_discover(void)
11086+{
11087+ /* go find volumes */
11088+ evms_discover_volumes(NULL);
11089+
11090+ /* Check if the root fs is on EVMS */
11091+ if (MAJOR(ROOT_DEV) == EVMS_MAJOR) {
11092+ find_root_fs_dev();
11093+ }
11094+
11095+ return 0;
11096+}
11097+
11098+/*
11099+ * a placeholder for cluster enablement
11100+ */
11101+void
11102+evms_cluster_init(int nodeid, int clusterid)
11103+{
11104+ /* dummy */
11105+ return;
11106+}
11107+
11108+EXPORT_SYMBOL(evms_cluster_init);
11109+
11110+/*
11111+ * a placeholder for cluster enablement
11112+ */
11113+int
11114+evms_cluster_shutdown(void)
11115+{
11116+ /* dummy */
11117+ return -1;
11118+}
11119+
11120+EXPORT_SYMBOL(evms_cluster_shutdown);
11121+
11122+static int __init
11123+evms_boot_info_level(char *str)
11124+{
11125+ int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
11126+ if (evms_boot_info_level) {
11127+ evms_info_level = evms_boot_info_level;
11128+ }
11129+ return 1;
11130+}
11131+
11132+__setup("evms_info_level=", evms_boot_info_level);
11133+module_init(evms_init_module);
11134+module_exit(evms_exit_module);
11135+__initcall(evms_init_discover);
11136+#ifdef MODULE_LICENSE
11137+MODULE_LICENSE("GPL");
11138+#endif
11139+
11140+/**********************************************************/
11141+/* END -- INIT/DISCOVERY support functions */
11142+/**********************************************************/
11143diff -Naur linux-2002-09-30/drivers/evms/evms_bbr.c evms-2002-09-30/drivers/evms/evms_bbr.c
11144--- linux-2002-09-30/drivers/evms/evms_bbr.c Wed Dec 31 18:00:00 1969
11145+++ evms-2002-09-30/drivers/evms/evms_bbr.c Wed Sep 25 15:04:22 2002
11146@@ -0,0 +1,1817 @@
11147+/* -*- linux-c -*- */
11148+/*
11149+ * Copyright (c) International Business Machines Corp., 2000
11150+ *
11151+ * This program is free software; you can redistribute it and/or modify
11152+ * it under the terms of the GNU General Public License as published by
11153+ * the Free Software Foundation; either version 2 of the License, or
11154+ * (at your option) any later version.
11155+ *
11156+ * This program is distributed in the hope that it will be useful,
11157+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11158+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
11159+ * the GNU General Public License for more details.
11160+ *
11161+ * You should have received a copy of the GNU General Public License
11162+ * along with this program; if not, write to the Free Software
11163+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
11164+ */
11165+/* linux/driver/evms/evms_bbr.c
11166+ *
11167+ * EVMS - Bad Block Relocation (BBR) Feature Plugin
11168+ *
11169+ * BBR feature is designed to remap I/O write failures to another safe location
11170+ * on disk. Note that most disk drives have BBR built into them, this means
11171+ * that our software BBR will be only activated when all hardware BBR
11172+ * replacement sectors have been used.
11173+ */
11174+
11175+#define LOG_PREFIX "bbr: "
11176+
11177+#include <linux/config.h>
11178+#include <linux/kernel.h>
11179+#include <linux/module.h>
11180+#include <linux/mempool.h>
11181+#include <asm/uaccess.h>
11182+
11183+#include <linux/evms/evms.h>
11184+#include <linux/evms/evms_bbr_k.h>
11185+
11186+/* API prototypes. */
11187+static int bbr_discover(struct evms_logical_node ** discover_list);
11188+static int bbr_delete(struct evms_logical_node * node);
11189+static void bbr_read(struct evms_logical_node * node, struct buffer_head * bh);
11190+static void bbr_write(struct evms_logical_node * node, struct buffer_head * bh);
11191+static int bbr_ioctl(struct evms_logical_node * bbr_node,
11192+ struct inode * inode,
11193+ struct file * file,
11194+ unsigned int cmd,
11195+ unsigned long arg);
11196+static int bbr_direct_ioctl(struct inode * inode,
11197+ struct file * file,
11198+ unsigned int cmd,
11199+ unsigned long arg);
11200+static int bbr_init_io(struct evms_logical_node * bbr_node,
11201+ int io_flag,
11202+ u64 startLSN,
11203+ u64 nr_sects,
11204+ void * bufptr);
11205+
11206+/* Other function prototypes. */
11207+static int bbr_create_pools(void);
11208+static void bbr_destroy_pools(void);
11209+static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id);
11210+static void bbr_io_handler(void * void_data);
11211+static void bbr_free_private(struct bbr_private * bbr_id);
11212+static inline void bbr_list_add(struct bbr_private * bbr_id);
11213+
11214+/* List of all BBR nodes. */
11215+static struct bbr_private * bbr_instances = NULL;
11216+
11217+/* Data pertaining to the I/O thread. */
11218+static struct evms_thread * bbr_io_thread = NULL;
11219+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
11220+static struct list_head bbr_io_list = LIST_HEAD_INIT(bbr_io_list);
11221+
11222+/* Global pools for bbr_io_buf's and bbr_remap's. */
11223+kmem_cache_t * bbr_io_buf_slab;
11224+mempool_t * bbr_io_buf_pool;
11225+kmem_cache_t * bbr_remap_slab;
11226+mempool_t * bbr_remap_pool;
11227+
11228+/* Plugin function table and header. */
11229+static struct evms_plugin_fops function_table = {
11230+ .discover = bbr_discover,
11231+ .delete = bbr_delete,
11232+ .read = bbr_read,
11233+ .write = bbr_write,
11234+ .init_io = bbr_init_io,
11235+ .ioctl = bbr_ioctl,
11236+ .direct_ioctl = bbr_direct_ioctl
11237+};
11238+
11239+static struct evms_plugin_header plugin_header = {
11240+ .id = SetPluginID(IBM_OEM_ID,
11241+ EVMS_FEATURE,
11242+ EVMS_BBR_FEATURE_ID),
11243+ .version = {
11244+ .major = EVMS_BBR_VERSION_MAJOR,
11245+ .minor = EVMS_BBR_VERSION_MINOR,
11246+ .patchlevel = EVMS_BBR_VERSION_PATCHLEVEL
11247+ },
11248+ .required_services_version = {
11249+ .major = EVMS_BBR_COMMON_SERVICES_MAJOR,
11250+ .minor = EVMS_BBR_COMMON_SERVICES_MINOR,
11251+ .patchlevel = EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
11252+ },
11253+ .fops = &function_table
11254+};
11255+
11256+/**
11257+ * le_meta_data_to_cpu
11258+ *
11259+ * Convert bbr meta data from on-disk (LE) format
11260+ * to the native cpu endian format.
11261+ */
11262+void le_meta_data_to_cpu(struct evms_bbr_metadata * md)
11263+{
11264+ md->signature = le32_to_cpup(&md->signature);
11265+ md->crc = le32_to_cpup(&md->crc);
11266+ md->block_size = le32_to_cpup(&md->block_size);
11267+ md->flags = le32_to_cpup(&md->flags);
11268+ md->sequence_number = le64_to_cpup(&md->sequence_number);
11269+ md->start_sect_bbr_table = le64_to_cpup(&md->start_sect_bbr_table);
11270+ md->nr_sects_bbr_table = le64_to_cpup(&md->nr_sects_bbr_table);
11271+ md->start_replacement_sect = le64_to_cpup(&md->start_replacement_sect);
11272+ md->nr_replacement_blks = le64_to_cpup(&md->nr_replacement_blks);
11273+}
11274+
11275+/**
11276+ * le_bbr_table_sector_to_cpu
11277+ *
11278+ * Convert bbr meta data from on-disk (LE) format
11279+ * to the native cpu endian format.
11280+ */
11281+void le_bbr_table_sector_to_cpu(struct evms_bbr_table * p)
11282+{
11283+ int i;
11284+ p->signature = le32_to_cpup(&p->signature);
11285+ p->crc = le32_to_cpup(&p->crc);
11286+ p->sequence_number = le32_to_cpup(&p->sequence_number);
11287+ p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
11288+ for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11289+ p->entries[i].bad_sect =
11290+ le64_to_cpup(&p->entries[i].bad_sect);
11291+ p->entries[i].replacement_sect =
11292+ le64_to_cpup(&p->entries[i].replacement_sect);
11293+ }
11294+}
11295+
11296+/**
11297+ * cpu_bbr_table_sector_to_le
11298+ *
11299+ * Convert bbr meta data from cpu endian format to on-disk (LE) format
11300+ */
11301+void cpu_bbr_table_sector_to_le(struct evms_bbr_table * p,
11302+ struct evms_bbr_table * le)
11303+{
11304+ int i;
11305+ le->signature = cpu_to_le32p(&p->signature);
11306+ le->crc = cpu_to_le32p(&p->crc);
11307+ le->sequence_number = cpu_to_le32p(&p->sequence_number);
11308+ le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
11309+ for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11310+ le->entries[i].bad_sect =
11311+ cpu_to_le64p(&p->entries[i].bad_sect);
11312+ le->entries[i].replacement_sect =
11313+ cpu_to_le64p(&p->entries[i].replacement_sect);
11314+ }
11315+}
11316+
11317+#ifdef EVMS_BBR_DEBUG
11318+static void print_meta_data(struct evms_bbr_metadata * md)
11319+{
11320+ LOG_DEBUG("BBR Metadata Sector:\n"
11321+ " signature 0x%08X\n"
11322+ " crc 0x%08X\n"
11323+ " block_size %u\n"
11324+ " start_sect_bbr_table "PFU64"\n"
11325+ " nr_sects_bbr_table "PFU64"\n"
11326+ " start_replacement_sect "PFU64"\n"
11327+ " nr_replacement_blks "PFU64"\n",
11328+ md->signature, md->crc, md->block_size,
11329+ md->start_sect_bbr_table, md->nr_sects_bbr_table,
11330+ md->start_replacement_sect, md->nr_replacement_blks);
11331+}
11332+
11333+static void print_bbr_table_sector(struct evms_bbr_table * p)
11334+{
11335+ int i;
11336+ LOG_DEBUG("BBR Table Sector:\n"
11337+ " sig 0x%08X\n"
11338+ " crc 0x%08X\n"
11339+ " sequence %u\n"
11340+ " in_use_cnt %u\n"
11341+ " Table Entries:\n",
11342+ p->signature, p->crc, p->sequence_number, p->in_use_cnt);
11343+ for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11344+ LOG_DEBUG(" [%d] bad_sect: "PFU64" replacement_sect: "PFU64"\n",
11345+ i, p->entries[i].bad_sect,
11346+ p->entries[i].replacement_sect);
11347+ }
11348+}
11349+
11350+void print_binary_tree(struct bbr_runtime_remap * node)
11351+{
11352+ if (node) {
11353+ LOG_DEFAULT("["PFU64","PFU64"]\n", node->remap.bad_sect,
11354+ node->remap.replacement_sect);
11355+ print_binary_tree(node->left);
11356+ print_binary_tree(node->right);
11357+ }
11358+}
11359+
11360+static void print_remap_list(struct bbr_private * bbr_id)
11361+{
11362+ if (bbr_id->remap_root) {
11363+ LOG_DEFAULT("%s for %s\n", __FUNCTION__, bbr_id->node->name);
11364+ print_binary_tree(bbr_id->remap_root);
11365+ }
11366+}
11367+#endif
11368+
11369+/**
11370+ * validate_bbr_table_sector
11371+ *
11372+ * Check the specified BBR table sector for a valid signature and CRC.
11373+ */
11374+static int validate_bbr_table_sector(struct evms_bbr_table * p)
11375+{
11376+ int rc = 0;
11377+ int org_crc, final_crc;
11378+
11379+ if ( le32_to_cpup(&p->signature) != EVMS_BBR_TABLE_SIGNATURE ) {
11380+ LOG_ERROR("BBR table signature doesn't match!\n");
11381+ LOG_ERROR("Sector has (0x%08X) expected(0x%08X)\n",
11382+ le32_to_cpup(&p->signature),
11383+ EVMS_BBR_TABLE_SIGNATURE);
11384+ rc = -EINVAL;
11385+ } else {
11386+ if (p->crc) {
11387+ org_crc = le32_to_cpup(&p->crc);
11388+ p->crc = 0;
11389+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p,
11390+ sizeof(*p));
11391+ if ( final_crc != org_crc ) {
11392+ LOG_ERROR("CRC failed!\n");
11393+ LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11394+ org_crc, final_crc);
11395+ rc = -EINVAL;
11396+ }
11397+ p->crc = cpu_to_le32p(&org_crc);
11398+ } else {
11399+ LOG_ERROR("BBR table sector has no CRC!\n");
11400+ rc = -EINVAL;
11401+ }
11402+ }
11403+ if (rc)
11404+ BBR_DEBUG_PRINT_TABLE_SECTOR(p);
11405+ le_bbr_table_sector_to_cpu(p);
11406+ return rc;
11407+}
11408+
11409+/**
11410+ * update_invalid_bbr_table_sector
11411+ *
11412+ * If one copy of a BBR table sector is bad, replace it with the valid copy.
11413+ */
11414+void update_invalid_bbr_table_sector(struct evms_logical_node * node,
11415+ struct evms_bbr_table * valid,
11416+ struct evms_bbr_table * invalid,
11417+ u64 lsn)
11418+{
11419+ int rc;
11420+ struct evms_bbr_table * tmp_bbr_table;
11421+
11422+ /* Correct the invalid bbr table sector */
11423+ memcpy(invalid, valid, sizeof(struct evms_bbr_table));
11424+
11425+ /* Allocate memory for I/O */
11426+ tmp_bbr_table = kmalloc(sizeof(struct evms_bbr_table), GFP_KERNEL);
11427+ if (tmp_bbr_table) {
11428+ memset(tmp_bbr_table, 0, sizeof(struct evms_bbr_table));
11429+ cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
11430+ LOG_WARNING("Correcting BBR table sector "PFU64"\n", lsn);
11431+ rc = INIT_IO(node, 1, lsn, 1, tmp_bbr_table);
11432+ if (rc) {
11433+ LOG_ERROR("Could not correct BBR table sector "PFU64".\n",
11434+ lsn);
11435+ }
11436+ kfree(tmp_bbr_table);
11437+ }
11438+}
11439+
11440+/**
11441+ * validate_bbr_table
11442+ *
11443+ * Validate the entire range of sectors in the BBR table.
11444+ */
11445+static u32 validate_bbr_table(struct evms_bbr_metadata * md,
11446+ struct evms_bbr_table * p)
11447+{
11448+ u32 i, nr_sects;
11449+
11450+ nr_sects = md->nr_sects_bbr_table;
11451+
11452+ for ( i = 0; i < nr_sects; i++, p++ ) {
11453+ if ( validate_bbr_table_sector(p) )
11454+ break;
11455+ }
11456+
11457+ if ( i != nr_sects ) {
11458+ LOG_SERIOUS("Stopped BBR table validation at sector %u.\n", i);
11459+ nr_sects = i;
11460+ }
11461+ LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11462+ return nr_sects;
11463+}
11464+
11465+/**
11466+ * validate_bbr_tables
11467+ * @node: BBR node to validate.
11468+ * @MD1: Primary metadata sector.
11469+ * @MD2: Secondary metadata sector.
11470+ * @p1: Primary BBR table.
11471+ * @p2: Secondary BBR table.
11472+ *
11473+ * Validate both copies of the BBR table. If one of them is invalid,
11474+ * try to correct the errors using the valid copy.
11475+ */
11476+static u32 validate_bbr_tables(struct evms_logical_node * node,
11477+ struct evms_bbr_metadata * MD1,
11478+ struct evms_bbr_metadata * MD2,
11479+ struct evms_bbr_table * p1,
11480+ struct evms_bbr_table * p2)
11481+{
11482+ u32 i, rc1, rc2, nr_sects;
11483+
11484+ nr_sects = MD1->nr_sects_bbr_table;
11485+ if ( nr_sects != MD2->nr_sects_bbr_table ) {
11486+ nr_sects = (nr_sects < MD2->nr_sects_bbr_table) ?
11487+ nr_sects : MD2->nr_sects_bbr_table;
11488+ LOG_SERIOUS("Size of BBR tables don't match. Using %u\n",
11489+ nr_sects);
11490+ }
11491+
11492+ for ( i = 0; i < nr_sects; i++, p1++, p2++ ) {
11493+ rc1 = validate_bbr_table_sector(p1);
11494+ if (rc1) {
11495+ LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11496+ MD1->start_sect_bbr_table + i);
11497+ }
11498+ rc2 = validate_bbr_table_sector(p2);
11499+ if (rc2) {
11500+ LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11501+ MD2->start_sect_bbr_table + i);
11502+ }
11503+
11504+ /* Correct BBR table errors. */
11505+ if (rc1 && rc2) {
11506+ /* Cannot fix. */
11507+ break;
11508+ } else if (rc1) {
11509+ update_invalid_bbr_table_sector(node, p2, p1,
11510+ MD1->start_sect_bbr_table + i);
11511+ continue;
11512+ } else if (rc2) {
11513+ update_invalid_bbr_table_sector(node, p1, p2,
11514+ MD2->start_sect_bbr_table + i);
11515+ continue;
11516+ }
11517+
11518+ if ( p1->sequence_number != p2->sequence_number ) {
11519+ LOG_WARNING("Sequence numbers for BBR table index %u don't match.\n", i);
11520+ LOG_WARNING("MD1 sequence_nr=%u, MD2 sequence_nr_2=%u\n",
11521+ p1->sequence_number, p2->sequence_number);
11522+ if ( p1->sequence_number < p2->sequence_number ) {
11523+ update_invalid_bbr_table_sector(node, p2, p1,
11524+ MD1->start_sect_bbr_table + i);
11525+ } else {
11526+ update_invalid_bbr_table_sector(node, p1, p2,
11527+ MD2->start_sect_bbr_table + i);
11528+ }
11529+ }
11530+ }
11531+ if ( i != nr_sects ) {
11532+ LOG_SERIOUS("Stopped validation at sector %u\n", i);
11533+ nr_sects = i;
11534+ }
11535+ LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11536+ return nr_sects;
11537+}
11538+
11539+/**
11540+ * validate_meta_data
11541+ *
11542+ * Check the specified BBR metadata sector for a valid signature and CRC.
11543+ */
11544+static int validate_meta_data(struct evms_bbr_metadata * md)
11545+{
11546+ int org_crc, final_crc;
11547+
11548+ BBR_DEBUG_PRINT_META_DATA(md);
11549+
11550+ if ( le32_to_cpup(&md->signature) != EVMS_BBR_SIGNATURE ) {
11551+ LOG_SERIOUS("BBR signature doesn't match!\n");
11552+ LOG_SERIOUS("Found: 0x%08X Expecting: 0x%08X\n",
11553+ le32_to_cpup(&md->signature), EVMS_BBR_SIGNATURE);
11554+ return -EINVAL;
11555+ }
11556+
11557+ if (md->crc) {
11558+ org_crc = le32_to_cpup(&md->crc);
11559+ md->crc = 0;
11560+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md,
11561+ sizeof(*md));
11562+ if ( final_crc != org_crc ) {
11563+ LOG_ERROR("CRC failed!\n");
11564+ LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11565+ org_crc, final_crc);
11566+ return -EINVAL;
11567+ }
11568+ md->crc = cpu_to_le32p(&org_crc);
11569+ } else {
11570+ LOG_WARNING("Metadata sector has no CRC!\n");
11571+ }
11572+
11573+ le_meta_data_to_cpu(md);
11574+ return 0;
11575+}
11576+
11577+/**
11578+ * bbr_load_meta_data
11579+ * @node: BBR node to read metadata from.
11580+ * @lsn: Sector to read metadata from.
11581+ * @md: Pointer to return metadata structure.
11582+ * @bbr_table: Pointer to return BBR table.
11583+ *
11584+ * Load one copy of the BBR metadata. If the metadata is valid, load the
11585+ * corresponding copy of the BBR table.
11586+ */
11587+static int load_meta_data(struct evms_logical_node * node,
11588+ u64 lsn,
11589+ struct evms_bbr_metadata ** md,
11590+ struct evms_bbr_table ** bbr_table)
11591+{
11592+ int rc;
11593+
11594+ *md = NULL;
11595+ *bbr_table = NULL;
11596+
11597+ if (!lsn) {
11598+ LOG_WARNING("No sector specified for BBR metadata on %s.\n",
11599+ node->name);
11600+ return -ENODATA;
11601+ }
11602+
11603+ /* Allocate a buffer for the metadata sector. */
11604+ *md = kmalloc(sizeof(struct evms_bbr_metadata), GFP_KERNEL);
11605+ if (!*md) {
11606+ LOG_ERROR("kmalloc error creating metadata buffer for %s.\n",
11607+ node->name);
11608+ return -ENOMEM;
11609+ }
11610+
11611+ /* Read the metadata sector. */
11612+ rc = INIT_IO(node, 0, lsn, 1, *md);
11613+ if (rc) {
11614+ LOG_ERROR("init_io error on %s.\n", node->name);
11615+ kfree(*md);
11616+ *md = NULL;
11617+ return rc;
11618+ }
11619+
11620+ /* Validate the metadata sector. */
11621+ rc = validate_meta_data(*md);
11622+ if (rc) {
11623+ LOG_ERROR("Error validating metadata for %s.\n", node->name);
11624+ kfree(*md);
11625+ *md = NULL;
11626+ return rc;
11627+ }
11628+
11629+ /* Allocate a buffer for the BBR table. */
11630+ *bbr_table = kmalloc((*md)->nr_sects_bbr_table <<
11631+ EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL);
11632+ if (!*bbr_table) {
11633+ LOG_ERROR("kmalloc error creating BBR table buffer for %s.\n",
11634+ node->name);
11635+ kfree(*md);
11636+ *md = NULL;
11637+ return -ENOMEM;
11638+ }
11639+
11640+ /* Read the BBR table but don't validate here. */
11641+ rc = INIT_IO(node, 0, (*md)->start_sect_bbr_table,
11642+ (*md)->nr_sects_bbr_table, *bbr_table);
11643+ if (rc) {
11644+ LOG_ERROR("init_io error on %s.\n", node->name);
11645+ kfree(*md);
11646+ *md = NULL;
11647+ kfree(*bbr_table);
11648+ *bbr_table = NULL;
11649+ }
11650+
11651+ return rc;
11652+}
11653+
11654+/**
11655+ * bbr_load_feature_data
11656+ * @node: BBR node
11657+ * @ID: Return pointer to BBR private data.
11658+ *
11659+ * Load both copies of the BBR metadata and table. If one is invalid, try
11660+ * to correct is using the valid copy. When a valid copy is found, create
11661+ * a private data structure for the specified node.
11662+ */
11663+static int load_feature_data(struct evms_logical_node * node,
11664+ struct bbr_private ** ID)
11665+{
11666+ struct evms_bbr_metadata * md1 = NULL;
11667+ struct evms_bbr_metadata * md2 = NULL;
11668+ struct evms_bbr_table * table1 = NULL;
11669+ struct evms_bbr_table * table2 = NULL;
11670+ u64 lba_table1 = 0, lba_table2 = 0;
11671+ u32 nr_sects = 0;
11672+ int rc = 0, rc1, rc2;
11673+
11674+ *ID = NULL;
11675+
11676+ /* Load metadata 1 */
11677+ rc1 = load_meta_data(node,
11678+ node->feature_header->feature_data1_start_lsn,
11679+ &md1, &table1);
11680+ /* Load metadata 2 */
11681+ rc2 = load_meta_data(node,
11682+ node->feature_header->feature_data2_start_lsn,
11683+ &md2, &table2);
11684+
11685+ if (rc1 && rc2) {
11686+ /* Both copies are bad? Cannot continue. */
11687+ rc = -ENODATA;
11688+ } else if (rc1 || rc2) {
11689+ /* One copy is bad. Use the good copy. */
11690+ if (rc1) {
11691+ lba_table2 = md2->start_sect_bbr_table;
11692+ kfree(table1);
11693+ kfree(md1);
11694+ table1 = table2;
11695+ table2 = NULL;
11696+ md1 = md2;
11697+ md2 = NULL;
11698+ } else {
11699+ lba_table1 = md1->start_sect_bbr_table;
11700+ }
11701+
11702+ nr_sects = validate_bbr_table(md1, table1);
11703+ if ( nr_sects == 0 ) {
11704+ rc = -ENODATA;
11705+ }
11706+ } else {
11707+ lba_table1 = md1->start_sect_bbr_table;
11708+ lba_table2 = md2->start_sect_bbr_table;
11709+ nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
11710+ if ( nr_sects == 0 ) {
11711+ rc = -ENODATA;
11712+ }
11713+ }
11714+
11715+ if (!rc && nr_sects) {
11716+ *ID = kmalloc(sizeof(struct bbr_private), GFP_KERNEL);
11717+ if (*ID) {
11718+ memset(*ID, 0, sizeof(struct bbr_private));
11719+ (*ID)->source = node;
11720+ (*ID)->blksize_in_sects = md1->block_size >>
11721+ EVMS_VSECTOR_SIZE_SHIFT;
11722+ (*ID)->remap_root = NULL;
11723+ (*ID)->lba_table1 = lba_table1;
11724+ (*ID)->lba_table2 = lba_table2;
11725+ (*ID)->bbr_table = table1;
11726+ (*ID)->nr_sects_bbr_table = nr_sects;
11727+ if ( nr_sects < md1->nr_sects_bbr_table ) {
11728+ LOG_WARNING("Making BBR node read-only\n");
11729+ (*ID)->flag |= EVMS_VOLUME_READ_ONLY;
11730+ }
11731+ (*ID)->nr_replacement_blks = nr_sects *
11732+ EVMS_BBR_ENTRIES_PER_SECT;
11733+ (*ID)->start_replacement_sect = md1->start_replacement_sect;
11734+ (*ID)->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
11735+ (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
11736+ if ( !bbr_remap_pool || !bbr_io_buf_pool ) {
11737+ rc = bbr_create_pools();
11738+ }
11739+ if (!rc) {
11740+ atomic_set(&(*ID)->in_use_replacement_blks,
11741+ bbr_table_to_remap_list(*ID));
11742+ }
11743+ } else {
11744+ rc = -ENOMEM;
11745+ }
11746+ }
11747+
11748+ if (!rc) {
11749+ if (!bbr_io_thread) {
11750+ const char * name = "evms_bbr_io";
11751+ bbr_io_thread = evms_cs_register_thread(bbr_io_handler,
11752+ NULL, name);
11753+ if (!bbr_io_thread) {
11754+ rc = -EINVAL;
11755+ }
11756+ }
11757+ }
11758+
11759+ /* If error, free table1. */
11760+ if (rc) {
11761+ if (table1) {
11762+ kfree(table1);
11763+ }
11764+ if (*ID) {
11765+ (*ID)->bbr_table = NULL;
11766+ bbr_free_private(*ID);
11767+ (*ID) = NULL;
11768+ }
11769+ }
11770+
11771+ /* Will never use md1, md2 and table2 again */
11772+ if (md1) {
11773+ kfree(md1);
11774+ }
11775+ if (md2) {
11776+ kfree(md2);
11777+ }
11778+ if (table2) {
11779+ kfree(table2);
11780+ }
11781+
11782+ return rc;
11783+}
11784+
11785+/**
11786+ * bbr_binary_tree_insert
11787+ *
11788+ * Insert a node into the binary tree.
11789+ */
11790+void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
11791+ struct bbr_runtime_remap * newnode)
11792+{
11793+ struct bbr_runtime_remap ** node = root;
11794+ while (node && *node) {
11795+ if ( newnode->remap.bad_sect > (*node)->remap.bad_sect ) {
11796+ node = &((*node)->right);
11797+ } else {
11798+ node = &((*node)->left);
11799+ }
11800+ }
11801+
11802+ newnode->left = newnode->right = NULL;
11803+ *node = newnode;
11804+}
11805+
11806+/**
11807+ * bbr_binary_search
11808+ *
11809+ * Search for a node that contains bad_sect = lsn.
11810+ */
11811+struct bbr_runtime_remap * bbr_binary_search(struct bbr_runtime_remap * root,
11812+ u64 lsn)
11813+{
11814+ struct bbr_runtime_remap * node = root;
11815+ while (node) {
11816+ if (node->remap.bad_sect == lsn) {
11817+ break;
11818+ }
11819+ if ( lsn > node->remap.bad_sect ) {
11820+ node = node->right;
11821+ } else {
11822+ node = node->left;
11823+ }
11824+ }
11825+ return node;
11826+}
11827+
11828+/**
11829+ * bbr_binary_tree_destroy
11830+ *
11831+ * Destroy the binary tree.
11832+ */
11833+void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
11834+ struct bbr_private * bbr_id)
11835+{
11836+ struct bbr_runtime_remap ** link = NULL;
11837+ struct bbr_runtime_remap * node = root;
11838+
11839+ while (node) {
11840+ if (node->left) {
11841+ link = &(node->left);
11842+ node = node->left;
11843+ continue;
11844+ }
11845+ if (node->right) {
11846+ link = &(node->right);
11847+ node = node->right;
11848+ continue;
11849+ }
11850+
11851+ mempool_free(node, bbr_remap_pool);
11852+ if (node == root) {
11853+ /* If root is deleted, we're done. */
11854+ break;
11855+ }
11856+
11857+ /* Back to root. */
11858+ node = root;
11859+ *link = NULL;
11860+ }
11861+}
11862+
11863+static void bbr_free_remap(struct bbr_private * bbr_id)
11864+{
11865+ unsigned long flags;
11866+ spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11867+ bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
11868+ bbr_id->remap_root = NULL;
11869+ spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11870+}
11871+
11872+/**
11873+ * bbr_insert_remap_entry
11874+ *
11875+ * Create a new remap entry and add it to the binary tree for this node.
11876+ */
11877+static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
11878+ struct evms_bbr_table_entry * new_bbr_entry)
11879+{
11880+ struct bbr_runtime_remap * newnode = NULL;
11881+ unsigned long flags;
11882+ int rc;
11883+
11884+ newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
11885+ if (!newnode) {
11886+ rc = -ENOMEM;
11887+ LOG_SERIOUS("Could not allocate from remap pool! (rc=%d)\n", rc);
11888+ return rc;
11889+ }
11890+ newnode->remap.bad_sect = new_bbr_entry->bad_sect;
11891+ newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
11892+ spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11893+ bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
11894+ spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11895+ return 0;
11896+}
11897+
11898+/**
11899+ * bbr_table_to_remap_list
11900+ *
11901+ * The on-disk bbr table is sorted by the replacement sector LBA. In order to
11902+ * improve run time performance, the in memory remap list must be sorted by
11903+ * the bad sector LBA. This function is called at discovery time to initialize
11904+ * the remap list. This function assumes that at least one copy of meta data
11905+ * is valid.
11906+ */
11907+static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
11908+{
11909+ u32 in_use_blks = 0;
11910+ int i, j;
11911+ struct evms_bbr_table * p;
11912+
11913+
11914+ for ( i = 0, p = bbr_id->bbr_table;
11915+ i < bbr_id->nr_sects_bbr_table;
11916+ i++, p++ ) {
11917+ if (!p->in_use_cnt) {
11918+ break;
11919+ }
11920+ in_use_blks += p->in_use_cnt;
11921+ for ( j = 0; j < p->in_use_cnt; j++ ) {
11922+ bbr_insert_remap_entry(bbr_id, &p->entries[j]);
11923+ }
11924+ }
11925+
11926+ return in_use_blks;
11927+}
11928+
11929+/**
11930+ * bbr_search_remap_entry
11931+ *
11932+ * Search remap entry for the specified sector. If found, return a pointer to
11933+ * the table entry. Otherwise, return NULL.
11934+ */
11935+static struct evms_bbr_table_entry * bbr_search_remap_entry(struct bbr_private * bbr_id,
11936+ u64 lsn)
11937+{
11938+ struct bbr_runtime_remap * p;
11939+ unsigned long flags;
11940+
11941+ spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11942+ p = bbr_binary_search(bbr_id->remap_root, lsn);
11943+ spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11944+ if (p) {
11945+ return (&p->remap);
11946+ } else {
11947+ return NULL;
11948+ }
11949+}
11950+
11951+/**
11952+ * bbr_remap
11953+ *
11954+ * If *lsn is in the remap table, return TRUE and modify *lsn,
11955+ * else, return FALSE.
11956+ */
11957+static inline int bbr_remap(struct bbr_private * bbr_id,
11958+ u64 * lsn)
11959+{
11960+ struct evms_bbr_table_entry *e;
11961+
11962+ if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11963+ ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11964+ e = bbr_search_remap_entry(bbr_id, *lsn);
11965+ if (e) {
11966+ *lsn = e->replacement_sect;
11967+ LOG_EXTRA("%s replacement sector (LSN="PFU64")\n",
11968+ __FUNCTION__, *lsn);
11969+ return TRUE;
11970+ }
11971+ }
11972+ return FALSE;
11973+}
11974+
11975+/**
11976+ * bbr_remap_probe
11977+ *
11978+ * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
11979+ * table return TRUE, Else, return FALSE.
11980+ */
11981+static inline int bbr_remap_probe(struct bbr_private * bbr_id,
11982+ u64 lsn, u64 nr_sects)
11983+{
11984+ u64 tmp, cnt;
11985+
11986+ if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11987+ ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11988+ for ( cnt = 0, tmp = lsn;
11989+ cnt < nr_sects;
11990+ cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
11991+ if ( bbr_remap(bbr_id,&tmp) ) {
11992+ return TRUE;
11993+ }
11994+ }
11995+ }
11996+ return FALSE;
11997+}
11998+
11999+static void *bbr_slab_pool_alloc(int gfp_mask, void * data)
12000+{
12001+ return kmem_cache_alloc(data, gfp_mask);
12002+}
12003+
12004+static void bbr_slab_pool_free(void *ptr, void * data)
12005+{
12006+ kmem_cache_free(data, ptr);
12007+}
12008+
12009+static int bbr_create_pools(void)
12010+{
12011+ /* Create a memory pool for the remap list. */
12012+ if (!bbr_remap_slab) {
12013+ bbr_remap_slab = kmem_cache_create("BBR_Remap_Slab",
12014+ sizeof(struct bbr_runtime_remap),
12015+ 0, SLAB_HWCACHE_ALIGN,
12016+ NULL, NULL);
12017+ if (!bbr_remap_slab) {
12018+ panic("Unable to create BBR remap cache.");
12019+ }
12020+ }
12021+ if (!bbr_remap_pool) {
12022+ bbr_remap_pool = mempool_create(64, bbr_slab_pool_alloc,
12023+ bbr_slab_pool_free,
12024+ bbr_remap_slab);
12025+ if (!bbr_remap_pool) {
12026+ panic("Unable to create BBR remap pool.");
12027+ }
12028+ }
12029+
12030+ /* Create a memory pool for the BBR I/O anchors. */
12031+ if (!bbr_io_buf_slab) {
12032+ bbr_io_buf_slab = kmem_cache_create("BBR_IO_Buf_Slab",
12033+ sizeof(struct bbr_io_buffer),
12034+ 0, SLAB_HWCACHE_ALIGN,
12035+ NULL, NULL);
12036+ if (!bbr_io_buf_slab) {
12037+ panic("Unable to create BBR I/O buffer cache.");
12038+ }
12039+ }
12040+ if (!bbr_io_buf_pool) {
12041+ bbr_io_buf_pool = mempool_create(256, bbr_slab_pool_alloc,
12042+ bbr_slab_pool_free,
12043+ bbr_io_buf_slab);
12044+ if (!bbr_io_buf_pool) {
12045+ panic("Unable to create BBR I/O buffer pool.");
12046+ }
12047+ }
12048+
12049+ return 0;
12050+}
12051+
12052+static void bbr_destroy_pools(void)
12053+{
12054+ if (bbr_io_buf_pool) {
12055+ mempool_destroy(bbr_io_buf_pool);
12056+ bbr_io_buf_pool = NULL;
12057+ }
12058+ if (bbr_io_buf_slab) {
12059+ kmem_cache_destroy(bbr_io_buf_slab);
12060+ bbr_io_buf_slab = NULL;
12061+ }
12062+ if (bbr_remap_pool) {
12063+ mempool_destroy(bbr_remap_pool);
12064+ bbr_remap_pool = NULL;
12065+ }
12066+ if (bbr_remap_slab) {
12067+ kmem_cache_destroy(bbr_remap_slab);
12068+ bbr_remap_slab = NULL;
12069+ }
12070+}
12071+
12072+/**
12073+ * bbr_discover
12074+ *
12075+ * Search through the discover list looking for object with BBR metadata.
12076+ * Remove them from the list and replace with a new BBR node.
12077+ */
12078+static int bbr_discover(struct evms_logical_node ** discover_list)
12079+{
12080+ struct evms_logical_node * node, * next_node;
12081+ struct evms_logical_node * bbr_node = NULL;
12082+ struct bbr_private * bbr_id;
12083+ int bad_blocks, rc = 0;
12084+
12085+ MOD_INC_USE_COUNT;
12086+
12087+ next_node = *discover_list;
12088+ while (next_node) {
12089+ node = next_node;
12090+ next_node = node->next;
12091+
12092+ /* The node must have a BBR feature-header. */
12093+ if ( ! node->feature_header ||
12094+ node->feature_header->feature_id != plugin_header.id ) {
12095+ continue;
12096+ }
12097+
12098+ rc = load_feature_data(node, &bbr_id);
12099+ if (rc) {
12100+ /* Error loading feature data.
12101+ * This node belongs to us, but metadata is invalid,
12102+ * - remove it from the discovery list
12103+ * - delete it
12104+ * - clear error code then continue.
12105+ * Will consider creating a read only BBR node in
12106+ * the future.
12107+ */
12108+ LOG_SERIOUS("Error in node (%s) with "PFU64" sectors.\n",
12109+ node->name, node->total_vsectors);
12110+ evms_cs_remove_logical_node_from_list(discover_list,
12111+ node);
12112+ DELETE(node);
12113+ rc = 0;
12114+ continue;
12115+ }
12116+
12117+ rc = evms_cs_allocate_logical_node(&bbr_node);
12118+ if (rc) {
12119+ LOG_SERIOUS("Could not allocate logical node! rc=%d\n", rc);
12120+ bbr_free_private(bbr_id);
12121+ continue;
12122+ }
12123+
12124+ MOD_INC_USE_COUNT;
12125+ bbr_node->volume_info = node->volume_info;
12126+ bbr_node->flags |= node->flags;
12127+ bbr_node->plugin = &plugin_header;
12128+ strcpy(bbr_node->name,
12129+ node->feature_header->object_name);
12130+ bbr_node->hardsector_size = node->hardsector_size;
12131+ bbr_node->total_vsectors = node->total_vsectors - 2 -
12132+ node->feature_header->feature_data1_size -
12133+ node->feature_header->feature_data2_size;
12134+ bbr_node->block_size = node->block_size;
12135+ bbr_node->private = bbr_id;
12136+ bbr_id->node = bbr_node;
12137+
12138+ /* Free the feature header */
12139+ kfree(node->feature_header);
12140+ node->feature_header = NULL;
12141+ evms_cs_remove_logical_node_from_list(discover_list, node);
12142+
12143+ /* If bad blocks exist, give warning */
12144+ bad_blocks = atomic_read(&bbr_id->in_use_replacement_blks);
12145+ if (bad_blocks) {
12146+ BBR_DEBUG_PRINT_REMAP_LIST(bbr_id);
12147+ LOG_WARNING("%s has %d bad blocks.\n",
12148+ bbr_id->source->name, bad_blocks);
12149+ LOG_WARNING("There are "PFU64" total replacement blocks.\n",
12150+ bbr_id->nr_replacement_blks);
12151+ LOG_WARNING("There are "PFU64" remaining replacement blocks.\n",
12152+ bbr_id->nr_replacement_blks -
12153+ bad_blocks);
12154+ }
12155+
12156+ evms_cs_add_logical_node_to_list(discover_list, bbr_node);
12157+ bbr_list_add(bbr_id);
12158+ }
12159+
12160+ MOD_DEC_USE_COUNT;
12161+ return rc;
12162+}
12163+
12164+static inline void bbr_list_add(struct bbr_private * bbr_id)
12165+{
12166+ bbr_id->next = bbr_instances;
12167+ bbr_instances = bbr_id;
12168+}
12169+
12170+static void bbr_list_remove(struct bbr_private * bbr_id)
12171+{
12172+ struct bbr_private ** p;
12173+
12174+ for ( p = &bbr_instances; *p; p = &(*p)->next ) {
12175+ if ( *p == bbr_id ) {
12176+ *p = (*p)->next;
12177+ break;
12178+ }
12179+ }
12180+}
12181+
12182+static struct bbr_private * bbr_find_private(char * object_name)
12183+{
12184+ struct bbr_private * p;
12185+
12186+ for ( p = bbr_instances; p; p = p->next ) {
12187+ if ( ! strncmp(p->node->name, object_name,
12188+ EVMS_VOLUME_NAME_SIZE) ) {
12189+ return p;
12190+ }
12191+ }
12192+ return NULL;
12193+}
12194+
12195+static void bbr_free_private(struct bbr_private * bbr_id)
12196+{
12197+ if (bbr_id->remap_root) {
12198+ bbr_free_remap(bbr_id);
12199+ }
12200+ if (bbr_id->bbr_table) {
12201+ kfree(bbr_id->bbr_table);
12202+ }
12203+ bbr_list_remove(bbr_id);
12204+ kfree(bbr_id);
12205+}
12206+
12207+/**
12208+ * bbr_delete
12209+ *
12210+ * Delete the specified BBR node and the node it is built on. If the last BBR
12211+ * node is deleted, shut down the I/O thread.
12212+ */
12213+static int bbr_delete(struct evms_logical_node * bbr_node)
12214+{
12215+ struct bbr_private * bbr_id;
12216+ int rc;
12217+
12218+ bbr_id = bbr_node->private;
12219+
12220+ rc = DELETE(bbr_id->source);
12221+ if (!rc) {
12222+ /* Now cleanup and go away */
12223+ bbr_free_private(bbr_id);
12224+ evms_cs_deallocate_logical_node(bbr_node);
12225+ if (!bbr_instances) {
12226+ bbr_destroy_pools();
12227+ if (bbr_io_thread) {
12228+ evms_cs_unregister_thread(bbr_io_thread);
12229+ bbr_io_thread = NULL;
12230+ }
12231+ }
12232+ MOD_DEC_USE_COUNT;
12233+ }
12234+ return rc;
12235+}
12236+
12237+static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
12238+ struct buffer_head * bh,
12239+ int rw)
12240+{
12241+ struct bbr_io_buffer * bbr_io_buf;
12242+
12243+ bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
12244+ if (bbr_io_buf) {
12245+ memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
12246+ INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
12247+ bbr_io_buf->bbr_id = bbr_id;
12248+ bbr_io_buf->bh = bh;
12249+ bbr_io_buf->rw = rw;
12250+ } else {
12251+ LOG_WARNING("Could not allocate from BBR I/O buffer pool!\n");
12252+ }
12253+ return bbr_io_buf;
12254+}
12255+
12256+static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
12257+{
12258+ mempool_free(bbr_io_buf, bbr_io_buf_pool);
12259+}
12260+
12261+/**
12262+ * bbr_io_remap_error
12263+ * @bbr_id: Private data for the BBR node.
12264+ * @rw: READ or WRITE.
12265+ * @starting_lsn: Starting sector of request to remap.
12266+ * @count: Number of sectors in the request.
12267+ * @buffer: Data buffer for the request.
12268+ *
12269+ * For the requested range, try to write each sector individually. For each
12270+ * sector that fails, find the next available remap location and write the
12271+ * data to that new location. Then update the table and write both copies
12272+ * of the table to disk. Finally, update the in-memory mapping and do any
12273+ * other necessary bookkeeping.
12274+ */
12275+static int bbr_io_remap_error(struct bbr_private * bbr_id,
12276+ int rw,
12277+ u64 starting_lsn,
12278+ u64 count,
12279+ char * buffer )
12280+{
12281+ struct evms_bbr_table * bbr_table;
12282+ unsigned long table_sector_index;
12283+ unsigned long table_sector_offset;
12284+ unsigned long index;
12285+ u64 lsn, new_lsn;
12286+ int rc;
12287+
12288+ if ( rw == READ ) {
12289+ /* Nothing can be done about read errors. */
12290+ return -EIO;
12291+ }
12292+
12293+ /* For each sector in the request. */
12294+ for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12295+ rc = INIT_IO(bbr_id->source, rw, starting_lsn + lsn, 1, buffer);
12296+ while (rc) {
12297+ if ( bbr_id->flag & BBR_STOP_REMAP ) {
12298+ /* Can't allow new remaps if the
12299+ * engine told us to stop.
12300+ */
12301+ LOG_ERROR("Object %s: Bad sector ("PFU64"), but remapping is turned off.\n",
12302+ bbr_id->node->name, starting_lsn+lsn);
12303+ return -EIO;
12304+ }
12305+
12306+ /* Find the next available relocation sector. */
12307+ new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
12308+ if ( new_lsn >= bbr_id->nr_replacement_blks ) {
12309+ /* No more replacement sectors available. */
12310+ return -EIO;
12311+ }
12312+ new_lsn += bbr_id->start_replacement_sect;
12313+
12314+ /* Write the data to its new location. */
12315+ LOG_WARNING("Object %s: Trying to remap bad sector ("PFU64") to sector ("PFU64")\n",
12316+ bbr_id->node->name, starting_lsn + lsn,
12317+ new_lsn);
12318+ rc = INIT_IO(bbr_id->source, rw, new_lsn, 1, buffer);
12319+ if (rc) {
12320+ /* This replacement sector is bad.
12321+ * Try the next one.
12322+ */
12323+ LOG_ERROR("Object %s: Replacement sector ("PFU64") is bad. Skipping.\n",
12324+ bbr_id->node->name, new_lsn);
12325+ atomic_inc(&bbr_id->in_use_replacement_blks);
12326+ continue;
12327+ }
12328+
12329+ /* Add this new entry to the on-disk table. */
12330+ table_sector_index = new_lsn -
12331+ bbr_id->start_replacement_sect;
12332+ table_sector_offset = table_sector_index /
12333+ EVMS_BBR_ENTRIES_PER_SECT;
12334+ index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
12335+
12336+ bbr_table = &bbr_id->bbr_table[table_sector_offset];
12337+ bbr_table->entries[index].bad_sect = starting_lsn + lsn;
12338+ bbr_table->entries[index].replacement_sect = new_lsn;
12339+ bbr_table->in_use_cnt++;
12340+ bbr_table->sequence_number++;
12341+ bbr_table->crc = 0;
12342+ bbr_table->crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
12343+ bbr_table,
12344+ sizeof(struct evms_bbr_table));
12345+
12346+ /* Write the table to disk. */
12347+ cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
12348+ if ( bbr_id->lba_table1 ) {
12349+ rc = INIT_IO(bbr_id->source, WRITE,
12350+ bbr_id->lba_table1 +
12351+ table_sector_offset,
12352+ 1, bbr_table);
12353+ }
12354+ if ( bbr_id->lba_table2 ) {
12355+ rc |= INIT_IO(bbr_id->source, WRITE,
12356+ bbr_id->lba_table2 +
12357+ table_sector_offset,
12358+ 1, bbr_table);
12359+ }
12360+ le_bbr_table_sector_to_cpu(bbr_table);
12361+
12362+ if (rc) {
12363+ /* Error writing one of the tables to disk. */
12364+ LOG_ERROR("Object %s: Error updating BBR tables on disk.\n",
12365+ bbr_id->node->name);
12366+ return rc;
12367+ }
12368+
12369+ /* Insert a new entry in the remapping binary-tree. */
12370+ rc = bbr_insert_remap_entry(bbr_id,
12371+ &bbr_table->entries[index]);
12372+ if (rc) {
12373+ LOG_ERROR("Object %s: Error adding new entry to remap tree.\n",
12374+ bbr_id->node->name);
12375+ return rc;
12376+ }
12377+
12378+ atomic_inc(&bbr_id->in_use_replacement_blks);
12379+ }
12380+ }
12381+
12382+ return 0;
12383+}
12384+
12385+/**
12386+ * bbr_io_process_request
12387+ *
12388+ * For each sector in this request, check if the sector has already
12389+ * been remapped. If so, process all previous sectors in the request,
12390+ * followed by the remapped sector. Then reset the starting lsn and
12391+ * count, and keep going with the rest of the request as if it were
12392+ * a whole new request. If any of the INIT_IO's return an error,
12393+ * call the remapper to relocate the bad sector(s).
12394+ */
12395+static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
12396+{
12397+ struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
12398+ u64 starting_lsn = bbr_io_buf->bh->b_rsector;
12399+ u64 count = bbr_io_buf->bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
12400+ u64 lsn, remapped_lsn;
12401+ char * buffer = bbr_io_buf->bh->b_data;
12402+ int rc = 0, rw = bbr_io_buf->rw;
12403+
12404+ /* For each sector in this request, check if this sector has already
12405+ * been remapped. If so, process all previous sectors in this request,
12406+ * followed by the remapped sector. Then reset the starting lsn and
12407+ * count and keep going with the rest of the request as if it were
12408+ * a whole new request.
12409+ */
12410+ for ( lsn = 0; lsn < count && !(bbr_id->flag & BBR_STOP_REMAP); lsn++ ) {
12411+ remapped_lsn = starting_lsn + lsn;
12412+ rc = bbr_remap(bbr_id, &remapped_lsn);
12413+ if (!rc) {
12414+ /* This sector is fine. */
12415+ continue;
12416+ }
12417+
12418+ /* Process all sectors in the request up to this one. */
12419+ if ( lsn > 0 ) {
12420+ rc = INIT_IO(bbr_id->source, rw,
12421+ starting_lsn, lsn, buffer);
12422+ if (rc) {
12423+ /* If this I/O failed, then one of the sectors
12424+ * in this request needs to be relocated.
12425+ */
12426+ rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12427+ lsn, buffer);
12428+ if (rc) {
12429+ return rc;
12430+ }
12431+ }
12432+ buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
12433+ }
12434+
12435+ /* Process the remapped sector. */
12436+ rc = INIT_IO(bbr_id->source, rw, remapped_lsn, 1, buffer);
12437+ if (rc) {
12438+ /* BUGBUG - Need more processing if this caused an
12439+ * an error. If this I/O failed, then the existing
12440+ * remap is now bad, and we need to find a new remap.
12441+ * Can't use bbr_io_remap_error(), because the existing
12442+ * map entry needs to be changed, not added again, and
12443+ * the original table entry also needs to be changed.
12444+ */
12445+ return rc;
12446+ }
12447+
12448+ buffer += EVMS_VSECTOR_SIZE;
12449+ starting_lsn += (lsn + 1);
12450+ count -= (lsn + 1);
12451+ lsn = -1;
12452+ }
12453+
12454+ /* Check for any remaining sectors after the last split. This could
12455+ * potentially be the whole request, but that should be a rare case
12456+ * because requests should only be processed by the thread if we know
12457+ * an error occurred or they contained one or more remapped sectors.
12458+ */
12459+ if ( count ) {
12460+ rc = INIT_IO(bbr_id->source, rw, starting_lsn, count, buffer);
12461+ if (rc) {
12462+ /* If this I/O failed, then one of the sectors in this
12463+ * request needs to be relocated.
12464+ */
12465+ rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12466+ count, buffer);
12467+ if (rc) {
12468+ return rc;
12469+ }
12470+ }
12471+ }
12472+
12473+ return 0;
12474+}
12475+
12476+/**
12477+ * bbr_io_handler
12478+ *
12479+ * This is the handler for the bbr_io_thread. It continuously loops,
12480+ * taking I/O requests off its list and processing them. If nothing
12481+ * is on the list, the thread goes back to sleep until specifically
12482+ * woken up.
12483+ *
12484+ * I/O requests should only be sent to this thread if we know that:
12485+ * a) the request contains at least one remapped sector.
12486+ * or
12487+ * b) the request caused an error on the normal I/O path.
12488+ * This function uses synchronous I/O, so sending a request to this
12489+ * thread that doesn't need special processing will cause severe
12490+ * performance degredation.
12491+ */
12492+static void bbr_io_handler(void * void_data)
12493+{
12494+ struct bbr_io_buffer * bbr_io_buf;
12495+ struct buffer_head * bh;
12496+ unsigned long flags;
12497+ int rc = 0;
12498+
12499+ while (1) {
12500+ /* Process bbr_io_list, one entry at a time. */
12501+ spin_lock_irqsave(&bbr_io_list_lock, flags);
12502+ if (list_empty(&bbr_io_list)) {
12503+ /* No more items on the list. */
12504+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12505+ break;
12506+ }
12507+ bbr_io_buf = list_entry(bbr_io_list.next,
12508+ struct bbr_io_buffer, bbr_io_list);
12509+ list_del(&bbr_io_buf->bbr_io_list);
12510+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12511+
12512+ rc = bbr_io_process_request(bbr_io_buf);
12513+
12514+ /* Clean up and complete the original I/O. */
12515+ bh = bbr_io_buf->bh;
12516+ if (bh->b_end_io) {
12517+ free_bbr_io_buf(bbr_io_buf);
12518+ evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12519+ bh->b_end_io(bh, rc ? 0 : 1);
12520+ } else {
12521+ /* A request that originated from bbr_init_io. */
12522+ bbr_io_buf->rc = rc;
12523+ complete(bbr_io_buf->complete);
12524+ }
12525+ }
12526+}
12527+
12528+/**
12529+ * bbr_schedule_io
12530+ *
12531+ * Place the specified bbr_io_buf on the thread's processing list.
12532+ */
12533+static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
12534+{
12535+ unsigned long flags;
12536+
12537+ spin_lock_irqsave(&bbr_io_list_lock, flags);
12538+ list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
12539+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12540+ evms_cs_wakeup_thread(bbr_io_thread);
12541+}
12542+
12543+/**
12544+ * bbr_read
12545+ *
12546+ * If there are any remapped sectors on this object, send this request over
12547+ * to the thread for processing. Otherwise send it down the stack normally.
12548+ */
12549+static void bbr_read(struct evms_logical_node * bbr_node,
12550+ struct buffer_head * bh )
12551+{
12552+ struct bbr_private * bbr_id = bbr_node->private;
12553+ struct bbr_io_buffer * bbr_io_buf;
12554+
12555+ if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12556+ bbr_node->total_vsectors ) {
12557+ /* Request is off the end of the object. */
12558+ bh->b_end_io(bh, 0);
12559+ return;
12560+ }
12561+
12562+ if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12563+ bbr_id->flag & BBR_STOP_REMAP ||
12564+ ! bbr_remap_probe(bbr_id, bh->b_rsector,
12565+ bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12566+ /* No existing remaps, this request doesn't contain any
12567+ * remapped sectors, or the engine told us not to remap.
12568+ */
12569+ R_IO(bbr_id->source, bh);
12570+ return;
12571+ }
12572+
12573+ /* This request has at least one remapped sector. */
12574+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
12575+ if (!bbr_io_buf) {
12576+ /* Can't get memory to track the I/O. */
12577+ bh->b_end_io(bh, 0);
12578+ return;
12579+ }
12580+
12581+ evms_cs_volume_request_in_progress(bbr_io_buf->bh->b_rdev, +1, NULL);
12582+ bbr_schedule_io(bbr_io_buf);
12583+}
12584+
12585+/**
12586+ * bbr_write_callback
12587+ *
12588+ * This is the callback for normal write requests. Check for an error
12589+ * during the I/O, and send to the thread for processing if necessary.
12590+ */
12591+static void bbr_write_callback(struct buffer_head * bh,
12592+ int uptodate)
12593+{
12594+ struct bbr_io_buffer * bbr_io_buf = bh->b_private;
12595+
12596+ bh->b_end_io = bbr_io_buf->org_end_io;
12597+ bh->b_private = bbr_io_buf->org_private;
12598+ bh->b_rsector = bbr_io_buf->org_rsector;
12599+ bh->b_rdev = bbr_io_buf->org_dev;
12600+
12601+ if (!(bbr_io_buf->bbr_id->flag & BBR_STOP_REMAP) &&
12602+ !uptodate) {
12603+ LOG_ERROR("Object %s: Write failure on sector ("PFU64"). Scheduling for retry.\n",
12604+ bbr_io_buf->bbr_id->node->name, (u64)bbr_io_buf->bh->b_rsector);
12605+ bbr_schedule_io(bbr_io_buf);
12606+ } else {
12607+ free_bbr_io_buf(bbr_io_buf);
12608+ evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12609+ bh->b_end_io(bh, uptodate);
12610+ }
12611+}
12612+
12613+/**
12614+ * bbr_write
12615+ *
12616+ * If there are any remapped sectors on this object, send the request over
12617+ * to the thread for processing. Otherwise, register for callback
12618+ * notification, and send the request down normally.
12619+ */
12620+static void bbr_write(struct evms_logical_node * bbr_node,
12621+ struct buffer_head * bh)
12622+{
12623+ struct bbr_private * bbr_id = bbr_node->private;
12624+ struct bbr_io_buffer * bbr_io_buf;
12625+
12626+ if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12627+ bbr_node->total_vsectors ||
12628+ bbr_id->flag & EVMS_VOLUME_READ_ONLY ) {
12629+ /* Request is off the end of the object, or this
12630+ * is a read-only object.
12631+ */
12632+ bh->b_end_io(bh, 0);
12633+ return;
12634+ }
12635+
12636+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
12637+ if (!bbr_io_buf) {
12638+ /* Can't get memory to track the I/O. */
12639+ bh->b_end_io(bh, 0);
12640+ return;
12641+ }
12642+
12643+ evms_cs_volume_request_in_progress(bh->b_rdev, +1, NULL);
12644+
12645+ if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12646+ bbr_id->flag & BBR_STOP_REMAP ||
12647+ ! bbr_remap_probe(bbr_id, bh->b_rsector,
12648+ bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12649+ /* No existing remaps, this request contains no remapped
12650+ * sectors, or the engine said to stop remapping.
12651+ */
12652+ bbr_io_buf->org_end_io = bh->b_end_io;
12653+ bbr_io_buf->org_private = bh->b_private;
12654+ bbr_io_buf->org_rsector = bh->b_rsector;
12655+ bbr_io_buf->org_dev = bh->b_rdev;
12656+ bh->b_end_io = bbr_write_callback;
12657+ bh->b_private = bbr_io_buf;
12658+ W_IO(bbr_id->source, bh);
12659+ } else {
12660+ /* This request contains at least one remapped sector. */
12661+ bbr_schedule_io(bbr_io_buf);
12662+ }
12663+}
12664+
12665+/**
12666+ * bbr_init_io_schedule_io
12667+ * @bbr_id: Private data for the BBR node.
12668+ * @rw: READ or WRITE.
12669+ * @lsn: Starting sector for the request.
12670+ * @count: Number of sectors in the request.
12671+ * @buffer: Data buffer for the request.
12672+ *
12673+ * During init_io, failures must still be handled by the I/O thread. Create
12674+ * a bbr_io_buf, and schedule it to be handled by the thread. Then wait until
12675+ * the request is complete.
12676+ */
12677+static int bbr_init_io_schedule_io(struct bbr_private * bbr_id,
12678+ int rw,
12679+ u64 lsn,
12680+ u64 count,
12681+ void * buffer)
12682+{
12683+ struct bbr_io_buffer * bbr_io_buf;
12684+ struct buffer_head bh;
12685+ struct completion complete;
12686+ int rc = 0;
12687+
12688+ if ( rw != WRITE ) {
12689+ /* Nothing can be done about read failures. */
12690+ return -EIO;
12691+ }
12692+
12693+ LOG_ERROR("Object %s: init_io write failure (sector "PFU64": count "PFU64"). Scheduling for retry.\n",
12694+ bbr_id->node->name, lsn, count);
12695+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, &bh, rw);
12696+ if (!bbr_io_buf) {
12697+ return -ENOMEM;
12698+ }
12699+
12700+ memset(&bh, 0, sizeof(struct buffer_head));
12701+ init_waitqueue_head(&bh.b_wait);
12702+ bh.b_rsector = lsn;
12703+ bh.b_size = count << EVMS_VSECTOR_SIZE_SHIFT;
12704+ bh.b_data = buffer;
12705+ bh.b_end_io = NULL;
12706+
12707+ /* Schedule the I/O and wait for it to finish. */
12708+ bbr_io_buf->complete = &complete;
12709+ init_completion(bbr_io_buf->complete);
12710+ bbr_schedule_io(bbr_io_buf);
12711+ wait_for_completion(bbr_io_buf->complete);
12712+
12713+ rc = bbr_io_buf->rc;
12714+ free_bbr_io_buf(bbr_io_buf);
12715+
12716+ return rc;
12717+}
12718+
12719+/**
12720+ * bbr_init_io
12721+ * @bbr_node: BBR node.
12722+ * @rw: READ or WRITE.
12723+ * @lsn: Starting sector for I/O request.
12724+ * @count: Number of sectors in the I/O request.
12725+ * @buffer: Data buffer for the I/O request.
12726+ *
12727+ * Synchronous I/O requests.
12728+ */
12729+static int bbr_init_io(struct evms_logical_node * bbr_node,
12730+ int rw,
12731+ u64 start_lsn,
12732+ u64 count,
12733+ void * buffer )
12734+{
12735+ struct bbr_private * bbr_id = bbr_node->private;
12736+ u64 lsn;
12737+ int rc = 0;
12738+
12739+ if ( start_lsn + count > bbr_node->total_vsectors ) {
12740+ /* Request is off the end of the object. */
12741+ return -EINVAL;
12742+ }
12743+
12744+ if ( rw == WRITE && (bbr_id->flag & EVMS_VOLUME_READ_ONLY) ) {
12745+ /* Can't write to a read-only object. */
12746+ return -EINVAL;
12747+ }
12748+
12749+ if ( bbr_id->flag & BBR_STOP_REMAP ||
12750+ atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12751+ ! bbr_remap_probe(bbr_id, start_lsn, count) ) {
12752+ /* Normal case (no existing remaps). */
12753+ rc = INIT_IO(bbr_id->source, rw, start_lsn, count, buffer);
12754+ if (rc && ! (bbr_id->flag & BBR_STOP_REMAP) ) {
12755+ /* Init_io error. Send request over to
12756+ * thread for further processing.
12757+ */
12758+ rc = bbr_init_io_schedule_io(bbr_id, rw, start_lsn,
12759+ count, buffer);
12760+ }
12761+ } else {
12762+ /* At least one sector in this request needs to be remapped.
12763+ * Test and send each one down individually.
12764+ */
12765+ for ( lsn = start_lsn;
12766+ lsn < start_lsn + count;
12767+ lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12768+ bbr_remap(bbr_id, &lsn);
12769+ rc = INIT_IO(bbr_id->source, rw, lsn, 1, buffer);
12770+ if (rc) {
12771+ /* Init_io error. Send request
12772+ * to thread for processing.
12773+ */
12774+ rc = bbr_init_io_schedule_io(bbr_id, rw,
12775+ lsn, 1, buffer);
12776+ if (rc) {
12777+ break;
12778+ }
12779+ }
12780+ }
12781+ }
12782+
12783+ return rc;
12784+}
12785+
12786+/**
12787+ * bbr_direct_ioctl_sector_io
12788+ *
12789+ * Process an I/O from the engine on an active BBR object.
12790+ */
12791+static int bbr_direct_ioctl_sector_io(struct bbr_private * bbr_id,
12792+ struct evms_notify_bbr * notify)
12793+{
12794+ char * buffer, * user_buffer;
12795+ u64 lsn;
12796+ int rc = 0;
12797+
12798+ buffer = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
12799+ if (!buffer) {
12800+ return -ENOMEM;
12801+ }
12802+
12803+ user_buffer = (char*)notify->buffer;
12804+
12805+ for ( lsn = 0;
12806+ lsn < notify->nr_sect;
12807+ lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
12808+ if ( notify->rw == WRITE ) {
12809+ if ( copy_from_user(buffer, user_buffer,
12810+ EVMS_VSECTOR_SIZE) ) {
12811+ rc = -EFAULT;
12812+ break;
12813+ }
12814+ }
12815+
12816+ rc = bbr_init_io(bbr_id->node, notify->rw,
12817+ notify->start_sect + lsn, 1, buffer);
12818+ if (rc) {
12819+ break;
12820+ }
12821+
12822+ if ( notify->rw == READ ) {
12823+ if ( copy_to_user(user_buffer, buffer,
12824+ EVMS_VSECTOR_SIZE) ) {
12825+ rc = -EFAULT;
12826+ break;
12827+ }
12828+ }
12829+ }
12830+
12831+ kfree(buffer);
12832+ return rc;
12833+}
12834+
12835+/**
12836+ * bbr_direct_ioctl
12837+ * @inode: N/A
12838+ * @file: N/A
12839+ * @cmd: N/A
12840+ * @arg: Pointer to an evms_plugin_ioctl_pkt.
12841+ *
12842+ * BBR-specific ioctls from the engine. Currently handles:
12843+ * BBR_STOP_REMAP_CMD
12844+ * BBR_GET_INFO_CMD
12845+ * BBR_SECTOR_IO_CMD
12846+ */
12847+static int bbr_direct_ioctl(struct inode * inode,
12848+ struct file * file,
12849+ unsigned int cmd,
12850+ unsigned long arg)
12851+{
12852+ int rc = 0;
12853+ struct bbr_private * bbr_id;
12854+ struct evms_plugin_ioctl_pkt pkt, * user_pkt;
12855+ struct evms_notify_bbr notify, * user_notify;
12856+
12857+ MOD_INC_USE_COUNT;
12858+
12859+ user_pkt = (struct evms_plugin_ioctl_pkt *)arg;
12860+ if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
12861+ MOD_DEC_USE_COUNT;
12862+ return -EFAULT;
12863+ }
12864+
12865+ if ( pkt.feature_id != plugin_header.id ) {
12866+ MOD_DEC_USE_COUNT;
12867+ return -EINVAL;
12868+ }
12869+
12870+ user_notify = (struct evms_notify_bbr *)pkt.feature_ioctl_data;
12871+ if ( copy_from_user(&notify, user_notify, sizeof(notify)) ) {
12872+ rc = -EFAULT;
12873+ } else {
12874+ bbr_id = bbr_find_private(notify.object_name);
12875+ if (!bbr_id) {
12876+ rc = -ENODEV;
12877+ } else {
12878+
12879+ switch(pkt.feature_command) {
12880+
12881+ case BBR_STOP_REMAP_CMD:
12882+ bbr_id->flag |= BBR_STOP_REMAP;
12883+ /* Fall through. */
12884+
12885+ case BBR_GET_INFO_CMD:
12886+ notify.count = atomic_read(&bbr_id->in_use_replacement_blks);
12887+ if ( copy_to_user(&user_notify->count,
12888+ &notify.count,
12889+ sizeof(user_notify->count))) {
12890+ rc = -EFAULT;
12891+ }
12892+ break;
12893+
12894+ case BBR_SECTOR_IO_CMD:
12895+ rc = bbr_direct_ioctl_sector_io(bbr_id,
12896+ &notify);
12897+ break;
12898+
12899+ default:
12900+ rc = -ENOSYS;
12901+ }
12902+ }
12903+ }
12904+
12905+ pkt.status = rc;
12906+ copy_to_user(user_pkt, &pkt, sizeof(pkt));
12907+ MOD_DEC_USE_COUNT;
12908+ return rc;
12909+}
12910+
12911+/**
12912+ * bbr_ioctl
12913+ * @bbr_node: BBR node.
12914+ * @inode: N/A
12915+ * @file: N/A
12916+ * @cmd: ioctl command to process.
12917+ * @arg: ioctl-specific data pointer.
12918+ *
12919+ * IOCTL handler. Currently BBR handles plugin-specific ioctls, as well as
12920+ * EVMS_GET_BMAP. All others are passed to the child node.
12921+ */
12922+static int bbr_ioctl (struct evms_logical_node * bbr_node,
12923+ struct inode * inode,
12924+ struct file * file,
12925+ unsigned int cmd,
12926+ unsigned long arg)
12927+{
12928+ struct bbr_private * bbr_id = bbr_node->private;
12929+ struct evms_get_bmap_pkt * bmap;
12930+ int rc = 0;
12931+
12932+ switch (cmd) {
12933+ case EVMS_PLUGIN_IOCTL:
12934+ rc = bbr_direct_ioctl(inode, file, cmd, arg);
12935+ break;
12936+
12937+ case EVMS_GET_BMAP:
12938+ bmap = (struct evms_get_bmap_pkt *)arg;
12939+ bbr_remap(bbr_id, &bmap->rsector);
12940+ /* fall thru */
12941+
12942+ default:
12943+ rc = IOCTL(bbr_id->source, inode, file, cmd, arg);
12944+ }
12945+ return rc;
12946+}
12947+
12948+static int __init bbr_init(void)
12949+{
12950+ return evms_cs_register_plugin(&plugin_header);
12951+}
12952+
12953+static void __exit bbr_exit(void)
12954+{
12955+ evms_cs_unregister_plugin(&plugin_header);
12956+}
12957+
12958+module_init(bbr_init);
12959+module_exit(bbr_exit);
12960+#ifdef MODULE_LICENSE
12961+MODULE_LICENSE("GPL");
12962+#endif
12963+
12964diff -Naur linux-2002-09-30/drivers/evms/evms_drivelink.c evms-2002-09-30/drivers/evms/evms_drivelink.c
12965--- linux-2002-09-30/drivers/evms/evms_drivelink.c Wed Dec 31 18:00:00 1969
12966+++ evms-2002-09-30/drivers/evms/evms_drivelink.c Fri Sep 13 16:09:55 2002
12967@@ -0,0 +1,1274 @@
12968+/* -*- linux-c -*-
12969+ *
12970+ *
12971+ * Copyright (c) International Business Machines Corp., 2000
12972+ *
12973+ * This program is free software; you can redistribute it and/or modify
12974+ * it under the terms of the GNU General Public License as published by
12975+ * the Free Software Foundation; either version 2 of the License, or
12976+ * (at your option) any later version.
12977+ *
12978+ * This program is distributed in the hope that it will be useful,
12979+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12980+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12981+ * the GNU General Public License for more details.
12982+ *
12983+ * You should have received a copy of the GNU General Public License
12984+ * along with this program; if not, write to the Free Software
12985+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12986+ *
12987+ *
12988+ */
12989+/*
12990+ * linux/drivers/evms/drvlink.c
12991+
12992+ *
12993+ * EVMS Drive Linking Feature.
12994+ *
12995+ * This feature provides the ability to link multiple storage objects
12996+ * together as a single virtual storage object.
12997+ *
12998+ */
12999+
13000+#include <linux/module.h>
13001+#include <linux/kernel.h>
13002+#include <linux/config.h>
13003+#include <linux/genhd.h>
13004+#include <linux/blk.h>
13005+#include <linux/evms/evms.h>
13006+#include <linux/evms/evms_drivelink.h>
13007+#include <asm/uaccess.h>
13008+
13009+#define LOG_PREFIX "drivelink: "
13010+
13011+/* prototypes for mandatory plugin interface functions */
13012+static int drivelink_discover(struct evms_logical_node **);
13013+static int drivelink_delete(struct evms_logical_node *);
13014+static void drivelink_read(struct evms_logical_node *, struct buffer_head *);
13015+static void drivelink_write(struct evms_logical_node *, struct buffer_head *);
13016+static int drivelink_ioctl(struct evms_logical_node *,
13017+ struct inode *,
13018+ struct file *, unsigned int, unsigned long);
13019+static int drivelink_init_io(struct evms_logical_node *,
13020+ int, u64, u64, void *);
13021+
13022+/* plugin function table definition */
13023+static struct evms_plugin_fops fops = {
13024+ .discover = drivelink_discover,
13025+ .delete = drivelink_delete,
13026+ .read = drivelink_read,
13027+ .write = drivelink_write,
13028+ .init_io = drivelink_init_io,
13029+ .ioctl = drivelink_ioctl
13030+};
13031+
13032+/* plugin header definition */
13033+static struct evms_plugin_header plugin_header = {
13034+ .id = SetPluginID(IBM_OEM_ID,
13035+ EVMS_FEATURE,
13036+ EVMS_DRIVELINK_FEATURE_ID),
13037+ .version = {
13038+ .major = 2,
13039+ .minor = 0,
13040+ .patchlevel = 1
13041+ },
13042+ .required_services_version = {
13043+ .major = 0,
13044+ .minor = 5,
13045+ .patchlevel = 0
13046+ },
13047+ .fops = &fops
13048+};
13049+
13050+/********************************************************/
13051+/* Required Plugin Function Table Entry Point: */
13052+/* Discover function & Support routines */
13053+/********************************************************/
13054+
13055+/**
13056+ * le_feature_data_to_cpu:
13057+ * @md: drivelink metadata
13058+ *
13059+ * convert feature data from on-disk (Little Endian) format
13060+ * to the native cpu endian format.
13061+**/
13062+static void
13063+le_feature_data_to_cpu(struct evms_drivelink_metadata *md)
13064+{
13065+ int i;
13066+
13067+ md->signature = le32_to_cpup(&md->signature);
13068+ md->crc = le32_to_cpup(&md->crc);
13069+ md->version.major = le32_to_cpup(&md->version.major);
13070+ md->version.minor = le32_to_cpup(&md->version.minor);
13071+ md->version.patchlevel = le32_to_cpup(&md->version.patchlevel);
13072+ md->flags = le32_to_cpup(&md->flags);
13073+ md->sequence_number = le64_to_cpup(&md->sequence_number);
13074+ md->child_serial_number = le64_to_cpup(&md->child_serial_number);
13075+ md->parent_serial_number = le64_to_cpup(&md->parent_serial_number);
13076+ md->child_count = le64_to_cpup(&md->child_count);
13077+ for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
13078+ struct evms_dl_ordering_table_entry *child_entry;
13079+
13080+ child_entry = &md->ordering_table[i];
13081+ child_entry->child_serial_number =
13082+ le64_to_cpup(&child_entry->child_serial_number);
13083+ child_entry->child_vsize =
13084+ le64_to_cpup(&child_entry->child_vsize);
13085+ }
13086+}
13087+
13088+/**
13089+ * load_feature_data: load a feature header from disk
13090+ * @node: storage object
13091+ * @md: ptr to drivelink metadata
13092+ *
13093+ * loads and verifies redundant copies of drivelink metadata. @md is modified
13094+ * and returned to the caller.
13095+ *
13096+ * Return value: 0 on success
13097+ * Otherwise error code
13098+**/
13099+static int
13100+load_feature_data(struct evms_logical_node *node,
13101+ struct evms_drivelink_metadata **md)
13102+{
13103+ int i, rc = 0, rc_array[2] = { 0, 0 }, size_in_bytes;
13104+ u64 real_metadata_size, feature_data_size;
13105+ u64 starting_sector;
13106+ struct evms_drivelink_metadata *cur_md, *md1, *md2 = NULL;
13107+ char *location_name;
13108+
13109+ /* verify the feature metadata size from the */
13110+ /* feature header agrees with the real size */
13111+ /* of the current metadata structure. */
13112+ real_metadata_size = evms_cs_size_in_vsectors(sizeof (**md));
13113+
13114+ /* allocate a buffer large enough to hold all */
13115+ /* sectors containing the feature's metadata */
13116+ size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
13117+ md1 = kmalloc(size_in_bytes, GFP_KERNEL);
13118+ if (md1) {
13119+ md2 = kmalloc(size_in_bytes, GFP_KERNEL);
13120+ if (!md2) {
13121+ kfree(md1);
13122+ rc = -ENOMEM;
13123+ }
13124+ } else {
13125+ rc = -ENOMEM;
13126+ }
13127+ if (!rc) {
13128+ for (i = 0; i < 2; i++) {
13129+ if (i == 0) {
13130+ starting_sector =
13131+ node->feature_header->
13132+ feature_data1_start_lsn;
13133+ feature_data_size =
13134+ node->feature_header->feature_data1_size;
13135+ cur_md = md1;
13136+ location_name = evms_primary_string;
13137+ } else {
13138+ starting_sector =
13139+ node->feature_header->
13140+ feature_data2_start_lsn;
13141+ feature_data_size =
13142+ node->feature_header->feature_data2_size;
13143+ cur_md = md2;
13144+ location_name = evms_secondary_string;
13145+ }
13146+ /* check that real metadata size matches the */
13147+ /* feature data size */
13148+ if (real_metadata_size != feature_data_size) {
13149+ LOG_ERROR
13150+ ("%s feature data size("PFU64" bytes) doesn't match expected size("PFU64" bytes).\n",
13151+ location_name,
13152+ feature_data_size <<
13153+ EVMS_VSECTOR_SIZE_SHIFT,
13154+ real_metadata_size <<
13155+ EVMS_VSECTOR_SIZE_SHIFT);
13156+ rc = -EINVAL;
13157+ rc_array[i] = rc;
13158+ continue;
13159+ }
13160+ /* load the node's feature data */
13161+ rc = INIT_IO(node,
13162+ 0,
13163+ starting_sector,
13164+ feature_data_size, cur_md);
13165+ if (rc) {
13166+ LOG_ERROR
13167+ ("error(%d) probing for %s feature data at sector("PFU64") on '%s'.\n",
13168+ rc, location_name, starting_sector,
13169+ node->name);
13170+ rc_array[i] = rc;
13171+ continue;
13172+ }
13173+ /* check for valid metadata signature */
13174+ if (le32_to_cpup(&cur_md->signature) !=
13175+ EVMS_DRIVELINK_SIGNATURE) {
13176+ rc = -ENODATA;
13177+ LOG_SERIOUS
13178+ ("error(%d) invalid signature in %s feature data on '%s'\n",
13179+ rc, location_name, node->name);
13180+ rc_array[i] = rc;
13181+ continue;
13182+ }
13183+ /* validate feature data CRC */
13184+ if (cur_md->crc != EVMS_MAGIC_CRC) {
13185+ int org_crc, final_crc;
13186+ org_crc = le32_to_cpup(&cur_md->crc);
13187+ cur_md->crc = 0;
13188+ final_crc =
13189+ evms_cs_calculate_crc(EVMS_INITIAL_CRC,
13190+ cur_md,
13191+ sizeof (*cur_md));
13192+ if (final_crc != org_crc) {
13193+ LOG_ERROR
13194+ ("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
13195+ org_crc, final_crc, location_name,
13196+ node->name);
13197+ rc = -EINVAL;
13198+ rc_array[i] = rc;
13199+ continue;
13200+ }
13201+ } else {
13202+ LOG_WARNING
13203+ ("CRC disabled in %s feature data on '%s'.\n",
13204+ location_name, node->name);
13205+ }
13206+ /* convert feature data from on-disk
13207+ * format (Little Endian) to native
13208+ * cpu endian format.
13209+ */
13210+ le_feature_data_to_cpu(cur_md);
13211+ /* check for valid structure version */
13212+ rc = evms_cs_check_version(&metadata_ver,
13213+ &cur_md->version);
13214+ if (rc) {
13215+ LOG_SERIOUS
13216+ ("error(%d) obsolete version detected: actual(%d,%d,%d), requires(%d,%d,%d) in %s feature data on '%s'\n",
13217+ rc, cur_md->version.major,
13218+ cur_md->version.minor,
13219+ cur_md->version.patchlevel,
13220+ DRIVELINK_METADATA_MAJOR,
13221+ DRIVELINK_METADATA_MINOR,
13222+ DRIVELINK_METADATA_PATCHLEVEL,
13223+ location_name, node->name);
13224+ rc_array[i] = rc;
13225+ }
13226+ }
13227+ /* getting same return code for both copies? */
13228+ if (rc_array[0] == rc_array[1]) {
13229+ rc = rc_array[0];
13230+ /* if no errors on both copies,
13231+ * check the sequence numbers.
13232+ * use the highest sequence number.
13233+ */
13234+ if (!rc) {
13235+ /* compare sequence numbers */
13236+ if (md1->sequence_number ==
13237+ md2->sequence_number) {
13238+ cur_md = md1;
13239+ } else {
13240+ LOG_WARNING
13241+ ("sequence number mismatches between front("PFU64") and rear("PFU64") feature data copies on node(%s)!\n",
13242+ md2->sequence_number,
13243+ md1->sequence_number, node->name);
13244+ if (md1->sequence_number >
13245+ md2->sequence_number)
13246+ cur_md = md1;
13247+ else
13248+ cur_md = md2;
13249+ LOG_WARNING
13250+ ("using %s feature data copy!\n",
13251+ (cur_md ==
13252+ md1) ? evms_primary_string :
13253+ evms_secondary_string);
13254+ }
13255+ }
13256+ /* getting different return codes for each copy */
13257+ } else if (rc_array[0] == 0) {
13258+ /* use 1st (rear) copy if its good */
13259+ rc = 0;
13260+ cur_md = md1;
13261+ } else if (rc_array[1] == 0) {
13262+ /* use 2nd (front) copy if its good */
13263+ rc = 0;
13264+ cur_md = md2;
13265+ } else if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
13266+ /* fail if either give a fatal error */
13267+ rc = -EINVAL;
13268+ cur_md = NULL;
13269+ }
13270+
13271+ /* deallocate metadata buffers appropriately */
13272+ if (rc || (cur_md == md1))
13273+ kfree(md2);
13274+ if (rc || (cur_md == md2))
13275+ kfree(md1);
13276+
13277+ /* save validated feature header pointer */
13278+ if (!rc)
13279+ *md = cur_md;
13280+ }
13281+ return (rc);
13282+}
13283+
13284+/**
13285+ * find_parent_node_for_child_node: finds or creates a parent node for this child node
13286+ * @child_node: input, child node
13287+ * @md: input, on-disk metadata
13288+ * @parent_node: output, parent node
13289+ * @dl_private: output, runtime metadata
13290+ * @discover_list: input/output, list of objects being discovered
13291+ *
13292+ * finds or creates a parent node for the specified child node. if the parent node is
13293+ * created, create and initialize the parent's private data area.
13294+ *
13295+ * Return value: 0 on success
13296+ * Otherwise error code.
13297+**/
13298+static int
13299+find_parent_node_for_child_node(struct evms_logical_node *child_node,
13300+ struct evms_drivelink_metadata *md,
13301+ struct evms_logical_node **parent_node,
13302+ struct runtime_data **dl_private,
13303+ struct evms_logical_node **discover_list)
13304+{
13305+ int rc = 0, parent_found = FALSE;
13306+ struct evms_logical_node *parent = NULL;
13307+ struct runtime_data *rd = NULL;
13308+
13309+ /* find the parent node for this child */
13310+ for (parent = *discover_list; parent; parent = parent->next) {
13311+ /* only parent nodes will have null feature headers */
13312+ if (!parent->feature_header) {
13313+ rd = (struct runtime_data *) parent->private;
13314+ if (rd->parent_sn == md->parent_serial_number) {
13315+ parent_found = TRUE;
13316+ break;
13317+ }
13318+ }
13319+ }
13320+ /* if no parent node found, create it */
13321+ if (parent_found == FALSE) {
13322+ rc = evms_cs_allocate_logical_node(&parent);
13323+ if (!rc) {
13324+ /* transpose info from child to parent */
13325+ parent->flags |= child_node->flags;
13326+ strcpy(parent->name,
13327+ child_node->feature_header->object_name);
13328+ /* copy evms system data to parent */
13329+ parent->volume_info = child_node->volume_info;
13330+ /* initialize the plugin id field */
13331+ parent->plugin = &plugin_header;
13332+ /* allocate parent's instance data */
13333+ parent->private = kmalloc(sizeof(*rd), GFP_KERNEL);
13334+ if (!parent->private)
13335+ rc = -ENOMEM;
13336+ }
13337+ if (!rc) {
13338+ /* initialize some instance data fields */
13339+ rd = (struct runtime_data *) parent->private;
13340+ rd->block_size = 0;
13341+ rd->parent_sn = md->parent_serial_number;
13342+ rd->child_count = md->child_count;
13343+ /* allocate the child table */
13344+ rd->child_table = kmalloc(sizeof(struct runtime_entry) *
13345+ rd->child_count, GFP_KERNEL);
13346+ if (!rd->child_table)
13347+ rc = -ENOMEM;
13348+ }
13349+ if (!rc) {
13350+ memset(rd->child_table, 0,
13351+ sizeof(struct runtime_entry) * rd->child_count);
13352+ /* add the parent node to the discover list */
13353+ rc = evms_cs_add_logical_node_to_list(discover_list,
13354+ parent);
13355+ MOD_INC_USE_COUNT;
13356+ }
13357+ /* if any errors encountered, try to clean up */
13358+ if (rc) {
13359+ LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
13360+ rc, child_node->name);
13361+ if (parent) {
13362+ DELETE(parent);
13363+ parent = NULL;
13364+ rd = NULL;
13365+ }
13366+ }
13367+ }
13368+
13369+ *dl_private = rd;
13370+ *parent_node = parent;
13371+
13372+ return (rc);
13373+}
13374+
13375+/**
13376+ * compute_child_index: compute the index for a specific child node
13377+ * @node: the child node
13378+ * @md: the drivelink on-disk metadata
13379+ *
13380+ * compute and return and 0-based index value of this child node's position
13381+ * in the parent node's ordering table.
13382+ *
13383+ * Return value: -1 on error
13384+ * otherwise the index of the specified child.
13385+**/
13386+static int
13387+compute_child_index(struct evms_logical_node *node,
13388+ struct evms_drivelink_metadata *md)
13389+{
13390+ int i, position = -1;
13391+
13392+ for (i = 0; i < md->child_count; i++) {
13393+ if (md->ordering_table[i].child_serial_number ==
13394+ md->child_serial_number) {
13395+ position = i;
13396+ break;
13397+ }
13398+ }
13399+ if (position == -1) {
13400+ LOG_SERIOUS("%s: child not found from '%s'\n",
13401+ __FUNCTION__, node->name);
13402+ }
13403+ return (position);
13404+}
13405+
13406+/**
13407+ * process_child_nodes: perform the discovery operation on each child node
13408+ * @discover_list: the list of potential child objects
13409+ *
13410+ * search the discovery list of drivelink child nodes. for each node found,
13411+ * perform the discovery operation on it.
13412+ *
13413+ * Return value: 0 on success
13414+ * otherwise error code
13415+**/
13416+static int
13417+process_child_nodes(struct evms_logical_node **discover_list)
13418+{
13419+ int rc = 0, index = -1;
13420+ struct evms_logical_node *node, *next_node, *parent;
13421+ struct evms_drivelink_metadata *md;
13422+ struct runtime_data *rd;
13423+ struct runtime_entry *child_entry = NULL;
13424+
13425+ for (node = *discover_list; node; node = next_node) {
13426+ next_node = node->next;
13427+ if ((!node->feature_header) ||
13428+ (node->feature_header->feature_id != plugin_header.id)) {
13429+ continue;
13430+ }
13431+
13432+ rc = evms_cs_remove_logical_node_from_list(discover_list, node);
13433+ if (rc)
13434+ BUG();
13435+ /* we need to load the feature data to */
13436+ /* find the parent's serial number this */
13437+ /* child node belongs to. */
13438+ md = NULL;
13439+ rc = load_feature_data(node, &md);
13440+ if (!rc) {
13441+ /* find the parent node for this child */
13442+ parent = NULL;
13443+ rc = find_parent_node_for_child_node(node, md,
13444+ &parent, &rd,
13445+ discover_list);
13446+ }
13447+ if (!rc) {
13448+ /* determine position of child in drive link object */
13449+ index = compute_child_index(node, md);
13450+ if (index == -1)
13451+ rc = index;
13452+ }
13453+ if (!rc) {
13454+ /* check for multiple child index requests */
13455+ child_entry =
13456+ (struct runtime_entry *) &rd->child_table[index];
13457+ /* check to see if this child index is
13458+ * already in use.
13459+ */
13460+ if (child_entry->child_node) {
13461+ LOG_SERIOUS
13462+ ("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
13463+ node->name, index,
13464+ child_entry->child_node->name);
13465+ rc = -1;
13466+ }
13467+ }
13468+ if (!rc) {
13469+ /* fill in child info in parent */
13470+
13471+ /* check the sector size for this node */
13472+ if (node->hardsector_size > parent->hardsector_size)
13473+ parent->hardsector_size = node->hardsector_size;
13474+ /* check the block size for this node */
13475+ if (node->block_size > parent->block_size)
13476+ parent->block_size = node->block_size;
13477+ /* set the child node */
13478+ child_entry->child_node = node;
13479+ /* set the metadata for this node */
13480+ child_entry->child_metadata = md;
13481+ }
13482+
13483+ /* on error, clean up accordingly */
13484+ if (rc) {
13485+ if (md)
13486+ kfree(md);
13487+ LOG_SERIOUS("%s: rc(%d) from '%s'\n",
13488+ __FUNCTION__, rc, node->name);
13489+ LOG_SERIOUS("deleting child node '%s'.\n", node->name);
13490+ rc = DELETE(node);
13491+ if (rc) {
13492+ LOG_SERIOUS
13493+ ("error(%d) attempting to delete '%s'.\n",
13494+ rc, node->name);
13495+ }
13496+ }
13497+ }
13498+
13499+ /* errors are handled internal to this function */
13500+ /* by deleting the failed node. This will get */
13501+ /* picked up by finalize_parent_nodes as a */
13502+ /* missing child node */
13503+ return (0);
13504+}
13505+
13506+#define TEST_CHILD_PRESENCE 0
13507+#define TEST_CHILD_COUNT 1
13508+#define TEST_CHILD_PARENTS_SERIAL_NUM 2
13509+#define TEST_CHILD_POSITION 3
13510+#define TEST_CHILD_METADATA 4
13511+
13512+/**
13513+ * test_parent_node: verify that a parent is complete
13514+ * @node: specified parent node
13515+ *
13516+ * verify that the parent node has all of its child nodes accounted for.
13517+ *
13518+ * Return value: 0 on success
13519+ * otherwise error code
13520+**/
13521+static int
13522+test_parent_node(struct evms_logical_node *node)
13523+{
13524+ int i, rc = 0;
13525+ struct runtime_data *rd;
13526+ struct runtime_entry *child_entry;
13527+
13528+ rd = (struct runtime_data *) node->private;
13529+ for (i = 0; i < rd->child_count; i++) {
13530+ child_entry = (struct runtime_entry *) &rd->child_table[i];
13531+
13532+ /* insure each child entry is filled */
13533+ if (!child_entry->child_node) {
13534+ node->flags |=
13535+ EVMS_VOLUME_SET_READ_ONLY | EVMS_VOLUME_PARTIAL;
13536+ LOG_ERROR("%s: missing child(%d).\n", __FUNCTION__, i);
13537+ } else
13538+ /* insure child count is the same */
13539+ /* in each child's metadata */
13540+ if (child_entry->child_metadata->child_count != rd->child_count) {
13541+ rc = -EVMS_FEATURE_FATAL_ERROR;
13542+ LOG_ERROR("%s: child count wrong for node '%s'\n",
13543+ __FUNCTION__, node->name);
13544+ } else
13545+ /* insure parent serial number is */
13546+ /* the same in each child's metadata */
13547+ if (child_entry->child_metadata->parent_serial_number !=
13548+ rd->parent_sn) {
13549+ rc = -EVMS_FEATURE_FATAL_ERROR;
13550+ LOG_ERROR
13551+ ("%s: incorrect [is("PFU64"), should be("PFU64")] child serial number for node '%s'\n",
13552+ __FUNCTION__,
13553+ child_entry->child_metadata->parent_serial_number,
13554+ rd->parent_sn, node->name);
13555+ } else
13556+ /* insure each is in the correct entry */
13557+ if (child_entry->child_metadata->ordering_table[i].
13558+ child_serial_number !=
13559+ child_entry->child_metadata->child_serial_number) {
13560+ rc = -EVMS_FEATURE_FATAL_ERROR;
13561+ LOG_ERROR
13562+ ("%s: child reports different index for node '%s'\n",
13563+ __FUNCTION__, node->name);
13564+ } else {
13565+ struct runtime_entry *other_child_entry;
13566+ int j, rc2;
13567+ /* compare the children's metadata */
13568+
13569+ /* look for another present child to
13570+ * compare against.
13571+ */
13572+ other_child_entry = NULL;
13573+ for (j = 0; j < rd->child_count; j++) {
13574+ /* skip comparing to ourselves */
13575+ if (j == i) {
13576+ continue;
13577+ }
13578+ /* is this child is present? */
13579+ if (rd->child_table[j].child_node) {
13580+ /* yes, use it */
13581+ other_child_entry = &rd->child_table[j];
13582+ break;
13583+ }
13584+ }
13585+ /* if we can't find another valid
13586+ * child node's metadata to compare
13587+ * against, just skip this test.
13588+ */
13589+ if (!other_child_entry) {
13590+ continue;
13591+ }
13592+ rc2 =
13593+ memcmp(other_child_entry->child_metadata->
13594+ ordering_table,
13595+ child_entry->child_metadata->ordering_table,
13596+ sizeof (child_entry->child_metadata->
13597+ ordering_table));
13598+ if (rc2) {
13599+ rc = -EVMS_FEATURE_FATAL_ERROR;
13600+ LOG_ERROR
13601+ ("%s: mismatching child metadata for nodes '%s' and '%s'\n",
13602+ __FUNCTION__,
13603+ rd->child_table[i - 1].child_node->name,
13604+ child_entry->child_node->name);
13605+ }
13606+ }
13607+ /* stop if fatal error encountered */
13608+ if (rc == -EVMS_FEATURE_FATAL_ERROR) {
13609+ break;
13610+ }
13611+ }
13612+ return (rc);
13613+}
13614+
13615+/**
13616+ * perform_final_adjustments: do final tweaks to parent node
13617+ * @node: parent node
13618+ *
13619+ * This function does the following:
13620+ * sets the vsize (in vsectors) field in each child node
13621+ * sets the voffset (in vsectors) field in each child node
13622+ * frees each child node's metadata
13623+ * sets the parent's total size field
13624+**/
13625+static void
13626+perform_final_adjustments(struct evms_logical_node *node)
13627+{
13628+ int i;
13629+ struct runtime_data *rd;
13630+ struct runtime_entry *child_entry = NULL;
13631+ struct evms_drivelink_metadata *ref_data = NULL;
13632+
13633+ rd = (struct runtime_data *) node->private;
13634+ /* find a valid copy of the ordering table.
13635+ * since all the ordering tables are the same
13636+ * we can just pick one to use for all the
13637+ * child computations.
13638+ */
13639+ for (i = 0; i < rd->child_count; i++) {
13640+ child_entry = (struct runtime_entry *) &rd->child_table[i];
13641+ if (child_entry->child_node) {
13642+ ref_data = child_entry->child_metadata;
13643+ break;
13644+ }
13645+ }
13646+ /* if we got this far, there should
13647+ * always be at least one valid child.
13648+ */
13649+ if (!ref_data)
13650+ BUG();
13651+ /* compute the parent's usable size,
13652+ * and construct the table used to
13653+ * remap parent I/Os to child I/Os */
13654+ for (i = 0; i < rd->child_count; i++) {
13655+ child_entry = (struct runtime_entry *) &rd->child_table[i];
13656+ /* set the LBA count for this child node */
13657+ child_entry->vsize = ref_data->ordering_table[i].child_vsize;
13658+ /* set the start LBA value for this child node */
13659+ child_entry->voffset = node->total_vsectors;
13660+ /* keep a running total of size in sectors */
13661+ node->total_vsectors += child_entry->vsize;
13662+ /* free the metadata for this child node */
13663+ if (ref_data != child_entry->child_metadata) {
13664+ kfree(child_entry->child_metadata);
13665+ }
13666+ child_entry->child_metadata = NULL;
13667+ /* free the feature header for this child node */
13668+ if (child_entry->child_node) {
13669+ kfree(child_entry->child_node->feature_header);
13670+ child_entry->child_node->feature_header = NULL;
13671+ }
13672+ }
13673+ /* free the reference data */
13674+ kfree(ref_data);
13675+}
13676+
13677+/**
13678+ * finalize_parent_nodes: verify and prepare parent nodes
13679+ * @discover_list: list of potential drivelink parent objects
13680+ *
13681+ * verify the completeness of each parent node. if not complete, purge the in-memory
13682+ * structs for this object and all its children. If complete, perform final tweaks
13683+ * to allow this node to useable.
13684+ *
13685+ * Return value: 0 on success
13686+ * otherwise error code
13687+**/
13688+static int
13689+finalize_parent_nodes(struct evms_logical_node **discover_list)
13690+{
13691+ int rc = 0, rc2;
13692+ struct evms_logical_node *node, *next_node;
13693+
13694+ for (node = *discover_list; node; node = next_node) {
13695+ next_node = node->next;
13696+ /* only check parent nodes */
13697+ if (!node->feature_header) {
13698+ /* valid the children of this parent */
13699+ rc = test_parent_node(node);
13700+ if (!rc) {
13701+ /* compute parent size and
13702+ * child remap table.
13703+ */
13704+ perform_final_adjustments(node);
13705+ } else {
13706+ /* fatal error encountered.
13707+ * cleanup from this node and
13708+ * delete it from memory.
13709+ */
13710+ evms_cs_remove_logical_node_from_list
13711+ (discover_list, node);
13712+ rc2 = DELETE(node);
13713+ if (rc2) {
13714+ LOG_SERIOUS
13715+ ("error(%d) attempting to delete '%s'.\n",
13716+ rc2, node->name);
13717+ }
13718+ }
13719+ }
13720+ }
13721+ return (rc);
13722+}
13723+
13724+/**
13725+ * drivelink_discover: discover drivelinked storage objects
13726+ * @discover_list: the list of objects to inspect
13727+ *
13728+ * perform the drivelink discover process on the objects in the discovery list
13729+ *
13730+ * Return value: 0 on success
13731+ * otherwise error code
13732+**/
13733+static int
13734+drivelink_discover(struct evms_logical_node **discover_list)
13735+{
13736+ int rc = 0;
13737+
13738+ MOD_INC_USE_COUNT;
13739+ rc = process_child_nodes(discover_list);
13740+ if (!rc)
13741+ rc = finalize_parent_nodes(discover_list);
13742+
13743+ MOD_DEC_USE_COUNT;
13744+ return (rc);
13745+}
13746+
13747+/********************************************************/
13748+/* Required Plugin Function Table Entry Point: */
13749+/* Delete function */
13750+/********************************************************/
13751+
13752+/**
13753+ * drivelink_delete: purges a drivelink object and its children from memory
13754+ * @node: the drivelink object to delete
13755+ *
13756+ * purge the drivelink object, its private data, and all its children from memory.
13757+ *
13758+ * Return value: 0 on success
13759+ * otherwise error code
13760+**/
13761+static int
13762+drivelink_delete(struct evms_logical_node *node)
13763+{
13764+ int i, rc = 0;
13765+ struct runtime_data *rd;
13766+ struct runtime_entry *child_entry;
13767+
13768+ LOG_DETAILS("deleting '%s'.\n", node->name);
13769+
13770+ rd = (struct runtime_data *) node->private;
13771+ if (rd) {
13772+ for (i = 0; i < rd->child_count; i++) {
13773+ child_entry = &rd->child_table[i];
13774+ /* delete the child node */
13775+ if (child_entry->child_node) {
13776+ rc = DELETE(child_entry->child_node);
13777+ if (rc)
13778+ break;
13779+ child_entry->child_node = NULL;
13780+ }
13781+ /* delete the child's metadata */
13782+ if (child_entry->child_metadata) {
13783+ kfree(child_entry->child_metadata);
13784+ child_entry->child_metadata = NULL;
13785+ }
13786+ }
13787+ if (!rc) {
13788+ /* delete the child table */
13789+ if (rd->child_table) {
13790+ kfree(rd->child_table);
13791+ rd->child_table = NULL;
13792+ }
13793+ /* delete the instance data */
13794+ kfree(rd);
13795+ node->private = NULL;
13796+ }
13797+ }
13798+ if (!rc) {
13799+ evms_cs_deallocate_logical_node(node);
13800+ MOD_DEC_USE_COUNT;
13801+ }
13802+
13803+ return (rc);
13804+}
13805+
13806+/**
13807+ * which_child: find the child node targetted by a IO to this drivelink object
13808+ * @parent: parent drivelink object
13809+ * @rsector: relative sector on the parent object
13810+ * @max_io_sects: largest IO size on the child, starting from rsector position
13811+ *
13812+ * This function find the child node a parent rsector maps to.
13813+ * It then adjusts the rsector value to be child relative and
13814+ * optionally computes the max # of sectors that can be access
13815+ * from this starting point on the child.
13816+ *
13817+ * Return value:
13818+ * The child node, the child relative rsector and max io size are
13819+ * returned to the caller. On error, the returned child node will
13820+ * be NULL.
13821+**/
13822+static struct evms_logical_node *
13823+which_child(struct evms_logical_node *parent,
13824+ u64 * rsector, u64 * max_io_sects)
13825+{
13826+ int i;
13827+ struct evms_logical_node *child = NULL;
13828+ struct runtime_data *rd;
13829+ struct runtime_entry *child_entry = NULL;
13830+
13831+ rd = (struct runtime_data *) parent->private;
13832+ for (i = 0; i < rd->child_count; i++) {
13833+ child_entry = (struct runtime_entry *) &rd->child_table[i];
13834+
13835+ if (*rsector >= child_entry->vsize) {
13836+ *rsector -= child_entry->vsize;
13837+ } else {
13838+ /* get the child node */
13839+ child = child_entry->child_node;
13840+ /* compute the sector count if requested */
13841+ if (max_io_sects)
13842+ /* this is only used for INIT I/O
13843+ * to return the largest sector
13844+ * count size for this child based
13845+ * on first sector in the I/O.
13846+ */
13847+ *max_io_sects = child_entry->vsize - *rsector;
13848+ break;
13849+ }
13850+ }
13851+ return (child);
13852+}
13853+
13854+/**
13855+ * drivelink_io_error: log an IO error for drivelink
13856+ * @node: drivelink object
13857+ * @bh: buffer head targetting this object
13858+ *
13859+ * this function was primarily created because the function
13860+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
13861+ * to be set on inline functions. Since this was an error path
13862+ * and not mainline, I decided to add a trace statement to help
13863+ * report on the failing condition.
13864+**/
13865+static void
13866+drivelink_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
13867+{
13868+ LOG_SERIOUS("%s error on '%s' remapping rsector("PFU64").\n",
13869+ (io_flag) ? "WRITE" : "READ",
13870+ node->name, (u64) bh->b_rsector);
13871+
13872+ bh->b_end_io(bh, 0);
13873+}
13874+
13875+/********************************************************/
13876+/* Required Plugin Function Table Entry Point: */
13877+/* Read function & Support routines */
13878+/********************************************************/
13879+
13880+/**
13881+ * drivelink_read: handles IO read operations to drivelink objects
13882+ * @node: drivelink object
13883+ * @bh: buffer head targetting this object
13884+ *
13885+ * handles IO read operations to the drivelink objects. internally remaps the
13886+ * drivelink relative requests to the child relative requests and then routes
13887+ * it to the child for further processing.
13888+**/
13889+static void
13890+drivelink_read(struct evms_logical_node *node, struct buffer_head *bh)
13891+{
13892+ struct evms_logical_node *child;
13893+ u64 io_size, rsector;
13894+
13895+ rsector = bh->b_rsector;
13896+ child = which_child(node, &rsector, &io_size);
13897+ if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13898+ bh->b_rsector = rsector;
13899+ R_IO(child, bh);
13900+ } else {
13901+ drivelink_io_error(node, READ, bh);
13902+ }
13903+}
13904+
13905+/********************************************************/
13906+/* Required Plugin Function Table Entry Point: */
13907+/* Write function & Support routines */
13908+/********************************************************/
13909+
13910+/**
13911+ * drivelink_read_write: handles IO write operations to drivelink objects
13912+ * @node: drivelink object
13913+ * @bh: buffer head targetting this object
13914+ *
13915+ * handles IO write operations to the drivelink objects. internally remaps the
13916+ * drivelink relative requests to the child relative requests and then routes
13917+ * it to the child for further processing.
13918+**/
13919+static void
13920+drivelink_write(struct evms_logical_node *node, struct buffer_head *bh)
13921+{
13922+ struct evms_logical_node *child;
13923+ u64 io_size, rsector;
13924+
13925+ rsector = bh->b_rsector;
13926+ child = which_child(node, &rsector, &io_size);
13927+ if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13928+ bh->b_rsector = rsector;
13929+ W_IO(child, bh);
13930+ } else {
13931+ drivelink_io_error(node, WRITE, bh);
13932+ }
13933+}
13934+
13935+/********************************************************/
13936+/* Required Plugin Function Table Entry Point: */
13937+/* Init I/O function */
13938+/********************************************************/
13939+
13940+/**
13941+ * drivelink_init_io: performs synchronous IO to drivelink objects
13942+ * @node: drivelink object
13943+ * @io_flag: read/write flag
13944+ * @sect_nr: starting sector, object relative (512 byte units)
13945+ * @num_sects: count of sectors
13946+ * @buf_addr: buffer address to read from/write to
13947+ *
13948+ * This function must determine which child or children a
13949+ * specified I/O request must be passed to. Also if, when,
13950+ * and how a request must be broken up.
13951+ *
13952+ * Return value: 0 on success
13953+ * otherwise error code
13954+**/
13955+static int
13956+drivelink_init_io(struct evms_logical_node *node, int io_flag,
13957+ u64 sect_nr,
13958+ u64 num_sects,
13959+ void *buf_addr)
13960+{
13961+ int rc = 0;
13962+
13963+ if (!node)
13964+ rc = -EINVAL;
13965+ else {
13966+ u64 starting_sector, remaining_sectors;
13967+ void *io_buf;
13968+ struct runtime_data *rd;
13969+
13970+ if ((sect_nr + num_sects) > node->total_vsectors) {
13971+ LOG_SERIOUS
13972+ ("attempted out of bound("PFU64") %s on '%s' at sector("PFU64"), count("PFU64").\n",
13973+ node->total_vsectors, (io_flag) ? "WRITE" : "READ",
13974+ node->name, sect_nr, num_sects);
13975+ rc = -EINVAL;
13976+ } else {
13977+ rd = (struct runtime_data *) node->private;
13978+ /* make working copies of input parameters */
13979+ starting_sector = sect_nr;
13980+ remaining_sectors = num_sects;
13981+ io_buf = buf_addr;
13982+ /* loop until all I/O is performed */
13983+ while (remaining_sectors) {
13984+ u64 io_start, io_size;
13985+ struct evms_logical_node *child;
13986+
13987+ /* compute the child relative io_start
13988+ * and max io_size.
13989+ */
13990+ io_start = starting_sector;
13991+ child = which_child(node, &io_start, &io_size);
13992+ /* adjust io_size based on
13993+ * original remaining sectors
13994+ * in this io.
13995+ */
13996+ if (io_size > remaining_sectors)
13997+ io_size = remaining_sectors;
13998+ if (child) {
13999+ rc = INIT_IO(child,
14000+ io_flag,
14001+ io_start, io_size, io_buf);
14002+ } else {
14003+ /* if partial volume, return 0's
14004+ * for missing children.
14005+ */
14006+ if (io_flag == READ) {
14007+ memset(io_buf, 0,
14008+ io_size <<
14009+ EVMS_VSECTOR_SIZE_SHIFT);
14010+ }
14011+ }
14012+ if (!rc) {
14013+ /* adjust working copies */
14014+ starting_sector += io_size;
14015+ remaining_sectors -= io_size;
14016+ io_buf += io_size <<
14017+ EVMS_VSECTOR_SIZE_SHIFT;
14018+ } else
14019+ break;
14020+ }
14021+ }
14022+ }
14023+
14024+ return (rc);
14025+}
14026+
14027+/********************************************************/
14028+/* Required Plugin Function Table Entry Point: */
14029+/* IOCTL function & Support routines */
14030+/********************************************************/
14031+
14032+/**
14033+ * drivelink_ioctl_cmd_plugin_ioctl: drivelink support for the 'plugin ioctl' command
14034+ * @node: drivelink object
14035+ * @inode: VFS supplied parameter
14036+ * @file: VFS supplied parameter
14037+ * @cmd: the specific ioctl command
14038+ * @arg: the specific ioctl arguments
14039+ *
14040+ * this function handles 'plugin ioctl' commands. currently there is no specific
14041+ * commands for this plugin. however, this plugin must broadcast some commands so
14042+ * lower layers can receive them.
14043+ *
14044+ * Return value: 0 on success
14045+ * otherwise error code
14046+**/
14047+static int
14048+drivelink_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
14049+ struct inode *inode, struct file *file,
14050+ unsigned long cmd, unsigned long arg)
14051+{
14052+ int i, rc = 0;
14053+ struct runtime_data *rd;
14054+ struct evms_plugin_ioctl_pkt tmp, *user_parms;
14055+
14056+ user_parms = (struct evms_plugin_ioctl_pkt *) arg;
14057+ /* copy user's parameters to kernel space */
14058+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
14059+ rc = -EFAULT;
14060+
14061+ if (!rc) {
14062+ rd = (struct runtime_data *) node->private;
14063+ /* is this cmd targetted at this feature ? */
14064+ if (tmp.feature_id == node->plugin->id) {
14065+ switch (tmp.feature_command) {
14066+ default:
14067+ break;
14068+ }
14069+ } else { /* broadcast this cmd to all children */
14070+ for (i = 0; i < rd->child_count; i++) {
14071+ struct evms_logical_node *child_node;
14072+
14073+ child_node = rd->child_table[i].child_node;
14074+ if (child_node) {
14075+ rc = IOCTL(child_node, inode, file,
14076+ cmd, arg);
14077+ if (rc)
14078+ break;
14079+ }
14080+ }
14081+ }
14082+ /* copy info to userspace */
14083+ if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
14084+ rc = -EFAULT;
14085+ }
14086+ return (rc);
14087+}
14088+
14089+/**
14090+ * drivelink_ioctl_cmd_broadcast: broadcast ioctls to your children
14091+ * @node: drivelink object
14092+ * @inode: VFS supplied parameter
14093+ * @file: VFS supplied parameter
14094+ * @cmd: the specific ioctl command
14095+ * @arg: the specific ioctl arguments
14096+ *
14097+ * broadcast the specified ioctl command and arguments to all this objects
14098+ * children. OR (logical opeation) the return values from all the children
14099+ * and return the OR'd value to the caller.
14100+ *
14101+ * Return value: 0 on success
14102+ * otherwise error code
14103+**/
14104+static int
14105+drivelink_ioctl_cmd_broadcast(struct evms_logical_node *node,
14106+ struct inode *inode, struct file *file,
14107+ unsigned long cmd, unsigned long arg)
14108+{
14109+ int i, rc = 0;
14110+ struct runtime_data *rd;
14111+
14112+ rd = (struct runtime_data *) node->private;
14113+ /* broadcast this cmd to all children */
14114+ for (i = 0; i < rd->child_count; i++) {
14115+ struct evms_logical_node *child_node;
14116+
14117+ child_node = rd->child_table[i].child_node;
14118+ if (child_node) {
14119+ rc |= IOCTL(child_node, inode, file, cmd, arg);
14120+ }
14121+ }
14122+ return (rc);
14123+}
14124+
14125+/**
14126+ * drivelink_ioctl: main ioctl entry point and handler
14127+ * @node: drivelink object
14128+ * @inode: VFS supplied parameter
14129+ * @file: VFS supplied parameter
14130+ * @cmd: a specific ioctl command
14131+ * @arg: a specific ioctl argument
14132+ *
14133+ * handles specific ioctl command internally and routes other ioctls commands to
14134+ * the appropriate entry points.
14135+ *
14136+ * Returns: 0 on success
14137+ * otherwise error code
14138+ **/
14139+static int
14140+drivelink_ioctl(struct evms_logical_node *node,
14141+ struct inode *inode,
14142+ struct file *file, unsigned int cmd, unsigned long arg)
14143+{
14144+ int rc = 0;
14145+ struct runtime_data *rd = NULL;
14146+ struct hd_geometry hdgeo;
14147+
14148+ if ((!node) || (!inode))
14149+ rc = -EINVAL;
14150+
14151+ if (!rc) {
14152+ rd = (struct runtime_data *) node->private;
14153+ switch (cmd) {
14154+ case HDIO_GETGEO:
14155+ hdgeo.heads = 255;
14156+ hdgeo.sectors = 63;
14157+ hdgeo.cylinders =
14158+ ((unsigned int) node->total_vsectors) /
14159+ hdgeo.heads / hdgeo.sectors;
14160+ hdgeo.start = 0;
14161+ if (copy_to_user((int *) arg, &hdgeo, sizeof (hdgeo)))
14162+ rc = -EFAULT;
14163+ break;
14164+ case EVMS_QUIESCE_VOLUME:
14165+ case EVMS_GET_DISK_LIST:
14166+ case EVMS_CHECK_MEDIA_CHANGE:
14167+ case EVMS_REVALIDATE_DISK:
14168+ case EVMS_OPEN_VOLUME:
14169+ case EVMS_CLOSE_VOLUME:
14170+ case EVMS_CHECK_DEVICE_STATUS:
14171+ rc = drivelink_ioctl_cmd_broadcast(node, inode, file,
14172+ cmd, arg);
14173+ break;
14174+ case EVMS_PLUGIN_IOCTL:
14175+ rc = drivelink_ioctl_cmd_plugin_ioctl(node, inode, file,
14176+ cmd, arg);
14177+ break;
14178+ case EVMS_GET_BMAP:
14179+ {
14180+ struct evms_get_bmap_pkt *bmap;
14181+ u64 io_start, io_size;
14182+ struct evms_logical_node *child;
14183+
14184+ bmap = (struct evms_get_bmap_pkt *) arg;
14185+ io_start = bmap->rsector;
14186+ child = which_child(node, &io_start, &io_size);
14187+ if (child) {
14188+ if (node->block_size !=
14189+ child->block_size) {
14190+ bmap->status = -EPERM;
14191+ } else {
14192+ bmap->rsector = io_start;
14193+ rc = IOCTL(child,
14194+ inode,
14195+ file, cmd, arg);
14196+ }
14197+ }
14198+ }
14199+ break;
14200+ default:
14201+ rc = -EINVAL;
14202+ break;
14203+ }
14204+ }
14205+ return (rc);
14206+}
14207+
14208+/********************************************************/
14209+/* Required Module Entry Point: */
14210+/* drivelink_init */
14211+/********************************************************/
14212+
14213+/**
14214+ * drivelink_init: register this module for use within the EVMS framework
14215+ *
14216+ * Return value: 0 on success
14217+ * otherwise error code.
14218+**/
14219+int __init
14220+drivelink_init(void)
14221+{
14222+ return evms_cs_register_plugin(&plugin_header);
14223+}
14224+
14225+/**
14226+ * drivelink_exit: unregister this module from use within the EVMS framework
14227+ *
14228+ * Return value: 0 on success
14229+ * otherwise error code.
14230+**/
14231+void __exit
14232+drivelink_exit(void)
14233+{
14234+ evms_cs_unregister_plugin(&plugin_header);
14235+}
14236+
14237+module_init(drivelink_init);
14238+module_exit(drivelink_exit);
14239+#ifdef MODULE_LICENSE
14240+MODULE_LICENSE("GPL");
14241+#endif
14242diff -Naur linux-2002-09-30/drivers/evms/evms_ecr.c evms-2002-09-30/drivers/evms/evms_ecr.c
14243--- linux-2002-09-30/drivers/evms/evms_ecr.c Wed Dec 31 18:00:00 1969
14244+++ evms-2002-09-30/drivers/evms/evms_ecr.c Fri Aug 16 16:19:56 2002
14245@@ -0,0 +1,213 @@
14246+/* -*- linux-c -*- */
14247+/*
14248+ *
14249+ * Copyright (c) International Business Machines Corp., 2000
14250+ *
14251+ * This program is free software; you can redistribute it and/or modify
14252+ * it under the terms of the GNU General Public License as published by
14253+ * the Free Software Foundation; either version 2 of the License, or
14254+ * (at your option) any later version.
14255+ *
14256+ * This program is distributed in the hope that it will be useful,
14257+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14258+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14259+ * the GNU General Public License for more details.
14260+ *
14261+ * You should have received a copy of the GNU General Public License
14262+ * along with this program; if not, write to the Free Software
14263+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14264+ */
14265+
14266+/* linux/driver/evms/evms_ecr.c
14267+ *
14268+ * EVMS - Cluster enablement (ECR) module
14269+ *
14270+ */
14271+
14272+
14273+#include <linux/kernel.h>
14274+#include <linux/module.h>
14275+#include <linux/init.h>
14276+#include <linux/types.h>
14277+#include <linux/evms/evms.h>
14278+#include <linux/evms/evms_ecr.h>
14279+
14280+#define LOG_PREFIX "ecr: "
14281+
14282+
14283+/*
14284+ * ecr_group_join
14285+ */
14286+ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table,
14287+ ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
14288+{
14289+ /* dummy */
14290+ return ECR_FAIL;
14291+}
14292+
14293+
14294+
14295+
14296+/*
14297+ * ecr_group_leave
14298+ */
14299+void ecr_group_leave(ecr_group_t group)
14300+{
14301+ /* dummy */
14302+ return;
14303+}
14304+
14305+
14306+
14307+/*
14308+ * ecr_group_send
14309+ */
14310+int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
14311+ size_t size, ecr_instance_t *instance,
14312+ void callback(int ret, ecr_instance_t *instance))
14313+{
14314+ /* dummy */
14315+ return ECR_FAIL;
14316+}
14317+
14318+
14319+
14320+/*
14321+ * ecr_group_send_wait
14322+ */
14323+int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
14324+ size_t size, int *ret)
14325+{
14326+ /* dummy */
14327+ *ret = ECR_FAIL;
14328+ return ECR_FAIL;
14329+}
14330+
14331+
14332+
14333+/*
14334+ * ecr_group_broadcast
14335+ */
14336+int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
14337+ ecr_instance_t *instance,
14338+ void callback(u_char ret, ecr_instance_t *instance))
14339+{
14340+ /* dummy */
14341+ return ECR_FAIL;
14342+}
14343+
14344+
14345+
14346+/*
14347+ * ecr_group_broadcast_wait
14348+ */
14349+int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
14350+ u_char *ret)
14351+{
14352+ /* dummy */
14353+ *ret = ECR_FAIL;
14354+ return ECR_FAIL;
14355+}
14356+
14357+
14358+
14359+/*
14360+ * ecr_group_atomic_execute
14361+ */
14362+int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
14363+ ecr_instance_t *instance,
14364+ void callback(ecr_instance_t *instance))
14365+{
14366+ /* dummy */
14367+ return ECR_FAIL;
14368+}
14369+
14370+
14371+
14372+/*
14373+ * ecr_group_atomic_execute_wait
14374+ */
14375+int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
14376+{
14377+ /* dummy */
14378+ return ECR_FAIL;
14379+}
14380+
14381+
14382+
14383+/*
14384+ * ecr_group_success_response
14385+ */
14386+void ecr_group_success_response(ecr_message_t *handle)
14387+{
14388+ /* dummy */
14389+ return;
14390+}
14391+
14392+
14393+
14394+
14395+/*
14396+ * ecr_group_failure_response
14397+ */
14398+void ecr_group_failure_response(ecr_message_t *handle, int ret)
14399+{
14400+ /* dummy */
14401+ return;
14402+}
14403+
14404+
14405+
14406+/*
14407+ * ecr_lock_create
14408+ */
14409+ecr_lock_t ecr_lock_create(char *lockname)
14410+{
14411+ /* dummy */
14412+ return ECR_FAIL;
14413+}
14414+
14415+/*
14416+ * ecr_lock
14417+ */
14418+int ecr_lock(ecr_lock_t lock, u64 start, u64 length,
14419+ ecr_lock_mode_t mode, u_char flag)
14420+{
14421+ /* dummy */
14422+ return ECR_FAIL;
14423+}
14424+
14425+
14426+
14427+/*
14428+ * ecr_unlock
14429+ */
14430+int ecr_unlock(ecr_lock_t lock, u64 start, u64 length)
14431+{
14432+ /* dummy */
14433+ return ECR_FAIL;
14434+}
14435+
14436+
14437+/********************************************************/
14438+/* Required Module Entry Point: */
14439+/* ecr_init() */
14440+/********************************************************/
14441+
14442+static int __init ecr_init(void)
14443+{
14444+ /* dummy */
14445+ return 0;
14446+}
14447+
14448+static void __exit ecr_exit(void)
14449+{
14450+ return;
14451+}
14452+
14453+module_init(ecr_init);
14454+module_exit(ecr_exit);
14455+#ifdef MODULE_LICENSE
14456+MODULE_LICENSE("GPL");
14457+#endif
14458+
14459diff -Naur linux-2002-09-30/drivers/evms/evms_passthru.c evms-2002-09-30/drivers/evms/evms_passthru.c
14460--- linux-2002-09-30/drivers/evms/evms_passthru.c Wed Dec 31 18:00:00 1969
14461+++ evms-2002-09-30/drivers/evms/evms_passthru.c Fri Sep 13 16:09:55 2002
14462@@ -0,0 +1,298 @@
14463+/* -*- linux-c -*- */
14464+
14465+/*
14466+ *
14467+ *
14468+ * Copyright (c) International Business Machines Corp., 2000
14469+ *
14470+ * This program is free software; you can redistribute it and/or modify
14471+ * it under the terms of the GNU General Public License as published by
14472+ * the Free Software Foundation; either version 2 of the License, or
14473+ * (at your option) any later version.
14474+ *
14475+ * This program is distributed in the hope that it will be useful,
14476+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14477+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14478+ * the GNU General Public License for more details.
14479+ *
14480+ * You should have received a copy of the GNU General Public License
14481+ * along with this program; if not, write to the Free Software
14482+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14483+ *
14484+ *
14485+ */
14486+/*
14487+ * linux/drivers/evms/evms_passthru.c
14488+ *
14489+ * EVMS System Data Manager
14490+ *
14491+ *
14492+ */
14493+
14494+#include <linux/module.h>
14495+#include <linux/kernel.h>
14496+#include <linux/config.h>
14497+#include <linux/genhd.h>
14498+#include <linux/string.h>
14499+#include <linux/blk.h>
14500+#include <linux/init.h>
14501+#include <linux/slab.h>
14502+#include <linux/evms/evms.h>
14503+#include <asm/system.h>
14504+
14505+#define EVMS_PASSTHRU_ID 0
14506+#define LOG_PREFIX "passthru: "
14507+
14508+static int passthru_mgr_discover(struct evms_logical_node **);
14509+static int passthru_mgr_delete(struct evms_logical_node *);
14510+static void passthru_mgr_read(struct evms_logical_node *, struct buffer_head *);
14511+static void passthru_mgr_write(struct evms_logical_node *, struct buffer_head *);
14512+static int passthru_mgr_ioctl(struct evms_logical_node *,
14513+ struct inode *,
14514+ struct file *, unsigned int, unsigned long);
14515+static int passthru_mgr_init_io(struct evms_logical_node *,
14516+ int, u64, u64, void *);
14517+
14518+static struct evms_plugin_fops fops = {
14519+ .discover = passthru_mgr_discover,
14520+ .delete = passthru_mgr_delete,
14521+ .read = passthru_mgr_read,
14522+ .write = passthru_mgr_write,
14523+ .init_io = passthru_mgr_init_io,
14524+ .ioctl = passthru_mgr_ioctl
14525+};
14526+
14527+static struct evms_plugin_header plugin_header = {
14528+ .id = SetPluginID(IBM_OEM_ID,
14529+ EVMS_FEATURE,
14530+ EVMS_PASSTHRU_ID),
14531+ .version = {
14532+ .major = 1,
14533+ .minor = 1,
14534+ .patchlevel = 1
14535+ },
14536+ .required_services_version = {
14537+ .major = 0,
14538+ .minor = 5,
14539+ .patchlevel = 0
14540+ },
14541+ .fops = &fops
14542+};
14543+
14544+/*******************************/
14545+/* discovery support functions */
14546+/*******************************/
14547+
14548+static int
14549+process_passthru_data(struct evms_logical_node **pp)
14550+{
14551+ int rc, size_in_sectors;
14552+ struct evms_logical_node *node, *new_node;
14553+
14554+ node = *pp;
14555+
14556+ size_in_sectors =
14557+ evms_cs_size_in_vsectors(sizeof (struct evms_feature_header));
14558+
14559+ /* allocate "parent" node */
14560+ rc = evms_cs_allocate_logical_node(&new_node);
14561+ if (!rc) {
14562+ /* initialize "parent" node */
14563+ new_node->private = node;
14564+ new_node->flags = node->flags;
14565+ new_node->plugin = &plugin_header;
14566+ new_node->system_id = node->system_id;
14567+ new_node->block_size = node->block_size;
14568+ new_node->hardsector_size = node->hardsector_size;
14569+ new_node->total_vsectors = node->total_vsectors;
14570+ new_node->total_vsectors -=
14571+ (size_in_sectors << 1) +
14572+ node->feature_header->alignment_padding;
14573+ new_node->volume_info = node->volume_info;
14574+ strcpy(new_node->name, node->name);
14575+ if (strlen(node->feature_header->object_name))
14576+ strcat(new_node->name,
14577+ node->feature_header->object_name);
14578+ else
14579+ strcat(new_node->name, "_Passthru");
14580+
14581+ /* return "parent" node to caller */
14582+ *pp = new_node;
14583+
14584+ MOD_INC_USE_COUNT;
14585+
14586+ LOG_DETAILS("feature header found on '%s', created '%s'.\n",
14587+ node->name, new_node->name);
14588+ /* we're done with the passthru feature headers
14589+ * so lets delete them now.
14590+ */
14591+ kfree(node->feature_header);
14592+ node->feature_header = NULL;
14593+ } else {
14594+ /* on any fatal error, delete the node */
14595+ int rc2 = DELETE(node);
14596+ if (rc2) {
14597+ LOG_DEFAULT
14598+ ("error(%d) attempting to delete node(%p,%s).\n",
14599+ rc2, node, node->name);
14600+ }
14601+ }
14602+ return (rc);
14603+}
14604+
14605+/********** Required Plugin Functions **********/
14606+
14607+/*
14608+ * Function: passthru_mgr_discover
14609+ *
14610+ */
14611+static int
14612+passthru_mgr_discover(struct evms_logical_node **discover_list)
14613+{
14614+ int rc = 0;
14615+ struct evms_logical_node *node, *tmp_list_head;
14616+
14617+ MOD_INC_USE_COUNT;
14618+ tmp_list_head = *discover_list;
14619+ *discover_list = NULL;
14620+
14621+ while (tmp_list_head) {
14622+ node = tmp_list_head;
14623+ rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
14624+ node);
14625+ if (!rc)
14626+ rc = process_passthru_data(&node);
14627+ if (!rc)
14628+ if (node)
14629+ rc = evms_cs_add_logical_node_to_list
14630+ (discover_list, node);
14631+ }
14632+ MOD_DEC_USE_COUNT;
14633+ return (rc);
14634+}
14635+
14636+/*
14637+ * Function: passthru_mgr_delete
14638+ *
14639+ */
14640+static int
14641+passthru_mgr_delete(struct evms_logical_node *node)
14642+{
14643+ int rc;
14644+ struct evms_logical_node *p;
14645+
14646+ LOG_DETAILS("deleting '%s'.\n", node->name);
14647+
14648+ p = node->private;
14649+ rc = DELETE(p);
14650+ if (!rc) {
14651+ evms_cs_deallocate_logical_node(node);
14652+ MOD_DEC_USE_COUNT;
14653+ }
14654+ return (rc);
14655+}
14656+
14657+/*
14658+ * function: passthru_io_error
14659+ *
14660+ * this function was primarily created because the function
14661+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
14662+ * to be set on inline functions. Since this was an error path
14663+ * and not mainline, I decided to add a trace statement to help
14664+ * report on the failing condition.
14665+ *
14666+ */
14667+static void
14668+passthru_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
14669+{
14670+ LOG_SERIOUS
14671+ ("attempt to %s beyond boundary("PFU64") on (%s), rsector("PFU64").\n",
14672+ (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1,
14673+ node->name, (u64) bh->b_rsector);
14674+
14675+ bh->b_end_io(bh, 0);
14676+}
14677+
14678+/*
14679+ * Function: passthru_mgr_read
14680+ */
14681+static void
14682+passthru_mgr_read(struct evms_logical_node *node, struct buffer_head *bh)
14683+{
14684+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14685+ node->total_vsectors) {
14686+ R_IO(((struct evms_logical_node *) (node->private)), bh);
14687+ } else
14688+ passthru_io_error(node, READ, bh);
14689+}
14690+
14691+/*
14692+ * Function: passthru_mgr_write
14693+ *
14694+ */
14695+static void
14696+passthru_mgr_write(struct evms_logical_node *node, struct buffer_head *bh)
14697+{
14698+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14699+ node->total_vsectors) {
14700+ W_IO(((struct evms_logical_node *) (node->private)), bh);
14701+ } else
14702+ passthru_io_error(node, WRITE, bh);
14703+}
14704+
14705+/*
14706+ * Function: passthru_mgr_ioctl
14707+ *
14708+ */
14709+static int
14710+passthru_mgr_ioctl(struct evms_logical_node *node,
14711+ struct inode *inode,
14712+ struct file *file, unsigned int cmd, unsigned long arg)
14713+{
14714+ int rc;
14715+
14716+ if ((!node) || (!inode))
14717+ rc = -EINVAL;
14718+ else
14719+ rc = IOCTL(((struct evms_logical_node *) (node->private)),
14720+ inode, file, cmd, arg);
14721+ return (rc);
14722+}
14723+
14724+static int
14725+passthru_mgr_init_io(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */
14726+ u64 sect_nr, /* disk LBA */
14727+ u64 num_sects, /* # of sectors */
14728+ void *buf_addr)
14729+{ /* buffer address */
14730+ int rc;
14731+ if ((sect_nr + num_sects) <= node->total_vsectors) {
14732+ rc = INIT_IO(((struct evms_logical_node *) (node->
14733+ private)),
14734+ io_flag, sect_nr, num_sects, buf_addr);
14735+ } else
14736+ rc = -EINVAL;
14737+ return (rc);
14738+}
14739+
14740+/*
14741+ * Function: passthru_init
14742+ *
14743+ */
14744+int __init
14745+evms_passthru_manager_init(void)
14746+{
14747+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
14748+}
14749+
14750+void __exit
14751+evms_passthru_manager_exit(void)
14752+{
14753+ evms_cs_unregister_plugin(&plugin_header);
14754+}
14755+
14756+module_init(evms_passthru_manager_init);
14757+module_exit(evms_passthru_manager_exit);
14758+#ifdef MODULE_LICENSE
14759+MODULE_LICENSE("GPL");
14760+#endif
14761diff -Naur linux-2002-09-30/drivers/evms/gpt_part.c evms-2002-09-30/drivers/evms/gpt_part.c
14762--- linux-2002-09-30/drivers/evms/gpt_part.c Wed Dec 31 18:00:00 1969
14763+++ evms-2002-09-30/drivers/evms/gpt_part.c Fri Sep 13 16:09:55 2002
14764@@ -0,0 +1,1018 @@
14765+/* -*- linux-c -*- */
14766+/*
14767+ *
14768+ *
14769+ * Copyright (c) International Business Machines Corp., 2000
14770+ *
14771+ * This program is free software; you can redistribute it and/or modify
14772+ * it under the terms of the GNU General Public License as published by
14773+ * the Free Software Foundation; either version 2 of the License, or
14774+ * (at your option) any later version.
14775+ *
14776+ * This program is distributed in the hope that it will be useful,
14777+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14778+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14779+ * the GNU General Public License for more details.
14780+ *
14781+ * You should have received a copy of the GNU General Public License
14782+ * along with this program; if not, write to the Free Software
14783+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14784+ *
14785+ *
14786+ */
14787+
14788+/* linux/driver/evms/gpt_part.c
14789+ *
14790+ * EVMS - EFI GPT segment manager plugin
14791+ *
14792+ * This plugin provides support for the GUID Partition Table format specified
14793+ * by the Extensible Firmware Interface documentation ... version 1.02
14794+ */
14795+
14796+#include <linux/config.h>
14797+#include <linux/module.h>
14798+#include <linux/kernel.h>
14799+#include <linux/config.h>
14800+#include <linux/string.h>
14801+#include <linux/blk.h>
14802+#include <asm/uaccess.h>
14803+#include <linux/evms/evms.h>
14804+
14805+/* prefix used in logging messages */
14806+#define LOG_PREFIX "gpt_part: "
14807+
14808+/**
14809+ * struct gpt_private - Private data structure for this plugin
14810+ * @source_object: object this IO will get remapped to
14811+ * @start_sect: source object relative starting address in 512 byte units
14812+ * @nr_sect: partition size in 512 bytes units
14813+ * @type: partition type or filesystem format indicator
14814+ *
14815+ * private copy of the just the fields we require to remap IO requests
14816+ * to the underlying object.
14817+ **/
14818+struct gpt_private {
14819+ struct evms_logical_node *source_disk;
14820+ u64 start_sect;
14821+ u64 nr_sects;
14822+ unsigned char type;
14823+};
14824+
14825+#define GPT_DISKMAGIC 0x5452415020494645 // "EFI PART"
14826+#define GPT_PNAME_SIZE 36 // max unicode partition name size
14827+
14828+/**
14829+ * struct guid - GUID structure
14830+ * @time_low: timestamp - low order 32 bits
14831+ * @time_mid: timestamp - mid 16 bits
14832+ * @time_high: timestamp - high 16 bits
14833+ * @clock_seq_high: clock - high order 8 bits
14834+ * @clock_seq_low: clock - low order 8 bits
14835+ * @node: spatial reference - unique id (ie. mac address of nic)
14836+ *
14837+ * GUID structure
14838+ **/
14839+struct guid {
14840+ u32 time_low;
14841+ u16 time_mid;
14842+ u16 time_high;
14843+ u8 clock_seq_high;
14844+ u8 clock_seq_low;
14845+ u8 node[6];
14846+};
14847+
14848+/**
14849+ * struct gpt_partition - GPT partition record definition
14850+ * @type: partition type
14851+ * @part_id: partition record id
14852+ * @start: address of 1st block of partition
14853+ * @end: address of last block of partition
14854+ * @attributes: bit field reserved by EFI spec
14855+ * @name: unicode name of partition
14856+ *
14857+ * GPT partition record definition
14858+ **/
14859+struct gpt_partition {
14860+ struct guid type;
14861+ struct guid part_id;
14862+ u64 start;
14863+ u64 end;
14864+ u64 attributes;
14865+ u16 name[GPT_PNAME_SIZE];
14866+};
14867+
14868+/**
14869+ * struct gpt_header - GPT header
14870+ * @signature: EFI compatible header signature
14871+ * @version: spec revision number
14872+ * @size: size (bytes) of gpt header
14873+ * @crc: crc of gpt header
14874+ * @reserve: reserved by spec ... must be zero
14875+ * @my_lba: lba of gpt header
14876+ * @alternate_lba: lba of 2nd copy of gpt header
14877+ * @start_useable: lba of 1st block of useable area on disk
14878+ * @end_useable: lba of last block of useable area on disk
14879+ * @disk_id: GUID - identifies this disk
14880+ * @ptable_lba: lba of partition table
14881+ * @ptable_count: number of entries in the partition table
14882+ * @ptable_entry_size: size of partition table entry
14883+ * @ptable_crc: crc of partition table
14884+ *
14885+ * GPT header
14886+ **/
14887+struct gpt_header {
14888+ u64 signature;
14889+ u32 version;
14890+ u32 size;
14891+ u32 crc;
14892+ u32 reserve;
14893+ u64 my_lba;
14894+ u64 alternate_lba;
14895+ u64 start_useable;
14896+ u64 end_useable;
14897+ struct guid disk_id;
14898+ u64 ptable_lba;
14899+ u32 ptable_count;
14900+ u32 ptable_entry_size;
14901+ u32 ptable_crc;
14902+};
14903+
14904+struct guid EFI_SYSTEM_PARTITION = {
14905+ 0xC12A7328,
14906+ 0xF81F,
14907+ 0x11D2,
14908+ 0xBA,
14909+ 0x4B,
14910+ {0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B}
14911+};
14912+
14913+struct guid BASIC_DATA_PARTITION = {
14914+ 0xEBD0A0A2,
14915+ 0xB9E5,
14916+ 0x4433,
14917+ 0x87,
14918+ 0xC0,
14919+ {0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7}
14920+};
14921+
14922+struct guid LEGACY_MBR_PARTITION = {
14923+ 0x024DEE41,
14924+ 0x33E7,
14925+ 0x11D3,
14926+ 0x9D,
14927+ 0x69,
14928+ {0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F}
14929+};
14930+
14931+struct guid GPT_SWAP_PARTITION = {
14932+ 0x0657FD6D,
14933+ 0xA4AB,
14934+ 0x43C4,
14935+ 0x84,
14936+ 0xE5,
14937+ {0x09, 0x33, 0xC8, 0x4B, 0x4F, 0x4F}
14938+};
14939+
14940+struct guid UNUSED_GPT_PARTITION = {
14941+ 0, 0, 0, 0, 0,
14942+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
14943+};
14944+
14945+static int exported_nodes; /* total # of exported segments
14946+ * produced during this discovery.
14947+ */
14948+
14949+/* Prototypes */
14950+static int partition_discover(struct evms_logical_node **);
14951+static int partition_delete(struct evms_logical_node *);
14952+static void partition_read(struct evms_logical_node *, struct buffer_head *);
14953+static void partition_write(struct evms_logical_node *, struct buffer_head *);
14954+static int partition_ioctl(struct evms_logical_node *,
14955+ struct inode *,
14956+ struct file *, unsigned int, unsigned long);
14957+static int partition_init_io(struct evms_logical_node *,
14958+ int, u64, u64, void *);
14959+
14960+static struct evms_plugin_fops fops = {
14961+ .discover = partition_discover,
14962+ .delete = partition_delete,
14963+ .read = partition_read,
14964+ .write = partition_write,
14965+ .init_io = partition_init_io,
14966+ .ioctl = partition_ioctl
14967+};
14968+
14969+#define EVMS_GPT_PARTITION_MANAGER_ID 3
14970+
14971+static struct evms_plugin_header plugin_header = {
14972+ .id = SetPluginID(IBM_OEM_ID,
14973+ EVMS_SEGMENT_MANAGER,
14974+ EVMS_GPT_PARTITION_MANAGER_ID),
14975+ .version = {
14976+ .major = 1,
14977+ .minor = 1,
14978+ .patchlevel = 1
14979+ },
14980+ .required_services_version = {
14981+ .major = 0,
14982+ .minor = 5,
14983+ .patchlevel = 0
14984+ },
14985+ .fops = &fops
14986+};
14987+
14988+/***************************************************/
14989+/* List Support - Typedefs, Variables, & Functions */
14990+/***************************************************/
14991+
14992+/* Typedefs */
14993+
14994+struct segment_list_node {
14995+ struct evms_logical_node *segment;
14996+ struct segment_list_node *next;
14997+};
14998+
14999+struct disk_list_node {
15000+ struct evms_logical_node *disk;
15001+ struct segment_list_node *segment_list;
15002+ struct disk_list_node *next;
15003+};
15004+
15005+/* Variables */
15006+
15007+static struct disk_list_node *my_disk_list;
15008+
15009+/* Functions */
15010+
15011+/*
15012+ * Function: Convert a GPT header from disk format to the arch specific
15013+ * format.
15014+ */
15015+static void
15016+disk_gpt_header_to_cpu(struct gpt_header *gh)
15017+{
15018+ gh->signature = le64_to_cpu(gh->signature);
15019+ gh->version = le32_to_cpu(gh->version);
15020+ gh->size = le32_to_cpu(gh->size);
15021+ gh->crc = le32_to_cpu(gh->crc);
15022+ gh->reserve = le32_to_cpu(gh->reserve);
15023+ gh->my_lba = le64_to_cpu(gh->my_lba);
15024+ gh->alternate_lba = le64_to_cpu(gh->alternate_lba);
15025+ gh->start_useable = le64_to_cpu(gh->start_useable);
15026+ gh->end_useable = le64_to_cpu(gh->end_useable);
15027+ gh->disk_id.time_low = le32_to_cpu(gh->disk_id.time_low);
15028+ gh->disk_id.time_mid = le16_to_cpu(gh->disk_id.time_mid);
15029+ gh->disk_id.time_high = le16_to_cpu(gh->disk_id.time_high);
15030+ gh->ptable_lba = le64_to_cpu(gh->ptable_lba);
15031+ gh->ptable_count = le32_to_cpu(gh->ptable_count);
15032+ gh->ptable_entry_size = le32_to_cpu(gh->ptable_entry_size);
15033+ gh->ptable_crc = le32_to_cpu(gh->ptable_crc);
15034+}
15035+
15036+static int
15037+matching_guids(struct guid *g1, struct guid *g2)
15038+{
15039+ if ((le32_to_cpu(g1->time_low) == g2->time_low) &&
15040+ (le16_to_cpu(g1->time_mid) == g2->time_mid) &&
15041+ (le16_to_cpu(g1->time_high) == g2->time_high) &&
15042+ (g1->clock_seq_high == g2->clock_seq_high) &&
15043+ (g1->clock_seq_low == g2->clock_seq_low)) {
15044+ return 1;
15045+ }
15046+ return 0;
15047+}
15048+static inline int
15049+isa_basic_data_gpt_partition_record(struct gpt_partition *p)
15050+{
15051+ return (matching_guids(&p->type, &BASIC_DATA_PARTITION));
15052+}
15053+static inline int
15054+isa_legacy_mbr_gpt_partition_record(struct gpt_partition *p)
15055+{
15056+ return (matching_guids(&p->type, &LEGACY_MBR_PARTITION));
15057+}
15058+static inline int
15059+isa_esp_gpt_partition_record(struct gpt_partition *p)
15060+{
15061+ return (matching_guids(&p->type, &EFI_SYSTEM_PARTITION));
15062+}
15063+static inline int
15064+isa_gpt_swap_partition_record(struct gpt_partition *p)
15065+{
15066+ return (matching_guids(&p->type, &GPT_SWAP_PARTITION));
15067+}
15068+static inline int
15069+isa_unused_gpt_partition_record(struct gpt_partition *p)
15070+{
15071+ return (matching_guids(&p->type, &UNUSED_GPT_PARTITION));
15072+}
15073+
15074+static struct disk_list_node **
15075+lookup_disk(struct evms_logical_node *disk)
15076+{
15077+ struct disk_list_node **ldln;
15078+
15079+ ldln = &my_disk_list;
15080+ while (*ldln) {
15081+ if ((*ldln)->disk == disk)
15082+ break;
15083+ ldln = &(*ldln)->next;
15084+ }
15085+ return (ldln);
15086+}
15087+
15088+static struct segment_list_node **
15089+lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
15090+{
15091+ struct segment_list_node **lsln;
15092+
15093+ lsln = &disk->segment_list;
15094+ while (*lsln) {
15095+ if ((*lsln)->segment == segment)
15096+ break;
15097+ lsln = &(*lsln)->next;
15098+ }
15099+ return (lsln);
15100+}
15101+
15102+static struct evms_logical_node *
15103+find_segment_on_disk(struct evms_logical_node *disk,
15104+ u64 start_sect, u64 nr_sects)
15105+{
15106+ struct evms_logical_node *rc = NULL;
15107+ struct disk_list_node **ldln;
15108+ struct segment_list_node **lsln;
15109+ struct gpt_private *gpt_prv;
15110+
15111+ ldln = lookup_disk(disk);
15112+ if (*ldln) {
15113+ /* disk found in list */
15114+ /* attempt to find segment */
15115+
15116+ lsln = &(*ldln)->segment_list;
15117+ while (*lsln) {
15118+ gpt_prv = (*lsln)->segment->private;
15119+ if (gpt_prv->start_sect == start_sect)
15120+ if (gpt_prv->nr_sects == nr_sects)
15121+ break;
15122+ lsln = &(*lsln)->next;
15123+ }
15124+ if (*lsln)
15125+ rc = (*lsln)->segment;
15126+ }
15127+ return (rc);
15128+}
15129+
15130+/* function description: add_segment_to_disk
15131+ *
15132+ * this function attempts to add a segment to the segment
15133+ * list of a disk. if the specified disk is not found, it
15134+ * will be added to the global disk list. this function will
15135+ * return a pointer to the matching segment in the disk's
15136+ * segment list. the caller must compare the returned pointer
15137+ * to the specified segment to see if the
15138+ * specified segment was already present in the disk's segment
15139+ * list. if the return pointer matches the specified segment,
15140+ * then the specified segment was added to the list. if the
15141+ * return segment pointer to does not match the specified
15142+ * segment pointer, then the specified segment pointer was
15143+ * a duplicate and can be thrown away.
15144+ */
15145+static int
15146+add_segment_to_disk(struct evms_logical_node *disk,
15147+ struct evms_logical_node *segment)
15148+{
15149+ int rc = 0;
15150+ struct disk_list_node **ldln, *new_disk;
15151+ struct segment_list_node **lsln, *new_segment;
15152+
15153+ ldln = lookup_disk(disk);
15154+ if (*ldln == NULL) {
15155+ /* disk not in list, add disk */
15156+ new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
15157+ if (new_disk) {
15158+ memset(new_disk, 0, sizeof (*new_disk));
15159+ new_disk->disk = disk;
15160+ *ldln = new_disk;
15161+ } else {
15162+ rc = -ENOMEM;
15163+ }
15164+ }
15165+ if (!rc) {
15166+ /* attempt to add segment */
15167+ lsln = lookup_segment(*ldln, segment);
15168+ if (*lsln == NULL) {
15169+ /* segment not in list, add segment */
15170+ new_segment =
15171+ kmalloc(sizeof (*new_segment), GFP_KERNEL);
15172+ if (new_segment) {
15173+ memset(new_segment, 0, sizeof (*new_segment));
15174+ new_segment->segment = segment;
15175+ *lsln = new_segment;
15176+ } else {
15177+ rc = -ENOMEM;
15178+ }
15179+ } else
15180+ rc = -1;
15181+ }
15182+ return (rc);
15183+}
15184+
15185+static int
15186+remove_segment_from_disk(struct evms_logical_node *disk,
15187+ struct evms_logical_node *segment,
15188+ struct evms_logical_node **empty_disk)
15189+{
15190+ int rc = 0;
15191+ struct disk_list_node **ldln, *tmp_disk_node;
15192+ struct segment_list_node **lsln, *tmp_segment_node;
15193+
15194+ *empty_disk = NULL;
15195+ ldln = lookup_disk(disk);
15196+ if (*ldln == NULL) {
15197+ rc = -1;
15198+ } else {
15199+ /* disk found in list */
15200+ /* attempt to add segment */
15201+ lsln = lookup_segment(*ldln, segment);
15202+ if (*lsln == NULL) {
15203+ rc = -2;
15204+ } else {
15205+ tmp_segment_node = *lsln;
15206+ /* remove segment from list */
15207+ *lsln = (*lsln)->next;
15208+ /* free the segment list node */
15209+ kfree(tmp_segment_node);
15210+
15211+ if ((*ldln)->segment_list == NULL) {
15212+ tmp_disk_node = *ldln;
15213+ *empty_disk = tmp_disk_node->disk;
15214+ /* remove disk from list */
15215+ *ldln = (*ldln)->next;
15216+ /* free the disk list node */
15217+ kfree(tmp_disk_node);
15218+ }
15219+ }
15220+ }
15221+ return (rc);
15222+}
15223+
15224+/*
15225+ * Function: add_segment
15226+ */
15227+static int
15228+process_segment(struct evms_logical_node **discover_list,
15229+ struct evms_logical_node *node,
15230+ u64 start_sect,
15231+ u64 nr_sects,
15232+ int type, int part_num, int evms_top_segment)
15233+{
15234+ struct gpt_private *gpt_prv = NULL;
15235+ struct evms_logical_node *segment;
15236+ int rc = 0;
15237+
15238+ segment = find_segment_on_disk(node, start_sect, nr_sects);
15239+ if (segment) {
15240+ LOG_DETAILS("exporting segment '%s'.\n", segment->name);
15241+ } else {
15242+ gpt_prv = kmalloc(sizeof (*gpt_prv), GFP_KERNEL);
15243+ if (gpt_prv) {
15244+ gpt_prv->source_disk = node;
15245+ gpt_prv->start_sect = start_sect;
15246+ gpt_prv->nr_sects = nr_sects;
15247+ gpt_prv->type = type;
15248+ rc = evms_cs_allocate_logical_node(&segment);
15249+ } else {
15250+ rc = -ENOMEM;
15251+ }
15252+ if (!rc) {
15253+ segment->plugin = &plugin_header;
15254+ segment->system_id = (unsigned int) type;
15255+ segment->total_vsectors = nr_sects;
15256+ segment->block_size = node->block_size;
15257+ segment->hardsector_size = node->hardsector_size;
15258+ segment->private = gpt_prv;
15259+ segment->flags = node->flags;
15260+ if (evms_top_segment)
15261+ segment->iflags |= EVMS_TOP_SEGMENT;
15262+ strcpy(segment->name, node->name);
15263+ if (GetPluginType(node->plugin->id) ==
15264+ EVMS_SEGMENT_MANAGER) {
15265+ strcat(segment->name, ".");
15266+ }
15267+ sprintf(segment->name + strlen(segment->name), "%d",
15268+ part_num);
15269+ LOG_DETAILS("creating segment '%s'.\n", segment->name);
15270+ rc = add_segment_to_disk(node, segment);
15271+ if (rc) {
15272+ LOG_ERROR
15273+ ("%s: error(%d) adding segment '%s'!\n",
15274+ __FUNCTION__, rc, segment->name);
15275+ rc = 0;
15276+ } else {
15277+ MOD_INC_USE_COUNT;
15278+ }
15279+ }
15280+ if (rc) {
15281+ if (gpt_prv)
15282+ kfree(gpt_prv);
15283+ if (segment)
15284+ evms_cs_deallocate_logical_node(segment);
15285+ }
15286+ }
15287+ if (!rc) {
15288+ evms_cs_add_logical_node_to_list(discover_list, segment);
15289+ exported_nodes++;
15290+ }
15291+ return rc;
15292+}
15293+
15294+void
15295+print_mem(void *buffer, int length)
15296+{
15297+ int i, done;
15298+ unsigned char *bufptr;
15299+
15300+ bufptr = (unsigned char *) buffer;
15301+ i = done = 0;
15302+ while (!done) {
15303+ if ((i % 16) == 0)
15304+ printk(KERN_INFO "\n0x%p->", buffer + i);
15305+ printk(KERN_INFO "%02x ", bufptr[i]);
15306+ if (++i >= length)
15307+ done++;
15308+ }
15309+ printk(KERN_INFO "\n");
15310+}
15311+
15312+/*
15313+ * Function: get GPT Partition Table - reads partition table
15314+ * into memory and performs crc check.
15315+ *
15316+ */
15317+static struct gpt_partition *
15318+get_gpt_partition_table(struct evms_logical_node *node, struct gpt_header *gh)
15319+{
15320+ int rc;
15321+ struct gpt_partition *pt;
15322+ u32 sector_count, calculated_crc;
15323+
15324+ sector_count =
15325+ evms_cs_size_in_vsectors(gh->ptable_count * gh->ptable_entry_size);
15326+
15327+ pt = kmalloc(sector_count * EVMS_VSECTOR_SIZE, GFP_KERNEL);
15328+ if (pt) {
15329+
15330+ rc = INIT_IO(node, 0, gh->ptable_lba, sector_count, pt);
15331+ if (!rc) {
15332+
15333+ calculated_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
15334+ pt,
15335+ gh->
15336+ ptable_count *
15337+ gh->
15338+ ptable_entry_size);
15339+
15340+ if (~calculated_crc != gh->ptable_crc) {
15341+ rc = -ENODATA;
15342+ }
15343+
15344+ }
15345+ } else {
15346+ rc = -ENOMEM;
15347+ }
15348+
15349+ if (rc) {
15350+ if (pt)
15351+ kfree(pt);
15352+ pt = NULL;
15353+ }
15354+
15355+ return (pt);
15356+}
15357+
15358+/*
15359+ * Function: Validate GPT Header - runs basic checks to
15360+ * sanity check a gpt header.
15361+ *
15362+ */
15363+static int
15364+isa_valid_gpt_header(struct evms_logical_node *node, u64 lsn,
15365+ struct gpt_header *gh)
15366+{
15367+ u32 crc;
15368+ u32 calculated_crc;
15369+ u64 sector_count;
15370+
15371+ /* signature */
15372+ if (le64_to_cpu(gh->signature) != GPT_DISKMAGIC)
15373+ return 0;
15374+
15375+ /* crc */
15376+ crc = le32_to_cpu(gh->crc);
15377+ gh->crc = 0;
15378+ calculated_crc =
15379+ ~(evms_cs_calculate_crc(EVMS_INITIAL_CRC, gh, le32_to_cpu(gh->size)));
15380+ gh->crc = cpu_to_le32(crc);
15381+
15382+ if (calculated_crc != crc)
15383+ return 0;
15384+
15385+ /* spec says lba reported by header must match actual location on disk */
15386+ if (lsn != le64_to_cpu(gh->my_lba))
15387+ return 0;
15388+
15389+ /* sanity check partition table info found in header */
15390+ if (gh->ptable_count == 0 || gh->ptable_entry_size == 0)
15391+ return 0;
15392+
15393+ sector_count =
15394+ evms_cs_size_in_vsectors(le64_to_cpu(gh->ptable_count) *
15395+ le64_to_cpu(gh->ptable_entry_size));
15396+
15397+ if ((le64_to_cpu(gh->ptable_lba) + sector_count - 1) >=
15398+ node->total_vsectors - 1)
15399+ return 0;
15400+
15401+ return 1;
15402+}
15403+
15404+/*
15405+ * Function: get GPT Partition Table Header
15406+ *
15407+ */
15408+static struct gpt_header *
15409+get_gpt_header(struct evms_logical_node *node, u64 lsn)
15410+{
15411+ int rc;
15412+ struct gpt_header *gh = NULL;
15413+
15414+ gh = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
15415+ if (gh) {
15416+ rc = INIT_IO(node, 0, lsn, 1, gh);
15417+ if (!rc) {
15418+ if (isa_valid_gpt_header(node, lsn, gh)) {
15419+ disk_gpt_header_to_cpu(gh);
15420+ } else {
15421+ rc = -ENODATA;
15422+ }
15423+
15424+ }
15425+ if (rc) {
15426+ kfree(gh);
15427+ gh = NULL;
15428+ }
15429+ }
15430+
15431+ return (gh);
15432+}
15433+
15434+/*
15435+ * Function: Get GPT Information
15436+ *
15437+ */
15438+static int
15439+get_gpt_info(struct evms_logical_node *node,
15440+ struct gpt_header **gh, struct gpt_partition **ptable)
15441+{
15442+ struct gpt_header *gh1 = NULL, *gh2 = NULL;
15443+
15444+ *gh = NULL;
15445+ *ptable = NULL;
15446+
15447+ gh1 = get_gpt_header(node, 1); // offset past protective mbr
15448+
15449+ if (gh1) {
15450+ *gh = gh1;
15451+ gh2 = get_gpt_header(node, gh1->alternate_lba);
15452+ if (gh2)
15453+ kfree(gh2);
15454+ else
15455+ LOG_WARNING
15456+ ("alternate guid partition table header is invalid, using primary copy.\n");
15457+ } else {
15458+ gh2 = get_gpt_header(node, node->total_vsectors - 1);
15459+ if (gh2) {
15460+ *gh = gh2;
15461+ LOG_WARNING
15462+ ("primary guid partition table header is invalid, using alternate copy\n");
15463+ } else {
15464+ LOG_DETAILS("no gpt header discovered on node %s\n",
15465+ node->name);
15466+ return 0;
15467+ }
15468+ }
15469+
15470+ *ptable = get_gpt_partition_table(node, *gh);
15471+ if (!*ptable) {
15472+ kfree(*gh);
15473+ *gh = NULL;
15474+ return 0;
15475+ }
15476+
15477+ return 1;
15478+}
15479+
15480+/*
15481+ * Function: Probe for GPT segments on logical node
15482+ *
15483+ */
15484+static int
15485+probe_for_segments(struct evms_logical_node **discover_list,
15486+ struct evms_logical_node *node)
15487+{
15488+ int rc;
15489+ int nextminor = 1;
15490+ int evms_top_segment;
15491+ u32 i;
15492+ u64 pstart,pend;
15493+ struct gpt_header *gh = NULL;
15494+ struct gpt_partition *ptable = NULL;
15495+ struct gpt_partition *part = NULL;
15496+
15497+ /* no need to inspect our own nodes */
15498+ if (node->plugin->id == plugin_header.id)
15499+ return 0;
15500+
15501+ /* nor nodes marked as EVMS_TOP_SEGMENT */
15502+ if (node->iflags & EVMS_TOP_SEGMENT)
15503+ return 0;
15504+
15505+ /* look for guid partition table & header */
15506+ if (!get_gpt_info(node, &gh, &ptable)) {
15507+ if (gh)
15508+ kfree(gh);
15509+ if (ptable)
15510+ kfree(ptable);
15511+ return 0;
15512+ }
15513+
15514+ /* walk the guid partition table, producing segment storage objects */
15515+ for (i = 0, part = ptable; i < gh->ptable_count; i++, part++) {
15516+
15517+ if (!isa_unused_gpt_partition_record(part)) {
15518+
15519+ pstart = le64_to_cpu(part->start);
15520+ pend = le64_to_cpu(part->end);
15521+
15522+ LOG_DETAILS
15523+ ("gpt partition start="PFU64" end="PFU64"\n",
15524+ pstart, (pend - pstart + 1));
15525+
15526+ /* stop other seg mgrs from recursive discovery on a gpt system partition */
15527+ if (isa_esp_gpt_partition_record(part))
15528+ evms_top_segment = 1;
15529+ else
15530+ evms_top_segment = 0;
15531+
15532+ rc = process_segment(discover_list,
15533+ node,
15534+ pstart,
15535+ (pend - pstart + 1),
15536+ 0, nextminor, evms_top_segment);
15537+
15538+ if (!rc) {
15539+ ++nextminor;
15540+ }
15541+ }
15542+
15543+ }
15544+
15545+ /* remove node we just consumed */
15546+ evms_cs_remove_logical_node_from_list(discover_list, node);
15547+
15548+ kfree(ptable);
15549+ kfree(gh);
15550+ return 1;
15551+}
15552+
15553+/*
15554+ * Function: partition_discover
15555+ *
15556+ */
15557+static int
15558+partition_discover(struct evms_logical_node **discover_list)
15559+{
15560+ int rc = 0;
15561+ struct evms_logical_node *node, *next_node;
15562+
15563+ MOD_INC_USE_COUNT;
15564+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
15565+
15566+ /* initialize global variable */
15567+ exported_nodes = 0;
15568+
15569+ /* examine each node on the discover list */
15570+ next_node = *discover_list;
15571+ while (next_node) {
15572+ node = next_node;
15573+ next_node = node->next;
15574+ probe_for_segments(discover_list, node);
15575+ }
15576+
15577+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
15578+ __FUNCTION__, exported_nodes, rc);
15579+ if (exported_nodes)
15580+ rc = exported_nodes;
15581+ MOD_DEC_USE_COUNT;
15582+ return (rc);
15583+}
15584+
15585+/*
15586+ * Function: partition_delete
15587+ *
15588+ */
15589+static int
15590+partition_delete(struct evms_logical_node *segment)
15591+{
15592+ int rc = 0;
15593+ struct gpt_private *gpt_prv;
15594+ struct evms_logical_node *empty_disk = NULL;
15595+
15596+ LOG_DETAILS("deleting segment '%s'.\n", segment->name);
15597+
15598+ if (!segment) {
15599+ rc = -ENODEV;
15600+ } else {
15601+ gpt_prv = segment->private;
15602+ if (gpt_prv) {
15603+ /* remove the segment from the
15604+ * disk's segment list
15605+ */
15606+ rc = remove_segment_from_disk(gpt_prv->source_disk,
15607+ segment, &empty_disk);
15608+ /* free the local instance data */
15609+ kfree(gpt_prv);
15610+ }
15611+ /* free the segment node */
15612+ evms_cs_deallocate_logical_node(segment);
15613+ MOD_DEC_USE_COUNT;
15614+ /* if the last segment on the disk was
15615+ * deleted, delete the disk node too
15616+ */
15617+ if (empty_disk)
15618+ DELETE(empty_disk);
15619+ }
15620+ return (rc);
15621+}
15622+
15623+/*
15624+ * function: partition_io_error
15625+ *
15626+ * this function was primarily created because the function
15627+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
15628+ * to be set on inline functions. Since this was an error path
15629+ * and not mainline, I decided to add a trace statement to help
15630+ * report on the failing condition.
15631+ *
15632+ */
15633+static void
15634+partition_io_error(struct evms_logical_node *node, int io_flag,
15635+ struct buffer_head *bh)
15636+{
15637+ LOG_SERIOUS
15638+ ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n",
15639+ (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
15640+ bh->b_rsector);
15641+
15642+ bh->b_end_io(bh, 0);
15643+}
15644+
15645+/*
15646+ * Function: partition_read
15647+ *
15648+ */
15649+static void
15650+partition_read(struct evms_logical_node *partition, struct buffer_head *bh)
15651+{
15652+ struct gpt_private *gpt_prv = partition->private;
15653+
15654+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15655+ partition->total_vsectors) {
15656+ bh->b_rsector += gpt_prv->start_sect;
15657+ R_IO(gpt_prv->source_disk, bh);
15658+ } else
15659+ partition_io_error(partition, READ, bh);
15660+}
15661+
15662+/*
15663+ * Function: partition_write
15664+ *
15665+ */
15666+static void
15667+partition_write(struct evms_logical_node *partition, struct buffer_head *bh)
15668+{
15669+ struct gpt_private *gpt_prv = partition->private;
15670+
15671+ if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15672+ partition->total_vsectors) {
15673+ bh->b_rsector += gpt_prv->start_sect;
15674+ W_IO(gpt_prv->source_disk, bh);
15675+ } else
15676+ partition_io_error(partition, WRITE, bh);
15677+}
15678+
15679+/*
15680+ * Function: partition_init_io
15681+ *
15682+ */
15683+static int
15684+partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */
15685+ u64 sect_nr, /* disk LBA */
15686+ u64 num_sects, /* # of sectors */
15687+ void *buf_addr)
15688+{ /* buffer address */
15689+ int rc;
15690+ struct gpt_private *gpt_prv = partition->private;
15691+
15692+ if ((sect_nr + num_sects) <= partition->total_vsectors) {
15693+ rc = INIT_IO(gpt_prv->source_disk, io_flag,
15694+ sect_nr + gpt_prv->start_sect, num_sects,
15695+ buf_addr);
15696+ } else {
15697+ LOG_SERIOUS
15698+ ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
15699+ (io_flag) ? "WRITE" : "READ", partition->name,
15700+ (gpt_prv->nr_sects - 1), sect_nr, num_sects);
15701+ rc = -EINVAL;
15702+ }
15703+
15704+ return (rc);
15705+}
15706+
15707+/*
15708+ * Function: partition_ioctl
15709+ *
15710+ */
15711+static int
15712+partition_ioctl(struct evms_logical_node *partition,
15713+ struct inode *inode,
15714+ struct file *file, unsigned int cmd, unsigned long arg)
15715+{
15716+ struct gpt_private *gpt_prv;
15717+ struct hd_geometry hd_geo;
15718+ int rc;
15719+
15720+ rc = 0;
15721+ gpt_prv = partition->private;
15722+ if (!inode)
15723+ return -EINVAL;
15724+ switch (cmd) {
15725+ case HDIO_GETGEO:
15726+ {
15727+ rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15728+ if (rc)
15729+ break;
15730+ if (copy_from_user
15731+ (&hd_geo, (void *) arg,
15732+ sizeof (struct hd_geometry)))
15733+ rc = -EFAULT;
15734+ if (rc)
15735+ break;
15736+ hd_geo.start = gpt_prv->start_sect;
15737+ if (copy_to_user
15738+ ((void *) arg, &hd_geo,
15739+ sizeof (struct hd_geometry)))
15740+ rc = -EFAULT;
15741+ }
15742+ break;
15743+ case EVMS_GET_BMAP:
15744+ {
15745+ struct evms_get_bmap_pkt *bmap =
15746+ (struct evms_get_bmap_pkt *) arg;
15747+ bmap->rsector += gpt_prv->start_sect;
15748+ /* intentionally fall thru to
15749+ * default ioctl down to device
15750+ * manager.
15751+ */
15752+ }
15753+ default:
15754+ rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15755+ }
15756+ return rc;
15757+}
15758+
15759+/*
15760+ * Function: gpt_module_init
15761+ *
15762+ */
15763+static int __init
15764+gpt_module_init(void)
15765+{
15766+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
15767+}
15768+
15769+/*
15770+ * Function: gpt module exit
15771+ */
15772+static void __exit
15773+gpt_module_exit(void)
15774+{
15775+ evms_cs_unregister_plugin(&plugin_header);
15776+}
15777+
15778+module_init(gpt_module_init);
15779+module_exit(gpt_module_exit);
15780+#ifdef MODULE_LICENSE
15781+MODULE_LICENSE("GPL");
15782+#endif
15783diff -Naur linux-2002-09-30/drivers/evms/ldev_mgr.c evms-2002-09-30/drivers/evms/ldev_mgr.c
15784--- linux-2002-09-30/drivers/evms/ldev_mgr.c Wed Dec 31 18:00:00 1969
15785+++ evms-2002-09-30/drivers/evms/ldev_mgr.c Fri Sep 13 16:45:06 2002
15786@@ -0,0 +1,1500 @@
15787+/* -*- linux-c -*- */
15788+/*
15789+ *
15790+ * Copyright (c) International Business Machines Corp., 2000
15791+ *
15792+ * This program is free software; you can redistribute it and/or modify
15793+ * it under the terms of the GNU General Public License as published by
15794+ * the Free Software Foundation; either version 2 of the License, or
15795+ * (at your option) any later version.
15796+ *
15797+ * This program is distributed in the hope that it will be useful,
15798+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15799+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15800+ * the GNU General Public License for more details.
15801+ *
15802+ * You should have received a copy of the GNU General Public License
15803+ * along with this program; if not, write to the Free Software
15804+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15805+ */
15806+
15807+/* linux/driver/evms/ldev_mgr.c
15808+ *
15809+ * EVMS - Local Device (Hard Drive) Manager
15810+ *
15811+ * This plugin walks the gendisk list and creates logical disk structures for each
15812+ * local ide or scsi device.
15813+ *
15814+ */
15815+
15816+#include <linux/config.h>
15817+#include <linux/module.h>
15818+#include <linux/errno.h>
15819+#include <linux/kernel.h>
15820+#include <linux/fs.h>
15821+#include <linux/slab.h>
15822+#include <asm/uaccess.h>
15823+#include <linux/blk.h> /* must be included by all block drivers */
15824+#include <linux/genhd.h>
15825+#include <linux/ide.h>
15826+#include <linux/version.h>
15827+#include "../scsi/scsi.h"
15828+#include "../scsi/sd.h"
15829+#include <linux/init.h>
15830+#include <linux/evms/evms.h>
15831+#include <linux/evms/ldev_mgr.h>
15832+
15833+#define LOG_PREFIX "ldev_mgr: "
15834+
15835+#define EVMS_LOCAL_DEVICE_MANAGER_ID 1
15836+
15837+/**
15838+ * struct ldev_private - private data used by this plugin
15839+ * @major: major device number
15840+ * @minor: minor device number
15841+ * @bdev: block_device record for this device
15842+ * @gd: gendisk entry for this device
15843+ * @media_changed: media changed status field
15844+ *
15845+ * private data maintained for each device by this plugin
15846+ **/
15847+struct ldev_private {
15848+ int major, minor;
15849+ struct block_device *bdev;
15850+ struct gendisk *gd;
15851+ int media_changed;
15852+};
15853+
15854+/* prototypes for mandatory plugin interface functions */
15855+static int discover_disks(struct evms_logical_node **);
15856+static int ldev_mgr_delete(struct evms_logical_node *);
15857+static void ldev_mgr_read(struct evms_logical_node *, struct buffer_head *);
15858+static void ldev_mgr_write(struct evms_logical_node *, struct buffer_head *);
15859+static int ldev_mgr_ioctl(struct evms_logical_node *,
15860+ struct inode *,
15861+ struct file *, unsigned int, unsigned long);
15862+static int ldev_init_io(struct evms_logical_node *,
15863+ int, u64, u64, void *);
15864+static int ldev_mgr_direct_ioctl(struct inode *,
15865+ struct file *, unsigned int, unsigned long);
15866+
15867+/* plugin function table definition */
15868+static struct evms_plugin_fops fops = {
15869+ .discover = discover_disks,
15870+ .delete = ldev_mgr_delete,
15871+ .read = ldev_mgr_read,
15872+ .write = ldev_mgr_write,
15873+ .init_io = ldev_init_io,
15874+ .ioctl = ldev_mgr_ioctl,
15875+ .direct_ioctl = ldev_mgr_direct_ioctl
15876+};
15877+
15878+/* plugin header definition */
15879+static struct evms_plugin_header plugin_header = {
15880+ .id = SetPluginID(IBM_OEM_ID,
15881+ EVMS_DEVICE_MANAGER,
15882+ EVMS_LOCAL_DEVICE_MANAGER_ID),
15883+ .version = {
15884+ .major = 1,
15885+ .minor = 1,
15886+ .patchlevel = 1
15887+ },
15888+ .required_services_version = {
15889+ .major = 0,
15890+ .minor = 5,
15891+ .patchlevel = 0
15892+ },
15893+ .fops = &fops
15894+};
15895+
15896+#define TYPE_NONE 0
15897+#define TYPE_GENERIC 1
15898+#define TYPE_IDE 2
15899+#define TYPE_SCSI 3
15900+
15901+#define INDEX_ALPHA 0
15902+#define INDEX_NUMERIC 1
15903+
15904+/********************************************************/
15905+/* Required Plugin Function Table Entry Point: */
15906+/* Discover function & Support routines */
15907+/********************************************************/
15908+
15909+#define MAX_NAME_BASE_SIZE 10
15910+#define MAX_NAME_MODIFIER_SIZE 4
15911+/**
15912+ * struct blk_device_info - block device info
15913+ * @devnode_name_base: base name (ie. hd or sd) for device
15914+ * @null1: guaranteed end-of-string NULL
15915+ * @devnode_name_modifier: name suffix (ie. ag for sdag) for device
15916+ * @null2: guaranteed end-of-string NULL
15917+ * @devnode_name_index: numeric device index (ie. 1 for hda1)
15918+ * @devnode_name_type: indicates numeric or alpha modifier
15919+ * @devnode_type: device type, IDE, SCSI, or GENERIC
15920+ *
15921+ * generic block device naming descriptor structure
15922+ **/
15923+struct blk_device_info {
15924+ char devnode_name_base[MAX_NAME_BASE_SIZE];
15925+ char null1;
15926+ char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
15927+ char null2;
15928+ int devnode_name_index;
15929+ int devnode_name_type;
15930+ int device_type;
15931+};
15932+
15933+static struct blk_device_info *blk_dev_info = NULL;
15934+
15935+#define BLK_DEV_INFO(a,b,c,d,e) \
15936+ strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); \
15937+ blk_dev_info[a].null1 = 0; \
15938+ strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); \
15939+ blk_dev_info[a].null2 = 0; \
15940+ blk_dev_info[a].devnode_name_index = 0; \
15941+ blk_dev_info[a].device_type = d; \
15942+ blk_dev_info[a].devnode_name_type = e;
15943+
15944+static void
15945+init_blk_dev_info(struct blk_device_info *blk_dev_info)
15946+{
15947+ BLK_DEV_INFO(IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA);
15948+ BLK_DEV_INFO(IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA);
15949+ BLK_DEV_INFO(IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA);
15950+ BLK_DEV_INFO(IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA);
15951+ BLK_DEV_INFO(IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA);
15952+ BLK_DEV_INFO(IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA);
15953+ BLK_DEV_INFO(IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA);
15954+ BLK_DEV_INFO(IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA);
15955+ BLK_DEV_INFO(IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA);
15956+ BLK_DEV_INFO(IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA);
15957+
15958+ BLK_DEV_INFO(SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA);
15959+ BLK_DEV_INFO(SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA);
15960+ BLK_DEV_INFO(SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA);
15961+ BLK_DEV_INFO(SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA);
15962+ BLK_DEV_INFO(SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA);
15963+ BLK_DEV_INFO(SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA);
15964+ BLK_DEV_INFO(SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA);
15965+ BLK_DEV_INFO(SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA);
15966+
15967+ BLK_DEV_INFO(XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA);
15968+
15969+ BLK_DEV_INFO(CYCLADES_MAJOR, "double", "0", TYPE_GENERIC,
15970+ INDEX_NUMERIC);
15971+
15972+ BLK_DEV_INFO(MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA);
15973+
15974+ BLK_DEV_INFO(ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA);
15975+
15976+ BLK_DEV_INFO(PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA);
15977+
15978+ BLK_DEV_INFO(40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA);
15979+ BLK_DEV_INFO(43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC);
15980+ BLK_DEV_INFO(44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA);
15981+ BLK_DEV_INFO(45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA);
15982+ BLK_DEV_INFO(47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC);
15983+
15984+ BLK_DEV_INFO(DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC,
15985+ INDEX_NUMERIC);
15986+ BLK_DEV_INFO(DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC,
15987+ INDEX_NUMERIC);
15988+ BLK_DEV_INFO(DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC,
15989+ INDEX_NUMERIC);
15990+ BLK_DEV_INFO(DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC,
15991+ INDEX_NUMERIC);
15992+ BLK_DEV_INFO(DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC,
15993+ INDEX_NUMERIC);
15994+ BLK_DEV_INFO(DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC,
15995+ INDEX_NUMERIC);
15996+ BLK_DEV_INFO(DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC,
15997+ INDEX_NUMERIC);
15998+ BLK_DEV_INFO(DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC,
15999+ INDEX_NUMERIC);
16000+
16001+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC,
16002+ INDEX_NUMERIC);
16003+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC,
16004+ INDEX_NUMERIC);
16005+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC,
16006+ INDEX_NUMERIC);
16007+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC,
16008+ INDEX_NUMERIC);
16009+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC,
16010+ INDEX_NUMERIC);
16011+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC,
16012+ INDEX_NUMERIC);
16013+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC,
16014+ INDEX_NUMERIC);
16015+ BLK_DEV_INFO(COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC,
16016+ INDEX_NUMERIC);
16017+
16018+ BLK_DEV_INFO(I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA);
16019+ BLK_DEV_INFO(I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA);
16020+ BLK_DEV_INFO(I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA);
16021+ BLK_DEV_INFO(I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA);
16022+ BLK_DEV_INFO(I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA);
16023+ BLK_DEV_INFO(I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA);
16024+ BLK_DEV_INFO(I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA);
16025+ BLK_DEV_INFO(I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA);
16026+
16027+ BLK_DEV_INFO(92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16028+ BLK_DEV_INFO(93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA);
16029+
16030+ BLK_DEV_INFO(DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA);
16031+ BLK_DEV_INFO(MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA);
16032+
16033+ BLK_DEV_INFO(96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16034+ BLK_DEV_INFO(97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16035+
16036+ BLK_DEV_INFO(UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16037+
16038+ BLK_DEV_INFO(JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC);
16039+
16040+ BLK_DEV_INFO(101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC);
16041+
16042+ BLK_DEV_INFO(104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16043+ BLK_DEV_INFO(105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16044+ BLK_DEV_INFO(106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16045+ BLK_DEV_INFO(107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16046+ BLK_DEV_INFO(108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16047+ BLK_DEV_INFO(108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16048+ BLK_DEV_INFO(110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16049+ BLK_DEV_INFO(111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16050+
16051+ BLK_DEV_INFO(RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC);
16052+
16053+ BLK_DEV_INFO(VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC);
16054+ BLK_DEV_INFO(VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC);
16055+ BLK_DEV_INFO(LOOP_MAJOR, "loop", "0", TYPE_GENERIC, INDEX_NUMERIC);
16056+}
16057+
16058+static int
16059+is_in_device_list(struct gendisk *gd, int major, int minor)
16060+{
16061+ int found, done, rc;
16062+ struct evms_logical_node *device = NULL;
16063+ struct ldev_private *ldev_prv;
16064+
16065+ done = found = FALSE;
16066+ while (done == FALSE) {
16067+ rc = evms_cs_find_next_device(device, &device);
16068+ if (rc || !device)
16069+ done = TRUE;
16070+ else {
16071+ ldev_prv = device->private;
16072+ if (ldev_prv->gd == gd)
16073+ if (ldev_prv->major == major)
16074+ if (ldev_prv->minor == minor)
16075+ done = found = TRUE;
16076+ }
16077+ }
16078+ return (found);
16079+}
16080+
16081+static void
16082+build_devnode_name(char *name_buf, int major)
16083+{
16084+ char buf[11], *modifier, *buf_ptr;
16085+ int int_mod, done;
16086+ struct blk_device_info *bdi;
16087+
16088+ bdi = &blk_dev_info[major];
16089+
16090+ /* convert the base name modifier to an integer */
16091+ modifier = bdi->devnode_name_modifier;
16092+ int_mod = 0;
16093+ while (*modifier) {
16094+ if (bdi->devnode_name_type == INDEX_ALPHA) {
16095+ int_mod *= 26;
16096+ int_mod += *modifier - 'a';
16097+ } else {
16098+ int_mod *= 10;
16099+ int_mod += *modifier - '0';
16100+ }
16101+ modifier++;
16102+ if (*modifier) {
16103+ int_mod++;
16104+ }
16105+ }
16106+ /* add in device_index_value */
16107+ int_mod += bdi->devnode_name_index;
16108+ bdi->devnode_name_index++;
16109+
16110+ /* convert integer modifier back to ALPHA/NUMERIC chars */
16111+ memset(buf, 0, sizeof (buf));
16112+ /* fill the buffer from the rear to front with the
16113+ * ascii version of the modifier, leaving space for
16114+ * NULL terminator at the end.
16115+ */
16116+ buf_ptr = &buf[sizeof (buf) - 2];
16117+ done = FALSE;
16118+ do {
16119+ if (bdi->devnode_name_type == INDEX_ALPHA) {
16120+ *buf_ptr = (int_mod % 26) + 'a';
16121+ int_mod /= 26;
16122+ } else {
16123+ *buf_ptr = (int_mod % 10) + '0';
16124+ int_mod /= 10;
16125+ }
16126+ if (int_mod) {
16127+ int_mod--;
16128+ } else {
16129+ done = TRUE;
16130+ }
16131+ buf_ptr--;
16132+ } while (!done);
16133+
16134+ /* find beginning of modifier in buffer */
16135+ modifier = buf;
16136+ while (!*modifier)
16137+ modifier++;
16138+
16139+ /* build the final device devnode name */
16140+ sprintf(name_buf, "%s%s", bdi->devnode_name_base, modifier);
16141+}
16142+
16143+static int
16144+ldev_mgr_lock_device(struct ldev_private *ldev_prv)
16145+{
16146+ int rc;
16147+ struct block_device *bdev;
16148+
16149+ bdev = bdget(MKDEV(ldev_prv->major, ldev_prv->minor));
16150+ if (!bdev)
16151+ return -ENOMEM;
16152+ rc = blkdev_get(bdev, FMODE_READ | FMODE_WRITE, 0, BDEV_RAW);
16153+ if (rc)
16154+ return rc;
16155+ ldev_prv->bdev = bdev;
16156+ return 0;
16157+}
16158+
16159+static void
16160+ldev_mgr_unlock_device(struct ldev_private *ldev_prv)
16161+{
16162+ struct block_device *bdev = ldev_prv->bdev;
16163+ ldev_prv->bdev = NULL;
16164+ if (!bdev) {
16165+ LOG_ERROR("error: NULL bdev field detected!\n");
16166+ BUG();
16167+ }
16168+ blkdev_put(bdev, BDEV_RAW);
16169+}
16170+
16171+#define DEVICE_KNOWN 1234
16172+#define DEVICE_UNINITIALIZED 1235
16173+#define DEVICE_MEDIA_NOT_PRESENT 1236
16174+static int
16175+create_logical_disk(struct evms_logical_node **disk_list,
16176+ struct gendisk *gd, int device_index)
16177+{
16178+ int rc = 0, major, minor;
16179+ struct evms_logical_node *new_disk = NULL;
16180+ struct ldev_private *ldev_prv = NULL;
16181+ char device_name[EVMS_VOLUME_NAME_SIZE + 1];
16182+
16183+ major = gd->major;
16184+ minor = device_index << gd->minor_shift;
16185+
16186+ /* skip uninitialized devices */
16187+ if (!blk_size[major])
16188+ rc = DEVICE_UNINITIALIZED;
16189+ else if (!blk_size[major][minor])
16190+ rc = DEVICE_UNINITIALIZED;
16191+ if (!rc) {
16192+ /* construct the devnode name for this device */
16193+ build_devnode_name(device_name, major);
16194+
16195+ /* skip devices we already know about */
16196+ if (is_in_device_list(gd, major, minor) == TRUE)
16197+ rc = DEVICE_KNOWN;
16198+ }
16199+ /* allocate the new node */
16200+ if (!rc) {
16201+ rc = evms_cs_allocate_logical_node(&new_disk);
16202+ }
16203+ /* allocate new nodes's instance data */
16204+ if (!rc) {
16205+ ldev_prv = kmalloc(sizeof(struct ldev_private), GFP_KERNEL);
16206+ if (!ldev_prv)
16207+ rc = -ENOMEM;
16208+ }
16209+ /* initialize the new node */
16210+ if (!rc) {
16211+ memset(ldev_prv, 0, sizeof(struct ldev_private));
16212+ new_disk->plugin = &plugin_header;
16213+
16214+ /* initialize the instance data */
16215+ new_disk->private = ldev_prv;
16216+ ldev_prv->gd = gd;
16217+ ldev_prv->major = major;
16218+ ldev_prv->minor = minor;
16219+ rc = ldev_mgr_lock_device(ldev_prv);
16220+ if (rc) {
16221+ LOG_ERROR("error(%d): unable to lock device(%d,%d)!\n",
16222+ rc, major, minor);
16223+ }
16224+ }
16225+ if (!rc) {
16226+ /* determine hardsector size */
16227+ new_disk->hardsector_size = 512;
16228+ if (hardsect_size[major]) {
16229+ new_disk->hardsector_size = hardsect_size[major][minor];
16230+ }
16231+ /* save the block size */
16232+ new_disk->block_size = 1024;
16233+ if (blksize_size[major]) {
16234+ new_disk->block_size = blksize_size[major][minor];
16235+ }
16236+ /* obtain the device size in sectors
16237+ *
16238+ * try 64bit size first, if that fails
16239+ * fall back on the 32bit size.
16240+ */
16241+ /* try 64bit size */
16242+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
16243+ rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE64,
16244+ (ulong) & new_disk->total_vsectors);
16245+ if (!rc) {
16246+ /* convert bytes to 512 byte sectors */
16247+ new_disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
16248+ } else
16249+#endif
16250+ {
16251+ /* try 32bit size */
16252+ ulong dev_size = 0;
16253+ rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE,
16254+ (ulong) & dev_size);
16255+ new_disk->total_vsectors = dev_size;
16256+ }
16257+ if (!rc && !new_disk->total_vsectors) {
16258+ rc = -ENOSPC;
16259+ }
16260+ }
16261+ if (!rc) {
16262+ /* remember removable devices */
16263+ if (gd->flags)
16264+ if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
16265+ new_disk->flags |= EVMS_DEVICE_REMOVABLE;
16266+
16267+ /* save the devnode name for this device */
16268+ strcpy(new_disk->name, device_name);
16269+
16270+ /* register this device with evms */
16271+ evms_cs_register_device(new_disk);
16272+ MOD_INC_USE_COUNT;
16273+
16274+ /* append this record the linked list */
16275+ evms_cs_add_logical_node_to_list(disk_list, new_disk);
16276+ LOG_DETAILS
16277+ ("added logical disk(%s) for physical disk(%u,%u,%s), size("PFU64") in 512 byte units\n",
16278+ new_disk->name, major, minor, new_disk->name,
16279+ new_disk->total_vsectors);
16280+
16281+ }
16282+ /* reset the "benign" error codes for the caller */
16283+ switch (rc) {
16284+ case DEVICE_UNINITIALIZED:
16285+ case DEVICE_KNOWN:
16286+ case DEVICE_MEDIA_NOT_PRESENT:
16287+ rc = 0;
16288+ case 0:
16289+ break;
16290+ default:
16291+ LOG_ERROR
16292+ ("error(%d): creating logical disk for device(%d,%d).\n",
16293+ rc, major, minor);
16294+ if (new_disk) {
16295+ evms_cs_deallocate_logical_node(new_disk);
16296+ }
16297+ if (ldev_prv) {
16298+ kfree(ldev_prv);
16299+ }
16300+ break;
16301+ }
16302+ return (rc);
16303+}
16304+
16305+static int
16306+create_logical_generic_disks(struct evms_logical_node **disk_list,
16307+ struct gendisk *gd)
16308+{
16309+ int rc, i;
16310+
16311+ /* This is a generic device */
16312+
16313+ rc = 0;
16314+ LOG_DEBUG("major name = %s\n", gd->major_name);
16315+ LOG_DEBUG("number of real devices = %i\n", gd->nr_real);
16316+ for (i = 0; i < gd->nr_real; i++) {
16317+ LOG_DEBUG("device %d:\n", i);
16318+ rc = create_logical_disk(disk_list, gd, i);
16319+ if (rc)
16320+ break;
16321+ }
16322+ return (rc);
16323+}
16324+
16325+static int
16326+create_logical_ide_disks(struct evms_logical_node **disk_list,
16327+ struct gendisk *gd)
16328+{
16329+ int rc = 0, i;
16330+ ide_hwif_t *ide_hwif;
16331+ ide_drive_t *drive;
16332+
16333+ /* This is an IDE device */
16334+ LOG_DEBUG("found IDE major : %i - searching for disks\n", gd->major);
16335+
16336+ ide_hwif = gd->real_devices; /* IDE internal data */
16337+ for (i = 0; i < MAX_DRIVES; i++) {
16338+ drive = &(ide_hwif->drives[i]);
16339+ if (drive->present && (drive->media == ide_disk)) {
16340+ /* force the name index value on ide drives */
16341+ blk_dev_info[gd->major].devnode_name_index = i;
16342+ rc = create_logical_disk(disk_list, gd, i);
16343+ }
16344+ if (rc)
16345+ break;
16346+ }
16347+ return (rc);
16348+}
16349+
16350+static int
16351+create_logical_scsi_disks(struct evms_logical_node **disk_list,
16352+ struct gendisk *gd)
16353+{
16354+ int rc = 0, i;
16355+ Scsi_Disk *SDisks;
16356+ Scsi_Device *SDev;
16357+
16358+ /* This is an SCSI device */
16359+ LOG_DEBUG("found SCSI major : %i - searching for disks\n", gd->major);
16360+ LOG_DEBUG("scsi: major name = %s\n", gd->major_name);
16361+ LOG_DEBUG("scsi: number of real devices = %i\n", gd->nr_real);
16362+ SDisks = gd->real_devices; /* SCSI internal data */
16363+ for (i = 0; i < gd->nr_real; i++) {
16364+ SDev = SDisks[i].device;
16365+ LOG_DEBUG
16366+ ("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
16367+ SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
16368+ rc = create_logical_disk(disk_list, gd, i);
16369+ if (rc)
16370+ break;
16371+ }
16372+ return (rc);
16373+}
16374+
16375+static int
16376+create_logical_disks(struct gendisk *gd, void *p_disk_list)
16377+{
16378+ int rc = 0;
16379+ struct evms_logical_node **disk_list = p_disk_list;
16380+
16381+ /* create logical disks from all IDE & SCSI devices */
16382+ switch (blk_dev_info[gd->major].device_type) {
16383+ case TYPE_IDE:
16384+ rc = create_logical_ide_disks(disk_list, gd);
16385+ break;
16386+ case TYPE_SCSI:
16387+ rc = create_logical_scsi_disks(disk_list, gd);
16388+ break;
16389+ case TYPE_GENERIC:
16390+ rc = create_logical_generic_disks(disk_list, gd);
16391+ break;
16392+ default:
16393+ LOG_DEBUG("unrecognized device major : %i\n", gd->major);
16394+ break;
16395+ }
16396+
16397+ return (rc);
16398+}
16399+
16400+static int
16401+discover_disks(struct evms_logical_node **disk_list)
16402+{
16403+ int rc = 0;
16404+
16405+ MOD_INC_USE_COUNT;
16406+ LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);
16407+
16408+ if (blk_dev_info == NULL) {
16409+ /* allocate space for device info array */
16410+ blk_dev_info = kmalloc(sizeof (struct blk_device_info)
16411+ * (MAX_BLKDEV + 1), GFP_KERNEL);
16412+ if (blk_dev_info) {
16413+ /* initialize device info array */
16414+ memset(blk_dev_info, 0,
16415+ sizeof (struct blk_device_info) * (MAX_BLKDEV + 1));
16416+ init_blk_dev_info(blk_dev_info);
16417+ } else {
16418+ rc = -ENOMEM;
16419+ }
16420+ }
16421+ if (!rc)
16422+ /* create logical disks from the raw devices */
16423+ rc = walk_gendisk(create_logical_disks, disk_list);
16424+
16425+ /* free blk_dev_info table and null the ptr to it */
16426+ kfree(blk_dev_info);
16427+ blk_dev_info = NULL;
16428+
16429+ LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);
16430+ MOD_DEC_USE_COUNT;
16431+ return (rc);
16432+}
16433+
16434+/********************************************************/
16435+/* Required Plugin Function Table Entry Point: */
16436+/* Delete function */
16437+/********************************************************/
16438+
16439+static int
16440+ldev_mgr_delete(struct evms_logical_node *disk)
16441+{
16442+ struct ldev_private *ldev_prv;
16443+
16444+ /* reset any evms volume related info from
16445+ * the device node, because we can't predict
16446+ * how this node will be used in the future.
16447+ */
16448+
16449+ /* removed the feature header if its been used
16450+ */
16451+ if (disk->feature_header) {
16452+ kfree(disk->feature_header);
16453+ disk->feature_header = NULL;
16454+ }
16455+ /* remove the volume_info structure and flag
16456+ * if this has been used directly by an evms
16457+ * feature.
16458+ */
16459+ evms_cs_deallocate_volume_info(disk);
16460+ /* reset the flags field to the appropriate state
16461+ */
16462+ disk->flags &= ~EVMS_VOLUME_FLAG;
16463+
16464+ /* disk nodes only get deleted when:
16465+ * 1) there are no references to the disk node
16466+ * in memory.
16467+ * 2) the device is removable
16468+ * 3) the device reported a media change
16469+ *
16470+ * All three of these conditions must be true
16471+ * before the disk node can be deleted.
16472+ * evms_check_for_device_changes should set
16473+ * and ensure these conditions before issuing
16474+ * deletes.
16475+ *
16476+ * Newly installed removable media will be
16477+ * picked up in this modules discover code.
16478+ *
16479+ * OR disk nodes can will be deleted if the
16480+ * devices they represent go away, for example
16481+ * in the case of a hotunplugged device or a
16482+ * required driver having been unloaded.
16483+ */
16484+ if (disk->flags & (EVMS_MEDIA_CHANGED | EVMS_DEVICE_UNAVAILABLE)) {
16485+ LOG_DETAILS("deleting '%s'.\n", disk->name);
16486+
16487+ evms_cs_unregister_device(disk);
16488+ MOD_DEC_USE_COUNT;
16489+ ldev_prv = disk->private;
16490+ ldev_mgr_unlock_device(ldev_prv);
16491+ if (ldev_prv) {
16492+ kfree(ldev_prv);
16493+ }
16494+ evms_cs_deallocate_logical_node(disk);
16495+ }
16496+ return 0;
16497+}
16498+
16499+/********************************************************/
16500+/* Required Plugin Function Table Entry Point: */
16501+/* Read function */
16502+/********************************************************/
16503+
16504+/*
16505+ * function: ldev_mgr_io_error
16506+ *
16507+ * this function was primarily created because the function
16508+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
16509+ * to be set on inline functions. Since this was an error path
16510+ * and not mainline, I decided to add a trace statement to help
16511+ * report on the failing condition.
16512+ *
16513+ */
16514+static void
16515+ldev_mgr_io_error(struct evms_logical_node *disk, int io_flag, struct buffer_head *bh, int rc)
16516+{
16517+ if (rc == -EOVERFLOW) {
16518+ LOG_SERIOUS
16519+ ("attempt to %s beyond boundary("PFU64") on (%s), rsector(%ld).\n",
16520+ (io_flag) ? "WRITE" : "READ", disk->total_vsectors - 1,
16521+ disk->name, bh->b_rsector);
16522+ } else if (rc == -ENXIO) {
16523+ LOG_SERIOUS("attempt to access a non-existent device(%s).\n",
16524+ disk->name);
16525+ }
16526+ bh->b_end_io(bh, 0);
16527+}
16528+
16529+/********************************************************/
16530+/* Required Plugin Function Table Entry Point: */
16531+/* Read function */
16532+/********************************************************/
16533+
16534+static void
16535+ldev_mgr_read(struct evms_logical_node *disk, struct buffer_head *bh)
16536+{
16537+ int rc = 0;
16538+ request_queue_t *q;
16539+ struct ldev_private *ldev_prv;
16540+
16541+ ldev_prv = disk->private;
16542+ if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16543+ disk->total_vsectors) {
16544+ bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16545+ q = blk_get_queue(bh->b_rdev);
16546+ if (q) {
16547+ disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16548+ q->make_request_fn(q, READ, bh);
16549+ return;
16550+ } else {
16551+ rc = -ENXIO;
16552+ disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16553+ }
16554+ } else {
16555+ rc = -EOVERFLOW;
16556+ }
16557+ if (rc) {
16558+ ldev_mgr_io_error(disk, READ, bh, rc);
16559+ }
16560+}
16561+
16562+/********************************************************/
16563+/* Required Plugin Function Table Entry Point: */
16564+/* Write function */
16565+/********************************************************/
16566+
16567+static void
16568+ldev_mgr_write(struct evms_logical_node *disk, struct buffer_head *bh)
16569+{
16570+ int rc = 0;
16571+ request_queue_t *q;
16572+ struct ldev_private *ldev_prv;
16573+
16574+ ldev_prv = disk->private;
16575+ if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16576+ disk->total_vsectors) {
16577+ bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16578+ q = blk_get_queue(bh->b_rdev);
16579+ if (q) {
16580+ disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16581+ q->make_request_fn(q, WRITE, bh);
16582+ return;
16583+ } else {
16584+ rc = -ENXIO;
16585+ disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16586+ }
16587+ } else {
16588+ rc = -EOVERFLOW;
16589+ }
16590+ if (rc) {
16591+ ldev_mgr_io_error(disk, WRITE, bh, rc);
16592+ }
16593+}
16594+
16595+/********************************************************/
16596+/* Required Plugin Function Table Entry Point: */
16597+/* Init_io function & Support routines */
16598+/********************************************************/
16599+
16600+/*
16601+ * function: allocate_bh
16602+ *
16603+ * This function obtains a buffer head from the private
16604+ * buffer head pool (pre-allocated at EVMS initial
16605+ * discovery time).
16606+ *
16607+ * NOTE: All access to the buffer head pool are protected
16608+ * by a private spinlock.
16609+ *
16610+ */
16611+static inline struct buffer_head *
16612+allocate_bh(void)
16613+{
16614+ struct buffer_head *bh =
16615+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
16616+ if (bh) {
16617+ init_waitqueue_head(&bh->b_wait);
16618+ }
16619+ return (bh);
16620+}
16621+
16622+/*
16623+ * function: deallocate_bh
16624+ *
16625+ * This function returns a buffer head to the private
16626+ * buffer head pool (pre-allocated at EVMS initial
16627+ * discovery time).
16628+ *
16629+ * NOTE: All access to the buffer head pool are protected
16630+ * by a private spinlock.
16631+ *
16632+ */
16633+static inline void
16634+deallocate_bh(struct buffer_head *bh)
16635+{
16636+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
16637+}
16638+
16639+/* this is the buffer head control block structure definition */
16640+typedef struct bh_cb_s {
16641+ int rc;
16642+ atomic_t blks_allocated;
16643+ wait_queue_head_t cb_wait;
16644+} bh_cb_t;
16645+
16646+/*
16647+ * function: __wait_on_bh_cb
16648+ *
16649+ * This is a worker function to wait_on_bh_cb.
16650+ * This function waits for a set of private buffer heads
16651+ * associated to the specified buffer head control block
16652+ * to return from I/O completion. On completion of the
16653+ * last buffer head, the calling function is awakened
16654+ * and continues running.
16655+ *
16656+ * This is the worker function to the function wait_on_bh_cb.
16657+ *
16658+ */
16659+static void
16660+__wait_on_bh_cb(bh_cb_t * bh_cb)
16661+{
16662+ struct task_struct *tsk = current;
16663+ DECLARE_WAITQUEUE(wait, tsk);
16664+
16665+ add_wait_queue(&bh_cb->cb_wait, &wait);
16666+ do {
16667+ run_task_queue(&tq_disk);
16668+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
16669+ if (!atomic_read(&bh_cb->blks_allocated))
16670+ break;
16671+ schedule();
16672+ } while (atomic_read(&bh_cb->blks_allocated));
16673+#ifdef O1_SCHEDULER
16674+ set_task_state(tsk, TASK_RUNNING);
16675+#else
16676+ tsk->state = TASK_RUNNING;
16677+#endif
16678+ remove_wait_queue(&bh_cb->cb_wait, &wait);
16679+}
16680+
16681+/*
16682+ * function: wait_on_bh_cb
16683+ *
16684+ * This function waits for a set of private buffer heads
16685+ * associated to the specified buffer head control block
16686+ * to return from I/O completion. On completion of the
16687+ * last buffer head, the calling function is awakened
16688+ * and continues running.
16689+ *
16690+ */
16691+static void
16692+wait_on_bh_cb(bh_cb_t * bh_cb)
16693+{
16694+ if (atomic_read(&bh_cb->blks_allocated))
16695+ __wait_on_bh_cb(bh_cb);
16696+ else
16697+ /* if we ended up with no buffer heads on
16698+ * this pass, lets wait a until a few buffer
16699+ * heads have been freed and try again. This
16700+ * should provide a reasonable delay.
16701+ */
16702+ schedule();
16703+}
16704+
16705+/*
16706+ * function: end_bh_cb_io
16707+ *
16708+ * This is the I/O completion function that is called for
16709+ * each private buffer head obtained from the buffer head
16710+ * pool. Control is return thru this routine so we can track
16711+ * all outstanding requests to know when to awaken the caller,
16712+ * and to regain control after all I/Os have been performed.
16713+ *
16714+ */
16715+static void
16716+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
16717+{
16718+ bh_cb_t *bh_cb = (bh_cb_t *) bh->b_private;
16719+
16720+ /* record that errors occurred */
16721+ if (!uptodate) {
16722+ bh_cb->rc = -EIO;
16723+ }
16724+ mark_buffer_uptodate(bh, uptodate);
16725+ unlock_buffer(bh);
16726+
16727+ deallocate_bh(bh);
16728+ atomic_dec(&bh_cb->blks_allocated);
16729+ if (!atomic_read(&bh_cb->blks_allocated))
16730+ if (waitqueue_active(&bh_cb->cb_wait))
16731+ wake_up(&bh_cb->cb_wait);
16732+}
16733+
16734+/*
16735+ * function: ldev_partial_sector_init_io
16736+ *
16737+ * This function is a support function for ldev_init_io,
16738+ * which handles the cases of performing I/O to only a part
16739+ * of non-standard sized hardsector. This function is not
16740+ * designed to be called directly, but via ldev_init_io.
16741+ *
16742+ */
16743+static int
16744+ldev_partial_sector_init_io(struct evms_logical_node *node,
16745+ int io_flag,
16746+ bh_cb_t * bh_cb,
16747+ u64 next_lsn,
16748+ u64 sector_lsn,
16749+ u64 io_size,
16750+ void *bufptr, unsigned char **sector_buf)
16751+{
16752+ int rc = 0;
16753+ struct ldev_private *ldev_prv = node->private;
16754+ kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16755+ struct buffer_head *bh;
16756+
16757+ if (*sector_buf == NULL) {
16758+ /* allocate buffer for incoming sector */
16759+ *sector_buf = kmalloc(node->hardsector_size, GFP_KERNEL);
16760+ if (!*sector_buf)
16761+ return -ENOMEM;
16762+ }
16763+ /* allocate a buffer head from the pool */
16764+ while ((bh = allocate_bh()) == NULL)
16765+ /* yielding the cpu is playing it
16766+ * safe. it might be wiser to just
16767+ * spin. requires more thought.
16768+ */
16769+ schedule();
16770+
16771+ /* set up the buffer head for this sector */
16772+ bh->b_end_io = end_bh_cb_io_sync;
16773+ bh->b_size = node->hardsector_size;
16774+ bh->b_rdev = dev;
16775+ bh->b_rsector = next_lsn - sector_lsn;
16776+ bh->b_data = *sector_buf;
16777+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16778+ bh->b_state = 0;
16779+ set_bit(BH_Dirty, &bh->b_state);
16780+ set_bit(BH_Lock, &bh->b_state);
16781+ set_bit(BH_Req, &bh->b_state);
16782+ set_bit(BH_Mapped, &bh->b_state);
16783+ bh->b_private = (void *) bh_cb;
16784+ atomic_inc(&bh_cb->blks_allocated);
16785+
16786+ /* drive the buffer head down */
16787+ /* to the device */
16788+ generic_make_request(READ, bh);
16789+
16790+ /* wait for all bh's I/O's to end */
16791+ wait_on_bh_cb(bh_cb);
16792+
16793+ /* copy data to/from user */
16794+ if (io_flag != WRITE)
16795+ /* READ */
16796+ memcpy(bufptr,
16797+ *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16798+ io_size << EVMS_VSECTOR_SIZE_SHIFT);
16799+ else {
16800+ /* WRITE */
16801+ memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16802+ bufptr, io_size << EVMS_VSECTOR_SIZE_SHIFT);
16803+
16804+ /* allocate a buffer head from the pool */
16805+ while ((bh = allocate_bh()) == NULL)
16806+ /* yielding the cpu is playing it
16807+ * safe. it might be wiser to just
16808+ * spin. requires more thought.
16809+ */
16810+ schedule();
16811+
16812+ /* set up the buffer head for this sector */
16813+ bh->b_end_io = end_bh_cb_io_sync;
16814+ bh->b_size = node->hardsector_size;
16815+ bh->b_rdev = dev;
16816+ bh->b_rsector = next_lsn - sector_lsn;
16817+ bh->b_data = *sector_buf;
16818+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16819+ bh->b_state = 0;
16820+ set_bit(BH_Dirty, &bh->b_state);
16821+ set_bit(BH_Lock, &bh->b_state);
16822+ set_bit(BH_Req, &bh->b_state);
16823+ set_bit(BH_Mapped, &bh->b_state);
16824+ bh->b_private = (void *) bh_cb;
16825+ atomic_inc(&bh_cb->blks_allocated);
16826+
16827+ /* drive the buffer head down */
16828+ /* to the device */
16829+ generic_make_request(WRITE, bh);
16830+
16831+ /* wait for all bh's I/O's to end */
16832+ wait_on_bh_cb(bh_cb);
16833+ }
16834+ return (rc);
16835+}
16836+
16837+/*
16838+ * function: ldev_init_io
16839+ *
16840+ * This function provides support for synchronous I/O
16841+ * operations to the underlying devices. These I/O
16842+ * operations are NOT buffered in any way including the
16843+ * operating system's buffer cache.
16844+ *
16845+ * This function can work with any hardsector size that
16846+ * is a power of 2.
16847+ *
16848+ * node : logical node of the target logical disk
16849+ * io_flag : 0 = read, 1 = write, 2 = read-a-head
16850+ * starting_lsn : the 0-based (disk relative) logical
16851+ * : (512 byte) sector number (lsn)
16852+ * num_lsns : the total number of lsns in this I/O
16853+ * bufptr : address of the memory to read/write the data
16854+ *
16855+ */
16856+static int
16857+ldev_init_io(struct evms_logical_node *node,
16858+ int io_flag,
16859+ u64 starting_lsn, u64 num_lsns, void *bufptr)
16860+{
16861+ int rc = 0, lsns_per_hardsector, lsns_per_blocksize;
16862+ unchar *sector_buf = NULL, *cur_bufptr;
16863+ u64 next_lsn, remaining_lsns, sector_lsn;
16864+ struct ldev_private *ldev_prv = node->private;
16865+ kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16866+ bh_cb_t bh_cb;
16867+
16868+ LOG_EVERYTHING
16869+ ("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn("PFU64"), num_lsns("PFU64"), bufptr(0x%p)\n",
16870+ __FUNCTION__, ldev_prv->major, ldev_prv->minor, io_flag,
16871+ starting_lsn, num_lsns, bufptr);
16872+
16873+ /* check for valid device */
16874+ if (!blk_size[ldev_prv->major][ldev_prv->minor]) {
16875+ node->flags |= EVMS_DEVICE_UNAVAILABLE;
16876+ return (-ENXIO);
16877+ }
16878+ /* check for 0 length request */
16879+ if (num_lsns == 0) {
16880+ LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);
16881+ return (-EINVAL);
16882+ }
16883+ /* check for out of bound request */
16884+ if ((starting_lsn + num_lsns) > node->total_vsectors) {
16885+ LOG_ERROR
16886+ ("%s: attempted %s beyond logical disk boundary("PFU64" LSNs), requesting LSN("PFU64"), total LSNs("PFU64").\n",
16887+ __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
16888+ node->total_vsectors, starting_lsn, num_lsns);
16889+ return (-EINVAL);
16890+ }
16891+ /* check for invalid io_flag value */
16892+ switch (io_flag) {
16893+ case READ: /* read... */
16894+ case WRITE: /* write... */
16895+ case READA: /* reada... */
16896+ break;
16897+ default:
16898+ return (-EINVAL);
16899+ }
16900+
16901+ /* compute some per device info once up-front */
16902+ lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;
16903+ lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;
16904+
16905+ /* initialize the buffer head control block */
16906+ memset(&bh_cb, 0, sizeof (bh_cb_t));
16907+ init_waitqueue_head(&bh_cb.cb_wait);
16908+ bh_cb.blks_allocated = (atomic_t)ATOMIC_INIT(0);
16909+
16910+ /* only update the local copy of variables */
16911+ cur_bufptr = bufptr;
16912+ next_lsn = starting_lsn;
16913+ remaining_lsns = num_lsns;
16914+
16915+ /* check for a mid-sector starting offset
16916+ *
16917+ * if found, perform I/O on part of that
16918+ * sector
16919+ */
16920+ sector_lsn = next_lsn & (lsns_per_hardsector - 1);
16921+ if (sector_lsn) {
16922+ u64 io_size;
16923+
16924+ /* determine bytes in IO to this sector */
16925+ io_size = lsns_per_hardsector - sector_lsn;
16926+ if (io_size > remaining_lsns)
16927+ io_size = remaining_lsns;
16928+
16929+ /* perform the partial sector io */
16930+ rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
16931+ next_lsn,
16932+ sector_lsn, io_size,
16933+ cur_bufptr, &sector_buf);
16934+
16935+ if (!rc) {
16936+ /* update progress in local variables */
16937+ cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;
16938+ next_lsn += io_size;
16939+ remaining_lsns -= io_size;
16940+ }
16941+ }
16942+
16943+ /* continue if no errors found */
16944+ if (!rc) {
16945+ /* perform I/O on all the complete sectors
16946+ * in this request.
16947+ *
16948+ * loop until there are no more complete sectors
16949+ * to process.
16950+ */
16951+ while (remaining_lsns >= lsns_per_hardsector) {
16952+ /* this inner loop attempts to drive as many
16953+ * bytes (in sector size multiples) down to
16954+ * the device as possible using the available
16955+ * buffer heads in the pool.
16956+ */
16957+ while (remaining_lsns >= lsns_per_hardsector) {
16958+ struct buffer_head *bh;
16959+
16960+ /* allocate a buffer head from the pool */
16961+ bh = allocate_bh();
16962+ if (bh == NULL)
16963+ break;
16964+
16965+ /* set up the buffer head for this I/O */
16966+ bh->b_end_io = end_bh_cb_io_sync;
16967+ bh->b_size =
16968+ (remaining_lsns >= lsns_per_blocksize) ?
16969+ node->block_size : node->hardsector_size;
16970+ bh->b_data = cur_bufptr;
16971+ bh->b_rdev = dev;
16972+ bh->b_rsector = next_lsn;
16973+ bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16974+ bh->b_state = 0;
16975+ set_bit(BH_Dirty, &bh->b_state);
16976+ set_bit(BH_Lock, &bh->b_state);
16977+ set_bit(BH_Req, &bh->b_state);
16978+ set_bit(BH_Mapped, &bh->b_state);
16979+ bh->b_private = (void *) &bh_cb;
16980+ atomic_inc(&bh_cb.blks_allocated);
16981+
16982+ /* drive the buffer head down */
16983+ /* to the device */
16984+ generic_make_request(io_flag, bh);
16985+
16986+ /* update progress in local variables */
16987+ cur_bufptr += bh->b_size;
16988+ next_lsn +=
16989+ bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16990+ remaining_lsns -=
16991+ bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16992+ }
16993+ /* wait for all bh's I/O's to end */
16994+ wait_on_bh_cb(&bh_cb);
16995+ }
16996+ }
16997+
16998+ /* continue if no errors found */
16999+ if (!rc)
17000+ /* check for a mid-sector ending offset
17001+ *
17002+ * if found, perform I/O on part of that
17003+ * sector
17004+ */
17005+ if (remaining_lsns)
17006+ /* perform the partial sector io */
17007+ rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
17008+ next_lsn,
17009+ 0, remaining_lsns,
17010+ cur_bufptr,
17011+ &sector_buf);
17012+
17013+ /* free the sector buffer if it was allocated */
17014+ if (sector_buf)
17015+ kfree(sector_buf);
17016+
17017+ /* coalesce return codes */
17018+ rc |= bh_cb.rc;
17019+
17020+ LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);
17021+
17022+ return (rc);
17023+}
17024+
17025+static int
17026+ldev_mgr_direct_ioctl(struct inode *inode,
17027+ struct file *file, unsigned int cmd, unsigned long arg)
17028+{
17029+ int rc = 0;
17030+ struct ldev_private *ldev_prv;
17031+ struct evms_plugin_ioctl_pkt tmp, *user_parms;
17032+ struct ldev_plugin_ioctl pi_data;
17033+ struct evms_logical_node *disk;
17034+
17035+ MOD_INC_USE_COUNT;
17036+
17037+ user_parms = (struct evms_plugin_ioctl_pkt *) arg;
17038+ /* copy user's parameters to kernel space */
17039+ if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
17040+ rc = -EFAULT;
17041+
17042+ if (!rc) {
17043+ /* validate its meant for us */
17044+ if (tmp.feature_id != plugin_header.id) {
17045+ rc = -EINVAL;
17046+ }
17047+ }
17048+
17049+ if (!rc) {
17050+ /* copy feature ioctl data to kernel space */
17051+ if (copy_from_user(&pi_data, tmp.feature_ioctl_data,
17052+ sizeof (pi_data))) {
17053+ rc = -EFAULT;
17054+ }
17055+ }
17056+
17057+ if (!rc) {
17058+ /* find the disk node specified by the disk_handle */
17059+ int done = FALSE;
17060+ disk = NULL;
17061+ while (!done) {
17062+ rc = evms_cs_find_next_device(disk,
17063+ &disk);
17064+ if (rc) {
17065+ break;
17066+ }
17067+ if (!disk) {
17068+ rc = -ENODATA;
17069+ break;
17070+ }
17071+ if (disk ==
17072+ DEV_HANDLE_TO_NODE(pi_data.disk_handle)) {
17073+ done = TRUE;
17074+ }
17075+ }
17076+ }
17077+
17078+ if (!rc) {
17079+ /* perform feature command */
17080+ ldev_prv = (struct ldev_private *) disk->private;
17081+ switch (tmp.feature_command) {
17082+ kdev_t save_dev;
17083+ case LDEV_MGR_BROADCAST_IOCTL_CMD:
17084+ save_dev = inode->i_rdev;
17085+ inode->i_rdev =
17086+ MKDEV(ldev_prv->major, ldev_prv->minor);
17087+ rc = ldev_prv->bdev->bd_op->ioctl(inode, file,
17088+ pi_data.cmd,
17089+ pi_data.arg);
17090+ inode->i_rdev = save_dev;
17091+ break;
17092+ default:
17093+ rc = -EINVAL;
17094+ break;
17095+ }
17096+ }
17097+
17098+ /* return status value */
17099+ tmp.status = rc;
17100+ copy_to_user((struct evms_plugin_ioctl_pkt *) arg, &tmp, sizeof (tmp));
17101+ MOD_DEC_USE_COUNT;
17102+ return rc;
17103+}
17104+
17105+/********************************************************/
17106+/* Required Plugin Function Table Entry Point: */
17107+/* IOCTL function & Support routines */
17108+/********************************************************/
17109+
17110+static int
17111+ldev_mgr_ioctl(struct evms_logical_node *disk,
17112+ struct inode *inode,
17113+ struct file *file, unsigned int cmd, unsigned long arg)
17114+{
17115+ int rc = 0;
17116+ struct ldev_private *ldev_prv = disk->private;
17117+ kdev_t save_dev;
17118+ struct block_device *save_bdev;
17119+
17120+ if (!inode || !disk)
17121+ return -EINVAL;
17122+
17123+ save_dev = inode->i_rdev;
17124+ inode->i_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
17125+ save_bdev = inode->i_bdev;
17126+ inode->i_bdev = ldev_prv->bdev;
17127+ /* check device availability */
17128+ if (!blk_get_queue(MKDEV(ldev_prv->major, ldev_prv->minor))) {
17129+ disk->flags |= EVMS_DEVICE_UNAVAILABLE;
17130+ }
17131+ switch (cmd) {
17132+ case EVMS_QUIESCE_VOLUME:
17133+ case EVMS_PLUGIN_IOCTL:
17134+ break;
17135+ case EVMS_GET_BMAP:
17136+ {
17137+ struct evms_get_bmap_pkt *bmap =
17138+ (struct evms_get_bmap_pkt *) arg;
17139+ bmap->dev = MKDEV(ldev_prv->major, ldev_prv->minor);
17140+ bmap->status = 0;
17141+ }
17142+ break;
17143+ case EVMS_OPEN_VOLUME:
17144+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17145+ rc = -ENXIO;
17146+ } else {
17147+ rc = ldev_prv->bdev->bd_op->open(inode, file);
17148+ }
17149+ break;
17150+ case EVMS_CLOSE_VOLUME:
17151+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17152+ rc = -ENXIO;
17153+ } else {
17154+ rc = ldev_prv->bdev->bd_op->release(inode, file);
17155+ }
17156+ break;
17157+ case EVMS_CHECK_MEDIA_CHANGE:
17158+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17159+ rc = -ENXIO;
17160+ } else {
17161+ /* once we detect that media changed
17162+ * is 'set', don't send any more ioctls
17163+ * down to the device, until the
17164+ * media change has been 'reset' by a
17165+ * revalidate disk ioctl. when already
17166+ * 'set', just return a 1 w/o actually
17167+ * performing another ioctl call to the
17168+ * device.
17169+ */
17170+ if (ldev_prv->media_changed == TRUE) {
17171+ rc = 1;
17172+ break;
17173+ }
17174+ rc = ldev_prv->bdev->bd_op->
17175+ check_media_change(MKDEV
17176+ (ldev_prv->major,
17177+ ldev_prv->minor));
17178+ if (rc == 1) {
17179+ ldev_prv->media_changed = TRUE;
17180+ disk->flags |= EVMS_MEDIA_CHANGED;
17181+ }
17182+ }
17183+ break;
17184+ case EVMS_REVALIDATE_DISK:
17185+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17186+ rc = -ENXIO;
17187+ } else {
17188+ /* don't actually send this ioctl down
17189+ * to the device, until we know that
17190+ * previous check media change ioctl
17191+ * has occurred.
17192+ *
17193+ * when we do actually send the ioctl
17194+ * down, reset the local media_changed
17195+ * flag.
17196+ */
17197+ if (ldev_prv->media_changed == FALSE)
17198+ break;
17199+ rc = ldev_prv->bdev->bd_op->
17200+ revalidate(MKDEV
17201+ (ldev_prv->major, ldev_prv->minor));
17202+ ldev_prv->media_changed = FALSE;
17203+ }
17204+ break;
17205+ case EVMS_GET_DISK_LIST:
17206+ rc = evms_cs_add_item_to_list((struct evms_list_node **) arg,
17207+ disk);
17208+ if (rc > 0)
17209+ rc = 0;
17210+ break;
17211+ case EVMS_CHECK_DEVICE_STATUS:
17212+ if (arg) {
17213+ int *status = (int *) arg;
17214+ *status |= disk->flags;
17215+ }
17216+ break;
17217+ case EVMS_UPDATE_DEVICE_INFO:
17218+ /* determine hardsector size */
17219+ disk->hardsector_size = 512;
17220+ if (hardsect_size[ldev_prv->major]) {
17221+ disk->hardsector_size = hardsect_size[ldev_prv->major][ldev_prv->minor];
17222+ }
17223+ /* save the block size */
17224+ disk->block_size = 1024;
17225+ if (blksize_size[ldev_prv->major]) {
17226+ disk->block_size = blksize_size[ldev_prv->major][ldev_prv->minor];
17227+ }
17228+ /* device size in sectors
17229+ *
17230+ * try 64bit size first, if that fails
17231+ * fall back on the 32bit size.
17232+ */
17233+ /* try 64bit size */
17234+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
17235+ rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE64,
17236+ (ulong) & disk->total_vsectors);
17237+ if (!rc) {
17238+ /* convert bytes to 512 byte sectors */
17239+ disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
17240+ } else
17241+#endif
17242+ {
17243+ /* try 32bit size */
17244+ ulong dev_size = 0;
17245+ rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE,
17246+ (ulong) & dev_size);
17247+ disk->total_vsectors = dev_size;
17248+ }
17249+ break;
17250+ default:
17251+ if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17252+ rc = -ENXIO;
17253+ } else {
17254+ rc = ldev_prv->bdev->bd_op->ioctl(inode, file, cmd,
17255+ arg);
17256+ }
17257+ break;
17258+ }
17259+ inode->i_bdev = save_bdev;
17260+ inode->i_rdev = save_dev;
17261+
17262+ return (rc);
17263+}
17264+
17265+/********************************************************/
17266+/* Required Module Entry Point: */
17267+/* ldev_mgr_init */
17268+/********************************************************/
17269+
17270+static int __init
17271+ldev_mgr_init(void)
17272+{
17273+ return evms_cs_register_plugin(&plugin_header);
17274+}
17275+
17276+static void __exit
17277+ldev_mgr_exit(void)
17278+{
17279+ evms_cs_unregister_plugin(&plugin_header);
17280+}
17281+
17282+module_init(ldev_mgr_init);
17283+module_exit(ldev_mgr_exit);
17284+#ifdef MODULE_LICENSE
17285+MODULE_LICENSE("GPL");
17286+#endif
17287diff -Naur linux-2002-09-30/drivers/evms/lvm_vge.c evms-2002-09-30/drivers/evms/lvm_vge.c
17288--- linux-2002-09-30/drivers/evms/lvm_vge.c Wed Dec 31 18:00:00 1969
17289+++ evms-2002-09-30/drivers/evms/lvm_vge.c Fri Sep 13 16:45:06 2002
17290@@ -0,0 +1,3734 @@
17291+/* -*- linux-c -*- */
17292+/*
17293+ * Copyright (c) International Business Machines Corp., 2000
17294+ *
17295+ * This program is free software; you can redistribute it and/or modify
17296+ * it under the terms of the GNU General Public License as published by
17297+ * the Free Software Foundation; either version 2 of the License, or
17298+ * (at your option) any later version.
17299+ *
17300+ * This program is distributed in the hope that it will be useful,
17301+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17302+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17303+ * the GNU General Public License for more details.
17304+ *
17305+ * You should have received a copy of the GNU General Public License
17306+ * along with this program; if not, write to the Free Software
17307+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17308+ */
17309+/*
17310+ * linux/drivers/evms/lvm_vge.c
17311+ *
17312+ * EVMS Linux LVM Region Manager
17313+ */
17314+
17315+#define LOG_PREFIX "lvm: "
17316+
17317+#include <linux/kernel.h>
17318+#include <linux/module.h>
17319+#include <linux/vmalloc.h>
17320+#include <linux/version.h>
17321+#include <asm/uaccess.h>
17322+
17323+#include <linux/evms/evms.h>
17324+#include <linux/evms/evms_lvm.h>
17325+
17326+/* Plugin API prototypes. */
17327+static int lvm_discover(struct evms_logical_node ** evms_node_list);
17328+static int lvm_discover_end(struct evms_logical_node ** evms_node_list);
17329+static int lvm_delete_node(struct evms_logical_node * logical_node);
17330+static void lvm_read(struct evms_logical_node * node, struct buffer_head * bh);
17331+static void lvm_write(struct evms_logical_node * node, struct buffer_head * bh);
17332+static int lvm_init_io(struct evms_logical_node * node,
17333+ int io_flag,
17334+ u64 sect_nr,
17335+ u64 num_sects,
17336+ void * buf_addr);
17337+static int lvm_ioctl(struct evms_logical_node * logical_node,
17338+ struct inode * inode,
17339+ struct file * file,
17340+ unsigned int cmd,
17341+ unsigned long arg);
17342+static int lvm_direct_ioctl(struct inode * inode,
17343+ struct file * file,
17344+ unsigned int cmd,
17345+ unsigned long args);
17346+
17347+static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
17348+ u64 snap_sector);
17349+
17350+/* LVM Plugin function table and header. */
17351+static struct evms_plugin_fops lvm_fops = {
17352+ .discover = lvm_discover,
17353+ .end_discover = lvm_discover_end,
17354+ .delete = lvm_delete_node,
17355+ .read = lvm_read,
17356+ .write = lvm_write,
17357+ .init_io = lvm_init_io,
17358+ .ioctl = lvm_ioctl,
17359+ .direct_ioctl = lvm_direct_ioctl
17360+};
17361+
17362+static struct evms_plugin_header lvm_plugin_header = {
17363+ .id = SetPluginID(IBM_OEM_ID,
17364+ EVMS_REGION_MANAGER,
17365+ 0x01),
17366+ .version = {
17367+ .major = EVMS_LVM_VERSION_MAJOR,
17368+ .minor = EVMS_LVM_VERSION_MINOR,
17369+ .patchlevel = EVMS_LVM_VERSION_PATCH
17370+ },
17371+ .required_services_version = {
17372+ .major = 0,
17373+ .minor = 5,
17374+ .patchlevel = 0
17375+ },
17376+ .fops = &lvm_fops
17377+};
17378+
17379+static struct lvm_volume_group * lvm_group_list = NULL;
17380+static struct proc_dir_entry * lvm_proc = NULL;
17381+
17382+
17383+/********** Miscellaneous Functions **********/
17384+
17385+
17386+/**
17387+ * remap sector
17388+ * @node:
17389+ * @org_sector: Logical sector to remap.
17390+ * @size: Size (in sectors) or request to remap.
17391+ * @new_sector: Remapped sector.
17392+ * @new_size: New size (in sectors).
17393+ * @pe_start_sector: Starting sector of PE - needed for snapshotting.
17394+ * @pv_entry: New node for which new_sector is relative.
17395+ *
17396+ * Common function to remap LV lba to PV lba in appropriate PE. This
17397+ * function needs to deal with requests that span PEs and/or stripes. If
17398+ * this occurs, the request will simply be chopped off at the boundary of
17399+ * the first PE/stripe. It is up to the calling function to loop
17400+ * accordingly to finish the full remapping. This function is now partially
17401+ * 64-bit enabled. The striping section contains code that currently cannot
17402+ * eliminate at least one mod operation on 64 bit values.
17403+ **/
17404+static int remap_sector(struct evms_logical_node * node,
17405+ u64 org_sector,
17406+ u64 size,
17407+ u64 * new_sector,
17408+ u64 * new_size,
17409+ u64 * pe_start_sector,
17410+ struct lvm_physical_volume ** pv_entry)
17411+{
17412+ struct lvm_logical_volume * volume = node->private;
17413+ struct le_table_entry * le_entry;
17414+ u32 le, offset_in_le;
17415+
17416+ *new_size = size;
17417+
17418+ if ( volume->stripes > 1 ) {
17419+ /* Volume is striped. Reset the size if the request crosses
17420+ * a stripe boundary. Striping in LVM is not 64-bit enabled.
17421+ */
17422+ u32 column, columns, sectors_per_column;
17423+ u32 sector_in_column, stripe_in_column, le_in_column;
17424+ u32 offset_in_stripe, stripe_in_le;
17425+ u32 org_sector32 = org_sector;
17426+
17427+ sectors_per_column = volume->stripes * volume->pe_size;
17428+ column = org_sector32 / sectors_per_column;
17429+ sector_in_column = org_sector32 % sectors_per_column;
17430+ stripe_in_column = sector_in_column / volume->stripe_size;
17431+ le_in_column = stripe_in_column % volume->stripes;
17432+ columns = volume->num_le / volume->stripes;
17433+ le = column + (columns * le_in_column);
17434+
17435+ offset_in_stripe = org_sector32 % volume->stripe_size;
17436+ stripe_in_le = stripe_in_column / volume->stripes;
17437+ offset_in_le = offset_in_stripe +
17438+ stripe_in_le * volume->stripe_size;
17439+
17440+ if ( offset_in_stripe + size > volume->stripe_size ) {
17441+ *new_size = volume->stripe_size - offset_in_stripe;
17442+ }
17443+ } else {
17444+ /* Linear volume. Just find LE and offset. Reset the size if
17445+ * the request crosses an LE boundary. This path is 64-bit safe.
17446+ */
17447+ le = org_sector >> volume->pe_size_shift;
17448+ offset_in_le = org_sector & (volume->pe_size - 1);
17449+
17450+ if ( offset_in_le + size > volume->pe_size ) {
17451+ *new_size = volume->pe_size - offset_in_le;
17452+ }
17453+ }
17454+
17455+ le_entry = &volume->le_map[le];
17456+ *pe_start_sector = le_entry->pe_sector_offset;
17457+ *new_sector = le_entry->pe_sector_offset + offset_in_le;
17458+ *pv_entry = le_entry->owning_pv;
17459+
17460+ return 0;
17461+}
17462+
17463+/**
17464+ * add_group_to_list
17465+ *
17466+ * Add a volume group to the end of the LVM global group list.
17467+ **/
17468+static int add_group_to_list(struct lvm_volume_group * group)
17469+{
17470+ struct lvm_volume_group ** p_group;
17471+
17472+ for ( p_group = &lvm_group_list;
17473+ *p_group; p_group = &(*p_group)->next_group ) {
17474+ ;
17475+ }
17476+
17477+ *p_group = group;
17478+ group->next_group = NULL;
17479+ return 0;
17480+}
17481+
17482+/**
17483+ * remove_group_from_list
17484+ *
17485+ * Remove an LVM volume group from the global LVM list.
17486+ **/
17487+static int remove_group_from_list(struct lvm_volume_group * group)
17488+{
17489+ struct lvm_volume_group ** p_group;
17490+
17491+ for ( p_group = &lvm_group_list;
17492+ *p_group; p_group = &(*p_group)->next_group ) {
17493+ if ( *p_group == group ) {
17494+ *p_group = (*p_group)->next_group;
17495+ group->next_group = NULL;
17496+ break;
17497+ }
17498+ }
17499+
17500+ return 0;
17501+}
17502+
17503+/**
17504+ * find_group_by_uuid
17505+ *
17506+ * Use the vg_uuid to find the desired volume group.
17507+ **/
17508+static int find_group_by_uuid(u8 * vg_uuid,
17509+ struct lvm_volume_group ** group)
17510+{
17511+ struct lvm_volume_group * gp;
17512+
17513+ for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
17514+ if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
17515+ *group = gp;
17516+ return 0;
17517+ }
17518+ }
17519+ *group = NULL;
17520+ return -EINVAL;
17521+}
17522+
17523+/**
17524+ * find_pv_by_number
17525+ *
17526+ * Search the PV list of the specified volume group, looking for the
17527+ * specified PV number. If found, return a pointer to that PV.
17528+ **/
17529+static struct lvm_physical_volume *
17530+find_pv_by_number(u32 pv_number,
17531+ struct lvm_volume_group * group)
17532+{
17533+ struct lvm_physical_volume * pv_entry;
17534+
17535+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
17536+ if ( pv_entry->pv_number == pv_number ) {
17537+ return pv_entry;
17538+ }
17539+ }
17540+ return NULL;
17541+}
17542+
17543+/**
17544+ * translate_lv_name
17545+ * @lvm_lv_name: Input LVM-style name.
17546+ * @evms_node_name: Output EVMS-style name.
17547+ *
17548+ * In LVM, volumes have names based on their dev-node, which follow the
17549+ * pattern /dev/group_name/volume_name. In EVMS, the same volume needs
17550+ * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
17551+ * the lv_disk_t needs to be translated before copying to the associated
17552+ * node. evms_node_name must point to a NAME_LEN sized buffer.
17553+ **/
17554+static int translate_lv_name(char * lvm_lv_name, char * evms_node_name)
17555+{
17556+ char * ptr;
17557+
17558+ memset(evms_node_name, 0, NAME_LEN);
17559+
17560+ /* Make sure the string starts with /dev/, and skip over it. */
17561+ ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
17562+ if ( ptr != lvm_lv_name ) {
17563+ LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
17564+ return -EINVAL;
17565+ }
17566+ ptr = &ptr[strlen(DEV_DIRECTORY)];
17567+
17568+ /* ptr now points to "group_name/volume_name".
17569+ * Use this to create the name for the EVMS node.
17570+ */
17571+ strcpy(evms_node_name, LVM_DEV_DIRECTORY);
17572+ strncat(evms_node_name, ptr, NAME_LEN - strlen(evms_node_name) - 1);
17573+
17574+ return 0;
17575+}
17576+
17577+/**
17578+ * check_pv_for_lv
17579+ *
17580+ * Run through all LE maps of all LVs in this group, and make sure the
17581+ * specified PV is not being pointed to by any LEs.
17582+ **/
17583+static int check_pv_for_lv(struct lvm_physical_volume * pv_entry,
17584+ struct lvm_volume_group * group)
17585+{
17586+ struct lvm_logical_volume * volume;
17587+ int i, j;
17588+
17589+ for ( i = 1; i <= MAX_LV; i++ ) {
17590+ if ( (volume = group->volume_list[i]) ) {
17591+ for ( j = 0; j < volume->num_le; j++ ) {
17592+ if ( volume->le_map[j].owning_pv == pv_entry ) {
17593+ return -EINVAL;
17594+ }
17595+ }
17596+ }
17597+ }
17598+ return 0;
17599+}
17600+
17601+
17602+/********** Metadata I/O Functions **********/
17603+
17604+
17605+/**
17606+ * endian_convert_pv
17607+ *
17608+ * Endian-neutral conversion for PV structures.
17609+ **/
17610+static inline void endian_convert_pv(struct pv_disk * pv)
17611+{
17612+ pv->version = le16_to_cpup(&pv->version);
17613+ pv->pv_on_disk.base = le32_to_cpup(&pv->pv_on_disk.base);
17614+ pv->pv_on_disk.size = le32_to_cpup(&pv->pv_on_disk.size);
17615+ pv->vg_on_disk.base = le32_to_cpup(&pv->vg_on_disk.base);
17616+ pv->vg_on_disk.size = le32_to_cpup(&pv->vg_on_disk.size);
17617+ pv->pv_uuidlist_on_disk.base =
17618+ le32_to_cpup(&pv->pv_uuidlist_on_disk.base);
17619+ pv->pv_uuidlist_on_disk.size =
17620+ le32_to_cpup(&pv->pv_uuidlist_on_disk.size);
17621+ pv->lv_on_disk.base = le32_to_cpup(&pv->lv_on_disk.base);
17622+ pv->lv_on_disk.size = le32_to_cpup(&pv->lv_on_disk.size);
17623+ pv->pe_on_disk.base = le32_to_cpup(&pv->pe_on_disk.base);
17624+ pv->pe_on_disk.size = le32_to_cpup(&pv->pe_on_disk.size);
17625+ pv->pv_major = le32_to_cpup(&pv->pv_major);
17626+ pv->pv_number = le32_to_cpup(&pv->pv_number);
17627+ pv->pv_status = le32_to_cpup(&pv->pv_status);
17628+ pv->pv_allocatable = le32_to_cpup(&pv->pv_allocatable);
17629+ pv->pv_size = le32_to_cpup(&pv->pv_size);
17630+ pv->lv_cur = le32_to_cpup(&pv->lv_cur);
17631+ pv->pe_size = le32_to_cpup(&pv->pe_size);
17632+ pv->pe_total = le32_to_cpup(&pv->pe_total);
17633+ pv->pe_allocated = le32_to_cpup(&pv->pe_allocated);
17634+ pv->pe_start = le32_to_cpup(&pv->pe_start);
17635+}
17636+
17637+/**
17638+ * read_pv
17639+ *
17640+ * Read in the PV structure from the specified node. If it contains a
17641+ * valid PV signature, allocate a new struct pv_disk and copy the data.
17642+ **/
17643+static int read_pv(struct evms_logical_node * node, struct pv_disk ** pv)
17644+{
17645+ struct pv_disk * pv_buffer;
17646+ int rc = -ENOMEM;
17647+
17648+ *pv = NULL;
17649+
17650+ /* Buffer for reading the PV metadata. */
17651+ pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
17652+ if (!pv_buffer) {
17653+ LOG_CRITICAL("Error allocating PV metadata buffer for %s\n",
17654+ node->name);
17655+ goto out;
17656+ }
17657+
17658+ /* Read the first two sectors. */
17659+ rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
17660+ evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer);
17661+ if (rc) {
17662+ LOG_SERIOUS("Error reading PV metadata from %s\n", node->name);
17663+ goto out_kfree;
17664+ }
17665+
17666+ /* Endian-neutral conversion of PV metadata. */
17667+ endian_convert_pv(pv_buffer);
17668+
17669+ /* Check for an LVM signature and make sure the sizes match.
17670+ * Versions 1 and 2 are both valid now. Thanks LVM! :)
17671+ */
17672+ if ( !(pv_buffer->id[0] == 'H' &&
17673+ pv_buffer->id[1] == 'M' &&
17674+ (pv_buffer->version == 1 || pv_buffer->version == 2) &&
17675+ pv_buffer->pv_size == node->total_vsectors) ) {
17676+ LOG_EXTRA("%s is not an LVM PV\n", node->name);
17677+ rc = -EINVAL;
17678+ goto out_kfree;
17679+ }
17680+
17681+ /* This is a valid PV. Allocate a new pv_disk. */
17682+ *pv = kmalloc(sizeof(struct pv_disk), GFP_NOIO);
17683+ if (!*pv) {
17684+ LOG_CRITICAL("Error allocating new PV for %s\n", node->name);
17685+ rc = -ENOMEM;
17686+ goto out_kfree;
17687+ }
17688+
17689+ /* Copy the metadata. */
17690+ memcpy(*pv, pv_buffer, sizeof(struct pv_disk));
17691+
17692+out_kfree:
17693+ kfree(pv_buffer);
17694+out:
17695+ return rc;
17696+}
17697+
17698+/**
17699+ * endian_convert_vg
17700+ *
17701+ * Endian-neutral conversion for VG structures
17702+ **/
17703+static inline void endian_convert_vg(struct vg_disk * vg)
17704+{
17705+ vg->vg_number = le32_to_cpup(&vg->vg_number);
17706+ vg->vg_access = le32_to_cpup(&vg->vg_access);
17707+ vg->vg_status = le32_to_cpup(&vg->vg_status);
17708+ vg->lv_max = le32_to_cpup(&vg->lv_max);
17709+ vg->lv_cur = le32_to_cpup(&vg->lv_cur);
17710+ vg->lv_open = le32_to_cpup(&vg->lv_open);
17711+ vg->pv_max = le32_to_cpup(&vg->pv_max);
17712+ vg->pv_cur = le32_to_cpup(&vg->pv_cur);
17713+ vg->pv_act = le32_to_cpup(&vg->pv_act);
17714+ vg->dummy = le32_to_cpup(&vg->dummy);
17715+ vg->vgda = le32_to_cpup(&vg->vgda);
17716+ vg->pe_size = le32_to_cpup(&vg->pe_size);
17717+ vg->pe_total = le32_to_cpup(&vg->pe_total);
17718+ vg->pe_allocated = le32_to_cpup(&vg->pe_allocated);
17719+ vg->pvg_total = le32_to_cpup(&vg->pvg_total);
17720+}
17721+
17722+/**
17723+ * read_vg
17724+ *
17725+ * Read in the VG structure from the specified node. Allocate a new
17726+ * struct vg_disk and copy the data.
17727+ **/
17728+static int read_vg(struct evms_logical_node * node,
17729+ struct pv_disk * pv,
17730+ struct vg_disk ** vg)
17731+{
17732+ struct vg_disk * vg_buffer;
17733+ unsigned long vg_sectors;
17734+ int rc = -ENOMEM;
17735+
17736+ /* Allocate a buffer to read the VG metadata. */
17737+ vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
17738+ vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
17739+ if (!vg_buffer) {
17740+ LOG_CRITICAL("Error allocating VG metadata buffer for %s\n",
17741+ node->name);
17742+ goto out;
17743+ }
17744+
17745+ /* Read the VG metadata. */
17746+ rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base),
17747+ vg_sectors, vg_buffer);
17748+ if (rc) {
17749+ LOG_SERIOUS("Error reading VG metadata from %s\n", node->name);
17750+ goto out_kfree;
17751+ }
17752+
17753+ /* Endian-neutral conversion of VG metadata. */
17754+ endian_convert_vg(vg_buffer);
17755+
17756+ /* Allocate a new struct vg_disk. */
17757+ *vg = kmalloc(sizeof(struct vg_disk), GFP_NOIO);
17758+ if (!*vg) {
17759+ LOG_CRITICAL("Error allocating new VG for %s\n", node->name);
17760+ rc = -ENOMEM;
17761+ goto out_kfree;
17762+ }
17763+
17764+ /* Copy the metadata. */
17765+ memcpy(*vg, vg_buffer, sizeof(struct vg_disk));
17766+
17767+out_kfree:
17768+ kfree(vg_buffer);
17769+out:
17770+ return rc;
17771+}
17772+
17773+/**
17774+ * read_uuid_list
17775+ **/
17776+static int read_uuid_list(struct evms_logical_node * node,
17777+ struct pv_disk * pv,
17778+ struct lvm_volume_group * group)
17779+{
17780+ u64 start_sector;
17781+ unsigned long total_sectors;
17782+ unsigned char * uuid_buffer;
17783+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17784+ unsigned long uuid_list_size;
17785+ int i, rc = 0;
17786+
17787+ if (group->uuid_list) {
17788+ LOG_EXTRA("Already read PV UUIDs for group %s\n",
17789+ group->vg_name);
17790+ goto out;
17791+ }
17792+
17793+ start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
17794+ total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
17795+ uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17796+ buffer_size);
17797+
17798+ /* Allocate a buffer to perform the I/Os. */
17799+ uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
17800+ if (!uuid_buffer) {
17801+ LOG_CRITICAL("Error allocating buffer for UUID list in group %s\n",
17802+ group->vg_name);
17803+ rc = -ENOMEM;
17804+ goto out;
17805+ }
17806+
17807+ /* Allocate memory for the UUID array for this group. */
17808+ group->uuid_list = vmalloc(uuid_list_size);
17809+ if (!group->uuid_list) {
17810+ LOG_CRITICAL("Error allocating UUID list for group %s\n",
17811+ group->vg_name);
17812+ rc = -ENOMEM;
17813+ goto out_kfree;
17814+ }
17815+ memset(group->uuid_list, 0, uuid_list_size);
17816+
17817+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17818+ rc = INIT_IO(node, 0, start_sector + i,
17819+ IO_BUFFER_SECTORS, uuid_buffer);
17820+ if (rc) {
17821+ LOG_SERIOUS("Error reading PV UUID list from %s\n",
17822+ node->name);
17823+ goto out_vfree;
17824+ }
17825+ /* Copy the I/O buffer into the UUID array. */
17826+ memcpy(&(group->uuid_list[i * EVMS_VSECTOR_SIZE]),
17827+ uuid_buffer, buffer_size);
17828+ }
17829+
17830+ /* Clear out the unused portion at the end of the uuid_list. */
17831+ memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0,
17832+ uuid_list_size - pv->pv_uuidlist_on_disk.size);
17833+
17834+out_kfree:
17835+ kfree(uuid_buffer);
17836+out:
17837+ return rc;
17838+
17839+out_vfree:
17840+ vfree(group->uuid_list);
17841+ group->uuid_list = NULL;
17842+ goto out_kfree;
17843+}
17844+
17845+/**
17846+ * endian_convert_lv
17847+ *
17848+ * Endian-neutral conversion for LV structures
17849+ **/
17850+static inline void endian_convert_lv(struct lv_disk * lv)
17851+{
17852+ lv->lv_access = le32_to_cpup(&lv->lv_access);
17853+ lv->lv_status = le32_to_cpup(&lv->lv_status);
17854+ lv->lv_open = le32_to_cpup(&lv->lv_open);
17855+ lv->lv_dev = le32_to_cpup(&lv->lv_dev);
17856+ lv->lv_number = le32_to_cpup(&lv->lv_number);
17857+ lv->lv_mirror_copies = le32_to_cpup(&lv->lv_mirror_copies);
17858+ lv->lv_recovery = le32_to_cpup(&lv->lv_recovery);
17859+ lv->lv_schedule = le32_to_cpup(&lv->lv_schedule);
17860+ lv->lv_size = le32_to_cpup(&lv->lv_size);
17861+ lv->lv_snapshot_minor = le32_to_cpup(&lv->lv_snapshot_minor);
17862+ lv->lv_chunk_size = le16_to_cpup(&lv->lv_chunk_size);
17863+ lv->dummy = le16_to_cpup(&lv->dummy);
17864+ lv->lv_allocated_le = le32_to_cpup(&lv->lv_allocated_le);
17865+ lv->lv_stripes = le32_to_cpup(&lv->lv_stripes);
17866+ lv->lv_stripesize = le32_to_cpup(&lv->lv_stripesize);
17867+ lv->lv_badblock = le32_to_cpup(&lv->lv_badblock);
17868+ lv->lv_allocation = le32_to_cpup(&lv->lv_allocation);
17869+ lv->lv_io_timeout = le32_to_cpup(&lv->lv_io_timeout);
17870+ lv->lv_read_ahead = le32_to_cpup(&lv->lv_read_ahead);
17871+}
17872+
17873+static inline void endian_convert_lvs(struct lvm_volume_group * group)
17874+{
17875+ int i;
17876+ for ( i = 0; i < group->vg->lv_max; i++ ) {
17877+ endian_convert_lv(&(group->lv_array[i]));
17878+ }
17879+}
17880+
17881+/**
17882+ * read_lv
17883+ *
17884+ * Read in the LV structures for the specified group. Do the read from
17885+ * the first PV in the group. If that one fails, keep trying on the
17886+ * remaining PVs until one works. This function will allocate a buffer
17887+ * for the group to read in the structures.
17888+ **/
17889+static int read_lv(struct lvm_volume_group * group)
17890+{
17891+ struct lvm_physical_volume * pv_entry = group->pv_list;
17892+ unsigned char * lv_buffer = NULL;
17893+ u64 start_sector;
17894+ unsigned long total_sectors, lv_array_size = 0;
17895+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17896+ int i, rc = 1;
17897+
17898+ if (group->lv_array) {
17899+ return 0;
17900+ }
17901+
17902+ if (!pv_entry) {
17903+ LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n",
17904+ group->vg_name);
17905+ return -EINVAL;
17906+ }
17907+
17908+ /* Allocate a buffer to do the actual I/Os. */
17909+ lv_buffer = kmalloc(buffer_size, GFP_NOIO);
17910+ if (!lv_buffer) {
17911+ LOG_CRITICAL("Error allocating buffer for LV structs for Group %s\n",
17912+ group->vg_name);
17913+ return -ENOMEM;
17914+ }
17915+
17916+ /* Read in the LV structures 4k at a time. If one PV returns errors,
17917+ * start over with the next PV in the group.
17918+ */
17919+ while (rc && pv_entry) {
17920+ start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
17921+ total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
17922+ lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17923+ buffer_size);
17924+
17925+ /* Allocate the buffer for this group to
17926+ * hold the entire LV array.
17927+ */
17928+ if (group->lv_array) {
17929+ vfree(group->lv_array);
17930+ group->lv_array = NULL;
17931+ }
17932+ group->lv_array = vmalloc(lv_array_size);
17933+ if (!group->lv_array) {
17934+ LOG_CRITICAL("Error allocating lv_array buffer for Group %s\n",
17935+ group->vg_name);
17936+ rc = -ENOMEM;
17937+ goto out_kfree;
17938+ }
17939+ memset(group->lv_array, 0, lv_array_size);
17940+
17941+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17942+ rc = INIT_IO(pv_entry->logical_node, 0,
17943+ start_sector + i, IO_BUFFER_SECTORS,
17944+ lv_buffer);
17945+ if (rc) {
17946+ LOG_SERIOUS("Error reading LV metadata from %s in Group %s\n",
17947+ pv_entry->logical_node->name,
17948+ group->vg_name);
17949+
17950+ /* Try the next PV if the current one
17951+ * caused any errors.
17952+ */
17953+ pv_entry = pv_entry->next;
17954+ break;
17955+ }
17956+ /* Copy the I/O buffer into the lv_array. */
17957+ memcpy(&(((char *)(group->lv_array))[i * EVMS_VSECTOR_SIZE]),
17958+ lv_buffer, buffer_size);
17959+ }
17960+ }
17961+
17962+ if (rc) {
17963+ LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n",
17964+ group->vg_name);
17965+ goto out_vfree;
17966+ }
17967+
17968+ /* Clear out the unused portion at the end of the lv_array. */
17969+ memset(&(((char *)(group->lv_array))[pv_entry->pv->lv_on_disk.size]),
17970+ 0, lv_array_size - pv_entry->pv->lv_on_disk.size);
17971+
17972+ /* Endian-neutral conversion of the LV metadata. */
17973+ endian_convert_lvs(group);
17974+
17975+out_kfree:
17976+ kfree(lv_buffer);
17977+ return rc;
17978+
17979+out_vfree:
17980+ vfree(group->lv_array);
17981+ group->lv_array = NULL;
17982+ goto out_kfree;
17983+}
17984+
17985+/**
17986+ * endian_convert_pe_map
17987+ *
17988+ * Endian-neutral conversion for PE structures
17989+ **/
17990+static inline void endian_convert_pe_map(struct lvm_physical_volume * pv_entry)
17991+{
17992+ int i;
17993+ for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
17994+ pv_entry->pe_map[i].lv_num =
17995+ le16_to_cpup(&pv_entry->pe_map[i].lv_num);
17996+ pv_entry->pe_map[i].le_num =
17997+ le16_to_cpup(&pv_entry->pe_map[i].le_num);
17998+ }
17999+}
18000+
18001+/**
18002+ * read_pe_map
18003+ *
18004+ * Read in the PE map for the specified PV. This function will allocate a
18005+ * buffer to read in the data.
18006+ **/
18007+static int read_pe_map(struct lvm_physical_volume * pv_entry)
18008+{
18009+ struct evms_logical_node * node = pv_entry->logical_node;
18010+ struct pv_disk * pv = pv_entry->pv;
18011+ unsigned char * pe_buffer;
18012+ u64 start_sector;
18013+ unsigned long total_sectors, pe_map_size;
18014+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
18015+ int i, rc = -ENOMEM;
18016+
18017+ if (pv_entry->pe_map) {
18018+ return 0;
18019+ }
18020+
18021+ start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
18022+ total_sectors = evms_cs_size_in_vsectors(pv->pe_total *
18023+ sizeof(struct pe_disk));
18024+ pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
18025+
18026+ /* Allocate a buffer for performing the I/O. */
18027+ pe_buffer = kmalloc(buffer_size, GFP_NOIO);
18028+ if (!pe_buffer) {
18029+ LOG_CRITICAL("Error allocating buffer for PE maps for %s\n",
18030+ node->name);
18031+ goto out;
18032+ }
18033+
18034+ /* Allocate a buffer to hold the PE map for this PV. */
18035+ pv_entry->pe_map = vmalloc(pe_map_size);
18036+ if (!pv_entry->pe_map) {
18037+ LOG_CRITICAL("Error allocating PE map for %s\n", node->name);
18038+ goto out_kfree;
18039+ }
18040+ memset(pv_entry->pe_map, 0, pe_map_size);
18041+
18042+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
18043+ rc = INIT_IO(node, 0, start_sector + i,
18044+ IO_BUFFER_SECTORS, pe_buffer);
18045+ if (rc) {
18046+ LOG_SERIOUS("Error reading PE maps from %s.\n",
18047+ node->name);
18048+ goto out_vfree;
18049+ }
18050+ /* Copy the data to the actual PE map. */
18051+ memcpy(&(((char *)(pv_entry->pe_map))[i * EVMS_VSECTOR_SIZE]),
18052+ pe_buffer, buffer_size);
18053+ }
18054+
18055+ /* Clear out the unused portion at the end of the PE map. */
18056+ memset(&(((char *)(pv_entry->pe_map))[total_sectors * EVMS_VSECTOR_SIZE]),
18057+ 0, pe_map_size - total_sectors * EVMS_VSECTOR_SIZE);
18058+
18059+ /* Endian-neutral conversion of the PE metadata. */
18060+ endian_convert_pe_map(pv_entry);
18061+
18062+out_kfree:
18063+ kfree(pe_buffer);
18064+out:
18065+ return rc;
18066+
18067+out_vfree:
18068+ vfree(pv_entry->pe_map);
18069+ pv_entry->pe_map = NULL;
18070+ goto out_kfree;
18071+}
18072+
18073+
18074+/********** Snapshot Manipulation Functions **********/
18075+
18076+
18077+/**
18078+ * snapshot_check_quiesce_original
18079+ *
18080+ * For this snapshot LV, check that both it and its original are quiesced.
18081+ **/
18082+static int
18083+snapshot_check_quiesce_original(struct lvm_logical_volume * snap_volume)
18084+{
18085+ struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18086+
18087+ if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
18088+ return -EINVAL;
18089+ }
18090+
18091+ if ( org_volume && !(org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18092+ return -EINVAL;
18093+ }
18094+
18095+ return 0;
18096+}
18097+
18098+/**
18099+ * snapshot_check_quiesce_all
18100+ *
18101+ * Go through the list of all snapshots for an original volume, and make
18102+ * sure everyone is in a quiesced state.
18103+ **/
18104+static int snapshot_check_quiesce_all(struct lvm_logical_volume * org_volume)
18105+{
18106+ struct lvm_logical_volume * snap;
18107+
18108+ if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18109+ return -EINVAL;
18110+ }
18111+
18112+ for ( snap = org_volume->snapshot_next;
18113+ snap; snap = snap->snapshot_next ) {
18114+ if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
18115+ return -EINVAL;
18116+ }
18117+ }
18118+
18119+ return 0;
18120+}
18121+
18122+/**
18123+ * invalidate_snapshot_volume
18124+ *
18125+ * In the event a snapshot volume becomes full or corrupted, its metadata
18126+ * must be altered in order to prevent it from being used again. Write some
18127+ * invalid data into the first entry of the COW table. If this volume is
18128+ * not fully deleted by the user/engine, this invalid COW entry will be
18129+ * detected by build_snapshot_maps(), and will cause the volume to be
18130+ * deleted before being exported to EVMS during discover. This is obviously
18131+ * a hack, but it is the same hack currently used by LVM. We're just trying
18132+ * to be compatible. :)
18133+ **/
18134+static int invalidate_snapshot_volume(struct lvm_logical_volume * snap_volume)
18135+{
18136+ struct evms_logical_node tmp_node;
18137+
18138+ tmp_node.private = snap_volume;
18139+ tmp_node.total_vsectors = snap_volume->lv_size;
18140+
18141+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18142+ LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n",
18143+ snap_volume->name);
18144+ return -EINVAL;
18145+ }
18146+
18147+ LOG_WARNING("Invalidating full/corrupt snapshot %s\n",
18148+ snap_volume->name);
18149+ LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
18150+
18151+ if (snap_volume->cow_table) {
18152+ snap_volume->cow_table[0].pv_org_rsector =
18153+ cpu_to_le64(((u64)1));
18154+ if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
18155+ LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18156+ snap_volume->name);
18157+ }
18158+ } else {
18159+ LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18160+ snap_volume->name);
18161+ }
18162+
18163+ snap_volume->lv_status &= ~LV_ACTIVE;
18164+ return 0;
18165+}
18166+
18167+/**
18168+ * remove_snapshot_from_chain
18169+ *
18170+ * Remove a snapshot volume from its original's chain of snapshots. This
18171+ * does not delete the snapshot volume. At runtime, we cannot delete
18172+ * volumes at the region-manager level, because EVMS may have this volume
18173+ * exported, and there is no way to notify EVMS of the deletion. It will
18174+ * eventually need to be deleted in the engine, which will then tell the
18175+ * EVMS kernel services to delete the volume in the kernel.
18176+ **/
18177+static int remove_snapshot_from_chain(struct lvm_logical_volume * snap_volume)
18178+{
18179+ struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18180+ struct lvm_logical_volume ** p_volume;
18181+
18182+ if (org_volume) {
18183+ for ( p_volume = &org_volume->snapshot_next;
18184+ *p_volume;
18185+ p_volume = &(*p_volume)->snapshot_next ) {
18186+ if ( *p_volume == snap_volume ) {
18187+ *p_volume = snap_volume->snapshot_next;
18188+ break;
18189+ }
18190+ }
18191+ }
18192+
18193+ snap_volume->snapshot_org = NULL;
18194+ snap_volume->snapshot_next = NULL;
18195+ return 0;
18196+}
18197+
18198+/**
18199+ * snapshot_hash
18200+ *
18201+ * The snapshot hash tables are NEVER going to have 4 billion entries, so
18202+ * we can safely cast the org_sector to 32 bits and just mod it by the
18203+ * hash table size.
18204+ **/
18205+static u32 snapshot_hash(u64 org_sector,
18206+ struct lvm_logical_volume * snap_volume)
18207+{
18208+ return (((u32)org_sector) % snap_volume->hash_table_size);
18209+}
18210+
18211+/**
18212+ * snapshot_search_hash_chain
18213+ *
18214+ * Search the hash chain that is anchored at the specified head pointer.
18215+ * If the sector number is found, the result pointer is set to that entry
18216+ * in the chain, and a 1 is returned. If the sector is not found, the
18217+ * result pointer is set to the previous entry and 0 is returned. If the
18218+ * result pointer is NULL, this means either the list is empty, or the
18219+ * specified sector should become the first list item.
18220+ **/
18221+static int snapshot_search_hash_chain(u64 org_sector,
18222+ struct snapshot_map_entry * head,
18223+ struct snapshot_map_entry ** result)
18224+{
18225+ struct snapshot_map_entry * curr = head;
18226+ struct snapshot_map_entry * prev = head;
18227+ while ( curr && curr->org_sector < org_sector ) {
18228+ prev = curr;
18229+ curr = curr->next;
18230+ }
18231+ if (!curr) {
18232+ /* Either an empty chain or went off the end of the chain. */
18233+ *result = prev;
18234+ return 0;
18235+ } else if ( curr->org_sector != org_sector ) {
18236+ *result = curr->prev;
18237+ return 0;
18238+ } else {
18239+ /* Found the desired sector. */
18240+ *result = curr;
18241+ return 1;
18242+ }
18243+}
18244+
18245+/**
18246+ * insert_snapshot_map_entry
18247+ *
18248+ * Insert a new entry into a snapshot hash chain, immediately following the
18249+ * specified entry. This function should not be used to add an entry into
18250+ * an empty list, or as the first entry in an existing list. For that case,
18251+ * use insert_snapshot_map_entry_at_head().
18252+ **/
18253+static int insert_snapshot_map_entry(struct snapshot_map_entry * entry,
18254+ struct snapshot_map_entry * base)
18255+{
18256+ entry->next = base->next;
18257+ entry->prev = base;
18258+ base->next = entry;
18259+ if (entry->next) {
18260+ entry->next->prev = entry;
18261+ }
18262+ return 0;
18263+}
18264+
18265+/**
18266+ * insert_snapshot_map_entry_at_head
18267+ *
18268+ * Insert a new entry into a snapshot chain as the first entry.
18269+ **/
18270+static int insert_snapshot_map_entry_at_head(struct snapshot_map_entry * entry,
18271+ struct snapshot_map_entry ** head)
18272+{
18273+ entry->next = *head;
18274+ entry->prev = NULL;
18275+ *head = entry;
18276+ if (entry->next) {
18277+ entry->next->prev = entry;
18278+ }
18279+ return 0;
18280+}
18281+
18282+/**
18283+ * add_cow_entry_to_snapshot_map
18284+ *
18285+ * Convert a cow table entry (from the on-disk data) into an appropriate
18286+ * entry for the snapshot map. Insert this new entry into the appropriate
18287+ * map for the specified volume.
18288+ *
18289+ * The cow_entry passed into this function must have already been
18290+ * endian-converted from disk-order to cpu-order.
18291+ **/
18292+static int add_cow_entry_to_snapshot_map(struct lv_COW_table_disk * cow_entry,
18293+ struct lvm_logical_volume * volume)
18294+{
18295+ struct snapshot_map_entry * new_entry, * target_entry;
18296+ struct snapshot_map_entry ** hash_table, * chain_head;
18297+ u32 hash_value;
18298+
18299+ if ( cow_entry->pv_org_number == 0 ) {
18300+ return -EINVAL;
18301+ }
18302+
18303+ new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector,
18304+ cow_entry->pv_snap_rsector);
18305+ if (!new_entry) {
18306+ return -ENOMEM;
18307+ }
18308+
18309+ new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number,
18310+ volume->group);
18311+ if (!new_entry->snap_pv) {
18312+ kfree(new_entry);
18313+ return -EINVAL;
18314+ }
18315+
18316+ hash_value = snapshot_hash(new_entry->org_sector, volume);
18317+ hash_table = volume->snapshot_map[cow_entry->pv_org_number];
18318+ chain_head = hash_table[hash_value];
18319+ if ( snapshot_search_hash_chain(new_entry->org_sector,
18320+ chain_head, &target_entry) ) {
18321+ /* In general, we should not find this entry in the snapshot
18322+ * map already. However, it could happen on a re-discover, but
18323+ * the build_snapshot_maps function should weed out those cases.
18324+ * In either event, we can simply ignore duplicates.
18325+ */
18326+ LOG_WARNING("Detected a duplicate snapshot map entry\n");
18327+ LOG_WARNING("Snap PV "PFU64":"PFU64", Org PV "PFU64":"PFU64"\n",
18328+ cow_entry->pv_snap_number,
18329+ cow_entry->pv_snap_rsector,
18330+ cow_entry->pv_org_number,
18331+ cow_entry->pv_org_rsector);
18332+ kfree(new_entry);
18333+ } else {
18334+ if (target_entry) {
18335+ insert_snapshot_map_entry(new_entry, target_entry);
18336+ } else {
18337+ insert_snapshot_map_entry_at_head(new_entry,
18338+ &hash_table[hash_value]);
18339+ }
18340+ }
18341+
18342+ return 0;
18343+}
18344+
18345+/**
18346+ * snapshot_remap_sector
18347+ *
18348+ * Perform a sector remap on a snapshot volume. This should be called from
18349+ * the I/O read path, after the LE-to-PE translation has already been
18350+ * performed. First, determine the base sector of the chunk containing the
18351+ * specified sector, and save the remainder. Then, perform a search through
18352+ * the snapshot map for the specified volume. If an match is found, change
18353+ * the PV and sector numbers to the new values. If no match is found, leave
18354+ * the values alone, meaning the read should proceed down the original
18355+ * volume.
18356+ **/
18357+static void
18358+snapshot_remap_sector(struct lvm_logical_volume * snap_volume,
18359+ u64 pe_start_sector,
18360+ u64 * sector,
18361+ struct lvm_physical_volume ** pv_entry)
18362+{
18363+ struct snapshot_map_entry ** hash_table;
18364+ struct snapshot_map_entry * chain_head, * result;
18365+ u32 hash_value;
18366+ u64 chunk_sector, remainder;
18367+
18368+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18369+ return;
18370+ }
18371+
18372+ chunk_sector = ((*sector - pe_start_sector) &
18373+ ((u64)(~(snap_volume->chunk_size - 1)))) +
18374+ pe_start_sector;
18375+ remainder = *sector - chunk_sector;
18376+ hash_value = snapshot_hash(chunk_sector, snap_volume);
18377+ hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
18378+ chain_head = hash_table[hash_value];
18379+
18380+ if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
18381+ *pv_entry = result->snap_pv;
18382+ *sector = result->snap_sector + remainder;
18383+ }
18384+}
18385+
18386+/**
18387+ * snapshot_read_write_chunk
18388+ *
18389+ * This function takes care of reading one chunk of data from the
18390+ * original, and writing it to the snapshot. Since the original now has
18391+ * a fixed sized buffer for this data, we may have to loop to get the
18392+ * whole chunk copied.
18393+ **/
18394+static int snapshot_read_write_chunk(struct lvm_logical_volume * org_volume,
18395+ struct lvm_physical_volume * org_pv,
18396+ u64 chunk_sector,
18397+ struct lvm_logical_volume * snap_volume,
18398+ struct lvm_physical_volume ** snap_pv,
18399+ u64 * snap_sector)
18400+{
18401+ u32 io_size = snap_volume->chunk_size;
18402+ u64 snap_pe_start_sector, size;
18403+ int i, iterations = 1;
18404+
18405+ if ( org_volume->chunk_size < snap_volume->chunk_size ) {
18406+ iterations = snap_volume->chunk_size / org_volume->chunk_size;
18407+ io_size = org_volume->chunk_size;
18408+ }
18409+
18410+ remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1,
18411+ snap_sector, &size, &snap_pe_start_sector, snap_pv);
18412+
18413+ /* Check for an incomplete volume. */
18414+ if (!*snap_sector || !*snap_pv) {
18415+ invalidate_snapshot_volume(snap_volume);
18416+ return -1;
18417+ }
18418+
18419+ for ( i = 0; i < iterations; i++ ) {
18420+
18421+ /* Read the chunk from the original volume. This is a physical
18422+ * read, not logical. Thus, stripe boundary considerations are
18423+ * unnecessary. Also, chunks are always aligned with PEs, so PE
18424+ * boundary considerations are unnecessary.
18425+ */
18426+ if ( INIT_IO(org_pv->logical_node, 0,
18427+ chunk_sector + i * io_size, io_size,
18428+ org_volume->chunk_data_buffer) ) {
18429+ return 1;
18430+ }
18431+
18432+ /* Write this chunk to the snapshot volume. This does duplicate
18433+ * the local init_io code, but we need to have the remapped
18434+ * sector later on, so this is slightly more efficient. Snapshot
18435+ * volumes cannot be striped, so there is no need to consider
18436+ * stripe-boundary conditions. And just like the read in the
18437+ * previous line, chunks are always aligned with PEs, so we
18438+ * don't have to consider PE-boundary conditions.
18439+ */
18440+ if ( INIT_IO((*snap_pv)->logical_node, 1,
18441+ *snap_sector + i * io_size, io_size,
18442+ org_volume->chunk_data_buffer) ) {
18443+ /* An error writing the chunk to the snapshot is the
18444+ * same situation as the snapshot being full.
18445+ */
18446+ invalidate_snapshot_volume(snap_volume);
18447+ return -1;
18448+ }
18449+ }
18450+
18451+ return 0;
18452+}
18453+
18454+/**
18455+ * snapshot_copy_data
18456+ *
18457+ * On a write to a snapshotted volume, check all snapshots to see if the
18458+ * specified chunk has already been remapped. If it has not, read the
18459+ * original data from the volume, write the data to the next available
18460+ * chunk on the snapshot, update the COW table, write the COW table to
18461+ * the snapshot, and insert a new entry into the snapshot map.
18462+ *
18463+ * Now converted to copy data to a single snapshot. The looping is left
18464+ * up to lvm_write.
18465+ **/
18466+static int snapshot_copy_data(struct lvm_logical_volume * org_volume,
18467+ struct lvm_logical_volume * snap_volume,
18468+ u64 pe_start_sector,
18469+ u64 org_sector,
18470+ struct lvm_physical_volume * org_pv)
18471+{
18472+ struct lvm_physical_volume * snap_pv;
18473+ struct snapshot_map_entry ** hash_table, * chain_head;
18474+ struct snapshot_map_entry * target_entry, * new_map_entry;
18475+ u64 chunk_sector, snap_sector;
18476+ u32 hash_value;
18477+ int rc = 0;
18478+
18479+ /* Lock out this snapshot while we are remapping. */
18480+ down(&snap_volume->snap_semaphore);
18481+
18482+ /* Make sure the snapshot has not been deactivated. */
18483+ if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
18484+ goto out;
18485+ }
18486+
18487+ /* Search the hash table to see if this sector has already been
18488+ * remapped on this snapshot.
18489+ */
18490+ chunk_sector = ((org_sector - pe_start_sector) &
18491+ ((u64)(~(snap_volume->chunk_size - 1)))) +
18492+ pe_start_sector;
18493+ hash_value = snapshot_hash(chunk_sector, snap_volume);
18494+ hash_table = snap_volume->snapshot_map[org_pv->pv_number];
18495+ chain_head = hash_table[hash_value];
18496+
18497+ if ( snapshot_search_hash_chain(chunk_sector,
18498+ chain_head, &target_entry) ) {
18499+ /* Chunk is already remapped. */
18500+ goto out;
18501+ }
18502+
18503+ /* Is there room on the snapshot to remap this chunk? */
18504+ if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
18505+ /* At this point, the snapshot is full. Any further
18506+ * writes to the original will cause the snapshot to
18507+ * become "corrupt" because they can't be remapped.
18508+ * Take this snapshot permanently offline.
18509+ */
18510+ goto out_invalidate;
18511+ }
18512+
18513+ rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector,
18514+ snap_volume, &snap_pv, &snap_sector);
18515+ if (rc) {
18516+ rc = (rc > 0) ? -EIO : 0;
18517+ goto out;
18518+ }
18519+
18520+ /* Fill in the appropriate COW table entry and write that
18521+ * metadata sector back to the snapshot volume. Since we are
18522+ * only writing one sector, there are no boundary conditions.
18523+ * Must endian-convert each entry as it is added.
18524+ */
18525+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number =
18526+ cpu_to_le64((u64)(org_pv->pv_number));
18527+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector =
18528+ cpu_to_le64p(&chunk_sector);
18529+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number =
18530+ cpu_to_le64((u64)(snap_pv->pv_number));
18531+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector =
18532+ cpu_to_le64p(&snap_sector);
18533+
18534+ if ( lvm_init_io(snap_volume->volume_node, 4,
18535+ snap_volume->current_cow_sector,
18536+ 1, snap_volume->cow_table) ) {
18537+ /* The data was written to the snapshot, but
18538+ * writing the metadata failed.
18539+ */
18540+ goto out_invalidate;
18541+ }
18542+
18543+ snap_volume->next_cow_entry++;
18544+ if ( snap_volume->next_cow_entry >=
18545+ (EVMS_VSECTOR_SIZE / sizeof (struct lv_COW_table_disk)) ) {
18546+ snap_volume->next_cow_entry = 0;
18547+ snap_volume->current_cow_sector++;
18548+ memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18549+ if ( lvm_init_io(snap_volume->volume_node, 4,
18550+ snap_volume->current_cow_sector,
18551+ 1, snap_volume->cow_table) ) {
18552+ /* Can't clear out the next sector of metadata. */
18553+ goto out_invalidate;
18554+ }
18555+ }
18556+ snap_volume->next_free_chunk += snap_volume->chunk_size;
18557+
18558+ /* Create a new snapshot map entry and add it in the appropriate
18559+ * place in the map.
18560+ */
18561+ new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector);
18562+ if (!new_map_entry) {
18563+ rc = -ENOMEM;
18564+ goto out_invalidate;
18565+ }
18566+ new_map_entry->snap_pv = snap_pv;
18567+ if (target_entry) {
18568+ insert_snapshot_map_entry(new_map_entry, target_entry);
18569+ } else {
18570+ insert_snapshot_map_entry_at_head(new_map_entry,
18571+ &(hash_table[hash_value]));
18572+ }
18573+
18574+out:
18575+ up(&snap_volume->snap_semaphore);
18576+ return rc;
18577+
18578+out_invalidate:
18579+ invalidate_snapshot_volume(snap_volume);
18580+ goto out;
18581+}
18582+
18583+/**
18584+ * get_snapshot_stats
18585+ **/
18586+static int get_snapshot_stats(struct lvm_snapshot_stat_ioctl * snap_stats)
18587+{
18588+ struct lvm_logical_volume * volume;
18589+ struct lvm_volume_group * group;
18590+
18591+ /* Make sure the parameters are in range. */
18592+ if ( snap_stats->lv_number < 1 || snap_stats->lv_number > MAX_LV ) {
18593+ return 1;
18594+ }
18595+
18596+ /* Make sure the specified group and volume exist, and that
18597+ * this is a snapshot volume.
18598+ */
18599+ find_group_by_uuid(snap_stats->vg_uuid, &group);
18600+ if ( ! group ||
18601+ ! (volume = group->volume_list[snap_stats->lv_number]) ||
18602+ ! (volume->lv_access & LV_SNAPSHOT) ) {
18603+ return 1;
18604+ }
18605+
18606+ /* Return the starting LBA of the next available chunk. */
18607+ snap_stats->next_free_chunk = volume->next_free_chunk;
18608+ snap_stats->lv_status = volume->lv_status;
18609+
18610+ return 0;
18611+}
18612+
18613+
18614+/********** Memory Allocation/Deallocation Functions **********/
18615+
18616+
18617+/**
18618+ * deallocate_physical_volume
18619+ *
18620+ * Free the memory used by this physical volume. Do not delete the EVMS
18621+ * node in this function, since this could be called during an error
18622+ * path when we want to save the logical node.
18623+ **/
18624+static int deallocate_physical_volume(struct lvm_physical_volume * pv_entry)
18625+{
18626+ if (pv_entry->pv) {
18627+ kfree(pv_entry->pv);
18628+ pv_entry->pv = NULL;
18629+ }
18630+
18631+ if (pv_entry->pe_map) {
18632+ vfree(pv_entry->pe_map);
18633+ pv_entry->pe_map = NULL;
18634+ }
18635+
18636+ kfree(pv_entry);
18637+ return 0;
18638+}
18639+
18640+/**
18641+ * allocate_physical_volume
18642+ *
18643+ * Create a new struct lvm_physical_volume for the specified volume group.
18644+ * Initialize the new PV with the evms node and lvm pv information.
18645+ **/
18646+static struct lvm_physical_volume *
18647+allocate_physical_volume(struct evms_logical_node * node, struct pv_disk * pv)
18648+{
18649+ struct lvm_physical_volume * new_pv;
18650+
18651+ new_pv = kmalloc(sizeof(struct lvm_physical_volume), GFP_NOIO);
18652+ if (!new_pv) {
18653+ LOG_CRITICAL("Error allocating physical volume for %s.\n",
18654+ node->name);
18655+ kfree(pv);
18656+ goto out;
18657+ }
18658+
18659+ /* Initialize the PV. */
18660+ memset(new_pv, 0, sizeof(struct lvm_physical_volume));
18661+ new_pv->logical_node = node;
18662+ new_pv->pv = pv;
18663+ new_pv->pv_number = pv->pv_number;
18664+
18665+out:
18666+ return new_pv;
18667+}
18668+
18669+/**
18670+ * allocate_snapshot_map_entry
18671+ *
18672+ * Allocate memory for a new entry in the snapshot map and fill in the
18673+ * sector values. The PV pointer is not filled in here, but can easily
18674+ * be found by using the find_pv_by_number function.
18675+ **/
18676+static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
18677+ u64 snap_sector)
18678+{
18679+ struct snapshot_map_entry * new_entry;
18680+
18681+ new_entry = kmalloc(sizeof(struct snapshot_map_entry), GFP_NOIO);
18682+ if (!new_entry) {
18683+ goto out;
18684+ }
18685+ memset(new_entry, 0, sizeof(struct snapshot_map_entry));
18686+ new_entry->org_sector = org_sector;
18687+ new_entry->snap_sector = snap_sector;
18688+out:
18689+ return new_entry;
18690+}
18691+
18692+/**
18693+ * deallocate_snapshot_map
18694+ *
18695+ * This function will delete one hash table, which is part of the whole
18696+ * snapshot remapping structure. Each hash table is an array of pointers
18697+ * to linked lists of struct snapshot_map_entry's.
18698+ **/
18699+static int deallocate_snapshot_map(struct snapshot_map_entry ** table,
18700+ u32 table_size)
18701+{
18702+ struct snapshot_map_entry * entry, * next;
18703+ int i;
18704+
18705+ if (table) {
18706+ for ( i = 0; i < table_size; i++ ) {
18707+ for ( entry = table[i]; entry; entry = next ) {
18708+ next = entry->next;
18709+ kfree(entry);
18710+ }
18711+ }
18712+ vfree(table);
18713+ }
18714+ return 0;
18715+}
18716+
18717+/**
18718+ * deallocate_logical_volume
18719+ *
18720+ * Delete the in-memory representation of a single LVM logical volume,
18721+ * including its PE map and any snapshot data. Do not alter the parent
18722+ * volume group, except to remove this volume from its volume list.
18723+ **/
18724+static int deallocate_logical_volume(struct lvm_logical_volume * volume)
18725+{
18726+ struct lvm_volume_group * group = volume->group;
18727+ struct lvm_logical_volume * org_volume, * snap_volume;
18728+ int i;
18729+
18730+ if ( volume->lv_access & LV_SNAPSHOT ) {
18731+ /* This volume is a snapshot. Remove it from the linked
18732+ * list of volumes that are snapshotting the original.
18733+ * First, the original volume must be quiesced.
18734+ */
18735+ org_volume = volume->snapshot_org;
18736+
18737+ if ( snapshot_check_quiesce_original(volume) ) {
18738+ return -EINVAL;
18739+ }
18740+
18741+ remove_snapshot_from_chain(volume);
18742+
18743+ /* If the snapshot that was just removed was the last/only
18744+ * volume snapshotting the original, then mark the original
18745+ * as no longer being snapshotted.
18746+ */
18747+ if ( org_volume && !org_volume->snapshot_next ) {
18748+ org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
18749+ }
18750+ } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
18751+ /* If this volume is a snapshot original, all of its snapshots
18752+ * must also be deleted. However, Those deletions need to be
18753+ * taken care of by the engine. So just check that they have
18754+ * all been quiesced before removing the original.
18755+ */
18756+ if ( snapshot_check_quiesce_all(volume) ) {
18757+ return -EINVAL;
18758+ }
18759+
18760+ /* In case there are any snapshots remaining, we must clear out
18761+ * their pointers to this original to prevent errors when those
18762+ * snapshots are accessed or deleted.
18763+ */
18764+ for ( snap_volume = volume->snapshot_next;
18765+ snap_volume; snap_volume = snap_volume->snapshot_next ) {
18766+ snap_volume->snapshot_org = NULL;
18767+ }
18768+ }
18769+
18770+ if (volume->name) {
18771+ LOG_DEBUG("Deleting volume %s\n", volume->name);
18772+ }
18773+
18774+ /* Free all the memory. This includes the LE-to-PE map, any snapshot
18775+ * hash tables, the COW table, and chunk data buffer.
18776+ */
18777+ if (volume->le_map) {
18778+ vfree(volume->le_map);
18779+ volume->le_map = NULL;
18780+ }
18781+ if (volume->snapshot_map) {
18782+ for ( i = 1; i <= group->pv_count; i++ ) {
18783+ deallocate_snapshot_map(volume->snapshot_map[i],
18784+ volume->hash_table_size);
18785+ }
18786+ kfree(volume->snapshot_map);
18787+ volume->snapshot_map = NULL;
18788+ }
18789+ if (volume->cow_table) {
18790+ kfree(volume->cow_table);
18791+ volume->cow_table = NULL;
18792+ }
18793+ if (volume->chunk_data_buffer) {
18794+ kfree(volume->chunk_data_buffer);
18795+ volume->chunk_data_buffer = NULL;
18796+ }
18797+
18798+ /* Remove this volume from the group's list. */
18799+ if ( group && group->volume_list[volume->lv_number] == volume ) {
18800+ group->volume_list[volume->lv_number] = NULL;
18801+ group->volume_count--;
18802+ }
18803+
18804+ kfree(volume);
18805+ return 0;
18806+}
18807+
18808+/**
18809+ * allocate_logical_volume
18810+ *
18811+ * Allocate space for a new LVM logical volume, including space for the
18812+ * LE-to-PE map and any necessary snapshot data.
18813+ **/
18814+static struct lvm_logical_volume *
18815+allocate_logical_volume(struct lv_disk * lv, struct lvm_volume_group * group)
18816+{
18817+ struct lvm_logical_volume * new_volume;
18818+ u32 table_entries_per_chunk, table_chunks;
18819+ int i;
18820+
18821+ /* Allocate space for the new logical volume. */
18822+ new_volume = kmalloc(sizeof(struct lvm_logical_volume), GFP_NOIO);
18823+ if (!new_volume) {
18824+ LOG_CRITICAL("Error allocating new logical volume %s\n",
18825+ lv->lv_name);
18826+ goto out;
18827+ }
18828+ memset(new_volume, 0, sizeof(struct lvm_logical_volume));
18829+
18830+ /* Allocate space for the LE to PE mapping table. */
18831+ new_volume->le_map = vmalloc(lv->lv_allocated_le *
18832+ sizeof(struct le_table_entry));
18833+ if (!new_volume->le_map) {
18834+ LOG_CRITICAL("Error creating LE map for logical volume %s\n",
18835+ lv->lv_name);
18836+ goto error;
18837+ }
18838+ memset(new_volume->le_map, 0,
18839+ lv->lv_allocated_le * sizeof(struct le_table_entry));
18840+
18841+ /* Initialize the rest of the new volume.
18842+ * Need the +1 on lv_number to match the PE Map entries on the PV.
18843+ */
18844+ new_volume->lv_number = lv->lv_number + 1;
18845+ new_volume->lv_size = lv->lv_size;
18846+ new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED;
18847+ new_volume->lv_status = lv->lv_status | LV_ACTIVE;
18848+ new_volume->lv_minor = MINOR(lv->lv_dev);
18849+ new_volume->stripes = lv->lv_stripes;
18850+ new_volume->stripe_size = lv->lv_stripesize;
18851+ new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize);
18852+ new_volume->pe_size = group->vg->pe_size;
18853+ new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size);
18854+ new_volume->num_le = lv->lv_allocated_le;
18855+ new_volume->group = group;
18856+ /* Different naming scheme for EVMS nodes. */
18857+ if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
18858+ goto error;
18859+ }
18860+
18861+ if ( new_volume->lv_access & LV_SNAPSHOT ) {
18862+ /* This volume is a snapshot, initialize the remaining data,
18863+ * and allocate space for the remapping structures, and one
18864+ * sector's worth of COW tables.
18865+ */
18866+ new_volume->chunk_size = lv->lv_chunk_size;
18867+ new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size;
18868+ new_volume->snap_org_minor = lv->lv_snapshot_minor;
18869+ new_volume->next_cow_entry = 0;
18870+ new_volume->current_cow_sector = 0;
18871+ table_entries_per_chunk = (new_volume->chunk_size <<
18872+ EVMS_VSECTOR_SIZE_SHIFT) /
18873+ sizeof(struct lv_COW_table_disk);
18874+ table_chunks = (new_volume->num_chunks +
18875+ table_entries_per_chunk - 1) /
18876+ table_entries_per_chunk;
18877+ new_volume->next_free_chunk = table_chunks *
18878+ new_volume->chunk_size;
18879+ new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size /
18880+ MAX_HASH_CHAIN_ENTRIES) + 1;
18881+
18882+ new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
18883+ if (!new_volume->cow_table) {
18884+ LOG_CRITICAL("Error allocating COW table for logical volume %s\n",
18885+ lv->lv_name);
18886+ goto error;
18887+ }
18888+ memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18889+
18890+ new_volume->snapshot_map = kmalloc((group->pv_count + 1) *
18891+ sizeof(struct snapshot_map_entry **),
18892+ GFP_NOIO);
18893+ if (!new_volume->snapshot_map) {
18894+ LOG_CRITICAL("Error allocating snapshot map for logical volume %s\n",
18895+ lv->lv_name);
18896+ goto error;
18897+ }
18898+
18899+ new_volume->snapshot_map[0] = NULL;
18900+ for ( i = 1; i <= group->pv_count; i++ ) {
18901+ new_volume->snapshot_map[i] =
18902+ vmalloc(new_volume->hash_table_size *
18903+ sizeof(struct snapshot_map_entry *));
18904+ if (!new_volume->snapshot_map[i]) {
18905+ LOG_CRITICAL("Error allocating snapshot sub-map for logical volume %s\n",
18906+ lv->lv_name);
18907+ goto error;
18908+ }
18909+ memset(new_volume->snapshot_map[i], 0,
18910+ new_volume->hash_table_size *
18911+ sizeof(struct snapshot_map_entry *));
18912+ }
18913+ init_MUTEX(&new_volume->snap_semaphore);
18914+ } else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
18915+ /* This volume is a snapshot original, allocate space to use for
18916+ * copying snapshot chunks. This will now be a fixed size
18917+ * instead of being based on the chunk size of the snapshots.
18918+ */
18919+ new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
18920+ new_volume->chunk_data_buffer =
18921+ kmalloc(new_volume->chunk_size <<
18922+ EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
18923+ if (!new_volume->chunk_data_buffer) {
18924+ LOG_SERIOUS("Error allocating snapshot chunk buffer for logical volume %s\n",
18925+ lv->lv_name);
18926+ goto error;
18927+ }
18928+ memset(new_volume->chunk_data_buffer, 0,
18929+ new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
18930+ }
18931+
18932+out:
18933+ return new_volume;
18934+error:
18935+ deallocate_logical_volume(new_volume);
18936+ new_volume = NULL;
18937+ goto out;
18938+}
18939+
18940+/**
18941+ * deallocate_volume_group
18942+ *
18943+ * Delete the entire in-memory representation of an LVM volume group,
18944+ * including all PVs and logical volumes. If this group is on LVM's
18945+ * volume group list, remove it.
18946+ **/
18947+static int deallocate_volume_group(struct lvm_volume_group * group)
18948+{
18949+ struct lvm_physical_volume * pv_entry, * next_pv;
18950+ int i;
18951+
18952+ LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
18953+
18954+ /* Remove the group from the global list. */
18955+ remove_group_from_list(group);
18956+
18957+ /* Delete the LV metadata array. */
18958+ if (group->lv_array) {
18959+ vfree(group->lv_array);
18960+ group->lv_array = NULL;
18961+ }
18962+
18963+ /* Delete the PV UUID list. */
18964+ if (group->uuid_list) {
18965+ vfree(group->uuid_list);
18966+ group->uuid_list = NULL;
18967+ }
18968+
18969+ /* Delete all logical volumes. */
18970+ for ( i = 1; i <= MAX_LV; i++ ) {
18971+ if (group->volume_list[i]) {
18972+ deallocate_logical_volume(group->volume_list[i]);
18973+ group->volume_list[i] = NULL;
18974+ }
18975+ }
18976+
18977+ /* Delete all PVs from the group's list. */
18978+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
18979+ next_pv = pv_entry->next;
18980+ if (pv_entry->logical_node) {
18981+ /* Send a delete command down to the segment manager. */
18982+ LOG_DEBUG("Deleting PV %s from group %s\n",
18983+ pv_entry->logical_node->name, group->vg_name);
18984+ DELETE(pv_entry->logical_node);
18985+ pv_entry->logical_node = NULL;
18986+ }
18987+ deallocate_physical_volume(pv_entry);
18988+ }
18989+
18990+ /* Delete the VG metadata. */
18991+ if (group->vg) {
18992+ kfree(group->vg);
18993+ group->vg = NULL;
18994+ }
18995+
18996+ kfree(group);
18997+ return 0;
18998+}
18999+
19000+/**
19001+ * allocate_volume_group
19002+ *
19003+ * Allocate space for a new LVM volume group and all of its sub-fields.
19004+ * Initialize the appropriate fields.
19005+ * vg parameter should already have an allocate/initialized struct vg_disk.
19006+ **/
19007+static struct lvm_volume_group * allocate_volume_group(struct vg_disk * vg,
19008+ u8 * vg_name)
19009+{
19010+ struct lvm_volume_group * new_group;
19011+
19012+ /* The volume group itself. */
19013+ new_group = kmalloc(sizeof(struct lvm_volume_group), GFP_NOIO);
19014+ if (!new_group) {
19015+ kfree(vg);
19016+ goto out;
19017+ }
19018+
19019+ /* Initialize the new group. */
19020+ memset(new_group, 0, sizeof(struct lvm_volume_group));
19021+ memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
19022+ strncpy(new_group->vg_name, vg_name, NAME_LEN - 1);
19023+ new_group->vg = vg;
19024+ /* Default sector and block sizes. */
19025+ new_group->hard_sect_size = 512;
19026+ new_group->block_size = 1024;
19027+ new_group->flags = EVMS_VG_DIRTY;
19028+
19029+ LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
19030+
19031+out:
19032+ return new_group;
19033+}
19034+
19035+/**
19036+ * remove_pv_from_group
19037+ *
19038+ * In the engine, when a PV is removed from a group (on a vgreduce), that
19039+ * same PV must be removed from that group in the kernel. Otherwise, when
19040+ * the rediscover occurs, that PV will still appear in the group, and
19041+ * will cause segfaults when we try to read metadata from it.
19042+ **/
19043+static int remove_pv_from_group(int pv_number, unsigned char * vg_uuid)
19044+{
19045+ struct lvm_volume_group * group;
19046+ struct lvm_physical_volume * pv_entry;
19047+ struct lvm_physical_volume ** p_pv_entry;
19048+
19049+ /* Make sure the numbers are in range. */
19050+ if ( pv_number < 0 || pv_number > MAX_PV ) {
19051+ return 0;
19052+ }
19053+
19054+ /* Make sure the group exists. */
19055+ find_group_by_uuid(vg_uuid, &group);
19056+ if (!group) {
19057+ return 0;
19058+ }
19059+
19060+ /* Make sure the PV is in this group. */
19061+ pv_entry = find_pv_by_number(pv_number, group);
19062+ if (!pv_entry) {
19063+ LOG_WARNING("Did not find PV %d in group %s\n",
19064+ pv_number, group->vg_name);
19065+ return 0;
19066+ }
19067+
19068+ /* Make sure the PV is not in use by any volumes. */
19069+ if ( check_pv_for_lv(pv_entry, group) ) {
19070+ LOG_SERIOUS("PV %d in group %s still contains LVs\n",
19071+ pv_number, group->vg_name);
19072+ return -EINVAL;
19073+ }
19074+
19075+ /* Take this PV out of the group's list. */
19076+ for ( p_pv_entry = &group->pv_list;
19077+ *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
19078+ if ( *p_pv_entry == pv_entry ) {
19079+ *p_pv_entry = (*p_pv_entry)->next;
19080+ pv_entry->next = NULL;
19081+ break;
19082+ }
19083+ }
19084+
19085+ group->pv_count--;
19086+
19087+ /* There is no way that this PV was the last in this group, so the
19088+ * group never needs to be deleted at this point. The only way this
19089+ * group will exist in the kernel is if there are volumes exported from
19090+ * it. If this was the last PV, then those volumes must be on that PV,
19091+ * and it wouldn't be allowed to be removed from the group (above).
19092+ */
19093+
19094+ /* Free up the memory for this PV. Just drop the node. */
19095+ deallocate_physical_volume(pv_entry);
19096+
19097+ LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
19098+ return 0;
19099+}
19100+
19101+
19102+/********** Consistency Checking Functions **********/
19103+
19104+
19105+/**
19106+ * clear_le_entries_for_missing_pv
19107+ *
19108+ * In the event that a PV turns up missing during a rediscover, we
19109+ * need to erase any LE map entries that might point to it.
19110+ **/
19111+static void
19112+clear_le_entries_for_missing_pv(struct lvm_volume_group * group,
19113+ struct lvm_physical_volume * pv_entry)
19114+{
19115+ struct lvm_logical_volume * volume;
19116+ int i, j;
19117+
19118+ for ( i = 1; i <= MAX_LV; i++ ) {
19119+ if (group->volume_list[i]) {
19120+ volume = group->volume_list[i];
19121+ for ( j = 0; j < volume->num_le; j++ ) {
19122+ if ( volume->le_map[j].owning_pv == pv_entry ) {
19123+ volume->le_map[j].owning_pv = NULL;
19124+ volume->le_map[j].pe_sector_offset = 0;
19125+ }
19126+ }
19127+ }
19128+ }
19129+}
19130+
19131+/**
19132+ * check_volume_groups
19133+ *
19134+ * This function performs some simple consistency checks on all dirty
19135+ * volume groups. Any groups that have no PVs are deleted. If any metadata
19136+ * structures (PV or VG) are missing, they are read in from disk.
19137+ **/
19138+static int check_volume_groups(void)
19139+{
19140+ struct lvm_volume_group * group, * next_group;
19141+ struct lvm_physical_volume * pv_entry, * next_pv;
19142+ int rc = 0;
19143+
19144+ for ( group = lvm_group_list; group; group = next_group ) {
19145+ next_group = group->next_group;
19146+
19147+ LOG_DEBUG("Checking Group %s\n", group->vg_name);
19148+
19149+ /* If a group has no PVs, it can be safely deleted,
19150+ * because we can't find any volumes on it.
19151+ */
19152+ if (!group->pv_count) {
19153+ LOG_WARNING("No PVs found for Group %s.\n",
19154+ group->vg_name);
19155+ if (!group->volume_count) {
19156+ deallocate_volume_group(group);
19157+ }
19158+ continue;
19159+ }
19160+
19161+ /* Make sure all metadata for the PVs is present. On a
19162+ * rediscover, it may be missing, because we delete it at the
19163+ * end of discovery. If any is missing, read it in from disk.
19164+ * This is only necessary in the kernel. It can't happen in
19165+ * the engine.
19166+ */
19167+ for ( pv_entry = group->pv_list;
19168+ pv_entry; pv_entry = next_pv ) {
19169+ next_pv = pv_entry->next;
19170+ if (!pv_entry->pv) {
19171+ LOG_DEBUG("Re-reading PV metadata for %s\n",
19172+ pv_entry->logical_node->name);
19173+ rc = read_pv(pv_entry->logical_node,
19174+ &pv_entry->pv);
19175+ if (rc) {
19176+ /* What happens if we can't re-read the
19177+ * PV metadata? This PV must be removed
19178+ * from the group. Need to also clear
19179+ * all LE entries in all LVs that are
19180+ * pointing to this PV before it can be
19181+ * removed from the list.
19182+ */
19183+ LOG_SERIOUS("PV metadata is missing or cannot be read from %s\n",
19184+ pv_entry->logical_node->name);
19185+ clear_le_entries_for_missing_pv(group,
19186+ pv_entry);
19187+ remove_pv_from_group(pv_entry->pv_number,
19188+ group->vg_uuid);
19189+ continue;
19190+ }
19191+ pv_entry->pv_number = pv_entry->pv->pv_number;
19192+
19193+ /* Check for a "stale" PV. This case should be
19194+ * already be covered, as long as the Engine is
19195+ * calling the PV_REMOVE ioctl when it does a
19196+ * vgreduce or a pvremove. If this is the last
19197+ * PV in the group, the group will be deleted.
19198+ */
19199+ if (!pv_entry->pv_number) {
19200+ remove_pv_from_group(0, group->vg_uuid);
19201+ continue;
19202+ }
19203+ }
19204+
19205+ if (!pv_entry->pe_map) {
19206+ LOG_DEBUG("Re-reading PE maps for %s\n",
19207+ pv_entry->logical_node->name);
19208+ rc = read_pe_map(pv_entry);
19209+ if (rc) {
19210+ LOG_WARNING("Error reading PE maps for %s\n",
19211+ pv_entry->logical_node->name);
19212+ LOG_WARNING("Any volumes residing on %s will be incomplete!\n",
19213+ pv_entry->logical_node->name);
19214+ }
19215+ }
19216+ }
19217+
19218+ /* Make sure the metadata for the VG is present. If it's
19219+ * missing, read it in from the first PV in the VG.
19220+ */
19221+ if (!group->vg && group->pv_count) {
19222+ LOG_DEBUG("Re-reading VG metadata for Group %s\n",
19223+ group->vg_name);
19224+ pv_entry = group->pv_list;
19225+ rc = read_vg(pv_entry->logical_node,
19226+ pv_entry->pv, &group->vg);
19227+ if (rc) {
19228+ /* What happens if we can't re-read the
19229+ * VG metadata? It's definitely bad
19230+ * news. Should we delete the VG?
19231+ */
19232+ continue;
19233+ }
19234+ }
19235+
19236+ /* Display a warning if the number of PVs found for the group
19237+ * doesn't match the number of PVs recorded for the VG.
19238+ */
19239+ if ( group->vg && group->pv_count != group->vg->pv_cur ) {
19240+ LOG_WARNING("Group %s is incomplete.\n",
19241+ group->vg_name);
19242+ LOG_WARNING(" Only %d of %d PVs found.\n",
19243+ group->pv_count, group->vg->pv_cur);
19244+ LOG_WARNING(" Volumes in this group may be incomplete.\n");
19245+ }
19246+ }
19247+
19248+ return 0;
19249+}
19250+
19251+/**
19252+ * check_le_maps
19253+ *
19254+ * Make sure all volumes in this group have valid LE-to-PE maps. Any
19255+ * volume that doesn't is marked as incomplete. This is safe for
19256+ * re-discovery because only new volumes could have corrupted LE maps.
19257+ **/
19258+static int check_le_maps(struct lvm_volume_group * group)
19259+{
19260+ struct lvm_logical_volume * volume;
19261+ int i, j, count;
19262+
19263+ for ( i = 1; i <= MAX_LV; i++ ) {
19264+ volume = group->volume_list[i];
19265+ if (!volume) {
19266+ continue;
19267+ }
19268+
19269+ if (!volume->le_map) {
19270+ /* No point in keeping the volume around if it has
19271+ * no LE map at all.
19272+ */
19273+ LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
19274+ deallocate_logical_volume(volume);
19275+ continue;
19276+ }
19277+
19278+ /* If any entries in the LE map are missing, mark this volume
19279+ * as incomplete.
19280+ */
19281+ for ( j = 0, count = 0; j < volume->num_le; j++ ) {
19282+ if ( !volume->le_map[j].owning_pv ||
19283+ !volume->le_map[j].pe_sector_offset) {
19284+ count++;
19285+ }
19286+ }
19287+ if (count) {
19288+ LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19289+ volume->name);
19290+ LOG_SERIOUS(" Missing %d out of %d LEs.\n",
19291+ count, volume->num_le);
19292+ volume->lv_access |= EVMS_LV_INCOMPLETE;
19293+ }
19294+ }
19295+ return 0;
19296+}
19297+
19298+/**
19299+ * check_snapshot_map
19300+ *
19301+ * For snapshot volumes, make sure the snapshot map is intact, and that
19302+ * any existing entries in the map are in the correct order and there
19303+ * are no duplicate entries.
19304+ **/
19305+static int check_snapshot_map(struct lvm_logical_volume * snap_volume)
19306+{
19307+ struct snapshot_map_entry ** table, * curr;
19308+ int i, j;
19309+
19310+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
19311+ return 0;
19312+ }
19313+ if (!snap_volume->snapshot_map) {
19314+ snap_volume->lv_access |= EVMS_LV_INVALID;
19315+ return -EINVAL;
19316+ }
19317+
19318+ for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
19319+ if (!snap_volume->snapshot_map[i]) {
19320+ snap_volume->lv_access |= EVMS_LV_INVALID;
19321+ return -EINVAL;
19322+ }
19323+ table = snap_volume->snapshot_map[i];
19324+ for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
19325+ for ( curr = table[j]; curr; curr = curr->next ) {
19326+ if ( curr->next &&
19327+ curr->org_sector >=
19328+ curr->next->org_sector) {
19329+ snap_volume->lv_access |=
19330+ EVMS_LV_INVALID;
19331+ return -EINVAL;
19332+ }
19333+ }
19334+ }
19335+ }
19336+ return 0;
19337+}
19338+
19339+/**
19340+ * check_logical_volumes
19341+ *
19342+ * Perform a consistency check on all of the logical volumes that have been
19343+ * discovered. Any volume that has any inconsistencies will be marked as
19344+ * incomplete or invalid, depending on the severity of the problem. At the
19345+ * end, all invalid volumes are deleted. If the deleted_incompletes
19346+ * parameter is set, those will also be deleted.
19347+ **/
19348+static int check_logical_volumes(int final_discovery)
19349+{
19350+ struct lvm_volume_group * group;
19351+ struct lvm_logical_volume * volume, * snap, * next;
19352+ int count, i, j;
19353+
19354+ /* Check every valid, dirty volume group. */
19355+ for ( group = lvm_group_list; group; group = group->next_group ) {
19356+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {
19357+ continue;
19358+ }
19359+ /* Check every valid volume in this group. */
19360+ for ( i = 1; i <= MAX_LV; i++ ) {
19361+ volume = group->volume_list[i];
19362+ if (!volume) {
19363+ continue;
19364+ }
19365+
19366+ LOG_DEBUG("Checking logical volume %s\n", volume->name);
19367+
19368+ if (!volume->group) {
19369+ volume->group = group;
19370+ }
19371+
19372+ /* All LE-map entries must have valid values. The I/O
19373+ * paths now detect missing LE entries.
19374+ */
19375+ if (volume->le_map) {
19376+ for ( j = 0, count = 0;
19377+ j < volume->num_le; j++ ) {
19378+ if ( !volume->le_map[j].owning_pv ||
19379+ !volume->le_map[j].pe_sector_offset ) {
19380+ count++;
19381+ }
19382+ }
19383+ if (count) {
19384+ LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19385+ volume->name);
19386+ LOG_SERIOUS(" Missing %d out of %d LEs.\n",
19387+ count, volume->num_le);
19388+ volume->lv_access |= EVMS_LV_INCOMPLETE;
19389+ } else {
19390+ /* In case this volume was previously
19391+ * marked incomplete.
19392+ */
19393+ volume->lv_access &=
19394+ ~EVMS_LV_INCOMPLETE;
19395+ }
19396+ } else {
19397+ /* This should only ever happen due to
19398+ * memory corruption.
19399+ */
19400+ LOG_SERIOUS("Volume %s has no LE map.\n",
19401+ volume->name);
19402+ volume->lv_access |= EVMS_LV_INVALID;
19403+ }
19404+
19405+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19406+ /* For a snapshot original, check all snapshots
19407+ * in the chain, to make sure they point back to
19408+ * the original. Also, make sure there is memory
19409+ * for the chunk buffer.
19410+ */
19411+ for ( snap = volume->snapshot_next, count = 0;
19412+ snap;
19413+ snap = snap->snapshot_next, count++ ) {
19414+ if ( snap->snapshot_org != volume ) {
19415+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19416+ volume->name);
19417+ snap->snapshot_org = NULL;
19418+ snap->lv_access |=
19419+ EVMS_LV_INVALID;
19420+ }
19421+ }
19422+ if (!count) {
19423+ LOG_WARNING("No snapshots found for volume %s\n",
19424+ volume->name);
19425+ if (final_discovery) {
19426+ volume->lv_access &=
19427+ ~LV_SNAPSHOT_ORG;
19428+ }
19429+ } else if (!volume->chunk_data_buffer) {
19430+ volume->lv_access |= EVMS_LV_INVALID;
19431+ }
19432+ } else if ( volume->lv_access & LV_SNAPSHOT ) {
19433+ /* For a snapshot volume, make sure it points
19434+ * back to its original. Also make sure there is
19435+ * memory for the cow table, and that any
19436+ * existing snapshot entries in the snapshot map
19437+ * are correctly ordered.
19438+ */
19439+ /* Is there a COW table? */
19440+ if (!volume->cow_table) {
19441+ LOG_SERIOUS("Snapshot volume %s has no COW table\n",
19442+ volume->name);
19443+ volume->lv_access |= EVMS_LV_INVALID;
19444+ }
19445+ /* Is the snapshot map in order? */
19446+ if ( check_snapshot_map(volume) ) {
19447+ LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n",
19448+ volume->name);
19449+ volume->lv_access |= EVMS_LV_INVALID;
19450+ }
19451+ /* Is there an original volume? This is only
19452+ * a real problem during final discovery.
19453+ */
19454+ if (!volume->snapshot_org) {
19455+ LOG_SERIOUS("Snapshot volume %s not pointing at an original\n",
19456+ volume->name);
19457+ if (final_discovery) {
19458+ volume->lv_access |=
19459+ EVMS_LV_INVALID;
19460+ }
19461+ }
19462+ /* Is the original the correct one? */
19463+ else if ( volume->snap_org_minor !=
19464+ volume->snapshot_org->lv_minor ) {
19465+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19466+ volume->name);
19467+ volume->lv_access |= EVMS_LV_INVALID;
19468+ }
19469+ }
19470+ /* Delete any invalid volumes from use. Delete
19471+ * incomplete volumes as well if this is not final
19472+ * discovery. If a snapshot original is bad, delete all
19473+ * of its snapshots.
19474+ */
19475+ if ( volume->lv_access & EVMS_LV_INVALID ||
19476+ (!final_discovery &&
19477+ (volume->lv_access & EVMS_LV_INCOMPLETE) &&
19478+ (volume->lv_access & EVMS_LV_NEW)) ) {
19479+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19480+ for ( snap = volume->snapshot_next;
19481+ snap; snap = next ) {
19482+ next = snap->snapshot_next;
19483+ snap->snapshot_next = NULL;
19484+ snap->snapshot_org = NULL;
19485+ invalidate_snapshot_volume(snap);
19486+ deallocate_logical_volume(snap);
19487+ }
19488+ volume->snapshot_next = NULL;
19489+ } else if ( volume->lv_access & LV_SNAPSHOT ) {
19490+ invalidate_snapshot_volume(volume);
19491+ }
19492+ deallocate_logical_volume(volume);
19493+ }
19494+ }
19495+ }
19496+
19497+ return 0;
19498+}
19499+
19500+
19501+/********** Volume Group Discovery Functions **********/
19502+
19503+
19504+/**
19505+ * find_group_for_pv
19506+ *
19507+ * This is a discover-time function. It reads the VG metadata info for the
19508+ * specified node, and locates the appropriate group that owns that
19509+ * node. If that group does not already exist, it is created and
19510+ * initialized.
19511+ **/
19512+static int find_group_for_pv(struct evms_logical_node * node,
19513+ struct pv_disk * pv,
19514+ struct lvm_volume_group ** group)
19515+{
19516+ struct vg_disk * vg;
19517+ int rc;
19518+
19519+ *group = NULL;
19520+
19521+ /* Check for an unassigned PV. */
19522+ if ( pv->vg_name[0] == 0 ) {
19523+ return 0;
19524+ }
19525+
19526+ /* Read the VG on-disk info for this PV. If this succeeds, it
19527+ * allocates a new VG metadata structure.
19528+ */
19529+ rc = read_vg(node, pv, &vg);
19530+ if (rc) {
19531+ return rc;
19532+ }
19533+
19534+ /* Use the UUID from the VG metadata to determine if this group
19535+ * has already been discovered and constructed.
19536+ */
19537+ find_group_by_uuid(vg->vg_uuid, group);
19538+
19539+ if (!*group) {
19540+ /* Create a new group entry and add to the global list. */
19541+ *group = allocate_volume_group(vg, pv->vg_name);
19542+ if (!*group) {
19543+ return -ENOMEM;
19544+ }
19545+ add_group_to_list(*group);
19546+ } else if (!(*group)->vg) {
19547+ /* On a rediscover, the VG metadata for an existing group might
19548+ * be missing. Fill it in if necessary. This check is also not
19549+ * necessary in the engine, since the metadata is never deleted.
19550+ */
19551+/* Should we re-copy vg_name? (vg_uuid can not be allowed to change).
19552+ * Or should vg_name changes be done through direct ioctl only?
19553+ */
19554+ (*group)->vg = vg;
19555+ } else {
19556+ kfree(vg);
19557+ }
19558+
19559+ /* Read in the UUID list for this group, if it isn't present. */
19560+ rc = read_uuid_list(node, pv, *group);
19561+ if (rc) {
19562+ LOG_WARNING("Error reading UUID list for group %s.\n",
19563+ (*group)->vg_name);
19564+ LOG_WARNING("May not be able to verify PV UUIDs for group %s\n",
19565+ (*group)->vg_name);
19566+ }
19567+
19568+ /* In the kernel, any time we even see a PV for a group, that group
19569+ * must be marked dirty so its volumes will be re-exported.
19570+ */
19571+ (*group)->flags |= EVMS_VG_DIRTY;
19572+
19573+ return 0;
19574+}
19575+
19576+/**
19577+ * check_for_duplicate_pv
19578+ *
19579+ * Search the list of PVs in the specified volume group. If the
19580+ * specified node already exists in the list, we can discard it.
19581+ **/
19582+static int check_for_duplicate_pv(struct evms_logical_node * node,
19583+ struct pv_disk * pv,
19584+ struct lvm_volume_group * group)
19585+{
19586+ struct lvm_physical_volume * pv_entry;
19587+
19588+ /* For re-discovery, we need to search all existing PVs in this VG to
19589+ * make sure we didn't get a duplicate from the plugin below us. The
19590+ * plugins below us should be re-exporting the same node on
19591+ * re-discovery, instead of creating a new node to represent the same
19592+ * objects, so just check the memory location.
19593+ */
19594+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19595+ if ( pv_entry->logical_node == node ) {
19596+
19597+ /* We found a duplicate. Just ignore the duplicate. */
19598+ LOG_DEBUG("PV %s is already in Group %s.\n",
19599+ node->name, group->vg_name);
19600+
19601+ /* Even if the node was a duplicate, we may need to
19602+ * fill in the pv entry for this partition, since we
19603+ * always delete those at the end of discovery.
19604+ */
19605+ if (!pv_entry->pv) {
19606+ pv_entry->pv = pv;
19607+ pv_entry->pv_number = pv->pv_number;
19608+ } else {
19609+ kfree(pv);
19610+ }
19611+
19612+ return 1;
19613+ }
19614+ }
19615+
19616+ /* No duplicate was found. */
19617+ return 0;
19618+}
19619+
19620+/**
19621+ * verify_pv_uuid
19622+ *
19623+ * Verify that the specified PV belongs in the specified group by
19624+ * searching for the PV's UUID in the group's list.
19625+ **/
19626+static int verify_pv_uuid(struct lvm_physical_volume * pv_entry,
19627+ struct lvm_volume_group * group)
19628+{
19629+ int i;
19630+
19631+ /* Obviously the UUID list must be present in order to search. */
19632+ if (!group->uuid_list) {
19633+ LOG_WARNING("UUID list is missing from group %s.\n",
19634+ group->vg_name);
19635+ LOG_WARNING("Cannot verify UUID for PV %s\n",
19636+ pv_entry->logical_node->name);
19637+ return 0;
19638+ }
19639+
19640+ /* Start with the UUID entry for this PV's number. */
19641+ if ( ! memcmp(pv_entry->pv->pv_uuid,
19642+ &(group->uuid_list[(pv_entry->pv_number - 1) * NAME_LEN]),
19643+ UUID_LEN) ) {
19644+ return 0;
19645+ }
19646+
19647+ /* If it wasn't found there, then search the entire group's list. */
19648+ for ( i = 0; i < group->vg->pv_cur; i++ ) {
19649+ if ( ! memcmp(pv_entry->pv->pv_uuid,
19650+ &(group->uuid_list[i * NAME_LEN]), UUID_LEN) ) {
19651+ /* Found the UUID. */
19652+ LOG_WARNING("Detected UUID mismatch for PV %s!\n",
19653+ pv_entry->logical_node->name);
19654+ LOG_WARNING("PV %s is recorded as being at index %d,\n",
19655+ pv_entry->logical_node->name,
19656+ pv_entry->pv_number);
19657+ LOG_WARNING(" but Group %s has it recorded at index %d.\n",
19658+ group->vg_name, i + 1);
19659+ LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
19660+ LOG_WARNING("If you have any snapshot regions in group %s\n",
19661+ group->vg_name);
19662+ LOG_WARNING(" it is recommended that you delete them immediately!\n");
19663+ return 0;
19664+ }
19665+ }
19666+
19667+ LOG_SERIOUS("Could not find UUID for PV %s in group %s\n",
19668+ pv_entry->logical_node->name, group->vg_name);
19669+ return -EINVAL;
19670+}
19671+
19672+/**
19673+ * add_pv_to_group
19674+ *
19675+ * Adds the physical volume to the appropriate volume group. The PV
19676+ * passed into this function MUST be part of a valid VG.
19677+ **/
19678+static int add_pv_to_group(struct lvm_physical_volume * pv_entry,
19679+ struct lvm_volume_group * group)
19680+{
19681+ int rc;
19682+
19683+ /* Make sure this PV's UUID is listed in the group. */
19684+ rc = verify_pv_uuid(pv_entry, group);
19685+ if (rc) {
19686+ LOG_SERIOUS("PV %s does not belong in group %s!\n",
19687+ pv_entry->logical_node->name, group->vg_name);
19688+ return rc;
19689+ }
19690+
19691+ /* Add this PV to the beginning of its group's list. */
19692+ pv_entry->next = group->pv_list;
19693+ group->pv_list = pv_entry;
19694+ group->pv_count++;
19695+
19696+ /* Update the group's block and hardsector sizes as appropriate. */
19697+ group->block_size = max(pv_entry->logical_node->block_size,
19698+ group->block_size);
19699+ group->hard_sect_size = max(pv_entry->logical_node->hardsector_size,
19700+ group->hard_sect_size);
19701+
19702+ /* Check for the Partial or Removable flag on the PV. */
19703+ if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
19704+ group->flags |= EVMS_VG_PARTIAL_PVS;
19705+ }
19706+ if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
19707+ group->flags |= EVMS_VG_REMOVABLE_PVS;
19708+ }
19709+
19710+ LOG_DETAILS("PV %s added to Group %s\n",
19711+ pv_entry->logical_node->name, group->vg_name);
19712+
19713+ return 0;
19714+}
19715+
19716+/**
19717+ * discover_volume_groups
19718+ *
19719+ * Examine the list of logical nodes. Any node that contains a valid PV
19720+ * structure is consumed and added to the appropriate volume group. PVs
19721+ * which do not belong to any group are deleted. Everything else is left
19722+ * on the discovery list.
19723+ **/
19724+static int discover_volume_groups(struct evms_logical_node ** evms_node_list)
19725+{
19726+ struct evms_logical_node * node, * next_node;
19727+ struct pv_disk * pv;
19728+ struct lvm_volume_group * group;
19729+ struct lvm_physical_volume * pv_entry;
19730+ int rc;
19731+
19732+ LOG_EXTRA("Searching for PVs in the node list.\n");
19733+
19734+ /* Run through the discovery list. */
19735+ for ( node = *evms_node_list; node; node = next_node ) {
19736+ /* Save the next node. We may remove this one from the list. */
19737+ next_node = node->next;
19738+
19739+ /* Read the PV metadata. This will also create a new struct pv_disk
19740+ * if it finds the correct LVM signatures.
19741+ */
19742+ rc = read_pv(node, &pv);
19743+ if (rc) {
19744+ /* This node is not an LVM PV, or an error occurred.
19745+ * Just leave the node on the discovery list.
19746+ */
19747+ continue;
19748+ }
19749+
19750+ rc = find_group_for_pv(node, pv, &group);
19751+ if (rc) {
19752+ /* Error getting the group for this PV. */
19753+ kfree(pv);
19754+ continue;
19755+ }
19756+
19757+ if (!group) {
19758+ /* This node is an unassigned PV. */
19759+ LOG_DETAILS("PV %s is unassigned.\n", node->name);
19760+ kfree(pv);
19761+ continue;
19762+ }
19763+
19764+ rc = check_for_duplicate_pv(node, pv, group);
19765+ if (rc) {
19766+ /* This node is already in the group. This check is also
19767+ * only in the kernel because the engine has no notion
19768+ * of rediscover, and thus can never get a duplicate.
19769+ */
19770+ evms_cs_remove_logical_node_from_list(evms_node_list,
19771+ node);
19772+ continue;
19773+ }
19774+
19775+ /* Allocate a PV entry for this node. */
19776+ pv_entry = allocate_physical_volume(node, pv);
19777+ if (!pv_entry) {
19778+ continue;
19779+ }
19780+
19781+ /* Add this PV to the appropriate volume group. */
19782+ rc = add_pv_to_group(pv_entry, group);
19783+ if (rc) {
19784+ deallocate_physical_volume(pv_entry);
19785+ continue;
19786+ }
19787+
19788+ rc = read_pe_map(pv_entry);
19789+ if (rc) {
19790+ LOG_WARNING("Error reading PE maps for node %s\n",
19791+ node->name);
19792+ LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
19793+ }
19794+
19795+ evms_cs_remove_logical_node_from_list(evms_node_list, node);
19796+ }
19797+
19798+ LOG_EXTRA("Group discovery complete.\n");
19799+ return 0;
19800+}
19801+
19802+
19803+/********** Logical Volume Discovery Functions **********/
19804+
19805+
19806+/**
19807+ * build_le_maps
19808+ *
19809+ * After all logical volumes have been discovered, the mappings from
19810+ * logical extents to physical extents must be constructed. Each PV
19811+ * contains a map on-disk of its PEs. Each PE map entry contains the
19812+ * logical volume number and the logical extent number on that volume.
19813+ * Our internal map is the reverse of this map for each volume, listing
19814+ * the PV node and sector offset for every logical extent on the volume.
19815+ **/
19816+static int build_le_maps(struct lvm_volume_group * group)
19817+{
19818+ struct lvm_logical_volume ** volume_list = group->volume_list;
19819+ struct lvm_physical_volume * pv_entry;
19820+ struct evms_logical_node * node;
19821+ struct pv_disk * pv;
19822+ struct pe_disk * pe_map;
19823+ u64 offset;
19824+ u32 lv_number, le_number, first_pe_sector;
19825+ int i;
19826+
19827+ LOG_DEBUG("Building LE maps for new volumes in group %s.\n",
19828+ group->vg_name);
19829+
19830+ /* For every PV in this VG. */
19831+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19832+ node = pv_entry->logical_node;
19833+ pv = pv_entry->pv;
19834+ pe_map = pv_entry->pe_map;
19835+
19836+ /* Version 1 metadata uses pe_on_disk.base + .size to find start
19837+ * of first PE. Version 2 uses pe_start.
19838+ */
19839+ if (pv->version == 1) {
19840+ first_pe_sector =
19841+ evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19842+ pv->pe_on_disk.size);
19843+ } else {
19844+ first_pe_sector = pv->pe_start;
19845+ if (!first_pe_sector) {
19846+ first_pe_sector =
19847+ evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19848+ pv->pe_on_disk.size);
19849+ }
19850+ }
19851+
19852+ /* For every entry in the PE map, calculate the PE's sector offset
19853+ * and update the correct LV's PE map. LV number of 0 marks an unused PE.
19854+ * For re-discovery, only compute entries for new volumes. If a PV
19855+ * is read-only, all LVs on that PV will also be read-only.
19856+ */
19857+ for ( i = 0; i < pv->pe_total; i++ ) {
19858+ lv_number = pe_map[i].lv_num;
19859+ if ( lv_number &&
19860+ volume_list[lv_number] &&
19861+ volume_list[lv_number]->lv_access &
19862+ (EVMS_LV_NEW | EVMS_LV_INCOMPLETE) ) {
19863+ le_number = pe_map[i].le_num;
19864+ offset = i * pv->pe_size + first_pe_sector;
19865+ volume_list[lv_number]->le_map[le_number].owning_pv =
19866+ pv_entry;
19867+ volume_list[lv_number]->le_map[le_number].pe_sector_offset =
19868+ offset;
19869+ if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
19870+ volume_list[lv_number]->lv_access &=
19871+ ~LV_WRITE;
19872+ }
19873+ }
19874+ }
19875+ }
19876+
19877+ return 0;
19878+}
19879+
19880+/**
19881+ * build_snapshot_maps
19882+ *
19883+ * For every volume in this group that is a snapshot, read all of the
19884+ * existing entries in the COW table, and build up the snapshot mapping
19885+ * structures accordingly.
19886+ *
19887+ * For reference, the COW tables attached to the snapshot volumes will
19888+ * always be in disk-order (little-endian), so that it can always be
19889+ * immediately written to disk. Therefore, endian conversions are necessary
19890+ * any time the COW table is accessed. This function will make a local
19891+ * copy of each COW table sector, and convert the local copy before
19892+ * building the snapshot maps.
19893+ **/
19894+static int build_snapshot_maps(struct lvm_volume_group * group)
19895+{
19896+ struct lvm_logical_volume * volume;
19897+ struct evms_logical_node tmp_node;
19898+ struct lv_COW_table_disk cow_table[EVMS_VSECTOR_SIZE /
19899+ sizeof(struct lv_COW_table_disk)];
19900+ unsigned long max_entries = EVMS_VSECTOR_SIZE /
19901+ sizeof(struct lv_COW_table_disk);
19902+ int i, j;
19903+
19904+ /* Check every volume in the group to see if it is a snapshot. Also
19905+ * check to make sure it is a new volume in the case of re-discovery.
19906+ */
19907+ for ( i = 1; i <= MAX_LV; i++ ) {
19908+
19909+ /* The volume must exist, must be new, and must be a snapshot.
19910+ */
19911+ volume = group->volume_list[i];
19912+ if ( !volume ||
19913+ !(volume->lv_access & EVMS_LV_NEW) ||
19914+ !(volume->lv_access & LV_SNAPSHOT)) {
19915+ continue;
19916+ }
19917+
19918+ /* Set up a temporary EVMS node. */
19919+ tmp_node.private = volume;
19920+
19921+ LOG_DEBUG("Building snapshot map for volume %s\n",
19922+ volume->name);
19923+
19924+ while (1) {
19925+ /* Read in one sector's worth of COW tables. */
19926+ if ( lvm_init_io(&tmp_node, 0,
19927+ volume->current_cow_sector,
19928+ 1, volume->cow_table) ) {
19929+ goto error;
19930+ }
19931+
19932+ /* Endian-conversion of this COW table
19933+ * to a local table.
19934+ */
19935+ for ( j = 0; j < max_entries; j++ ) {
19936+ cow_table[j].pv_org_number =
19937+ le64_to_cpu(volume->cow_table[j].pv_org_number);
19938+ cow_table[j].pv_org_rsector =
19939+ le64_to_cpu(volume->cow_table[j].pv_org_rsector);
19940+ cow_table[j].pv_snap_number =
19941+ le64_to_cpu(volume->cow_table[j].pv_snap_number);
19942+ cow_table[j].pv_snap_rsector =
19943+ le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
19944+ }
19945+
19946+ /* Translate every valid COW table entry into
19947+ * a snapshot map entry.
19948+ */
19949+ for ( volume->next_cow_entry = 0;
19950+ volume->next_cow_entry < max_entries &&
19951+ cow_table[volume->next_cow_entry].pv_org_number;
19952+ volume->next_cow_entry++ ) {
19953+ /* org_rsector must be a valid sector number,
19954+ * i.e. it can't be within a PVs metadata. This
19955+ * is how we detect invalidated snapshots.
19956+ */
19957+ if ( cow_table[volume->next_cow_entry].pv_org_rsector < 10 ||
19958+ cow_table[volume->next_cow_entry].pv_org_number > group->pv_count ||
19959+ add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]), volume) ) {
19960+ /* This volume either has an invalid COW entry,
19961+ * or had an error adding that COW entry to the
19962+ * snapshot map. This snapshot is done.
19963+ */
19964+ goto error;
19965+ }
19966+ volume->next_free_chunk += volume->chunk_size;
19967+ }
19968+
19969+ /* Move on to the next sector if necessary. */
19970+ if ( volume->next_cow_entry == max_entries ) {
19971+ volume->current_cow_sector++;
19972+ } else {
19973+ break;
19974+ }
19975+ }
19976+ }
19977+
19978+out:
19979+ return 0;
19980+error:
19981+ invalidate_snapshot_volume(volume);
19982+ deallocate_logical_volume(volume);
19983+ goto out;
19984+}
19985+
19986+/**
19987+ * link_snapshot_volumes
19988+ *
19989+ * This function examines the list of logical volumes in this group and
19990+ * sets up the necessary pointers to link snapshots and their originals.
19991+ * A singly-linked list is created starting with the original volume. Also,
19992+ * all snapshot volumes point directly back to their original. This
19993+ * function should not be run until all volumes have been discovered.
19994+ * In the case of re-discovery, all of these links/lists get rebuilt as if
19995+ * they were not already there. Currently this should not pose a problem.
19996+ **/
19997+static int link_snapshot_volumes(struct lvm_volume_group * group)
19998+{
19999+ struct lvm_logical_volume * org_volume, * snap_volume;
20000+ u32 org_minor, buffer_size = 0;
20001+ int i, j;
20002+
20003+ for ( i = 1; i <= MAX_LV; i++ ) {
20004+
20005+ /* Only process snapshot-originals. */
20006+ org_volume = group->volume_list[i];
20007+ if ( !org_volume || !(org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
20008+ continue;
20009+ }
20010+
20011+ /* For snapshot-originals, look for all other volumes that
20012+ * claim to be snapshotting it. For each one that is found,
20013+ * insert it at the start of the original's list of snapshots.
20014+ * Need to start with a NULL snapshot_next, otherwise could
20015+ * wind up with circular lists.
20016+ */
20017+ org_minor = org_volume->lv_minor;
20018+ org_volume->snapshot_next = NULL;
20019+
20020+ for ( j = 1; j <= MAX_LV; j++ ) {
20021+ snap_volume = group->volume_list[j];
20022+ if ( snap_volume &&
20023+ snap_volume->lv_access & LV_SNAPSHOT &&
20024+ (snap_volume->snap_org_minor == org_minor) ) {
20025+ snap_volume->snapshot_org = org_volume;
20026+ snap_volume->snapshot_next =
20027+ org_volume->snapshot_next;
20028+ org_volume->snapshot_next = snap_volume;
20029+ if ( snap_volume->chunk_size > buffer_size ) {
20030+ buffer_size = snap_volume->chunk_size;
20031+ }
20032+ LOG_DEBUG("Linking snapshot (%s) to original (%s)\n",
20033+ snap_volume->name, org_volume->name);
20034+ }
20035+ }
20036+
20037+ /* If no snapshots were found for a volume that claims to be
20038+ * under snapshot, mark the group dirty. If this is final
20039+ * discovery, the original will have the snapshot flag turned
20040+ * off in check_logical_volumes().
20041+ */
20042+ if (!org_volume->snapshot_next) {
20043+ LOG_WARNING("No snapshots found for original (%s)\n",
20044+ org_volume->name);
20045+ group->flags |= EVMS_VG_DIRTY;
20046+ }
20047+ }
20048+ return 0;
20049+}
20050+
20051+/**
20052+ * discover_volumes_in_group
20053+ **/
20054+static int discover_volumes_in_group(struct lvm_volume_group * group)
20055+{
20056+ struct lv_disk * lv_array = group->lv_array;
20057+ struct lvm_logical_volume * new_volume;
20058+ int i;
20059+
20060+ /* Search through the LV structs for valid LV entries. */
20061+ for ( i = 0; i < group->vg->lv_max; i++ ) {
20062+
20063+ /* Only discover valid, active volumes. */
20064+ if ( !lv_array[i].lv_name[0] ||
20065+ lv_array[i].lv_number >= MAX_LV ) {
20066+ continue;
20067+ }
20068+
20069+ /* Make sure this volume isn't already in the list. */
20070+ if (group->volume_list[lv_array[i].lv_number + 1]) {
20071+ continue;
20072+ }
20073+
20074+ /* Create a new logical volume and place it in the appropriate
20075+ * spot in this VG's volume list.
20076+ */
20077+ new_volume = allocate_logical_volume(&(lv_array[i]), group);
20078+ if (!new_volume) {
20079+ /* This volume will be missing, but other
20080+ * volumes in this group can still be built.
20081+ */
20082+ LOG_CRITICAL("Error allocating LV %s in Group %s\n",
20083+ lv_array[i].lv_name, group->vg_name);
20084+ continue;
20085+ }
20086+
20087+ group->volume_list[new_volume->lv_number] = new_volume;
20088+ group->volume_count++;
20089+ group->flags |= EVMS_VG_DIRTY;
20090+
20091+ LOG_DEBUG("Discovered volume %s in group %s.\n",
20092+ new_volume->name, group->vg_name);
20093+ }
20094+
20095+ return 0;
20096+}
20097+
20098+/**
20099+ * discover_logical_volumes
20100+ *
20101+ * After all PVs have been claimed and added to the appropriate VG list,
20102+ * the volumes for each VG must be constructed. For each group, read all
20103+ * the LV structs off the first PV in the list. Search this list of
20104+ * structs for valid LVs. For each valid LV, create a new volume and add
20105+ * it to the group.
20106+ **/
20107+static int discover_logical_volumes(int final_discovery)
20108+{
20109+ struct lvm_volume_group *group;
20110+ int rc;
20111+
20112+ /* Look for volumes in each valid VG entry. We even need to check ones
20113+ * that aren't dirty - We could have deleted an incomplete volume on
20114+ * the previous pass, and need to rediscover it in case this is final
20115+ * discovery and we now want to export it.
20116+ */
20117+ for ( group = lvm_group_list; group; group = group->next_group ) {
20118+
20119+ if ( ! group->vg ||
20120+ (! final_discovery &&
20121+ ! (group->flags & EVMS_VG_DIRTY)) ) {
20122+ continue;
20123+ }
20124+
20125+ LOG_DEBUG("Searching for volumes in group %s\n",
20126+ group->vg_name);
20127+
20128+ /* Read in the LV array from disk if necessary. */
20129+ rc = read_lv(group);
20130+ if (rc) {
20131+ LOG_WARNING("Unable to read LV metadata for group %s\n",
20132+ group->vg_name);
20133+ LOG_WARNING("No regions can be discovered for group %s\n",
20134+ group->vg_name);
20135+ continue;
20136+ }
20137+
20138+ /* Assemble each volume in the group. */
20139+ discover_volumes_in_group(group);
20140+
20141+ /* Build the LE map for each LV discovered in this group. This
20142+ * must be done after all LVS in the group are discovered.
20143+ */
20144+ build_le_maps(group);
20145+ check_le_maps(group);
20146+
20147+ /* Set up all of the initial snapshot maps. Only the kernel
20148+ * keeps track of the snapshot maps.
20149+ */
20150+ build_snapshot_maps(group);
20151+
20152+ /* Set up the pointers to link snapshot volumes
20153+ * with their originals.
20154+ */
20155+ link_snapshot_volumes(group);
20156+ }
20157+
20158+ return 0;
20159+}
20160+
20161+/**
20162+ * export_volumes
20163+ *
20164+ * The last thing the plugin must do is take each newly constructed volume
20165+ * and place it on the evms logical node list. A zero return-code from
20166+ * this function means nothing new was added to the list, and a positive
20167+ * return code means that many new items were added to the list.
20168+ **/
20169+static int export_volumes(struct evms_logical_node ** evms_node_list,
20170+ int final_discover)
20171+{
20172+ struct lvm_volume_group * group;
20173+ struct evms_logical_node * new_node;
20174+ struct lvm_logical_volume * volume;
20175+ int i, count = 0;
20176+
20177+ LOG_EXTRA("Exporting volumes\n");
20178+
20179+ /* For every valid, dirty volume group. */
20180+ for ( group = lvm_group_list; group; group = group->next_group ) {
20181+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {
20182+ continue;
20183+ }
20184+
20185+ /* Export every valid volume in the group. For re-discovery,
20186+ * we re-export the same logical node.
20187+ */
20188+ for ( i = 1; i <= MAX_LV; i++ ) {
20189+ volume = group->volume_list[i];
20190+ if (!volume) {
20191+ continue;
20192+ }
20193+
20194+ /* For new volumes, create a new EVMS node and
20195+ * initialize the appropriate fields.
20196+ */
20197+ if ( volume->lv_access & EVMS_LV_NEW ) {
20198+ if ( evms_cs_allocate_logical_node(&new_node) ) {
20199+ continue;
20200+ }
20201+ MOD_INC_USE_COUNT;
20202+
20203+ volume->volume_node = new_node;
20204+ volume->lv_access &= (~EVMS_LV_QUIESCED &
20205+ ~EVMS_LV_NEW);
20206+ new_node->hardsector_size =
20207+ group->hard_sect_size;
20208+ new_node->block_size = group->block_size;
20209+ new_node->plugin = &lvm_plugin_header;
20210+ new_node->private = volume;
20211+ memcpy(new_node->name, volume->name, NAME_LEN);
20212+
20213+ /* Snapshot volumes should report the
20214+ * size of their original.
20215+ */
20216+ new_node->total_vsectors =
20217+ (volume->lv_access & LV_SNAPSHOT) ?
20218+ volume->snapshot_org->lv_size :
20219+ volume->lv_size;
20220+
20221+ /* Is the volume read-only? */
20222+ if ( ! (volume->lv_access & LV_WRITE) ) {
20223+ new_node->flags |=
20224+ EVMS_VOLUME_READ_ONLY;
20225+ LOG_DEBUG("LVM volume %s is read-only\n",
20226+ volume->name);
20227+ }
20228+
20229+ /* Is the volume incomplete? */
20230+ if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
20231+ new_node->flags |=
20232+ (EVMS_VOLUME_READ_ONLY |
20233+ EVMS_VOLUME_PARTIAL);
20234+ LOG_DEBUG("LVM volume %s is incomplete\n",
20235+ volume->name);
20236+ }
20237+
20238+ /* Does the volume group contain any partial or
20239+ * removable PVs?
20240+ */
20241+ if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
20242+ new_node->flags |= EVMS_VOLUME_PARTIAL;
20243+ }
20244+ if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
20245+ new_node->flags |=
20246+ EVMS_DEVICE_REMOVABLE;
20247+ }
20248+ }
20249+
20250+ /* Export the node, only if it hasn't been exported
20251+ * during this full EVMS discover.
20252+ */
20253+ if ( ! (volume->lv_access & EVMS_LV_EXPORTED) ) {
20254+ if ( ! evms_cs_add_logical_node_to_list(evms_node_list,
20255+ volume->volume_node) ) {
20256+ LOG_DETAILS("Exporting LVM volume %s\n",
20257+ volume->name);
20258+ volume->lv_access |= EVMS_LV_EXPORTED;
20259+ count++;
20260+ }
20261+ }
20262+
20263+ if (final_discover) {
20264+ volume->lv_access &= ~EVMS_LV_EXPORTED;
20265+ }
20266+ }
20267+
20268+ /* The group is clean now. */
20269+ group->flags &= ~EVMS_VG_DIRTY;
20270+ }
20271+
20272+ return count;
20273+}
20274+
20275+/**
20276+ * lvm_cleanup
20277+ *
20278+ * This function runs through the entire lvm data structure, removing
20279+ * all items that are not needed at runtime. Currently, this is just the
20280+ * struct vg_disk structure and the struct pv_disk structure for each PV.
20281+ * Also, any groups that don't contain any volumes are deleted. All of the
20282+ * other volume_group, logical_volume and evms_logical_node structures will
20283+ * be kept around at run-time.
20284+ **/
20285+static int lvm_cleanup(void)
20286+{
20287+ struct lvm_volume_group * group, * next_group;
20288+ struct lvm_physical_volume * pv_entry;
20289+
20290+ for ( group = lvm_group_list; group; group = next_group ) {
20291+ next_group = group->next_group;
20292+
20293+ /* Delete groups with no volumes. */
20294+ if (!group->volume_count) {
20295+ LOG_WARNING("Group %s contains no logical volumes. Deleting.\n",
20296+ group->vg_name);
20297+ remove_group_from_list(group);
20298+ deallocate_volume_group(group);
20299+ /* Need to go back to the start of the list,
20300+ * just to be safe. :)
20301+ */
20302+ next_group = lvm_group_list;
20303+ continue;
20304+ }
20305+
20306+ /* Delete data structures that aren't used at runtime. */
20307+ if (group->vg) {
20308+ kfree(group->vg);
20309+ group->vg = NULL;
20310+ }
20311+
20312+ for ( pv_entry = group->pv_list;
20313+ pv_entry; pv_entry = pv_entry->next) {
20314+ if (pv_entry->pv) {
20315+ kfree(pv_entry->pv);
20316+ pv_entry->pv = NULL;
20317+ }
20318+ if (pv_entry->pe_map) {
20319+ vfree(pv_entry->pe_map);
20320+ pv_entry->pe_map = NULL;
20321+ }
20322+ }
20323+ if (group->lv_array) {
20324+ vfree(group->lv_array);
20325+ group->lv_array = NULL;
20326+ }
20327+ if (group->uuid_list) {
20328+ vfree(group->uuid_list);
20329+ group->uuid_list = NULL;
20330+ }
20331+ }
20332+ return 0;
20333+}
20334+
20335+/**
20336+ * lvm_get_bmap
20337+ *
20338+ * Support for the BMAP ioctl used by LILO to translate filesystem blocks
20339+ * to disk blocks to map kernel images for boot time.
20340+ **/
20341+static int lvm_get_bmap(struct evms_logical_node * node,
20342+ struct evms_get_bmap_pkt * bmap,
20343+ struct evms_logical_node ** pv_node)
20344+{
20345+ struct lvm_logical_volume * volume = node->private;
20346+ struct lvm_physical_volume * pv_entry;
20347+ u64 pe_start_sector, new_sector = 0, new_size = 0;
20348+ int rc = 0;
20349+
20350+ /* No kernel images allowed on snapshot LVs. */
20351+ if ( volume->lv_access & LV_SNAPSHOT ) {
20352+ return -EINVAL;
20353+ }
20354+
20355+ /* Range check. */
20356+ if ( bmap->rsector >= volume->lv_size ) {
20357+ return -EINVAL;
20358+ }
20359+
20360+ rc = remap_sector(node, bmap->rsector, 1, &new_sector,
20361+ &new_size, &pe_start_sector, &pv_entry);
20362+
20363+ if (rc || !pv_entry || !new_sector) {
20364+ return -EINVAL;
20365+ }
20366+
20367+ bmap->rsector = new_sector;
20368+ *pv_node = pv_entry->logical_node;
20369+
20370+ return 0;
20371+}
20372+
20373+/**
20374+ * lvm_global_proc_read
20375+ *
20376+ * A callback function for the lvm-global proc-fs entry. This will print
20377+ * general info about all LVM VGs, PVs, and LVs.
20378+ **/
20379+static int lvm_global_proc_read(char * page, char ** start, off_t off,
20380+ int count, int * eof, void * data)
20381+{
20382+ struct lvm_volume_group * group;
20383+ struct lvm_physical_volume * pv_entry;
20384+ struct lvm_logical_volume * volume, * snap;
20385+ int vgs = 0, lvs = 0, pvs = 0;
20386+ int i, sz = 0;
20387+
20388+ PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
20389+ PROCPRINT("Plugin ID: %x.%x.%x\n",
20390+ GetPluginOEM(lvm_plugin_header.id),
20391+ GetPluginType(lvm_plugin_header.id),
20392+ GetPluginID(lvm_plugin_header.id));
20393+ PROCPRINT("Plugin Version: %d.%d.%d\n",
20394+ lvm_plugin_header.version.major,
20395+ lvm_plugin_header.version.minor,
20396+ lvm_plugin_header.version.patchlevel);
20397+ PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
20398+ lvm_plugin_header.required_services_version.major,
20399+ lvm_plugin_header.required_services_version.minor,
20400+ lvm_plugin_header.required_services_version.patchlevel);
20401+
20402+ /* Count all existing items. */
20403+ for ( group = lvm_group_list; group; group = group->next_group ) {
20404+ lvs += group->volume_count;
20405+ pvs += group->pv_count;
20406+ vgs++;
20407+ }
20408+
20409+ PROCPRINT("\n");
20410+ PROCPRINT("Total: %d VGs %d PVs %d LVs\n", vgs, pvs, lvs);
20411+
20412+ /* Print out specifics about each VG. */
20413+ for ( group = lvm_group_list; group; group = group->next_group ) {
20414+ PROCPRINT("\n");
20415+ PROCPRINT("VG: %s [%d PV, %d LV]\n",
20416+ group->vg_name, group->pv_count, group->volume_count);
20417+ PROCPRINT("PVs:\n");
20418+ for ( pv_entry = group->pv_list;
20419+ pv_entry; pv_entry = pv_entry->next ) {
20420+ if (pv_entry->logical_node) {
20421+ PROCPRINT("\t%s\t%10Ld KB\n",
20422+ pv_entry->logical_node->name,
20423+ (long long)pv_entry->logical_node->total_vsectors / 2);
20424+ }
20425+ }
20426+ PROCPRINT("LVs:\n");
20427+ for ( i = 1; i <= MAX_LV; i++ ) {
20428+ if (group->volume_list[i]) {
20429+ volume = group->volume_list[i];
20430+ PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
20431+ volume->name,
20432+ (long long)volume->lv_size / 2,
20433+ volume->num_le);
20434+ if ( volume->lv_access & LV_SNAPSHOT ) {
20435+ PROCPRINT("\tSnapshot of : ");
20436+ if (volume->snapshot_org) {
20437+ PROCPRINT("%s : ",
20438+ volume->snapshot_org->name);
20439+ } else {
20440+ PROCPRINT("(unknown) : ");
20441+ }
20442+ PROCPRINT("%ld%% full : ",
20443+ (long)(volume->next_free_chunk) *
20444+ 100 / (long)(volume->lv_size));
20445+ if ( volume->lv_status & LV_ACTIVE ) {
20446+ PROCPRINT("active");
20447+ } else {
20448+ PROCPRINT("disabled");
20449+ }
20450+ } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20451+ PROCPRINT("\tSnapshotted by : ");
20452+ for ( snap = volume->snapshot_next;
20453+ snap;
20454+ snap = snap->snapshot_next ) {
20455+ PROCPRINT("%s ", snap->name);
20456+ }
20457+ }
20458+ PROCPRINT("\n");
20459+ }
20460+ }
20461+ }
20462+
20463+out:
20464+ *start = page + off;
20465+ sz -= off;
20466+ if (sz < 0)
20467+ sz = 0;
20468+ return sz > count ? count : sz;
20469+}
20470+
20471+
20472+/********** Required EVMS Plugin Functions **********/
20473+
20474+
20475+/**
20476+ * lvm_discover
20477+ *
20478+ * This is the entry point into the LVM discovery process. It is a three
20479+ * phase process. First, the list of nodes are examined for PVs, and the
20480+ * appropriate volume groups are created. Then each volume group is
20481+ * examined to find all available logical volumes. Finally, each LVM
20482+ * logical volume has a new EVMS node created for it, and added to the
20483+ * list of nodes.
20484+ **/
20485+static int lvm_discover(struct evms_logical_node ** evms_node_list)
20486+{
20487+ int rc;
20488+
20489+ MOD_INC_USE_COUNT;
20490+ LOG_EXTRA("Beginning discovery.\n");
20491+
20492+ discover_volume_groups(evms_node_list);
20493+
20494+ check_volume_groups();
20495+
20496+ discover_logical_volumes(FALSE);
20497+
20498+ check_logical_volumes(FALSE);
20499+
20500+ rc = export_volumes(evms_node_list, FALSE);
20501+
20502+ LOG_EXTRA("Discovery complete.\n");
20503+ MOD_DEC_USE_COUNT;
20504+ return rc;
20505+}
20506+
20507+/**
20508+ * lvm_discover_end
20509+ *
20510+ * The discovery process at the region-manager level is now iterative,
20511+ * much like the EVMS feature level. This allows the ability to stack
20512+ * LVM on top of MD, or vice-versa. To accomplish this correctly, and
20513+ * also to accomplish partial volume discovery, a second discover
20514+ * entry point is needed, so EVMS can tell the region managers that
20515+ * discovery is over, and to finish up any discovery that is not yet
20516+ * complete. When this function is called, it should be assumed that
20517+ * the node list has had nothing new added to it since the last call
20518+ * of the regular discover function. Therefore, when this function is
20519+ * called, we do not need to try to discovery any additional volume
20520+ * groups. We will, however, look for logical volumes once more. This
20521+ * gives us the ability to export (read-only) volumes that have
20522+ * partially corrupted LE maps due to missing PVs in their VG.
20523+ **/
20524+static int lvm_discover_end(struct evms_logical_node ** evms_node_list)
20525+{
20526+ int rc;
20527+
20528+ MOD_INC_USE_COUNT;
20529+ LOG_EXTRA("Beginning final discovery\n");
20530+
20531+ discover_volume_groups(evms_node_list);
20532+
20533+ check_volume_groups();
20534+
20535+ discover_logical_volumes(TRUE);
20536+
20537+ check_logical_volumes(TRUE);
20538+
20539+ rc = export_volumes(evms_node_list, TRUE);
20540+
20541+ lvm_cleanup();
20542+
20543+ LOG_EXTRA("Final discovery complete.\n");
20544+ MOD_DEC_USE_COUNT;
20545+ return rc;
20546+}
20547+
20548+/**
20549+ * lvm_delete_node
20550+ *
20551+ * This function deletes the in-memory representation of an LVM logical volume.
20552+ **/
20553+static int lvm_delete_node(struct evms_logical_node * logical_node)
20554+{
20555+ struct lvm_logical_volume * volume = logical_node->private;
20556+ struct lvm_volume_group * group = volume->group;
20557+
20558+ LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
20559+
20560+ if ( deallocate_logical_volume(volume) ) {
20561+ return -EINVAL;
20562+ }
20563+
20564+ /* If we just removed the last volume from this group, the entire group
20565+ * must also be deleted.
20566+ */
20567+ if ( group && group->volume_count == 0 ) {
20568+ remove_group_from_list(group);
20569+ deallocate_volume_group(group);
20570+ }
20571+
20572+ /* Free the logical node. */
20573+ evms_cs_deallocate_logical_node(logical_node);
20574+ MOD_DEC_USE_COUNT;
20575+ return 0;
20576+}
20577+
20578+/**
20579+ * lvm_read
20580+ **/
20581+static void lvm_read(struct evms_logical_node * node,
20582+ struct buffer_head * bh)
20583+{
20584+ struct lvm_logical_volume * volume = node->private;
20585+ struct lvm_physical_volume * pv_entry;
20586+ u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20587+ u64 new_sector, new_size, pe_start_sector;
20588+
20589+ /* If this volume is a snapshot, lock the volume, and do
20590+ * the LE-PE translation on its original volume.
20591+ */
20592+ if ( volume->lv_access & LV_SNAPSHOT ) {
20593+ down(&volume->snap_semaphore);
20594+ if (!volume->snapshot_org) {
20595+ goto out_error;
20596+ }
20597+ node = volume->snapshot_org->volume_node;
20598+ }
20599+
20600+ /* Make sure the volume is active and readable. */
20601+ if ( !(volume->lv_access & LV_READ &&
20602+ volume->lv_status & LV_ACTIVE) ) {
20603+ goto out_error;
20604+ }
20605+
20606+ /* Check if I/O goes past end of logical volume. Must use the
20607+ * node, not the volume, so snapshots will work correctly.
20608+ */
20609+ if ( bh->b_rsector + size > node->total_vsectors ) {
20610+ goto out_error;
20611+ }
20612+
20613+ /* Logical-to-Physical remapping. Check for incomplete volumes.
20614+ * Check intermediate boundary conditions as well.
20615+ */
20616+ if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20617+ &new_size, &pe_start_sector, &pv_entry) ||
20618+ !pe_start_sector || !pv_entry ||
20619+ size != new_size ) {
20620+ goto out_error;
20621+ }
20622+
20623+ /* For snapshot volumes, check if this sector's chunk has been
20624+ * remapped. If it has, new_sector and pv_entry will be changed
20625+ * accordingly. If not, they remain the same.
20626+ */
20627+ if ( volume->lv_access & LV_SNAPSHOT ) {
20628+ snapshot_remap_sector(volume, pe_start_sector,
20629+ &new_sector, &pv_entry);
20630+ }
20631+
20632+ bh->b_rsector = new_sector;
20633+ R_IO(pv_entry->logical_node, bh);
20634+
20635+out:
20636+ /* Unlock the snapshot. */
20637+ if ( volume->lv_access & LV_SNAPSHOT ) {
20638+ up(&volume->snap_semaphore);
20639+ }
20640+ return;
20641+
20642+out_error:
20643+ bh->b_end_io(bh, 0);
20644+ goto out;
20645+}
20646+
20647+/**
20648+ * lvm_write
20649+ **/
20650+static void lvm_write(struct evms_logical_node * node,
20651+ struct buffer_head * bh)
20652+{
20653+ struct lvm_logical_volume * volume = node->private;
20654+ struct lvm_logical_volume * snap_volume;
20655+ struct lvm_physical_volume * pv_entry;
20656+ u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20657+ u64 new_sector, new_size, pe_start_sector;
20658+
20659+ /* Make sure the volume is active and writable. */
20660+ if ( !(volume->lv_access & LV_WRITE &&
20661+ volume->lv_status & LV_ACTIVE) ) {
20662+ goto out_error;
20663+ }
20664+
20665+ /* Check if I/O goes past end of logical volume. */
20666+ if ( bh->b_rsector + size > node->total_vsectors ) {
20667+ goto out_error;
20668+ }
20669+
20670+ /* Logical-to-Physical remapping. Check for incomplete volumes.
20671+ * Check intermediate boundary conditions as well.
20672+ */
20673+ if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20674+ &new_size, &pe_start_sector, &pv_entry) ||
20675+ !pe_start_sector || !pv_entry ||
20676+ size != new_size ) {
20677+ goto out_error;
20678+ }
20679+
20680+ /* Copy-on-write for snapshotting. */
20681+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20682+ /* Originals can be snapshotted multiple times. */
20683+ for ( snap_volume = volume->snapshot_next;
20684+ snap_volume; snap_volume = snap_volume->snapshot_next ) {
20685+ if ( snapshot_copy_data(volume, snap_volume,
20686+ pe_start_sector, new_sector,
20687+ pv_entry) ) {
20688+ goto out_error;
20689+ }
20690+ }
20691+ }
20692+
20693+ bh->b_rsector = new_sector;
20694+ W_IO(pv_entry->logical_node, bh);
20695+out:
20696+ return;
20697+out_error:
20698+ bh->b_end_io(bh, 0);
20699+ goto out;
20700+}
20701+
20702+/**
20703+ * lvm_init_io
20704+ *
20705+ * Init_io on a snapshot volume treats it like a regular volume.
20706+ **/
20707+static int lvm_init_io(struct evms_logical_node * node,
20708+ int io_flag,
20709+ u64 sect_nr,
20710+ u64 num_sects,
20711+ void * buf_addr)
20712+{
20713+ struct lvm_logical_volume * volume = node->private;
20714+ struct lvm_physical_volume * pv_entry;
20715+ u64 pe_start_sector, new_sector, new_size;
20716+ int rc = 0;
20717+
20718+ /* Only allow internal writes to snapshots (io_flag==4). Disallow
20719+ * writes to snapshot originals.
20720+ */
20721+ if ( io_flag == WRITE &&
20722+ volume->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG) ) {
20723+ return -EINVAL;
20724+ }
20725+
20726+ /* The node for a snapshot reports the size of the original. If a
20727+ * request comes in in that range, just return.
20728+ */
20729+ else if ( volume->lv_access & LV_SNAPSHOT &&
20730+ sect_nr >= volume->lv_size &&
20731+ sect_nr < node->total_vsectors ) {
20732+ if ( io_flag == READ ) {
20733+ memset(buf_addr, 0,
20734+ num_sects << EVMS_VSECTOR_SIZE_SHIFT);
20735+ }
20736+ return 0;
20737+ }
20738+
20739+ /* Regular range check. */
20740+ else if ( sect_nr + num_sects > volume->lv_size ) {
20741+ return -EINVAL;
20742+ }
20743+
20744+ if ( io_flag == 4 ) {
20745+ io_flag = WRITE;
20746+ }
20747+
20748+ /* Init IO needs to deal with the possibility of a request that spans
20749+ * PEs or stripes. This is possible because there is no limit on
20750+ * num_sects. To handle this, we loop through remap_sector and
20751+ * INIT_IO until num_sects reaches zero.
20752+ */
20753+ while (num_sects) {
20754+ if ( remap_sector(node, sect_nr, num_sects, &new_sector,
20755+ &new_size, &pe_start_sector, &pv_entry) ) {
20756+ return -EIO;
20757+ }
20758+
20759+ /* If the volume is incomplete, clear the buffer (on a read). */
20760+ if (!pe_start_sector || !pv_entry) {
20761+ if ( io_flag == READ ) {
20762+ memset(buf_addr, 0,
20763+ new_size << EVMS_VSECTOR_SIZE_SHIFT);
20764+ }
20765+ } else {
20766+ rc = INIT_IO(pv_entry->logical_node, io_flag,
20767+ new_sector, new_size, buf_addr);
20768+ }
20769+ num_sects -= new_size;
20770+ sect_nr += new_size;
20771+ buf_addr = (void *)(((unsigned long) buf_addr) +
20772+ (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
20773+ }
20774+
20775+ return rc;
20776+}
20777+
20778+/**
20779+ * lvm_ioctl
20780+ **/
20781+static int lvm_ioctl(struct evms_logical_node * logical_node,
20782+ struct inode * inode,
20783+ struct file * file,
20784+ unsigned int cmd,
20785+ unsigned long arg)
20786+{
20787+ struct lvm_logical_volume * volume = logical_node->private;
20788+ int rc = 0;
20789+
20790+ LOG_ENTRY_EXIT("Ioctl %d\n", cmd);
20791+
20792+ switch (cmd) {
20793+
20794+ case HDIO_GETGEO:
20795+ {
20796+ /* Fixed geometry for all LVM volumes. */
20797+ unsigned char heads = 64;
20798+ unsigned char sectors = 32;
20799+ short cylinders;
20800+ long start = 0;
20801+ struct hd_geometry * hd = (struct hd_geometry *)arg;
20802+ cylinders = logical_node->total_vsectors;
20803+ cylinders = (cylinders / heads) / sectors;
20804+
20805+ if (!hd) {
20806+ return -EINVAL;
20807+ }
20808+
20809+ if ( copy_to_user((char *)(&hd->heads),
20810+ &heads, sizeof(heads)) ||
20811+ copy_to_user((char *)(&hd->sectors),
20812+ &sectors, sizeof(sectors)) ||
20813+ copy_to_user((short *)(&hd->cylinders),
20814+ &cylinders, sizeof(cylinders)) ||
20815+ copy_to_user((long *)(&hd->start),
20816+ &start, sizeof(start)) ) {
20817+ return -EFAULT;
20818+ }
20819+ }
20820+ break;
20821+
20822+ case EVMS_QUIESCE_VOLUME:
20823+ {
20824+ struct evms_quiesce_vol_pkt * tmp =
20825+ (struct evms_quiesce_vol_pkt *)arg;
20826+ if (tmp->command) {
20827+ volume->lv_access |= EVMS_LV_QUIESCED;
20828+ } else {
20829+ volume->lv_access &= ~EVMS_LV_QUIESCED;
20830+ }
20831+ }
20832+ break;
20833+
20834+ case EVMS_GET_BMAP:
20835+ {
20836+ struct evms_get_bmap_pkt * bmap =
20837+ (struct evms_get_bmap_pkt *)arg;
20838+ struct evms_logical_node * pv_node;
20839+
20840+ rc = lvm_get_bmap(logical_node, bmap, &pv_node);
20841+ if (!rc) {
20842+ rc = IOCTL(pv_node, inode, file, cmd,
20843+ (unsigned long) bmap);
20844+ }
20845+ }
20846+ break;
20847+
20848+ case EVMS_GET_DISK_LIST:
20849+ case EVMS_CHECK_MEDIA_CHANGE:
20850+ case EVMS_REVALIDATE_DISK:
20851+ case EVMS_OPEN_VOLUME:
20852+ case EVMS_CLOSE_VOLUME:
20853+ case EVMS_CHECK_DEVICE_STATUS:
20854+ {
20855+ /* These five ioctl all need to
20856+ * be broadcast to all PVs.
20857+ */
20858+ struct lvm_volume_group * group = volume->group;
20859+ struct lvm_physical_volume * pv_entry;
20860+ for ( pv_entry = group->pv_list;
20861+ pv_entry; pv_entry = pv_entry->next ) {
20862+ rc |= IOCTL(pv_entry->logical_node, inode,
20863+ file, cmd, arg);
20864+ }
20865+ }
20866+ break;
20867+
20868+ default:
20869+ /* Currently LVM does not send any ioctl's down to the
20870+ * PVs. Which PV would they go to? What would we do with
20871+ * the return codes?
20872+ */
20873+ rc = -EINVAL;
20874+ }
20875+
20876+ return rc;
20877+}
20878+
20879+/**
20880+ * lvm_direct_ioctl
20881+ *
20882+ * This function provides a method for user-space to communicate directly
20883+ * with a plugin in the kernel.
20884+ **/
20885+static int lvm_direct_ioctl(struct inode * inode,
20886+ struct file * file,
20887+ unsigned int cmd,
20888+ unsigned long args)
20889+{
20890+ struct evms_plugin_ioctl_pkt pkt, * user_pkt;
20891+ struct lvm_pv_remove_ioctl pv_remove, * user_pv_remove;
20892+ struct lvm_snapshot_stat_ioctl snap_stats, * user_snap_stats;
20893+ int rc = 0;
20894+
20895+ MOD_INC_USE_COUNT;
20896+
20897+ user_pkt = (struct evms_plugin_ioctl_pkt *)args;
20898+
20899+ /* Copy user's parameters to kernel space. */
20900+ if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
20901+ MOD_DEC_USE_COUNT;
20902+ return -EFAULT;
20903+ }
20904+
20905+ /* Make sure this is supposed to be our ioctl. */
20906+ if ( pkt.feature_id != lvm_plugin_header.id ) {
20907+ MOD_DEC_USE_COUNT;
20908+ return -EINVAL;
20909+ }
20910+
20911+ switch (pkt.feature_command) {
20912+
20913+ case EVMS_LVM_PV_REMOVE_IOCTL:
20914+ user_pv_remove =
20915+ (struct lvm_pv_remove_ioctl *)pkt.feature_ioctl_data;
20916+ if ( copy_from_user(&pv_remove, user_pv_remove,
20917+ sizeof(pv_remove)) ) {
20918+ rc = -EINVAL;
20919+ break;
20920+ }
20921+ rc = remove_pv_from_group(pv_remove.pv_number,
20922+ pv_remove.vg_uuid);
20923+ break;
20924+
20925+ case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
20926+ user_snap_stats =
20927+ (struct lvm_snapshot_stat_ioctl *)pkt.feature_ioctl_data;
20928+ if ( copy_from_user(&snap_stats, user_snap_stats,
20929+ sizeof(snap_stats)) ) {
20930+ rc = -EINVAL;
20931+ break;
20932+ }
20933+ rc = get_snapshot_stats(&snap_stats);
20934+ if ( copy_to_user(user_snap_stats, &snap_stats,
20935+ sizeof(snap_stats)) ) {
20936+ rc = -EINVAL;
20937+ break;
20938+ }
20939+ break;
20940+
20941+ default:
20942+ rc = -EINVAL;
20943+ break;
20944+ }
20945+
20946+ pkt.status = rc;
20947+ copy_to_user(user_pkt, &pkt, sizeof(pkt));
20948+ MOD_DEC_USE_COUNT;
20949+ return rc;
20950+}
20951+
20952+/**
20953+ * lvm_vge_init
20954+ **/
20955+int __init lvm_vge_init(void)
20956+{
20957+ struct proc_dir_entry *pde;
20958+
20959+ lvm_group_list = NULL;
20960+ lvm_proc = NULL;
20961+
20962+ /* Register the global proc-fs entries. */
20963+ pde = evms_cs_get_evms_proc_dir();
20964+ if (pde) {
20965+ lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
20966+ if (lvm_proc) {
20967+ create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG,
20968+ lvm_proc, lvm_global_proc_read,
20969+ NULL);
20970+ }
20971+ }
20972+
20973+ /* Register this plugin with EVMS. */
20974+ return evms_cs_register_plugin(&lvm_plugin_header);
20975+}
20976+
20977+/**
20978+ * lvm_vge_exit
20979+ **/
20980+void __exit lvm_vge_exit(void)
20981+{
20982+ struct lvm_volume_group * group, * next_group;
20983+ struct proc_dir_entry * pde;
20984+ int i;
20985+
20986+ /* If LVM is called for module_exit, that means the reference
20987+ * count must be zero, which means there should be no volumes,
20988+ * and thus no volume groups. But, check anyway and delete
20989+ * any volumes and groups that are still hanging around.
20990+ */
20991+ if (lvm_group_list) {
20992+ LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
20993+ }
20994+
20995+ for ( group = lvm_group_list; group; group = next_group ) {
20996+ next_group = group->next_group;
20997+
20998+ LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n",
20999+ group->vg_name);
21000+
21001+ for ( i = 1; i <= MAX_LV; i++ ) {
21002+ if (group->volume_list[i]) {
21003+ lvm_delete_node(group->volume_list[i]->volume_node);
21004+ }
21005+ }
21006+ }
21007+
21008+ /* Unregister the proc-fs entries. */
21009+ pde = evms_cs_get_evms_proc_dir();
21010+ if (pde) {
21011+ remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
21012+ remove_proc_entry(LVM_PROC_NAME, pde);
21013+ }
21014+
21015+ /* Unregister this plugin from EVMS. */
21016+ evms_cs_unregister_plugin(&lvm_plugin_header);
21017+}
21018+
21019+module_init(lvm_vge_init);
21020+module_exit(lvm_vge_exit);
21021+#ifdef MODULE_LICENSE
21022+MODULE_LICENSE("GPL");
21023+#endif
21024+
21025diff -Naur linux-2002-09-30/drivers/evms/md_core.c evms-2002-09-30/drivers/evms/md_core.c
21026--- linux-2002-09-30/drivers/evms/md_core.c Wed Dec 31 18:00:00 1969
21027+++ evms-2002-09-30/drivers/evms/md_core.c Sun Sep 29 23:25:48 2002
21028@@ -0,0 +1,3633 @@
21029+/*
21030+ * Copyright (c) International Business Machines Corp., 2000
21031+ *
21032+ * This program is free software; you can redistribute it and/or modify
21033+ * it under the terms of the GNU General Public License as published by
21034+ * the Free Software Foundation; either version 2 of the License, or
21035+ * (at your option) any later version.
21036+ *
21037+ * This program is distributed in the hope that it will be useful,
21038+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21039+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
21040+ * the GNU General Public License for more details.
21041+ *
21042+ * You should have received a copy of the GNU General Public License
21043+ * along with this program; if not, write to the Free Software
21044+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21045+ *
21046+ *
21047+ * linux/drivers/evms/md_core.c
21048+ *
21049+ * EVMS Linux MD Region Manager
21050+ *
21051+ */
21052+
21053+
21054+#include <linux/module.h>
21055+#include <linux/kmod.h>
21056+#include <linux/kernel.h>
21057+#include <linux/config.h>
21058+#include <linux/genhd.h>
21059+#include <linux/string.h>
21060+#include <linux/blk.h>
21061+#include <linux/init.h>
21062+#include <linux/slab.h>
21063+#include <linux/vmalloc.h>
21064+#include <linux/evms/evms.h>
21065+#include <linux/evms/evms_md.h>
21066+#include <linux/sysctl.h>
21067+#include <asm/system.h>
21068+#include <asm/uaccess.h>
21069+
21070+#define LOG_PREFIX "md core: "
21071+
21072+/*
21073+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
21074+ * is 100 KB/sec, so the extra system load does not show up that much.
21075+ * Increase it if you want to have more _guaranteed_ speed. Note that
21076+ * the RAID driver will use the maximum available bandwith if the IO
21077+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
21078+ * speed limit - in case reconstruction slows down your system despite
21079+ * idle IO detection.
21080+ *
21081+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
21082+ */
21083+
21084+static MD_LIST_HEAD(all_raid_disks);
21085+static MD_LIST_HEAD(pending_raid_disks);
21086+
21087+static int sysctl_speed_limit_min = 100;
21088+static int sysctl_speed_limit_max = 100000;
21089+
21090+
21091+static mdk_personality_t *pers[MAX_PERSONALITY];
21092+
21093+static int md_blocksizes[MAX_MD_DEVS];
21094+static int md_hardsect_sizes[MAX_MD_DEVS];
21095+int evms_md_size[MAX_MD_DEVS];
21096+static struct evms_thread *evms_md_recovery_thread = NULL;
21097+
21098+/*
21099+ * Enables to iterate over all existing md arrays
21100+ */
21101+static LIST_HEAD(all_mddevs);
21102+static LIST_HEAD(incomplete_mddevs);
21103+static LIST_HEAD(running_mddevs);
21104+
21105+/*
21106+ * The mapping between kdev and mddev is not necessary a simple
21107+ * one! Eg. HSM uses several sub-devices to implement Logical
21108+ * Volumes. All these sub-devices map to the same mddev.
21109+ */
21110+struct dev_mapping evms_mddev_map[MAX_MD_DEVS];
21111+
21112+
21113+/* Support functions for discovery */
21114+static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node);
21115+static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node);
21116+static int evms_md_import_device (struct evms_logical_node **discover_list,
21117+ struct evms_logical_node *node);
21118+static void evms_md_autostart_arrays(struct evms_logical_node **discover_list);
21119+static void evms_md_run_devices (struct evms_logical_node **discover_list);
21120+static int evms_md_run_array (struct evms_logical_node ** discover_list,
21121+ mddev_t *mddev);
21122+static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list,
21123+ mddev_t *mddev);
21124+static int evms_md_create_logical_node(struct evms_logical_node **discover_list,
21125+ mddev_t *mddev, uint flags);
21126+static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
21127+static int evms_md_analyze_sbs (mddev_t * mddev);
21128+static mddev_t * alloc_mddev (kdev_t dev);
21129+static void free_mddev(mddev_t * mddev);
21130+static void evms_md_create_recovery_thread(void);
21131+static void evms_md_destroy_recovery_thread(void);
21132+static int do_md_run (mddev_t * mddev);
21133+static int do_md_stop (mddev_t * mddev, int ro);
21134+
21135+static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node);
21136+static void kick_rdev_from_array (mdk_rdev_t * rdev);
21137+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
21138+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
21139+
21140+/* Plugin API prototypes */
21141+static int md_discover( struct evms_logical_node ** discover_list );
21142+static int md_end_discover( struct evms_logical_node ** discover_list );
21143+static int md_delete( struct evms_logical_node * node);
21144+static void md_read( struct evms_logical_node * node,
21145+ struct buffer_head * bh);
21146+static void md_write( struct evms_logical_node * node,
21147+ struct buffer_head * bh);
21148+static int md_sync_io( struct evms_logical_node *node,
21149+ int rw,
21150+ u64 sect_nr,
21151+ u64 num_sects,
21152+ void *data);
21153+static int md_ioctl( struct evms_logical_node *node,
21154+ struct inode *inode,
21155+ struct file *file,
21156+ unsigned int cmd,
21157+ unsigned long arg);
21158+static int md_ioctl_cmd_broadcast(
21159+ struct evms_logical_node *node,
21160+ struct inode *inode,
21161+ struct file *file,
21162+ unsigned long cmd,
21163+ unsigned long arg);
21164+
21165+static int md_direct_ioctl(
21166+ struct inode *inode,
21167+ struct file *file,
21168+ unsigned int cmd,
21169+ unsigned long arg);
21170+
21171+/* global MD data structures */
21172+static struct evms_plugin_fops md_fops = {
21173+ .discover = md_discover,
21174+ .end_discover = md_end_discover,
21175+ .delete = md_delete,
21176+ .read = md_read,
21177+ .write = md_write,
21178+ .init_io = md_sync_io,
21179+ .ioctl = md_ioctl,
21180+ .direct_ioctl = md_direct_ioctl
21181+};
21182+
21183+static struct evms_plugin_header md_plugin_header = {
21184+ .id = SetPluginID(IBM_OEM_ID,
21185+ EVMS_REGION_MANAGER,
21186+ EVMS_MD_ID),
21187+ .version = {
21188+ .major = EVMS_MD_MAJOR_VERSION,
21189+ .minor = EVMS_MD_MINOR_VERSION,
21190+ .patchlevel = EVMS_MD_PATCHLEVEL_VERSION
21191+ },
21192+ .required_services_version = {
21193+ .major = EVMS_MD_COMMON_SERVICES_MAJOR,
21194+ .minor = EVMS_MD_COMMON_SERVICES_MINOR,
21195+ .patchlevel = EVMS_MD_COMMON_SERVICES_PATCHLEVEL
21196+ },
21197+ .fops = &md_fops
21198+};
21199+
21200+/* global variables */
21201+static int exported_nodes; /* total # of exported devices
21202+ * produced during this discovery.
21203+ */
21204+static struct evms_logical_node **cur_discover_list = NULL;
21205+
21206+/**********************************************************/
21207+/* SYSCTL - EVMS/RAID folder */
21208+/**********************************************************/
21209+
21210+#ifdef CONFIG_PROC_FS
21211+static struct ctl_table_header *md_table_header;
21212+
21213+static ctl_table md_table[] = {
21214+ {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
21215+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
21216+ {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
21217+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
21218+ {0}
21219+};
21220+
21221+static ctl_table md_dir_table[] = {
21222+ {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
21223+ {0}
21224+};
21225+
21226+static ctl_table evms_dir_table[] = {
21227+ {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
21228+ {0}
21229+};
21230+
21231+static ctl_table dev_dir_table[] = {
21232+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
21233+ {0}
21234+};
21235+#endif
21236+/********** Required EVMS Plugin Functions **********/
21237+
21238+/*
21239+ * Function: md_discover
21240+ * We should only export complete MD device nodes
21241+ */
21242+static int md_discover( struct evms_logical_node ** discover_list )
21243+{
21244+ MOD_INC_USE_COUNT;
21245+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21246+
21247+ /* initialize global variable */
21248+ exported_nodes = 0;
21249+ cur_discover_list = discover_list;
21250+ evms_md_autostart_arrays(discover_list);
21251+
21252+ LOG_ENTRY_EXIT("%s: EXIT (exported nodes: %d)\n", __FUNCTION__,exported_nodes);
21253+ cur_discover_list = NULL;
21254+ MOD_DEC_USE_COUNT;
21255+ return(exported_nodes);
21256+}
21257+
21258+static mddev_t * evms_md_find_incomplete_array(int level)
21259+{
21260+ mddev_t *mddev;
21261+ struct list_head *tmp,*tmp2;
21262+ mdk_rdev_t *rdev;
21263+
21264+ ITERATE_INCOMPLETE_MDDEV(mddev,tmp) {
21265+ ITERATE_RDEV(mddev, rdev, tmp2) {
21266+ if (rdev->sb && rdev->sb->level == level)
21267+ return mddev;
21268+ }
21269+ }
21270+ return NULL;
21271+}
21272+
21273+/*
21274+ * Function: md_end_discover
21275+ */
21276+static int md_end_discover( struct evms_logical_node ** discover_list )
21277+{
21278+ int rc = 0;
21279+ struct list_head *tmp;
21280+ mdk_rdev_t *rdev;
21281+ mddev_t *mddev;
21282+ struct evms_logical_node *node;
21283+ int done = FALSE;
21284+
21285+ MOD_INC_USE_COUNT;
21286+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21287+ rc = md_discover(discover_list);
21288+
21289+ do {
21290+ done = TRUE;
21291+ if ( (mddev = evms_md_find_incomplete_array(5)) != NULL) {
21292+ evms_md_run_incomplete_array(discover_list, mddev);
21293+ done = FALSE;
21294+ continue;
21295+ }
21296+ if ( (mddev = evms_md_find_incomplete_array(1)) != NULL) {
21297+ evms_md_run_incomplete_array(discover_list, mddev);
21298+ done = FALSE;
21299+ continue;
21300+ }
21301+ if ( (mddev = evms_md_find_incomplete_array(0)) != NULL) {
21302+ evms_md_run_incomplete_array(discover_list, mddev);
21303+ done = FALSE;
21304+ continue;
21305+ }
21306+ if ( (mddev = evms_md_find_incomplete_array(-1)) != NULL) {
21307+ evms_md_run_incomplete_array(discover_list, mddev);
21308+ done = FALSE;
21309+ continue;
21310+ }
21311+
21312+ } while (!done);
21313+
21314+
21315+ /*
21316+ * At this point, delete all mddevs which did not start.
21317+ */
21318+ ITERATE_MDDEV(mddev,tmp) {
21319+ if (mddev->pers == NULL) {
21320+ LOG_WARNING("%s: deleting md%d\n", __FUNCTION__, mdidx(mddev));
21321+ free_mddev(mddev);
21322+ }
21323+ }
21324+
21325+
21326+ /*
21327+ * At this point, delete all rdevs which do not belong to any of discovered MD arrays.
21328+ */
21329+ ITERATE_RDEV_ALL(rdev, tmp) {
21330+ if (!rdev->mddev) {
21331+ node = rdev->node;
21332+ if (node) {
21333+ if (node->plugin->id == md_plugin_header.id)
21334+ evms_md_export_rdev(rdev, FALSE);
21335+ else
21336+ evms_md_export_rdev(rdev, TRUE);
21337+ }
21338+ }
21339+ }
21340+
21341+ LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
21342+ MOD_DEC_USE_COUNT;
21343+ return rc;
21344+}
21345+
21346+
21347+/*
21348+ * Function: md_delete_node
21349+ */
21350+static int md_delete( struct evms_logical_node * node)
21351+{
21352+ struct evms_md *evms_md;
21353+ mddev_t *mddev;
21354+
21355+ evms_md = node->private;
21356+ mddev = evms_md->mddev;
21357+ LOG_DEFAULT("md_delete() [%s]\n", evms_md_partition_name(node));
21358+
21359+ if (mddev)
21360+ do_md_stop(mddev,0);
21361+ if (evms_md) {
21362+ if (evms_md->instance_plugin_hdr.fops)
21363+ kfree(evms_md->instance_plugin_hdr.fops);
21364+ kfree(evms_md);
21365+ }
21366+
21367+ evms_cs_deallocate_logical_node(node);
21368+ return 0;
21369+}
21370+
21371+
21372+/*
21373+ * Function: md_read
21374+ */
21375+static void md_read( struct evms_logical_node * node,
21376+ struct buffer_head * bh)
21377+{
21378+ struct evms_md *evms_md;
21379+ mddev_t *mddev;
21380+
21381+ evms_md = node->private;
21382+ mddev = evms_md->mddev;
21383+ if (evms_md_check_boundary(node, bh)) return;
21384+ if (mddev && mddev->pers)
21385+ mddev->pers->read(node, bh);
21386+}
21387+
21388+
21389+/*
21390+ * Function: md_write
21391+ */
21392+static void md_write( struct evms_logical_node * node,
21393+ struct buffer_head * bh)
21394+{
21395+ struct evms_md *evms_md;
21396+ mddev_t *mddev;
21397+
21398+ evms_md = node->private;
21399+ mddev = evms_md->mddev;
21400+ if (evms_md_check_boundary(node, bh)) return;
21401+ if (mddev->ro) {
21402+ LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21403+ bh->b_end_io(bh, 0);
21404+ return;
21405+ }
21406+ if (mddev && mddev->pers)
21407+ mddev->pers->write(node, bh);
21408+}
21409+
21410+/*
21411+ * Function: md_sync_io
21412+ */
21413+static int md_sync_io(
21414+ struct evms_logical_node *node,
21415+ int rw,
21416+ u64 sect_nr,
21417+ u64 num_sects,
21418+ void *buf_addr)
21419+{
21420+ struct evms_md *evms_md;
21421+ mddev_t *mddev;
21422+ int rc = 0;
21423+
21424+ evms_md = node->private;
21425+ mddev = evms_md->mddev;
21426+
21427+ if (sect_nr + num_sects > node->total_vsectors) {
21428+ LOG_ERROR("%s: attempt to %s beyond MD device(%s) boundary("PFU64") with sect_nr("PFU64") and num_sects("PFU64")\n",
21429+ __FUNCTION__,
21430+ rw ? "WRITE" : "READ",
21431+ node->name,
21432+ node->total_vsectors,
21433+ sect_nr,num_sects);
21434+ rc = -EINVAL;
21435+ }
21436+
21437+ if ((mddev->ro) && (rw != READ)) {
21438+ LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21439+ return -EINVAL;
21440+ }
21441+
21442+ if (!rc && mddev && mddev->pers) {
21443+ /*
21444+ * Check if the personality can handle synchronous I/O,
21445+ * otherwise use the generic function.
21446+ */
21447+ if (mddev->pers->sync_io)
21448+ rc = mddev->pers->sync_io(mddev, rw, sect_nr, num_sects, buf_addr);
21449+ else
21450+ rc = evms_md_sync_io(node, rw, sect_nr, num_sects, buf_addr);
21451+ } else
21452+ rc = -EINVAL;
21453+ return rc;
21454+}
21455+
21456+/**
21457+ * md_end_sync_request - End IO handler for synchronous I/O functions
21458+ **/
21459+static void md_end_sync_request(struct buffer_head *bh, int uptodate)
21460+{
21461+ struct evms_md_sync_cb * cb = (struct evms_md_sync_cb *) bh->b_private;
21462+
21463+ if (!uptodate)
21464+ cb->rc |= -EIO;
21465+ /* we are done with the bh */
21466+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
21467+
21468+ if (atomic_dec_and_test(&cb->io_count)) {
21469+ if (waitqueue_active(&cb->wait))
21470+ wake_up(&cb->wait);
21471+ }
21472+}
21473+
21474+/**
21475+ * md_sync_request_submit_bh - submit a page-size bh
21476+ * @node - target MD node
21477+ * @bh - pointer to the buffer head
21478+ * @sector - the sector number
21479+ * @data - pointer to buffer
21480+ * @rw - READ/WRITE
21481+ * @cb - MD synchronous I/O control block
21482+ **/
21483+static inline void md_sync_request_submit_bh(
21484+ struct evms_logical_node *node,
21485+ struct buffer_head *bh,
21486+ unsigned long sector,
21487+ char *data,
21488+ int rw,
21489+ struct evms_md_sync_cb *cb)
21490+{
21491+
21492+ bh->b_this_page = (struct buffer_head *)1;
21493+ bh->b_rsector = sector;
21494+ bh->b_size = PAGE_SIZE;
21495+ bh->b_state = 0;
21496+ set_bit(BH_Dirty, &bh->b_state);
21497+ set_bit(BH_Lock, &bh->b_state);
21498+ set_bit(BH_Req, &bh->b_state);
21499+ set_bit(BH_Mapped, &bh->b_state);
21500+ atomic_set(&bh->b_count, 1);
21501+ bh->b_data = data;
21502+ bh->b_page = virt_to_page(data);
21503+ bh->b_list = BUF_LOCKED;
21504+ bh->b_end_io = md_end_sync_request;
21505+ bh->b_private = cb;
21506+ atomic_inc(&cb->io_count);
21507+ if (rw == READ)
21508+ R_IO(node,bh);
21509+ else
21510+ W_IO(node,bh);
21511+}
21512+
21513+/**
21514+ * evms_md_allocate_bh
21515+ *
21516+ * Note that this function will not return unless we got a free bh
21517+ **/
21518+static inline struct buffer_head *evms_md_allocate_bh(void)
21519+{
21520+ struct buffer_head *bh;
21521+
21522+ while ((bh = evms_cs_allocate_from_pool(evms_bh_pool, FALSE)) == NULL)
21523+ schedule(); /* just yield for a someone to deallocate a bh */
21524+ init_waitqueue_head(&bh->b_wait);
21525+ bh->b_count = (atomic_t)ATOMIC_INIT(0);
21526+ return(bh);
21527+}
21528+
21529+/**
21530+ * md_partial_sync_io -
21531+ * This function handles synchronous I/O when sector is not page aligned
21532+ * @node - evms node for the MD array
21533+ * @rw - READ/WRITE
21534+ * @sector - the sector
21535+ * @nsects - on input, the total sectors for the request
21536+ * @nsects - on output, number of sectors completed
21537+ * @data - data buffer
21538+ **/
21539+int evms_md_partial_sync_io(
21540+ struct evms_logical_node *node,
21541+ int rw,
21542+ u64 sector,
21543+ u32 *nsects,
21544+ void *data)
21545+{
21546+ int rc;
21547+ u32 offset, size;
21548+ struct buffer_head *bh;
21549+ struct evms_md_sync_cb cb;
21550+ char *page;
21551+
21552+ size = (u32)(*nsects << EVMS_VSECTOR_SIZE_SHIFT);
21553+
21554+ /* calculate byte offset */
21555+ offset = (u32)((sector & (EVMS_MD_SECTS_PER_PAGE-1)) << EVMS_VSECTOR_SIZE_SHIFT);
21556+ if (!offset && (*nsects >= EVMS_MD_SECTS_PER_PAGE)) {
21557+ *nsects = 0;
21558+ return 0; /* Nothing to do */
21559+ }
21560+
21561+ page = NULL;
21562+ rc = 0;
21563+
21564+ page = kmalloc(PAGE_SIZE, GFP_KERNEL);
21565+ if (!page) {
21566+ LOG_ERROR("%s: no memory!\n", __FUNCTION__);
21567+ rc = -ENOMEM;
21568+ }
21569+
21570+ bh = evms_md_allocate_bh();
21571+
21572+ if (!rc) {
21573+ memset(&cb, 0, sizeof(cb));
21574+ init_waitqueue_head(&cb.wait);
21575+ cb.io_count = (atomic_t)ATOMIC_INIT(0);
21576+ md_sync_request_submit_bh(
21577+ node, bh,
21578+ (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21579+ page, READ, &cb);
21580+ wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21581+ rc |= cb.rc;
21582+ }
21583+
21584+ if (!rc) {
21585+ size = (size <= (PAGE_SIZE - offset)) ? size : (PAGE_SIZE - offset);
21586+
21587+ switch (rw) {
21588+ case READ:
21589+ /* copy data and return */
21590+ memcpy(data, page+offset, size);
21591+ break;
21592+ case WRITE:
21593+ /* copy data and then write */
21594+ memcpy(page+offset, data, size);
21595+
21596+ bh = evms_md_allocate_bh();
21597+
21598+ md_sync_request_submit_bh(
21599+ node, bh,
21600+ (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21601+ page, WRITE, &cb);
21602+ wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21603+ rc |= cb.rc;
21604+ break;
21605+ default:
21606+ rc = -EINVAL;
21607+ }
21608+ }
21609+
21610+ if (page)
21611+ kfree(page);
21612+
21613+ if (!rc)
21614+ *nsects = (u64)(size >> EVMS_VSECTOR_SIZE_SHIFT);
21615+ else
21616+ *nsects = 0;
21617+ return rc;
21618+}
21619+
21620+/**
21621+ * evms_md_sync_io - This function handles synchronous I/O
21622+ **/
21623+int evms_md_sync_io(
21624+ struct evms_logical_node *node,
21625+ int rw,
21626+ u64 sector,
21627+ u64 total_nr_sects,
21628+ void *data )
21629+{
21630+ int rc = 0;
21631+ u64 total_nr_pages, size;
21632+ u32 nsects;
21633+ struct buffer_head *bh;
21634+ struct evms_md_sync_cb cb;
21635+
21636+ if (sector % EVMS_MD_SECTS_PER_PAGE) {
21637+ nsects = total_nr_sects;
21638+ rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21639+ if (!rc) {
21640+ total_nr_sects -= nsects;
21641+ sector += nsects;
21642+ data += (nsects << EVMS_VSECTOR_SIZE_SHIFT);
21643+ if (total_nr_sects == 0)
21644+ return rc;
21645+ } else {
21646+ return rc;
21647+ }
21648+ }
21649+
21650+ total_nr_pages = total_nr_sects / EVMS_MD_SECTS_PER_PAGE;
21651+ size = total_nr_sects << EVMS_VSECTOR_SIZE_SHIFT;
21652+
21653+ memset(&cb, 0, sizeof(cb));
21654+ init_waitqueue_head(&cb.wait);
21655+ cb.io_count = (atomic_t)ATOMIC_INIT(0);
21656+
21657+ while (!rc && total_nr_pages) {
21658+
21659+ bh = evms_md_allocate_bh();
21660+
21661+ md_sync_request_submit_bh(node, bh,(unsigned long)sector, data, rw, &cb);
21662+
21663+ sector += EVMS_MD_SECTS_PER_PAGE;
21664+ size -= PAGE_SIZE;
21665+ total_nr_pages--;
21666+ data += PAGE_SIZE;
21667+ }
21668+ if (!rc) {
21669+ wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21670+ rc |= cb.rc;
21671+ }
21672+
21673+ if (!rc && size) {
21674+ nsects = size >> EVMS_VSECTOR_SIZE_SHIFT;
21675+ rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21676+ }
21677+
21678+ return(rc);
21679+}
21680+
21681+/*
21682+ * Function: md_ioctl
21683+ */
21684+static int md_ioctl(
21685+ struct evms_logical_node * node,
21686+ struct inode * inode,
21687+ struct file * file,
21688+ unsigned int cmd,
21689+ unsigned long arg)
21690+{
21691+ struct evms_md * evms_md = node->private;
21692+ mddev_t *mddev;
21693+ int rc = 0;
21694+
21695+ if ((!inode) || (!evms_md) )
21696+ rc = -EINVAL;
21697+
21698+ if (!rc) {
21699+ switch (cmd) {
21700+ /*
21701+ * We have a problem here : there is no easy way to give a CHS
21702+ * virtual geometry. We currently pretend that we have a 2 heads
21703+ * 4 sectors (with a BIG number of cylinders...). This drives
21704+ * dosfs just mad... ;-)
21705+ */
21706+
21707+ case HDIO_GETGEO:
21708+ {
21709+ struct hd_geometry hdgeo;
21710+ hdgeo.heads = 2;
21711+ hdgeo.sectors = 4;
21712+ hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
21713+ hdgeo.heads / hdgeo.sectors;
21714+ hdgeo.start = 0;
21715+ if (copy_to_user((int *)arg,
21716+ &hdgeo,
21717+ sizeof(hdgeo)))
21718+ rc = -EFAULT;
21719+ }
21720+ break;
21721+ case EVMS_QUIESCE_VOLUME:
21722+ case EVMS_GET_DISK_LIST:
21723+ case EVMS_CHECK_MEDIA_CHANGE:
21724+ case EVMS_REVALIDATE_DISK:
21725+ case EVMS_OPEN_VOLUME:
21726+ case EVMS_CLOSE_VOLUME:
21727+ case EVMS_CHECK_DEVICE_STATUS:
21728+ rc = md_ioctl_cmd_broadcast(
21729+ node, inode, file, cmd, arg);
21730+ break;
21731+ case EVMS_PLUGIN_IOCTL:
21732+ rc = md_direct_ioctl(
21733+ inode, file, cmd, arg);
21734+ break;
21735+ default:
21736+ mddev = evms_md->mddev;
21737+ if (mddev == NULL) {
21738+ rc = -ENODEV;
21739+ } else if (mddev->pers->evms_ioctl == NULL) {
21740+ rc = -ENOSYS;
21741+ } else {
21742+ rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
21743+ }
21744+ }
21745+ }
21746+ return(rc);
21747+}
21748+
21749+static int md_ioctl_cmd_broadcast(
21750+ struct evms_logical_node *node,
21751+ struct inode *inode,
21752+ struct file *file,
21753+ unsigned long cmd,
21754+ unsigned long arg)
21755+{
21756+ int rc = 0;
21757+ struct evms_md *evms_md;
21758+ mddev_t *mddev;
21759+ struct list_head *tmp;
21760+ mdk_rdev_t *rdev;
21761+
21762+ evms_md = node->private;
21763+ mddev = evms_md->mddev;
21764+
21765+ /* broadcast this cmd to all children */
21766+ ITERATE_RDEV(mddev,rdev,tmp) {
21767+ if (!rdev->mddev) {
21768+ MD_BUG();
21769+ continue;
21770+ }
21771+ if (!rdev->virtual_spare) {
21772+ rc |= IOCTL(rdev->node, inode, file, cmd, arg);
21773+ }
21774+ }
21775+ return (rc);
21776+}
21777+
21778+
21779+static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
21780+{
21781+ mdk_rdev_t *rdev;
21782+ mdp_disk_t *disk = NULL;
21783+ int i;
21784+
21785+ if (evms_md_find_rdev(mddev,dev))
21786+ return -EEXIST;
21787+
21788+ LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
21789+ if ((rdev = kmalloc(sizeof(*rdev),GFP_KERNEL)) == NULL)
21790+ return -ENOMEM;
21791+
21792+ memset(rdev, 0, sizeof(*rdev));
21793+
21794+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
21795+ disk = mddev->sb->disks + i;
21796+ if (!disk->major && !disk->minor)
21797+ break;
21798+ if (disk_removed(disk))
21799+ break;
21800+ }
21801+ if (i == MD_SB_DISKS) {
21802+ LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
21803+ kfree(rdev);
21804+ return -EBUSY;
21805+ }
21806+
21807+ if (disk_removed(disk)) {
21808+ /*
21809+ * reuse slot
21810+ */
21811+ if (disk->number != i) {
21812+ MD_BUG();
21813+ kfree(rdev);
21814+ return -EINVAL;
21815+ }
21816+ } else {
21817+ disk->number = i;
21818+ }
21819+
21820+ disk->raid_disk = disk->number;
21821+ disk->major = MAJOR(dev);
21822+ disk->minor = MINOR(dev);
21823+
21824+ mark_disk_spare(disk);
21825+
21826+ rdev->mddev = mddev;
21827+ rdev->dev = dev;
21828+ rdev->desc_nr = disk->number;
21829+ rdev->virtual_spare = 1;
21830+
21831+ /* bind rdev to mddev array */
21832+ list_add(&rdev->all, &all_raid_disks);
21833+ list_add(&rdev->same_set, &mddev->disks);
21834+ MD_INIT_LIST_HEAD(&rdev->pending);
21835+
21836+ mddev->sb->nr_disks++;
21837+ mddev->sb->spare_disks++;
21838+ mddev->sb->working_disks++;
21839+ mddev->nb_dev++;
21840+
21841+ mddev->sb_dirty = 1;
21842+
21843+ evms_md_update_sb(mddev);
21844+
21845+ return 0;
21846+}
21847+
21848+static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
21849+{
21850+ mdk_rdev_t *rdev = NULL;
21851+ mdp_disk_t *disk;
21852+ int rc = 0;
21853+
21854+ disk = evms_md_find_disk(mddev,dev);
21855+ if (!disk)
21856+ return -ENODEV;
21857+
21858+ rdev = evms_md_find_rdev(mddev,dev);
21859+
21860+ if (rdev && !rdev->faulty) {
21861+ /*
21862+ * The disk is active in the array,
21863+ * must ask the personality to do it
21864+ */
21865+ if (mddev->pers && mddev->pers->diskop) {
21866+ /* Assume spare, try to remove it first. */
21867+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
21868+ if (rc)
21869+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
21870+ } else
21871+ rc = -ENOSYS;
21872+ }
21873+
21874+ if (!rc) {
21875+ remove_descriptor(disk,mddev->sb);
21876+ if (rdev)
21877+ kick_rdev_from_array(rdev);
21878+ mddev->sb_dirty = 1;
21879+ evms_md_update_sb(mddev);
21880+
21881+ }
21882+ return rc;
21883+}
21884+
21885+
21886+/*
21887+ * Function: md_direct_ioctl
21888+ *
21889+ * This function provides a method for user-space to communicate directly
21890+ * with a plugin in the kernel.
21891+ */
21892+static int md_direct_ioctl(
21893+ struct inode * inode,
21894+ struct file * file,
21895+ unsigned int cmd,
21896+ unsigned long args )
21897+{
21898+ struct evms_plugin_ioctl_pkt argument;
21899+ kdev_t md_kdev;
21900+ mddev_t *mddev = NULL;
21901+ struct evms_md_ioctl ioctl_arg;
21902+ struct evms_md_kdev device;
21903+ struct evms_md_array_info array_info, *usr_array_info;
21904+ int rc = 0;
21905+
21906+ MOD_INC_USE_COUNT;
21907+
21908+ // Copy user's parameters to kernel space
21909+ if ( copy_from_user(&argument, (struct evms_plugin_ioctl_pkt*)args, sizeof(argument)) ) {
21910+ MOD_DEC_USE_COUNT;
21911+ return -EFAULT;
21912+ }
21913+
21914+ // Make sure this is supposed to be our ioctl.
21915+ if ( argument.feature_id != md_plugin_header.id ) {
21916+ MOD_DEC_USE_COUNT;
21917+ return -EINVAL;
21918+ }
21919+
21920+ // Copy user's md ioclt parmeters to kernel space
21921+ if ( copy_from_user(&ioctl_arg,
21922+ (struct evms_md_ioctl*)argument.feature_ioctl_data,
21923+ sizeof(ioctl_arg)) )
21924+ rc = -EFAULT;
21925+ else {
21926+ if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
21927+ md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
21928+ mddev = kdev_to_mddev(md_kdev);
21929+ if (mddev == NULL)
21930+ rc = -ENODEV;
21931+ } else
21932+ rc = -ENODEV;
21933+ }
21934+
21935+ if (!rc) {
21936+ switch(argument.feature_command) {
21937+ case EVMS_MD_PERS_IOCTL_CMD:
21938+ if (mddev->pers->md_pers_ioctl == NULL) {
21939+ MOD_DEC_USE_COUNT;
21940+ return -ENOSYS;
21941+ }
21942+ rc = mddev->pers->md_pers_ioctl(mddev,
21943+ ioctl_arg.cmd,
21944+ ioctl_arg.arg);
21945+ copy_to_user((struct evms_md_ioctl*)argument.feature_ioctl_data,
21946+ &ioctl_arg,
21947+ sizeof(ioctl_arg));
21948+ break;
21949+
21950+ case EVMS_MD_ADD:
21951+ if ( copy_from_user(&device,
21952+ (struct evms_md_kdev *)ioctl_arg.arg,
21953+ sizeof(device)) )
21954+ rc = -EFAULT;
21955+ else
21956+ rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
21957+ break;
21958+
21959+ case EVMS_MD_REMOVE:
21960+ if ( copy_from_user(&device,
21961+ (struct evms_md_kdev *)ioctl_arg.arg,
21962+ sizeof(device)) )
21963+ rc = -EFAULT;
21964+ else
21965+ rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
21966+ break;
21967+
21968+ case EVMS_MD_ACTIVATE:
21969+ rc = -ENOSYS;
21970+ break;
21971+
21972+ case EVMS_MD_DEACTIVATE:
21973+ rc = -ENOSYS;
21974+ break;
21975+
21976+ case EVMS_MD_GET_ARRAY_INFO:
21977+
21978+ usr_array_info = (struct evms_md_array_info *)ioctl_arg.arg;
21979+ if ( copy_from_user(&array_info, usr_array_info,
21980+ sizeof(array_info)) )
21981+ rc = -EFAULT;
21982+ else {
21983+ array_info.state = 0;
21984+ if (mddev->curr_resync)
21985+ array_info.state |= EVMS_MD_ARRAY_SYNCING;
21986+ copy_to_user(&usr_array_info->state, &array_info.state,
21987+ sizeof(usr_array_info->state));
21988+ if (copy_to_user(array_info.sb, mddev->sb,
21989+ sizeof(mdp_super_t)))
21990+ rc = -EFAULT;
21991+ }
21992+ break;
21993+ default:
21994+ rc = -ENOSYS;
21995+ break;
21996+ }
21997+ }
21998+
21999+ argument.status = rc;
22000+ copy_to_user((struct evms_plugin_ioctl_pkt*)args, &argument, sizeof(argument));
22001+ MOD_DEC_USE_COUNT;
22002+ return rc;
22003+}
22004+
22005+
22006+
22007+
22008+void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
22009+{
22010+ unsigned int minor = MINOR(dev);
22011+
22012+ if (MAJOR(dev) != MD_MAJOR) {
22013+ MD_BUG();
22014+ return;
22015+ }
22016+ if (evms_mddev_map[minor].mddev != NULL) {
22017+ MD_BUG();
22018+ return;
22019+ }
22020+ evms_mddev_map[minor].mddev = mddev;
22021+ evms_mddev_map[minor].data = data;
22022+}
22023+
22024+void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
22025+{
22026+ unsigned int minor = MINOR(dev);
22027+
22028+ if (MAJOR(dev) != MD_MAJOR) {
22029+ MD_BUG();
22030+ return;
22031+ }
22032+ if (evms_mddev_map[minor].mddev != mddev) {
22033+ MD_BUG();
22034+ return;
22035+ }
22036+ evms_mddev_map[minor].mddev = NULL;
22037+ evms_mddev_map[minor].data = NULL;
22038+}
22039+
22040+static mddev_t * alloc_mddev (kdev_t dev)
22041+{
22042+ mddev_t *mddev;
22043+
22044+ if (MAJOR(dev) != MD_MAJOR) {
22045+ MD_BUG();
22046+ return 0;
22047+ }
22048+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
22049+ if (!mddev)
22050+ return NULL;
22051+
22052+ memset(mddev, 0, sizeof(*mddev));
22053+
22054+ mddev->__minor = MINOR(dev);
22055+ init_MUTEX(&mddev->reconfig_sem);
22056+ init_MUTEX(&mddev->recovery_sem);
22057+ init_MUTEX(&mddev->resync_sem);
22058+ INIT_LIST_HEAD(&mddev->disks);
22059+ INIT_LIST_HEAD(&mddev->all_mddevs);
22060+ INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22061+ INIT_LIST_HEAD(&mddev->running_mddevs);
22062+ mddev->active = (atomic_t)ATOMIC_INIT(0);
22063+ mddev->recovery_active = (atomic_t)ATOMIC_INIT(0);
22064+
22065+ /*
22066+ * The 'base' mddev is the one with data NULL.
22067+ * personalities can create additional mddevs
22068+ * if necessary.
22069+ */
22070+ evms_md_add_mddev_mapping(mddev, dev, 0);
22071+ list_add(&mddev->all_mddevs, &all_mddevs);
22072+
22073+ MOD_INC_USE_COUNT;
22074+ evms_md_create_recovery_thread();
22075+
22076+ return mddev;
22077+}
22078+
22079+mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
22080+{
22081+ mdk_rdev_t * rdev;
22082+ struct list_head *tmp;
22083+
22084+ ITERATE_RDEV(mddev,rdev,tmp) {
22085+ if (rdev->desc_nr == nr)
22086+ return rdev;
22087+ }
22088+ return NULL;
22089+}
22090+
22091+
22092+mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
22093+{
22094+ struct list_head *tmp;
22095+ mdk_rdev_t *rdev;
22096+
22097+ ITERATE_RDEV(mddev,rdev,tmp) {
22098+ if (rdev->dev == dev)
22099+ return rdev;
22100+ }
22101+ return NULL;
22102+}
22103+
22104+mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, struct evms_logical_node * node)
22105+{
22106+ struct list_head *tmp;
22107+ mdk_rdev_t *rdev;
22108+
22109+ ITERATE_RDEV(mddev,rdev,tmp) {
22110+ if (rdev->node == node)
22111+ return rdev;
22112+ }
22113+ return NULL;
22114+}
22115+
22116+static MD_LIST_HEAD(device_names);
22117+
22118+static char * org_partition_name (kdev_t dev)
22119+{
22120+ struct gendisk *hd;
22121+ static char nomem [] = "<nomem>";
22122+ dev_name_t *dname;
22123+ struct list_head *tmp = device_names.next;
22124+
22125+ while (tmp != &device_names) {
22126+ dname = list_entry(tmp, dev_name_t, list);
22127+ if (dname->dev == dev)
22128+ return dname->name;
22129+ tmp = tmp->next;
22130+ }
22131+
22132+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
22133+
22134+ if (!dname)
22135+ return nomem;
22136+ /*
22137+ * ok, add this new device name to the list
22138+ */
22139+ hd = get_gendisk (dev);
22140+ dname->name = NULL;
22141+ if (hd)
22142+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
22143+ if (!dname->name) {
22144+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
22145+ dname->name = dname->namebuf;
22146+ }
22147+
22148+ dname->dev = dev;
22149+ MD_INIT_LIST_HEAD(&dname->list);
22150+ list_add(&dname->list, &device_names);
22151+
22152+ return dname->name;
22153+}
22154+
22155+
22156+#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
22157+char * evms_md_partition_name (struct evms_logical_node *node)
22158+{
22159+ if (node && node->name)
22160+ return node->name;
22161+ else
22162+ return EVMS_MD_NULL_PARTITION_NAME;
22163+}
22164+
22165+static char * get_partition_name (mdk_rdev_t *rdev)
22166+{
22167+ if (rdev->node)
22168+ return evms_md_partition_name(rdev->node);
22169+ else
22170+ return org_partition_name(rdev->dev);
22171+}
22172+
22173+/*
22174+ * Function: evms_md_calc_dev_sboffset
22175+ * return the LSN for md super block.
22176+ */
22177+static u64 evms_md_calc_dev_sboffset (struct evms_logical_node *node,mddev_t *mddev, int persistent)
22178+{
22179+ u64 size = 0;
22180+
22181+ size = node->total_vsectors;
22182+ if (persistent) {
22183+ size = MD_NEW_SIZE_SECTORS(size);
22184+ }
22185+ return size; /* size in sectors */
22186+}
22187+
22188+/*
22189+ * Function: evms_md_calc_dev_size
22190+ * return data size (in blocks) for an "extended" device.
22191+ */
22192+static unsigned long evms_md_calc_dev_size (struct evms_logical_node *node,
22193+ mddev_t *mddev,
22194+ int persistent)
22195+{
22196+ unsigned long size;
22197+ u64 size_in_sectors;
22198+
22199+ size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
22200+ size = size_in_sectors >> 1;
22201+ if (!mddev->sb) {
22202+ MD_BUG();
22203+ return size;
22204+ }
22205+ if (mddev->sb->chunk_size)
22206+ size &= ~(mddev->sb->chunk_size/1024 - 1);
22207+ return size;
22208+}
22209+
22210+static unsigned int zoned_raid_size (mddev_t *mddev)
22211+{
22212+ unsigned int mask;
22213+ mdk_rdev_t * rdev;
22214+ struct list_head *tmp;
22215+
22216+ if (!mddev->sb) {
22217+ MD_BUG();
22218+ return -EINVAL;
22219+ }
22220+ /*
22221+ * do size and offset calculations.
22222+ */
22223+ mask = ~(mddev->sb->chunk_size/1024 - 1);
22224+
22225+ ITERATE_RDEV(mddev,rdev,tmp) {
22226+ rdev->size &= mask;
22227+ evms_md_size[mdidx(mddev)] += rdev->size;
22228+ }
22229+ return 0;
22230+}
22231+
22232+/*
22233+ * We check wether all devices are numbered from 0 to nb_dev-1. The
22234+ * order is guaranteed even after device name changes.
22235+ *
22236+ * Some personalities (raid0, linear) use this. Personalities that
22237+ * provide data have to be able to deal with loss of individual
22238+ * disks, so they do their checking themselves.
22239+ */
22240+int evms_md_check_ordering (mddev_t *mddev)
22241+{
22242+ int i, c;
22243+ mdk_rdev_t *rdev;
22244+ struct list_head *tmp;
22245+
22246+ /*
22247+ * First, all devices must be fully functional
22248+ */
22249+ ITERATE_RDEV(mddev,rdev,tmp) {
22250+ if (rdev->faulty) {
22251+ LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
22252+ mdidx(mddev), get_partition_name(rdev));
22253+ goto abort;
22254+ }
22255+ }
22256+
22257+ c = 0;
22258+ ITERATE_RDEV(mddev,rdev,tmp) {
22259+ c++;
22260+ }
22261+ if (c != mddev->nb_dev) {
22262+ MD_BUG();
22263+ goto abort;
22264+ }
22265+ if (mddev->nb_dev != mddev->sb->raid_disks) {
22266+ LOG_ERROR("%s: [md%d] array needs %d disks, has %d, aborting.\n",
22267+ __FUNCTION__, mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
22268+ goto abort;
22269+ }
22270+ /*
22271+ * Now the numbering check
22272+ */
22273+ for (i = 0; i < mddev->nb_dev; i++) {
22274+ c = 0;
22275+ ITERATE_RDEV(mddev,rdev,tmp) {
22276+ if (rdev->desc_nr == i)
22277+ c++;
22278+ }
22279+ if (!c) {
22280+ LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
22281+ goto abort;
22282+ }
22283+ if (c > 1) {
22284+ LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
22285+ goto abort;
22286+ }
22287+ }
22288+ return 0;
22289+abort:
22290+ return 1;
22291+}
22292+
22293+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
22294+{
22295+ if (disk_active(disk)) {
22296+ sb->working_disks--;
22297+ } else {
22298+ if (disk_spare(disk)) {
22299+ sb->spare_disks--;
22300+ sb->working_disks--;
22301+ } else {
22302+ sb->failed_disks--;
22303+ }
22304+ }
22305+ sb->nr_disks--;
22306+ disk->major = disk->minor = 0;
22307+ mark_disk_removed(disk);
22308+}
22309+
22310+#define BAD_MINOR \
22311+"%s: invalid raid minor (%x)\n"
22312+
22313+#define NO_SB \
22314+"disabled device %s, could not read superblock.\n"
22315+
22316+#define BAD_CSUM \
22317+"invalid superblock checksum on %s\n"
22318+
22319+
22320+static int alloc_array_sb (mddev_t * mddev)
22321+{
22322+ if (mddev->sb) {
22323+ MD_BUG();
22324+ return 0;
22325+ }
22326+
22327+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
22328+ if (!mddev->sb) {
22329+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22330+ return -ENOMEM;
22331+ }
22332+ md_clear_page(mddev->sb);
22333+ return 0;
22334+}
22335+
22336+static int alloc_disk_sb (mdk_rdev_t * rdev)
22337+{
22338+ if (rdev->sb)
22339+ MD_BUG();
22340+
22341+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
22342+ if (!rdev->sb) {
22343+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22344+ return -EINVAL;
22345+ }
22346+ md_clear_page(rdev->sb);
22347+
22348+ return 0;
22349+}
22350+
22351+/*
22352+ * Function: free_disk_sb
22353+ *
22354+ */
22355+static void free_disk_sb (mdk_rdev_t * rdev)
22356+{
22357+ if (rdev->sb) {
22358+ free_page((unsigned long) rdev->sb);
22359+ rdev->sb = NULL;
22360+ rdev->sb_offset = 0;
22361+ rdev->size = 0;
22362+ } else {
22363+ if (!rdev->virtual_spare && !rdev->faulty)
22364+ MD_BUG();
22365+ }
22366+}
22367+
22368+/*
22369+ * Function: evms_md_read_disk_sb
22370+ * Read the MD superblock.
22371+ */
22372+static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
22373+{
22374+ int rc = 0;
22375+ struct evms_logical_node *node = rdev->node;
22376+ u64 sb_offset_in_sectors;
22377+
22378+ if (!rdev->sb) {
22379+ MD_BUG();
22380+ return -EINVAL;
22381+ }
22382+ if (node->total_vsectors <= MD_RESERVED_SECTORS) {
22383+ LOG_DETAILS("%s is too small, total_vsectors("PFU64")\n",
22384+ evms_md_partition_name(node), node->total_vsectors);
22385+ return -EINVAL;
22386+ }
22387+
22388+ /*
22389+ * Calculate the position of the superblock,
22390+ * it's at the end of the disk
22391+ */
22392+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
22393+ rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
22394+ LOG_DEBUG("(read) %s's sb offset("PFU64") total_vsectors("PFU64")\n",
22395+ evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
22396+
22397+ /*
22398+ * Read superblock
22399+ */
22400+ rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
22401+
22402+ return rc;
22403+}
22404+
22405+static unsigned int calc_sb_csum (mdp_super_t * sb)
22406+{
22407+ unsigned int disk_csum, csum;
22408+
22409+ disk_csum = sb->sb_csum;
22410+ sb->sb_csum = 0;
22411+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
22412+ sb->sb_csum = disk_csum;
22413+ return csum;
22414+}
22415+
22416+
22417+
22418+/*
22419+ * Check one RAID superblock for generic plausibility
22420+ */
22421+
22422+static int check_disk_sb (mdk_rdev_t * rdev)
22423+{
22424+ mdp_super_t *sb;
22425+ int ret = -EINVAL;
22426+
22427+ sb = rdev->sb;
22428+ if (!sb) {
22429+ MD_BUG();
22430+ goto abort;
22431+ }
22432+
22433+ if (sb->md_magic != MD_SB_MAGIC) {
22434+ goto abort;
22435+ }
22436+
22437+ if (sb->md_minor >= MAX_MD_DEVS) {
22438+ LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
22439+ goto abort;
22440+ }
22441+ if (calc_sb_csum(sb) != sb->sb_csum) {
22442+ LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
22443+ goto abort;
22444+ }
22445+
22446+ switch (sb->level) {
22447+ case -1:
22448+ case 0:
22449+ case 1:
22450+ case 5:
22451+ break;
22452+ default:
22453+ LOG_ERROR("%s: EVMS MD does not support MD level %d\n", __FUNCTION__, sb->level);
22454+ goto abort;
22455+ }
22456+ ret = 0;
22457+abort:
22458+ return ret;
22459+}
22460+
22461+static kdev_t dev_unit(kdev_t dev)
22462+{
22463+ unsigned int mask;
22464+ struct gendisk *hd = get_gendisk(dev);
22465+
22466+ if (!hd)
22467+ return 0;
22468+ mask = ~((1 << hd->minor_shift) - 1);
22469+
22470+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
22471+}
22472+
22473+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
22474+{
22475+ struct list_head *tmp;
22476+ mdk_rdev_t *rdev;
22477+
22478+ ITERATE_RDEV(mddev,rdev,tmp)
22479+ if (dev_unit(rdev->dev) == dev_unit(dev))
22480+ return rdev;
22481+
22482+ return NULL;
22483+}
22484+
22485+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
22486+{
22487+ struct list_head *tmp;
22488+ mdk_rdev_t *rdev;
22489+
22490+ ITERATE_RDEV(mddev1,rdev,tmp)
22491+ if (match_dev_unit(mddev2, rdev->dev))
22492+ return 1;
22493+
22494+ return 0;
22495+}
22496+
22497+
22498+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
22499+{
22500+ mdk_rdev_t *same_pdev;
22501+
22502+ if (rdev->mddev) {
22503+ MD_BUG();
22504+ return;
22505+ }
22506+
22507+ same_pdev = match_dev_unit(mddev, rdev->dev);
22508+ if (same_pdev)
22509+ LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
22510+ " protection against single-disk failure might be compromised.\n",
22511+ mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
22512+
22513+ list_add(&rdev->same_set, &mddev->disks);
22514+ rdev->mddev = mddev;
22515+ mddev->nb_dev++;
22516+ if (rdev->sb && disk_active(&rdev->sb->this_disk))
22517+ mddev->nr_raid_disks++;
22518+ LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22519+}
22520+
22521+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
22522+{
22523+ if (!rdev->mddev) {
22524+ MD_BUG();
22525+ return;
22526+ }
22527+ list_del(&rdev->same_set);
22528+ MD_INIT_LIST_HEAD(&rdev->same_set);
22529+ rdev->mddev->nb_dev--;
22530+ if (rdev->sb && disk_active(&rdev->sb->this_disk))
22531+ rdev->mddev->nr_raid_disks--;
22532+ LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22533+ rdev->mddev = NULL;
22534+}
22535+
22536+
22537+/*
22538+ * Function: evms_md_export_rdev
22539+ * EVMS MD version of export_rdev()
22540+ * Discard this MD "extended" device
22541+ */
22542+static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node)
22543+{
22544+ LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
22545+ if (rdev->mddev)
22546+ MD_BUG();
22547+ free_disk_sb(rdev);
22548+ list_del(&rdev->all);
22549+ MD_INIT_LIST_HEAD(&rdev->all);
22550+ if (rdev->pending.next != &rdev->pending) {
22551+ LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
22552+ list_del(&rdev->pending);
22553+ MD_INIT_LIST_HEAD(&rdev->pending);
22554+ }
22555+ if (rdev->node && delete_node) {
22556+ if (cur_discover_list) {
22557+ LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
22558+ get_partition_name(rdev));
22559+ evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
22560+ }
22561+ LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
22562+ DELETE(rdev->node);
22563+ rdev->node = NULL;
22564+ }
22565+ rdev->dev = 0;
22566+ rdev->faulty = 0;
22567+ kfree(rdev);
22568+}
22569+
22570+
22571+static void kick_rdev_from_array (mdk_rdev_t * rdev)
22572+{
22573+ LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
22574+ unbind_rdev_from_array(rdev);
22575+ evms_md_export_rdev(rdev, TRUE);
22576+}
22577+
22578+static void export_array (mddev_t *mddev)
22579+{
22580+ struct list_head *tmp;
22581+ mdk_rdev_t *rdev;
22582+ mdp_super_t *sb = mddev->sb;
22583+
22584+ LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
22585+ if (mddev->sb) {
22586+ mddev->sb = NULL;
22587+ free_page((unsigned long) sb);
22588+ }
22589+
22590+ LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
22591+ ITERATE_RDEV(mddev,rdev,tmp) {
22592+ if (!rdev->mddev) {
22593+ MD_BUG();
22594+ continue;
22595+ }
22596+ kick_rdev_from_array(rdev);
22597+ }
22598+ if (mddev->nb_dev)
22599+ MD_BUG();
22600+}
22601+
22602+static void free_mddev (mddev_t *mddev)
22603+{
22604+ struct evms_logical_node *node;
22605+ struct evms_md *evms_md;
22606+
22607+ if (!mddev) {
22608+ MD_BUG();
22609+ return;
22610+ }
22611+
22612+ node = mddev->node;
22613+
22614+ export_array(mddev);
22615+ evms_md_size[mdidx(mddev)] = 0;
22616+
22617+
22618+ /*
22619+ * Make sure nobody else is using this mddev
22620+ * (careful, we rely on the global kernel lock here)
22621+ */
22622+ while (atomic_read(&mddev->resync_sem.count) != 1)
22623+ schedule();
22624+ while (atomic_read(&mddev->recovery_sem.count) != 1)
22625+ schedule();
22626+
22627+ evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
22628+ list_del(&mddev->all_mddevs);
22629+ INIT_LIST_HEAD(&mddev->all_mddevs);
22630+ if (!list_empty(&mddev->running_mddevs)) {
22631+ list_del(&mddev->running_mddevs);
22632+ INIT_LIST_HEAD(&mddev->running_mddevs);
22633+ }
22634+ if (!list_empty(&mddev->incomplete_mddevs)) {
22635+ list_del(&mddev->incomplete_mddevs);
22636+ INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22637+ }
22638+
22639+ kfree(mddev);
22640+ if (node) {
22641+ evms_md = node->private;
22642+ evms_md->mddev = NULL;
22643+ }
22644+ MOD_DEC_USE_COUNT;
22645+ evms_md_destroy_recovery_thread();
22646+}
22647+
22648+
22649+static void print_desc(mdp_disk_t *desc)
22650+{
22651+ printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
22652+ desc->raid_disk,desc->state);
22653+}
22654+
22655+static void print_sb(mdp_super_t *sb)
22656+{
22657+ int i;
22658+
22659+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
22660+ sb->major_version, sb->minor_version, sb->patch_version,
22661+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
22662+ sb->ctime);
22663+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
22664+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
22665+ sb->layout, sb->chunk_size);
22666+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
22667+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
22668+ sb->failed_disks, sb->spare_disks,
22669+ sb->sb_csum, sb->events_lo);
22670+
22671+ for (i = 0; i < MD_SB_DISKS; i++) {
22672+ mdp_disk_t *desc;
22673+
22674+ desc = sb->disks + i;
22675+ if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
22676+ printk(" D %2d: ", i);
22677+ print_desc(desc);
22678+ }
22679+ }
22680+ printk(" THIS: ");
22681+ print_desc(&sb->this_disk);
22682+
22683+}
22684+
22685+static void print_rdev(mdk_rdev_t *rdev)
22686+{
22687+ printk("rdev %s: SZ:%08ld F:%d DN:%d ",
22688+ get_partition_name(rdev),
22689+ rdev->size, rdev->faulty, rdev->desc_nr);
22690+ if (rdev->sb) {
22691+ printk("rdev superblock:\n");
22692+ print_sb(rdev->sb);
22693+ } else
22694+ printk("no rdev superblock!\n");
22695+}
22696+
22697+void evms_md_print_devices (void)
22698+{
22699+ struct list_head *tmp, *tmp2;
22700+ mdk_rdev_t *rdev;
22701+ mddev_t *mddev;
22702+
22703+ printk("\n");
22704+ printk(": **********************************\n");
22705+ printk(": * <COMPLETE RAID STATE PRINTOUT> *\n");
22706+ printk(": **********************************\n");
22707+ ITERATE_MDDEV(mddev,tmp) {
22708+ printk("md%d: ", mdidx(mddev));
22709+
22710+ ITERATE_RDEV(mddev,rdev,tmp2)
22711+ printk("<%s>", get_partition_name(rdev));
22712+
22713+ if (mddev->sb) {
22714+ printk(" array superblock:\n");
22715+ print_sb(mddev->sb);
22716+ } else
22717+ printk(" no array superblock.\n");
22718+
22719+ ITERATE_RDEV(mddev,rdev,tmp2)
22720+ print_rdev(rdev);
22721+ }
22722+ printk(": **********************************\n");
22723+ printk("\n");
22724+}
22725+
22726+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
22727+{
22728+ int ret;
22729+ mdp_super_t *tmp1, *tmp2;
22730+
22731+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
22732+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
22733+
22734+ if (!tmp1 || !tmp2) {
22735+ ret = 0;
22736+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
22737+ goto abort;
22738+ }
22739+
22740+ *tmp1 = *sb1;
22741+ *tmp2 = *sb2;
22742+
22743+ /*
22744+ * nr_disks is not constant
22745+ */
22746+ tmp1->nr_disks = 0;
22747+ tmp2->nr_disks = 0;
22748+
22749+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
22750+ ret = 0;
22751+ else
22752+ ret = 1;
22753+
22754+abort:
22755+ if (tmp1)
22756+ kfree(tmp1);
22757+ if (tmp2)
22758+ kfree(tmp2);
22759+
22760+ return ret;
22761+}
22762+
22763+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
22764+{
22765+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
22766+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
22767+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
22768+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
22769+
22770+ return 1;
22771+
22772+ return 0;
22773+}
22774+
22775+/*
22776+ * Function: evms_md_find_rdev_all
22777+ * EVMS MD version of find_rdev_all()
22778+ * Search entire all_raid_disks for "node"
22779+ * Return the MD "extended" device if found.
22780+ */
22781+static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node)
22782+{
22783+ struct list_head *tmp;
22784+ mdk_rdev_t *rdev;
22785+
22786+ tmp = all_raid_disks.next;
22787+ while (tmp != &all_raid_disks) {
22788+ rdev = list_entry(tmp, mdk_rdev_t, all);
22789+ if (rdev->node == node)
22790+ return rdev;
22791+ tmp = tmp->next;
22792+ }
22793+ return NULL;
22794+}
22795+
22796+/*
22797+ * Function: evms_md_find_mddev_all
22798+ */
22799+static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node)
22800+{
22801+ struct list_head *tmp;
22802+ mddev_t *mddev;
22803+
22804+ ITERATE_MDDEV(mddev,tmp) {
22805+ if (mddev->node == node)
22806+ return mddev;
22807+ }
22808+ return NULL;
22809+}
22810+
22811+
22812+/*
22813+ * Function: evms_md_write_disk_sb
22814+ * EVMS MD version of write_disk_sb
22815+ */
22816+static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
22817+{
22818+ unsigned long size;
22819+ u64 sb_offset_in_sectors;
22820+
22821+ if (!rdev->sb) {
22822+ MD_BUG();
22823+ return 1;
22824+ }
22825+ if (rdev->faulty) {
22826+ MD_BUG();
22827+ return 1;
22828+ }
22829+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
22830+ MD_BUG();
22831+ return 1;
22832+ }
22833+
22834+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
22835+ if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
22836+ LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
22837+ get_partition_name(rdev),
22838+ rdev->sb_offset,
22839+ (unsigned long)(sb_offset_in_sectors >> 1));
22840+ goto skip;
22841+ }
22842+ /*
22843+ * If the disk went offline meanwhile and it's just a spare, then
22844+ * its size has changed to zero silently, and the MD code does
22845+ * not yet know that it's faulty.
22846+ */
22847+ size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
22848+ if (size != rdev->size) {
22849+ LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
22850+ get_partition_name(rdev), rdev->size, size);
22851+ goto skip;
22852+ }
22853+
22854+ LOG_DETAILS("(write) %s's sb offset: "PFU64"\n",get_partition_name(rdev), sb_offset_in_sectors);
22855+
22856+ INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
22857+
22858+skip:
22859+ return 0;
22860+}
22861+
22862+static int evms_md_sync_sbs(mddev_t * mddev)
22863+{
22864+ mdk_rdev_t *rdev;
22865+ struct list_head *tmp;
22866+ mdp_disk_t * disk;
22867+
22868+ ITERATE_RDEV(mddev,rdev,tmp) {
22869+ if (rdev->virtual_spare || rdev->faulty)
22870+ continue;
22871+
22872+ /* copy everything from the master */
22873+ memcpy(rdev->sb, mddev->sb, sizeof(mdp_super_t));
22874+
22875+ /* this_disk is unique, copy it from the master */
22876+// rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
22877+ // use the SB disk array since if update occurred on normal shutdown
22878+ // the rdevs may be out of date.
22879+ disk = evms_md_find_disk(mddev, rdev->dev);
22880+ if (disk) {
22881+ rdev->sb->this_disk = *disk;
22882+ }
22883+
22884+ rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
22885+ }
22886+ return 0;
22887+}
22888+
22889+static int evms_md_update_sb_sync(mddev_t * mddev, int clean)
22890+{
22891+ mdk_rdev_t *rdev;
22892+ struct list_head *tmp;
22893+ int rc = 0;
22894+ int found = FALSE;
22895+
22896+ ITERATE_RDEV(mddev,rdev,tmp) {
22897+
22898+ if (rdev->virtual_spare || rdev->faulty)
22899+ continue;
22900+
22901+ if ((rc = evms_md_read_disk_sb(rdev))) {
22902+ LOG_ERROR("%s: error reading superblock on %s!\n",
22903+ __FUNCTION__, evms_md_partition_name(rdev->node));
22904+ break;
22905+ }
22906+
22907+ if ((rc = check_disk_sb(rdev))) {
22908+ LOG_ERROR("%s: %s has invalid sb!\n",
22909+ __FUNCTION__, evms_md_partition_name(rdev->node));
22910+ break;
22911+ }
22912+
22913+ rdev->desc_nr = rdev->sb->this_disk.number;
22914+ rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
22915+
22916+ /* copy master superlbock from the first good rdev */
22917+ if (!found) {
22918+ found = TRUE;
22919+ memcpy(mddev->sb, rdev->sb, sizeof(mdp_super_t));
22920+ if (clean)
22921+ mddev->sb->state |= 1 << MD_SB_CLEAN;
22922+ else
22923+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
22924+ }
22925+ }
22926+ if (!rc && found) {
22927+ evms_md_update_sb(mddev);
22928+ } else {
22929+ LOG_SERIOUS("%s: BUG! BUG! superblocks will not be updated!\n", __FUNCTION__);
22930+ }
22931+ return rc;
22932+
22933+}
22934+
22935+int evms_md_update_sb(mddev_t * mddev)
22936+{
22937+ int err, count = 100;
22938+ struct list_head *tmp;
22939+ mdk_rdev_t *rdev;
22940+
22941+
22942+repeat:
22943+ mddev->sb->utime = CURRENT_TIME;
22944+ if ((++mddev->sb->events_lo)==0)
22945+ ++mddev->sb->events_hi;
22946+
22947+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
22948+ /*
22949+ * oops, this 64-bit counter should never wrap.
22950+ * Either we are in around ~1 trillion A.C., assuming
22951+ * 1 reboot per second, or we have a bug:
22952+ */
22953+ MD_BUG();
22954+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
22955+ }
22956+ evms_md_sync_sbs(mddev);
22957+
22958+ /*
22959+ * do not write anything to disk if using
22960+ * nonpersistent superblocks
22961+ */
22962+ if (mddev->sb->not_persistent)
22963+ return 0;
22964+
22965+ LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
22966+
22967+ err = 0;
22968+ ITERATE_RDEV(mddev,rdev,tmp) {
22969+ if (!rdev->virtual_spare && !rdev->faulty) {
22970+ LOG_DETAILS(" %s [events: %x]",
22971+ get_partition_name(rdev),
22972+ rdev->sb->events_lo);
22973+ err += evms_md_write_disk_sb(rdev);
22974+ } else {
22975+ if (rdev->faulty)
22976+ LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
22977+ if (rdev->virtual_spare)
22978+ LOG_DETAILS(" skipping virtual spare.\n");
22979+ }
22980+ }
22981+ if (err) {
22982+ if (--count) {
22983+ LOG_WARNING("errors occurred during superblock update, repeating\n");
22984+ goto repeat;
22985+ }
22986+ LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
22987+ }
22988+ return 0;
22989+}
22990+
22991+/*
22992+ * Function: evms_md_import_device
22993+ * Insure that node is not yet imported.
22994+ * Read and validate the MD super block on this device
22995+ * Add to the global MD "extended" devices list (all_raid_disks)
22996+ *
22997+ */
22998+static int evms_md_import_device (struct evms_logical_node **discover_list,
22999+ struct evms_logical_node *node)
23000+{
23001+ int err;
23002+ mdk_rdev_t *rdev;
23003+
23004+ LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
23005+
23006+ if (evms_md_find_rdev_all(node)) {
23007+ LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
23008+ return -EEXIST;
23009+ }
23010+
23011+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
23012+ if (!rdev) {
23013+ LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
23014+ return -ENOMEM;
23015+ }
23016+ memset(rdev, 0, sizeof(*rdev));
23017+
23018+ if ((err = alloc_disk_sb(rdev)))
23019+ goto abort_free;
23020+
23021+ rdev->node = node; /* set this for evms_md_read_disk_sb() */
23022+
23023+ rdev->desc_nr = -1;
23024+ rdev->faulty = 0;
23025+
23026+ if (!node->total_vsectors) {
23027+ LOG_ERROR("%s has zero size!\n", evms_md_partition_name(node));
23028+ err = -EINVAL;
23029+ goto abort_free;
23030+ }
23031+
23032+ if ((err = evms_md_read_disk_sb(rdev))) {
23033+ LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
23034+ goto abort_free;
23035+ }
23036+ if ((err = check_disk_sb(rdev))) {
23037+ LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
23038+ goto abort_free;
23039+ }
23040+ rdev->desc_nr = rdev->sb->this_disk.number;
23041+ rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
23042+ LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
23043+ list_add(&rdev->all, &all_raid_disks);
23044+ MD_INIT_LIST_HEAD(&rdev->pending);
23045+
23046+ if (rdev->faulty && rdev->sb)
23047+ free_disk_sb(rdev);
23048+
23049+ return 0;
23050+
23051+abort_free:
23052+ if (rdev->sb) {
23053+ free_disk_sb(rdev);
23054+ }
23055+ kfree(rdev);
23056+ return err;
23057+}
23058+
23059+
23060+
23061+/*
23062+ * Function: evms_md_analyze_sbs
23063+ * EVMS MD version of analyze_sbs()
23064+ */
23065+static int evms_md_analyze_sbs (mddev_t * mddev)
23066+{
23067+ int out_of_date = 0, i;
23068+ struct list_head *tmp, *tmp2;
23069+ mdk_rdev_t *rdev, *rdev2, *freshest;
23070+ mdp_super_t *sb;
23071+
23072+ LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
23073+ /*
23074+ * Verify the RAID superblock on each real device
23075+ */
23076+ ITERATE_RDEV(mddev,rdev,tmp) {
23077+ if (rdev->faulty) {
23078+ MD_BUG();
23079+ goto abort;
23080+ }
23081+ if (!rdev->sb) {
23082+ MD_BUG();
23083+ goto abort;
23084+ }
23085+ if (check_disk_sb(rdev))
23086+ goto abort;
23087+ }
23088+
23089+ /*
23090+ * The superblock constant part has to be the same
23091+ * for all disks in the array.
23092+ */
23093+ sb = NULL;
23094+
23095+ ITERATE_RDEV(mddev,rdev,tmp) {
23096+ if (!sb) {
23097+ sb = rdev->sb;
23098+ continue;
23099+ }
23100+ if (!sb_equal(sb, rdev->sb)) {
23101+ LOG_WARNING("kick out %s\n",get_partition_name(rdev));
23102+ kick_rdev_from_array(rdev);
23103+ continue;
23104+ }
23105+ }
23106+
23107+ /*
23108+ * OK, we have all disks and the array is ready to run. Let's
23109+ * find the freshest superblock, that one will be the superblock
23110+ * that represents the whole array.
23111+ */
23112+ if (!mddev->sb)
23113+ if (alloc_array_sb(mddev))
23114+ goto abort;
23115+ sb = mddev->sb;
23116+ freshest = NULL;
23117+
23118+ ITERATE_RDEV(mddev,rdev,tmp) {
23119+ __u64 ev1, ev2;
23120+ /*
23121+ * if the checksum is invalid, use the superblock
23122+ * only as a last resort. (decrease it's age by
23123+ * one event)
23124+ */
23125+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
23126+ if (rdev->sb->events_lo || rdev->sb->events_hi)
23127+ if ((rdev->sb->events_lo--)==0)
23128+ rdev->sb->events_hi--;
23129+ }
23130+ LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
23131+
23132+ if (!freshest) {
23133+ freshest = rdev;
23134+ continue;
23135+ }
23136+ /*
23137+ * Find the newest superblock version
23138+ */
23139+ ev1 = md_event(rdev->sb);
23140+ ev2 = md_event(freshest->sb);
23141+ if (ev1 != ev2) {
23142+ out_of_date = 1;
23143+ if (ev1 > ev2)
23144+ freshest = rdev;
23145+ }
23146+ }
23147+ if (out_of_date) {
23148+ LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
23149+ }
23150+ memcpy (sb, freshest->sb, sizeof(*sb));
23151+
23152+ /*
23153+ * at this point we have picked the 'best' superblock
23154+ * from all available superblocks.
23155+ * now we validate this superblock and kick out possibly
23156+ * failed disks.
23157+ */
23158+ ITERATE_RDEV(mddev,rdev,tmp) {
23159+ /*
23160+ * Kick all non-fresh devices
23161+ */
23162+ __u64 ev1, ev2;
23163+ ev1 = md_event(rdev->sb);
23164+ ev2 = md_event(sb);
23165+ if (ev1 < ev2) {
23166+ if (ev1) {
23167+ LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
23168+ kick_rdev_from_array(rdev);
23169+ continue;
23170+ } else {
23171+ LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
23172+ }
23173+ }
23174+ }
23175+
23176+ /*
23177+ * Remove unavailable and faulty devices ...
23178+ *
23179+ * note that if an array becomes completely unrunnable due to
23180+ * missing devices, we do not write the superblock back, so the
23181+ * administrator has a chance to fix things up. The removal thus
23182+ * only happens if it's nonfatal to the contents of the array.
23183+ */
23184+ for (i = 0; i < MD_SB_DISKS; i++) {
23185+ int found;
23186+ mdp_disk_t *desc;
23187+
23188+ desc = sb->disks + i;
23189+
23190+ /*
23191+ * We kick faulty devices/descriptors immediately.
23192+ *
23193+ * Note: multipath devices are a special case. Since we
23194+ * were able to read the superblock on the path, we don't
23195+ * care if it was previously marked as faulty, it's up now
23196+ * so enable it.
23197+ */
23198+ if (disk_faulty(desc) && mddev->sb->level != -4) {
23199+ found = 0;
23200+ ITERATE_RDEV(mddev,rdev,tmp) {
23201+ if (rdev->desc_nr != desc->number)
23202+ continue;
23203+ LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
23204+ kick_rdev_from_array(rdev);
23205+ found = 1;
23206+ break;
23207+ }
23208+ if (!found) {
23209+ LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
23210+ __FUNCTION__ ,mdidx(mddev), desc->number);
23211+ }
23212+ /*
23213+ * Don't call remove_descriptor(),
23214+ * let the administrator remove it from the user-land */
23215+ /* remove_descriptor(desc, sb); */
23216+ continue;
23217+ } else if (disk_faulty(desc)) {
23218+ /*
23219+ * multipath entry marked as faulty, unfaulty it
23220+ */
23221+ kdev_t dev;
23222+
23223+ dev = MKDEV(desc->major, desc->minor);
23224+
23225+ rdev = evms_md_find_rdev(mddev, dev);
23226+ if (rdev)
23227+ mark_disk_spare(desc);
23228+ else {
23229+ LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
23230+ __FUNCTION__ ,mdidx(mddev), desc->number);
23231+ /*
23232+ * Don't call remove_descriptor(),
23233+ * let the administrator remove it from the user-land */
23234+ /* remove_descriptor(desc, sb); */
23235+ }
23236+ }
23237+
23238+ /*
23239+ * Is this device present in the rdev ring?
23240+ */
23241+ found = 0;
23242+ ITERATE_RDEV(mddev,rdev,tmp) {
23243+ /*
23244+ * Multi-path IO special-case: since we have no
23245+ * this_disk descriptor at auto-detect time,
23246+ * we cannot check rdev->number.
23247+ * We can check the device though.
23248+ */
23249+ if ((sb->level == -4) && (rdev->dev ==
23250+ MKDEV(desc->major,desc->minor))) {
23251+ found = 1;
23252+ break;
23253+ }
23254+ if (rdev->desc_nr == desc->number) {
23255+ found = 1;
23256+ break;
23257+ }
23258+ }
23259+ if (found)
23260+ continue;
23261+
23262+ LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
23263+ mdidx(mddev), desc->number);
23264+ remove_descriptor(desc, sb);
23265+ }
23266+
23267+ /*
23268+ * Kick all rdevs that are not in the
23269+ * descriptor array:
23270+ */
23271+ ITERATE_RDEV(mddev,rdev,tmp) {
23272+ if (rdev->desc_nr == -1)
23273+ kick_rdev_from_array(rdev);
23274+ }
23275+
23276+ /*
23277+ * Do a final reality check.
23278+ */
23279+ if (mddev->sb->level != -4) {
23280+ ITERATE_RDEV(mddev,rdev,tmp) {
23281+ if (rdev->desc_nr == -1) {
23282+ MD_BUG();
23283+ goto abort;
23284+ }
23285+ /*
23286+ * is the desc_nr unique?
23287+ */
23288+ ITERATE_RDEV(mddev,rdev2,tmp2) {
23289+ if ((rdev2 != rdev) &&
23290+ (rdev2->desc_nr == rdev->desc_nr)) {
23291+ MD_BUG();
23292+ goto abort;
23293+ }
23294+ }
23295+ }
23296+ }
23297+
23298+#define OLD_VERSION KERN_ALERT \
23299+"md%d: unsupported raid array version %d.%d.%d\n"
23300+
23301+#define NOT_CLEAN_IGNORE KERN_ERR \
23302+"md%d: raid array is not clean -- starting background reconstruction\n"
23303+
23304+ /*
23305+ * Check if we can support this RAID array
23306+ */
23307+ if (sb->major_version != MD_MAJOR_VERSION ||
23308+ sb->minor_version > MD_MINOR_VERSION) {
23309+
23310+ LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
23311+ mdidx(mddev),
23312+ sb->major_version,
23313+ sb->minor_version,
23314+ sb->patch_version);
23315+ goto abort;
23316+ }
23317+
23318+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
23319+ (sb->level == 4) || (sb->level == 5)))
23320+ LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
23321+ mdidx(mddev), sb->level);
23322+
23323+ LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
23324+ return 0;
23325+abort:
23326+ LOG_WARNING("ABORT analyze_sbs()!!!\n");
23327+ return 1;
23328+}
23329+
23330+
23331+static int device_size_calculation (mddev_t * mddev)
23332+{
23333+ int data_disks = 0, persistent;
23334+ //unsigned int readahead;
23335+ mdp_super_t *sb = mddev->sb;
23336+ struct list_head *tmp;
23337+ mdk_rdev_t *rdev;
23338+
23339+ /*
23340+ * Do device size calculation. Bail out if too small.
23341+ * (we have to do this after having validated chunk_size,
23342+ * because device size has to be modulo chunk_size)
23343+ */
23344+ persistent = !mddev->sb->not_persistent;
23345+ ITERATE_RDEV(mddev,rdev,tmp) {
23346+ if (rdev->faulty)
23347+ continue;
23348+ if (rdev->size) {
23349+ LOG_DEFAULT("%s: already calculated %s\n", __FUNCTION__, get_partition_name(rdev));
23350+ continue;
23351+ }
23352+ rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
23353+ if (rdev->size < sb->chunk_size / 1024) {
23354+ LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
23355+ get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
23356+ return -EINVAL;
23357+ }
23358+ }
23359+
23360+ switch (sb->level) {
23361+ case -4:
23362+ data_disks = 1;
23363+ break;
23364+ case -3:
23365+ data_disks = 1;
23366+ break;
23367+ case -2:
23368+ data_disks = 1;
23369+ break;
23370+ case -1:
23371+ zoned_raid_size(mddev);
23372+ data_disks = 1;
23373+ break;
23374+ case 0:
23375+ zoned_raid_size(mddev);
23376+ data_disks = sb->raid_disks;
23377+ break;
23378+ case 1:
23379+ data_disks = 1;
23380+ break;
23381+ case 4:
23382+ case 5:
23383+ data_disks = sb->raid_disks-1;
23384+ break;
23385+ default:
23386+ LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
23387+ goto abort;
23388+ }
23389+ if (!evms_md_size[mdidx(mddev)])
23390+ evms_md_size[mdidx(mddev)] = sb->size * data_disks;
23391+
23392+ return 0;
23393+abort:
23394+ return 1;
23395+}
23396+
23397+
23398+#define TOO_BIG_CHUNKSIZE KERN_ERR \
23399+"too big chunk_size: %d > %d\n"
23400+
23401+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
23402+"too small chunk_size: %d < %ld\n"
23403+
23404+#define BAD_CHUNKSIZE KERN_ERR \
23405+"no chunksize specified, see 'man raidtab'\n"
23406+
23407+static int do_md_run (mddev_t * mddev)
23408+{
23409+ int pnum, err;
23410+ int chunk_size;
23411+ struct list_head *tmp;
23412+ mdk_rdev_t *rdev;
23413+
23414+
23415+ if (!mddev->nb_dev) {
23416+ MD_BUG();
23417+ return -EINVAL;
23418+ }
23419+
23420+ if (mddev->pers)
23421+ return -EBUSY;
23422+
23423+ /*
23424+ * Resize disks to align partitions size on a given
23425+ * chunk size.
23426+ */
23427+ evms_md_size[mdidx(mddev)] = 0;
23428+
23429+ /*
23430+ * Analyze all RAID superblock(s)
23431+ */
23432+ if (evms_md_analyze_sbs(mddev)) {
23433+ MD_BUG();
23434+ return -EINVAL;
23435+ }
23436+
23437+ mddev->chunk_size = chunk_size = mddev->sb->chunk_size;
23438+ pnum = level_to_pers(mddev->sb->level);
23439+
23440+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
23441+ if (!chunk_size) {
23442+ /*
23443+ * 'default chunksize' in the old md code used to
23444+ * be PAGE_SIZE, baaad.
23445+ * we abort here to be on the safe side. We dont
23446+ * want to continue the bad practice.
23447+ */
23448+ printk(BAD_CHUNKSIZE);
23449+ return -EINVAL;
23450+ }
23451+ if (chunk_size > MAX_CHUNK_SIZE) {
23452+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
23453+ return -EINVAL;
23454+ }
23455+ /*
23456+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
23457+ */
23458+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
23459+ MD_BUG();
23460+ return -EINVAL;
23461+ }
23462+ if (chunk_size < PAGE_SIZE) {
23463+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
23464+ return -EINVAL;
23465+ }
23466+ } else
23467+ if (chunk_size)
23468+ printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
23469+
23470+ if (pnum >= MAX_PERSONALITY) {
23471+ MD_BUG();
23472+ return -EINVAL;
23473+ }
23474+ if (!pers[pnum])
23475+ {
23476+#ifdef CONFIG_KMOD
23477+ char module_name[80];
23478+ sprintf (module_name, "md-personality-%d", pnum);
23479+ request_module (module_name);
23480+ if (!pers[pnum])
23481+#endif
23482+ {
23483+ printk(KERN_ERR "personality %d is not loaded!\n",
23484+ pnum);
23485+ return -EINVAL;
23486+ }
23487+ }
23488+ if (device_size_calculation(mddev))
23489+ return -EINVAL;
23490+
23491+ /*
23492+ * Drop all container device buffers, from now on
23493+ * the only valid external interface is through the md
23494+ * device.
23495+ * Also find largest hardsector size
23496+ */
23497+ md_hardsect_sizes[mdidx(mddev)] = 512;
23498+ ITERATE_RDEV(mddev,rdev,tmp) {
23499+ if (rdev->faulty)
23500+ continue;
23501+ invalidate_device(rdev->dev, 1);
23502+/* if (get_hardsect_size(rdev->dev)
23503+ > md_hardsect_sizes[mdidx(mddev)])
23504+ md_hardsect_sizes[mdidx(mddev)] =
23505+ get_hardsect_size(rdev->dev); */
23506+ if (rdev->node->hardsector_size > md_hardsect_sizes[mdidx(mddev)]) {
23507+ md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
23508+ }
23509+
23510+ }
23511+ md_blocksizes[mdidx(mddev)] = 1024;
23512+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
23513+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
23514+
23515+ mddev->pers = pers[pnum];
23516+
23517+ err = mddev->pers->run(mddev);
23518+ if (err) {
23519+ LOG_WARNING("%s: pers->run() failed.\n", __FUNCTION__);
23520+ mddev->pers = NULL;
23521+ return -EINVAL;
23522+ }
23523+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
23524+
23525+ evms_md_update_sb(mddev);
23526+
23527+ if (incomplete_mddev(mddev)) {
23528+ LOG_DEFAULT("%s: [md%d] was incomplete!\n", __FUNCTION__, mdidx(mddev));
23529+ list_del(&mddev->incomplete_mddevs);
23530+ INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23531+ }
23532+
23533+ list_add(&mddev->running_mddevs, &running_mddevs);
23534+
23535+ return (0);
23536+}
23537+
23538+#undef TOO_BIG_CHUNKSIZE
23539+#undef BAD_CHUNKSIZE
23540+
23541+
23542+#define OUT(x) do { err = (x); goto out; } while (0)
23543+
23544+
23545+#define STILL_MOUNTED KERN_WARNING \
23546+"md%d still mounted.\n"
23547+#define STILL_IN_USE \
23548+"md%d still in use.\n"
23549+
23550+static int do_md_stop (mddev_t * mddev, int ro)
23551+{
23552+ int err = 0, resync_interrupted = 0, clean = 0;
23553+ kdev_t dev = mddev_to_kdev(mddev);
23554+
23555+ if (atomic_read(&mddev->active)>1) {
23556+ printk(STILL_IN_USE, mdidx(mddev));
23557+ OUT(-EBUSY);
23558+ }
23559+
23560+ if (mddev->pers) {
23561+ /*
23562+ * It is safe to call stop here, it only frees private
23563+ * data. Also, it tells us if a device is unstoppable
23564+ * (eg. resyncing is in progress)
23565+ */
23566+ if (mddev->pers->stop_resync)
23567+ if (mddev->pers->stop_resync(mddev))
23568+ resync_interrupted = 1;
23569+
23570+ if (mddev->recovery_running)
23571+ evms_cs_interrupt_thread(evms_md_recovery_thread);
23572+
23573+ /*
23574+ * This synchronizes with signal delivery to the
23575+ * resync or reconstruction thread. It also nicely
23576+ * hangs the process if some reconstruction has not
23577+ * finished.
23578+ */
23579+ down(&mddev->recovery_sem);
23580+ up(&mddev->recovery_sem);
23581+
23582+ invalidate_device(dev, 1);
23583+
23584+ if (ro) {
23585+ if (mddev->ro)
23586+ OUT(-ENXIO);
23587+ mddev->ro = 1;
23588+ mddev->node->plugin = &md_plugin_header;
23589+ } else {
23590+ if (mddev->ro)
23591+ set_device_ro(dev, 0);
23592+ if (mddev->pers->stop(mddev)) {
23593+ if (mddev->ro)
23594+ set_device_ro(dev, 1);
23595+ OUT(-EBUSY);
23596+ }
23597+ if (mddev->ro)
23598+ mddev->ro = 0;
23599+ }
23600+ if (mddev->sb) {
23601+ /*
23602+ * mark it clean only if there was no resync
23603+ * interrupted.
23604+ */
23605+ if (!mddev->recovery_running && !resync_interrupted) {
23606+ LOG_DEBUG("%s: marking sb clean...\n", __FUNCTION__);
23607+ clean = 1;
23608+ }
23609+ evms_md_update_sb_sync(mddev, clean);
23610+ }
23611+ if (ro)
23612+ set_device_ro(dev, 1);
23613+ }
23614+
23615+ /*
23616+ * Free resources if final stop
23617+ */
23618+ if (!ro) {
23619+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
23620+ free_mddev(mddev);
23621+
23622+ } else
23623+ printk (KERN_INFO
23624+ "md%d switched to read-only mode.\n", mdidx(mddev));
23625+out:
23626+ return err;
23627+}
23628+
23629+
23630+static int evms_md_run_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23631+{
23632+ mdk_rdev_t *rdev;
23633+ struct list_head *tmp;
23634+ int err = 0;
23635+ uint flags = 0;
23636+
23637+ if (mddev->disks.prev == &mddev->disks) {
23638+ MD_BUG();
23639+ return -EINVAL;
23640+ }
23641+
23642+ LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
23643+
23644+ ITERATE_RDEV(mddev,rdev,tmp) {
23645+ LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
23646+ }
23647+
23648+ err = do_md_run (mddev);
23649+ if (!err) {
23650+ /*
23651+ * remove all nodes consumed by this md device from the discover list
23652+ */
23653+ ITERATE_RDEV(mddev,rdev,tmp) {
23654+ LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
23655+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23656+ flags |= rdev->node->flags;
23657+ }
23658+ err = evms_md_create_logical_node(discover_list,mddev,flags);
23659+ if (!err) {
23660+ exported_nodes++;
23661+ }
23662+ } else {
23663+ LOG_WARNING("%s: could not start [md%d] containing: \n",__FUNCTION__,mdidx(mddev));
23664+ ITERATE_RDEV(mddev,rdev,tmp) {
23665+ LOG_WARNING(" (%s, desc_nr=%d)\n", get_partition_name(rdev), rdev->desc_nr);
23666+ }
23667+ LOG_WARNING("%s: will try restart [md%d] again later.\n",__FUNCTION__,mdidx(mddev));
23668+
23669+ mddev->sb_dirty = 0;
23670+ }
23671+ return err;
23672+}
23673+
23674+static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23675+{
23676+ mdk_rdev_t *rdev;
23677+
23678+ LOG_DEFAULT("%s [md%d]\n",
23679+ __FUNCTION__, mdidx(mddev));
23680+ if (evms_md_run_array(discover_list,mddev) == 0) {
23681+ /*
23682+ * We succeeded running this MD device.
23683+ * Now read MD superblock on this newly created MD node.
23684+ */
23685+ if (mddev->node &&
23686+ (evms_md_import_device(discover_list,mddev->node) == 0)) {
23687+ /*
23688+ * Yes, there is a superblock on this MD node.
23689+ * We probably have a MD stacking case here.
23690+ */
23691+ rdev = evms_md_find_rdev_all(mddev->node);
23692+ if (rdev) {
23693+ list_add(&rdev->pending, &pending_raid_disks);
23694+ evms_md_run_devices(discover_list);
23695+ } else {
23696+ LOG_WARNING("%s: imported %s but no rdev was found!\n",
23697+ __FUNCTION__,
23698+ evms_md_partition_name(mddev->node));
23699+ }
23700+ }
23701+ }
23702+ if (incomplete_mddev(mddev)) {
23703+ list_del(&mddev->incomplete_mddevs);
23704+ INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23705+ }
23706+}
23707+
23708+/*
23709+ * lets try to run arrays based on all disks that have arrived
23710+ * until now. (those are in the ->pending list)
23711+ *
23712+ * the method: pick the first pending disk, collect all disks with
23713+ * the same UUID, remove all from the pending list and put them into
23714+ * the 'same_array' list. Then order this list based on superblock
23715+ * update time (freshest comes first), kick out 'old' disks and
23716+ * compare superblocks. If everything's fine then run it.
23717+ *
23718+ * If "unit" is allocated, then bump its reference count
23719+ */
23720+static void evms_md_run_devices (struct evms_logical_node **discover_list)
23721+{
23722+ struct list_head candidates;
23723+ struct list_head *tmp;
23724+ mdk_rdev_t *rdev0, *rdev;
23725+ mddev_t *mddev;
23726+ kdev_t md_kdev;
23727+
23728+
23729+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
23730+ while (pending_raid_disks.next != &pending_raid_disks) {
23731+ rdev0 = list_entry(pending_raid_disks.next,
23732+ mdk_rdev_t, pending);
23733+ MD_INIT_LIST_HEAD(&candidates);
23734+ ITERATE_RDEV_PENDING(rdev,tmp) {
23735+ if (uuid_equal(rdev0, rdev)) {
23736+ if (!sb_equal(rdev0->sb, rdev->sb)) {
23737+ LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
23738+ get_partition_name(rdev),get_partition_name(rdev0));
23739+ continue;
23740+ }
23741+ list_del(&rdev->pending);
23742+ list_add(&rdev->pending, &candidates);
23743+ }
23744+ }
23745+
23746+ /*
23747+ * now we have a set of devices, with all of them having
23748+ * mostly sane superblocks. It's time to allocate the
23749+ * mddev.
23750+ */
23751+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
23752+ mddev = kdev_to_mddev(md_kdev);
23753+ if (mddev && (!incomplete_mddev(mddev))) {
23754+ LOG_DETAILS("md%d already running, cannot run %s\n",
23755+ mdidx(mddev), get_partition_name(rdev0));
23756+
23757+ ITERATE_RDEV(mddev,rdev,tmp) {
23758+ /*
23759+ * This is EVMS re-discovery!
23760+ * Remove all nodes consumed by this md device from the discover list
23761+ */
23762+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23763+ }
23764+
23765+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23766+ if (evms_md_find_mddev_all(rdev->node))
23767+ /*
23768+ * We have found an MD superblock on top of a running MD array.
23769+ * Delete rdev but keep the MD array.
23770+ */
23771+ evms_md_export_rdev(rdev, FALSE);
23772+ else
23773+ evms_md_export_rdev(rdev, TRUE);
23774+ }
23775+ continue;
23776+ }
23777+
23778+ if (!mddev) {
23779+ mddev = alloc_mddev(md_kdev);
23780+ if (mddev == NULL) {
23781+ LOG_ERROR("cannot allocate memory for md drive.\n");
23782+ break;
23783+ }
23784+ LOG_DETAILS("created md%d\n", mdidx(mddev));
23785+ } else {
23786+ LOG_DETAILS("%s: found INCOMPLETE md%d\n", __FUNCTION__, mdidx(mddev));
23787+ }
23788+
23789+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23790+ bind_rdev_to_array(rdev, mddev);
23791+ list_del(&rdev->pending);
23792+ MD_INIT_LIST_HEAD(&rdev->pending);
23793+ }
23794+
23795+ if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
23796+ (mddev->nb_dev == rdev0->sb->nr_disks)) {
23797+ evms_md_run_array(discover_list,mddev);
23798+ } else {
23799+ LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
23800+ mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
23801+ list_add(&mddev->incomplete_mddevs, &incomplete_mddevs);
23802+ ITERATE_RDEV(mddev,rdev,tmp) {
23803+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23804+ }
23805+ }
23806+ }
23807+ LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
23808+}
23809+
23810+void evms_md_recover_arrays(void)
23811+{
23812+ if (!evms_md_recovery_thread) {
23813+ MD_BUG();
23814+ return;
23815+ }
23816+ evms_cs_wakeup_thread(evms_md_recovery_thread);
23817+}
23818+
23819+int evms_md_error_dev(
23820+ mddev_t *mddev,
23821+ kdev_t dev)
23822+{
23823+ mdk_rdev_t * rdev;
23824+
23825+ rdev = evms_md_find_rdev(mddev, dev);
23826+ if (rdev) {
23827+ return evms_md_error(mddev,rdev->node);
23828+ } else {
23829+ LOG_ERROR("%s: could not find %s in md%d\n",
23830+ __FUNCTION__, org_partition_name(dev), mdidx(mddev));
23831+ return 0;
23832+ }
23833+}
23834+
23835+int evms_md_error(
23836+ mddev_t *mddev,
23837+ struct evms_logical_node *node)
23838+{
23839+ mdk_rdev_t * rrdev;
23840+
23841+ /* check for NULL first */
23842+ if (!mddev) {
23843+ MD_BUG();
23844+ return 0;
23845+ }
23846+ LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
23847+ mdidx(mddev), node->name,
23848+ __builtin_return_address(0),__builtin_return_address(1),
23849+ __builtin_return_address(2),__builtin_return_address(3));
23850+
23851+ rrdev = evms_md_find_rdev_from_node(mddev, node);
23852+ if (!rrdev || rrdev->faulty)
23853+ return 0;
23854+ if (!mddev->pers->error_handler
23855+ || mddev->pers->error_handler(mddev,node) <= 0) {
23856+ free_disk_sb(rrdev);
23857+ rrdev->faulty = 1;
23858+ } else
23859+ return 1;
23860+ /*
23861+ * if recovery was running, stop it now.
23862+ */
23863+ if (mddev->pers->stop_resync)
23864+ mddev->pers->stop_resync(mddev);
23865+ if (mddev->recovery_running)
23866+ evms_cs_interrupt_thread(evms_md_recovery_thread);
23867+ evms_md_recover_arrays();
23868+
23869+ return 0;
23870+}
23871+
23872+int evms_register_md_personality (int pnum, mdk_personality_t *p)
23873+{
23874+ if (pnum >= MAX_PERSONALITY) {
23875+ MD_BUG();
23876+ return -EINVAL;
23877+ }
23878+
23879+ if (pers[pnum]) {
23880+ MD_BUG();
23881+ return -EBUSY;
23882+ }
23883+
23884+ pers[pnum] = p;
23885+ LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
23886+ return 0;
23887+}
23888+
23889+int evms_unregister_md_personality (int pnum)
23890+{
23891+ if (pnum >= MAX_PERSONALITY) {
23892+ MD_BUG();
23893+ return -EINVAL;
23894+ }
23895+
23896+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
23897+ pers[pnum] = NULL;
23898+ return 0;
23899+}
23900+
23901+mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
23902+{
23903+ mdp_super_t *sb = mddev->sb;
23904+ mdp_disk_t *disk;
23905+ mdk_rdev_t *rdev;
23906+ int i, j;
23907+
23908+ for (i = 0, j = 0; j < mddev->nb_dev; i++) {
23909+ rdev = evms_md_find_rdev_nr(mddev, i);
23910+ if (rdev == NULL)
23911+ continue;
23912+ j++;
23913+ if (rdev->faulty)
23914+ continue;
23915+ if (!rdev->sb) {
23916+ if (!rdev->virtual_spare)
23917+ MD_BUG();
23918+ continue;
23919+ }
23920+ disk = &sb->disks[rdev->desc_nr];
23921+ if (disk_faulty(disk)) {
23922+ MD_BUG();
23923+ continue;
23924+ }
23925+ if (disk_active(disk))
23926+ continue;
23927+ return disk;
23928+ }
23929+ return NULL;
23930+}
23931+
23932+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
23933+{
23934+ mdp_super_t *sb = mddev->sb;
23935+ mdp_disk_t *disk;
23936+ int i;
23937+
23938+ for (i=0; i < MD_SB_DISKS; i++) {
23939+ disk = &sb->disks[i];
23940+ if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
23941+ return disk;
23942+ }
23943+ return NULL;
23944+}
23945+
23946+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
23947+void evms_md_sync_acct(
23948+ kdev_t dev,
23949+ unsigned long nr_sectors)
23950+{
23951+ unsigned int major = MAJOR(dev);
23952+ unsigned int index;
23953+
23954+ index = disk_index(dev);
23955+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23956+ return;
23957+
23958+ sync_io[major][index] += nr_sectors;
23959+}
23960+
23961+static int is_mddev_idle(mddev_t *mddev)
23962+{
23963+ mdk_rdev_t * rdev;
23964+ struct list_head *tmp;
23965+ int idle;
23966+ unsigned long curr_events;
23967+
23968+ idle = 1;
23969+ ITERATE_RDEV(mddev,rdev,tmp) {
23970+ int major = MAJOR(rdev->dev);
23971+ int idx = disk_index(rdev->dev);
23972+
23973+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23974+ continue;
23975+
23976+ curr_events = kstat.dk_drive_rblk[major][idx] +
23977+ kstat.dk_drive_wblk[major][idx] ;
23978+ curr_events -= sync_io[major][idx];
23979+ if ((curr_events - rdev->last_events) > 32) {
23980+ rdev->last_events = curr_events;
23981+ idle = 0;
23982+ }
23983+ }
23984+ return idle;
23985+}
23986+
23987+MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
23988+
23989+void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
23990+{
23991+ /* another "blocks" (512byte) blocks have been synced */
23992+ atomic_sub(blocks, &mddev->recovery_active);
23993+ wake_up(&mddev->recovery_wait);
23994+ if (!ok) {
23995+ // stop recovery, signal do_sync ....
23996+ }
23997+}
23998+
23999+#define SYNC_MARKS 10
24000+#define SYNC_MARK_STEP (3*HZ)
24001+int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
24002+{
24003+ mddev_t *mddev2;
24004+ unsigned int max_sectors, currspeed,
24005+ j, window, err, serialize;
24006+ unsigned long mark[SYNC_MARKS];
24007+ unsigned long mark_cnt[SYNC_MARKS];
24008+ int last_mark,m;
24009+ struct list_head *tmp;
24010+ unsigned long last_check;
24011+
24012+
24013+ err = down_interruptible(&mddev->resync_sem);
24014+ if (err)
24015+ goto out_nolock;
24016+
24017+recheck:
24018+ serialize = 0;
24019+ ITERATE_MDDEV(mddev2,tmp) {
24020+ if (mddev2 == mddev)
24021+ continue;
24022+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
24023+ LOG_DEFAULT("delaying resync of md%d until md%d "
24024+ "has finished resync (they share one or more physical units)\n",
24025+ mdidx(mddev), mdidx(mddev2));
24026+ serialize = 1;
24027+ break;
24028+ }
24029+ }
24030+ if (serialize) {
24031+ interruptible_sleep_on(&evms_resync_wait);
24032+ if (md_signal_pending(current)) {
24033+ md_flush_signals();
24034+ err = -EINTR;
24035+ goto out;
24036+ }
24037+ goto recheck;
24038+ }
24039+
24040+ mddev->curr_resync = 1;
24041+
24042+ max_sectors = mddev->sb->size<<1;
24043+
24044+ LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
24045+ LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
24046+ sysctl_speed_limit_min);
24047+ LOG_DEFAULT("using maximum available idle IO bandwith "
24048+ "(but not more than %d KB/sec) for reconstruction.\n",
24049+ sysctl_speed_limit_max);
24050+
24051+ /*
24052+ * Resync has low priority.
24053+ */
24054+#ifdef O1_SCHEDULER
24055+ set_user_nice(current,19);
24056+#else
24057+ current->nice = 19;
24058+#endif
24059+
24060+ is_mddev_idle(mddev); /* this also initializes IO event counters */
24061+ for (m = 0; m < SYNC_MARKS; m++) {
24062+ mark[m] = jiffies;
24063+ mark_cnt[m] = 0;
24064+ }
24065+ last_mark = 0;
24066+ mddev->resync_mark = mark[last_mark];
24067+ mddev->resync_mark_cnt = mark_cnt[last_mark];
24068+
24069+ /*
24070+ * Tune reconstruction:
24071+ */
24072+ window = MD_READAHEAD*(PAGE_SIZE/512);
24073+ LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
24074+ window/2,max_sectors/2);
24075+
24076+ atomic_set(&mddev->recovery_active, 0);
24077+ init_waitqueue_head(&mddev->recovery_wait);
24078+ last_check = 0;
24079+ for (j = 0; j < max_sectors;) {
24080+ int sectors;
24081+
24082+ sectors = mddev->pers->sync_request(mddev, j);
24083+
24084+ if (sectors < 0) {
24085+ err = sectors;
24086+ goto out;
24087+ }
24088+ atomic_add(sectors, &mddev->recovery_active);
24089+ j += sectors;
24090+ mddev->curr_resync = j;
24091+
24092+ if (last_check + window > j)
24093+ continue;
24094+
24095+ last_check = j;
24096+
24097+ run_task_queue(&tq_disk);
24098+
24099+ repeat:
24100+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
24101+ /* step marks */
24102+ int next = (last_mark+1) % SYNC_MARKS;
24103+
24104+ mddev->resync_mark = mark[next];
24105+ mddev->resync_mark_cnt = mark_cnt[next];
24106+ mark[next] = jiffies;
24107+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
24108+ last_mark = next;
24109+ }
24110+
24111+
24112+ if (md_signal_pending(current)) {
24113+ /*
24114+ * got a signal, exit.
24115+ */
24116+ mddev->curr_resync = 0;
24117+ LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
24118+ md_flush_signals();
24119+ err = -EINTR;
24120+ goto out;
24121+ }
24122+
24123+ /*
24124+ * this loop exits only if either when we are slower than
24125+ * the 'hard' speed limit, or the system was IO-idle for
24126+ * a jiffy.
24127+ * the system might be non-idle CPU-wise, but we only care
24128+ * about not overloading the IO subsystem. (things like an
24129+ * e2fsck being done on the RAID array should execute fast)
24130+ */
24131+ if (md_need_resched(current))
24132+ schedule();
24133+
24134+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
24135+
24136+ if (currspeed > sysctl_speed_limit_min) {
24137+#ifdef O1_SCHEDULER
24138+ set_user_nice(current,19);
24139+#else
24140+ current->nice = 19;
24141+#endif
24142+
24143+ if ((currspeed > sysctl_speed_limit_max) ||
24144+ !is_mddev_idle(mddev)) {
24145+#ifdef O1_SCHEDULER
24146+ set_current_state(TASK_INTERRUPTIBLE);
24147+#else
24148+ current->state = TASK_INTERRUPTIBLE;
24149+#endif
24150+ md_schedule_timeout(HZ/4);
24151+ goto repeat;
24152+ }
24153+ } else
24154+#ifdef O1_SCHEDULER
24155+ set_user_nice(current,-20);
24156+#else
24157+ current->nice = -20;
24158+#endif
24159+ }
24160+ LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
24161+ err = 0;
24162+ /*
24163+ * this also signals 'finished resyncing' to md_stop
24164+ */
24165+out:
24166+ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
24167+ up(&mddev->resync_sem);
24168+out_nolock:
24169+ mddev->curr_resync = 0;
24170+ wake_up(&evms_resync_wait);
24171+ return err;
24172+}
24173+
24174+
24175+
24176+/*
24177+ * This is a kernel thread which syncs a spare disk with the active array
24178+ *
24179+ * the amount of foolproofing might seem to be a tad excessive, but an
24180+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
24181+ * of my root partition with the first 0.5 gigs of my /home partition ... so
24182+ * i'm a bit nervous ;)
24183+ */
24184+void evms_md_do_recovery(void *data)
24185+{
24186+ int err;
24187+ mddev_t *mddev;
24188+ mdp_super_t *sb;
24189+ mdp_disk_t *spare;
24190+ struct list_head *tmp;
24191+
24192+ LOG_DEFAULT("recovery thread got woken up ...\n");
24193+restart:
24194+ ITERATE_MDDEV(mddev,tmp) {
24195+
24196+ sb = mddev->sb;
24197+ if (!sb)
24198+ continue;
24199+ if (mddev->recovery_running)
24200+ continue;
24201+ if (sb->active_disks == sb->raid_disks)
24202+ continue;
24203+ if (!sb->spare_disks) {
24204+ LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
24205+ "-- continuing in degraded mode\n", mdidx(mddev));
24206+ continue;
24207+ }
24208+
24209+ spare = NULL;
24210+
24211+ if (!spare) {
24212+ /*
24213+ * now here we get the spare and resync it.
24214+ */
24215+ spare = evms_md_get_spare(mddev);
24216+ }
24217+ if (!spare)
24218+ continue;
24219+
24220+ LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
24221+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24222+ if (!mddev->pers->diskop)
24223+ continue;
24224+
24225+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
24226+ continue;
24227+
24228+ down(&mddev->recovery_sem);
24229+ mddev->recovery_running = 1;
24230+ err = evms_md_do_sync(mddev, spare);
24231+ if (err == -EIO) {
24232+ LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
24233+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24234+ if (!disk_faulty(spare)) {
24235+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
24236+ mark_disk_faulty(spare);
24237+ mark_disk_nonsync(spare);
24238+ mark_disk_inactive(spare);
24239+ sb->spare_disks--;
24240+ sb->working_disks--;
24241+ sb->failed_disks++;
24242+ }
24243+ } else
24244+ if (disk_faulty(spare))
24245+ mddev->pers->diskop(mddev, &spare,
24246+ DISKOP_SPARE_INACTIVE);
24247+ if (err == -EINTR || err == -ENOMEM) {
24248+ /*
24249+ * Recovery got interrupted, or ran out of mem ...
24250+ * signal back that we have finished using the array.
24251+ */
24252+ mddev->pers->diskop(mddev, &spare,
24253+ DISKOP_SPARE_INACTIVE);
24254+ up(&mddev->recovery_sem);
24255+ mddev->recovery_running = 0;
24256+ continue;
24257+ } else {
24258+ mddev->recovery_running = 0;
24259+ up(&mddev->recovery_sem);
24260+ }
24261+ if (!disk_faulty(spare)) {
24262+ /*
24263+ * the SPARE_ACTIVE diskop possibly changes the
24264+ * pointer too
24265+ */
24266+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
24267+ mark_disk_sync(spare);
24268+ mark_disk_active(spare);
24269+ sb->active_disks++;
24270+ sb->spare_disks--;
24271+ }
24272+ mddev->sb_dirty = 1;
24273+ evms_md_update_sb(mddev);
24274+ goto restart;
24275+ }
24276+ LOG_DEFAULT("recovery thread finished ...\n");
24277+
24278+}
24279+
24280+static void evms_md_create_recovery_thread(void)
24281+{
24282+ static char * name = "evms_mdrecoveryd";
24283+
24284+ if (!evms_md_recovery_thread) {
24285+ /* Create MD recovery thread */
24286+ evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
24287+ if (!evms_md_recovery_thread)
24288+ LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
24289+ }
24290+}
24291+
24292+static void evms_md_destroy_recovery_thread(void)
24293+{
24294+ if (evms_md_recovery_thread && !MOD_IN_USE) {
24295+ /* Destroy MD recovery thread */
24296+ evms_cs_unregister_thread(evms_md_recovery_thread);
24297+ evms_md_recovery_thread = NULL;
24298+ }
24299+}
24300+
24301+/**
24302+ * evms_md_create_logical_node
24303+ **/
24304+static int evms_md_create_logical_node(
24305+ struct evms_logical_node **discover_list,
24306+ mddev_t *mddev,
24307+ uint flags)
24308+{
24309+ int rc;
24310+ struct evms_md *evms_md = NULL;
24311+ struct evms_logical_node *newnode = NULL;
24312+ struct evms_plugin_header *hdr = NULL;
24313+ struct evms_plugin_fops *fops = NULL;
24314+
24315+ rc = evms_cs_allocate_logical_node(&newnode);
24316+ if (!rc) {
24317+ evms_md = kmalloc(sizeof(*evms_md), GFP_KERNEL);
24318+ if (!evms_md) {
24319+ rc = -ENOMEM;
24320+ } else {
24321+
24322+ memset(evms_md,0,sizeof(*evms_md));
24323+ evms_md->mddev = mddev;
24324+
24325+ fops = kmalloc(sizeof(*fops), GFP_KERNEL);
24326+ if (fops) {
24327+ /* copy MD plugin header
24328+ * copy function table
24329+ * replace read and write function pointers.
24330+ */
24331+ evms_md->instance_plugin_hdr = md_plugin_header;
24332+ memcpy(fops, &md_fops, sizeof(*fops));
24333+ fops->read = mddev->pers->read;
24334+ fops->write = mddev->pers->write;
24335+ evms_md->instance_plugin_hdr.fops = fops;
24336+ hdr = &evms_md->instance_plugin_hdr;
24337+ } else {
24338+ LOG_WARNING("%s: No memory to copy function table\n",__FUNCTION__);
24339+ rc = 0; /* clear rc and continue */
24340+ hdr = &md_plugin_header;
24341+ }
24342+ }
24343+ }
24344+
24345+ if (!rc && hdr) {
24346+ memset(newnode,0,sizeof(*newnode));
24347+ newnode->plugin = hdr;
24348+ newnode->total_vsectors = (u64)evms_md_size[mdidx(mddev)] * 2;
24349+ newnode->block_size = md_blocksizes[mdidx(mddev)];
24350+ newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
24351+ sprintf(newnode->name,"md/md%d",mdidx(mddev));
24352+ newnode->private = evms_md;
24353+ newnode->flags = flags;
24354+
24355+ rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
24356+ if (rc) {
24357+ LOG_ERROR("%s: could not add md node %s\n", __FUNCTION__, newnode->name);
24358+ } else {
24359+ LOG_DEBUG("%s: added [%s] to discover list (total_vsectors="PFU64")\n",
24360+ __FUNCTION__, newnode->name, newnode->total_vsectors);
24361+ }
24362+ }
24363+
24364+ if (!rc) {
24365+ mddev->node = newnode;
24366+ } else {
24367+ if (evms_md) {
24368+ if (fops)
24369+ kfree(fops);
24370+ kfree(evms_md);
24371+ }
24372+ if (newnode)
24373+ evms_cs_deallocate_logical_node(newnode);
24374+ }
24375+ return rc;
24376+}
24377+
24378+
24379+/*
24380+ * Function: evms_md_autostart_arrays
24381+ * Discover MD "extended" devices
24382+ * Add MD "extended" devices to pending list for further processing
24383+ */
24384+static void evms_md_autostart_arrays (struct evms_logical_node **discover_list)
24385+{
24386+ struct evms_logical_node *node, *next_node;
24387+ mdk_rdev_t *rdev;
24388+ int rc=0;
24389+
24390+ LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
24391+
24392+ /* examine each node on the discover list */
24393+ next_node = *discover_list;
24394+ while(next_node) {
24395+ node = next_node;
24396+ next_node = node->next;
24397+
24398+ rc = evms_md_import_device(discover_list, node);
24399+ if (rc && (rc != -EEXIST)) {
24400+ LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
24401+ continue;
24402+ }
24403+
24404+ /*
24405+ * Sanity checks:
24406+ */
24407+ rdev = evms_md_find_rdev_all(node);
24408+ if (!rdev) {
24409+ LOG_ERROR("find_rdev_all() failed\n");
24410+ continue;
24411+ }
24412+ if (rdev->faulty) {
24413+ MD_BUG();
24414+ continue;
24415+ }
24416+
24417+ if (!rc) {
24418+ list_add(&rdev->pending, &pending_raid_disks);
24419+ } else if (rc == -EEXIST) {
24420+ struct evms_logical_node *md_node;
24421+ /*
24422+ * Must be in a re-discovery process here.
24423+ * Find the EVMS MD node that this rdev is a member of
24424+ */
24425+ if (rdev->mddev) {
24426+ md_node = rdev->mddev->node;
24427+ if (md_node) {
24428+ rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
24429+ switch (rc) {
24430+ case 0:
24431+ exported_nodes++;
24432+ LOG_DETAILS("Added MD node (%s) to discover list\n",
24433+ md_node->name);
24434+ break;
24435+ case 1: /* already on the list */
24436+ case 2: /* already on the list */
24437+ break;
24438+ default:
24439+ LOG_WARNING("could not add md node (%s), rc=%d\n",
24440+ md_node->name, rc);
24441+ }
24442+ } else {
24443+ LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
24444+ rdev->mddev->__minor);
24445+ }
24446+ } else {
24447+ LOG_ERROR("This device [%s] does not belong to any array!\n",
24448+ get_partition_name(rdev));
24449+ evms_md_export_rdev(rdev, TRUE);
24450+ }
24451+ evms_cs_remove_logical_node_from_list(discover_list,node);
24452+ }
24453+ }
24454+
24455+ evms_md_run_devices(discover_list);
24456+ LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
24457+}
24458+
24459+#ifdef CONFIG_PROC_FS
24460+static int status_resync(char * page, off_t * offset, int count, mddev_t * mddev)
24461+{
24462+ int sz = 0;
24463+ off_t off = *offset;
24464+ unsigned long max_blocks, resync, res, dt, db, rt;
24465+
24466+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
24467+ max_blocks = mddev->sb->size;
24468+
24469+ /*
24470+ * Should not happen.
24471+ */
24472+ if (!max_blocks) {
24473+ MD_BUG();
24474+ return 0;
24475+ }
24476+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
24477+ {
24478+ int i, x = res/50, y = 20-x;
24479+ PROCPRINT("[");
24480+ for (i = 0; i < x; i++)
24481+ PROCPRINT("=");
24482+ sz += sprintf(page + sz, ">");
24483+ for (i = 0; i < y; i++)
24484+ PROCPRINT(".");
24485+ PROCPRINT("] ");
24486+ }
24487+ if (!mddev->recovery_running)
24488+ /*
24489+ * true resync
24490+ */
24491+ PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
24492+ res/10, res % 10, resync, max_blocks);
24493+ else
24494+ /*
24495+ * recovery ...
24496+ */
24497+ PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
24498+ res/10, res % 10, resync, max_blocks);
24499+
24500+ /*
24501+ * We do not want to overflow, so the order of operands and
24502+ * the * 100 / 100 trick are important. We do a +1 to be
24503+ * safe against division by zero. We only estimate anyway.
24504+ *
24505+ * dt: time from mark until now
24506+ * db: blocks written from mark until now
24507+ * rt: remaining time
24508+ */
24509+ dt = ((jiffies - mddev->resync_mark) / HZ);
24510+ if (!dt) dt++;
24511+ db = resync - (mddev->resync_mark_cnt/2);
24512+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
24513+
24514+ PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
24515+
24516+ PROCPRINT(" speed=%ldK/sec", db/dt);
24517+
24518+out:
24519+ *offset = off;
24520+ return sz;
24521+}
24522+
24523+static int evms_md_status_read_proc(char *page, char **start, off_t off,
24524+ int count, int *eof, void *data)
24525+{
24526+ int sz = 0, j, size;
24527+ struct list_head *tmp, *tmp2;
24528+ mdk_rdev_t *rdev;
24529+ mddev_t *mddev;
24530+
24531+ PROCPRINT("Enterprise Volume Management System: MD Status\n");
24532+ PROCPRINT("Personalities : ");
24533+ for (j = 0; j < MAX_PERSONALITY; j++)
24534+ if (pers[j])
24535+ PROCPRINT("[%s] ", pers[j]->name);
24536+
24537+ PROCPRINT("\n");
24538+
24539+
24540+ ITERATE_MDDEV(mddev,tmp) {
24541+ PROCPRINT("md%d : %sactive", mdidx(mddev),
24542+ mddev->pers ? "" : "in");
24543+ if (mddev->pers) {
24544+ if (mddev->ro)
24545+ PROCPRINT(" (read-only)");
24546+ PROCPRINT(" %s", mddev->pers->name);
24547+ }
24548+
24549+ size = 0;
24550+ ITERATE_RDEV(mddev,rdev,tmp2) {
24551+ PROCPRINT(" %s[%d]",
24552+ rdev->node->name, rdev->desc_nr);
24553+ if (rdev->faulty) {
24554+ PROCPRINT("(F)");
24555+ continue;
24556+ }
24557+ size += rdev->size;
24558+ }
24559+
24560+ if (mddev->nb_dev) {
24561+ if (mddev->pers)
24562+ PROCPRINT("\n "PFU64" blocks",
24563+ mddev->node->total_vsectors >> 1);
24564+ else
24565+ PROCPRINT("\n %d blocks", size);
24566+ }
24567+
24568+ if (!mddev->pers) {
24569+ PROCPRINT("\n");
24570+ continue;
24571+ }
24572+
24573+ sz += mddev->pers->status (page+sz, mddev);
24574+
24575+ PROCPRINT("\n ");
24576+ if (mddev->curr_resync) {
24577+ sz += status_resync (page+sz, &off, count, mddev);
24578+ } else {
24579+ if (atomic_read(&mddev->resync_sem.count) != 1)
24580+ PROCPRINT(" resync=DELAYED");
24581+ }
24582+
24583+ PROCPRINT("\n");
24584+ }
24585+ *eof = 1;
24586+out:
24587+ *start = page + off;
24588+ sz -= off;
24589+ if (sz < 0)
24590+ sz = 0;
24591+ return sz > count ? count : sz;
24592+}
24593+#endif
24594+
24595+/* Function: md_core_init
24596+ */
24597+int __init md_core_init(void)
24598+{
24599+#ifdef CONFIG_PROC_FS
24600+ struct proc_dir_entry *evms_proc_dir;
24601+#endif
24602+
24603+#ifdef CONFIG_PROC_FS
24604+ evms_proc_dir = evms_cs_get_evms_proc_dir();
24605+ if (evms_proc_dir) {
24606+ create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
24607+ }
24608+ md_table_header = register_sysctl_table(dev_dir_table, 1);
24609+#endif
24610+
24611+ return evms_cs_register_plugin(&md_plugin_header);
24612+}
24613+
24614+static void __exit md_core_exit(void)
24615+{
24616+#ifdef CONFIG_PROC_FS
24617+ struct proc_dir_entry *evms_proc_dir;
24618+
24619+ evms_proc_dir = evms_cs_get_evms_proc_dir();
24620+ if (evms_proc_dir) {
24621+ remove_proc_entry("mdstat", evms_proc_dir);
24622+ }
24623+ unregister_sysctl_table(md_table_header);
24624+#endif
24625+ evms_cs_unregister_plugin(&md_plugin_header);
24626+}
24627+
24628+module_init(md_core_init);
24629+module_exit(md_core_exit);
24630+#ifdef MODULE_LICENSE
24631+MODULE_LICENSE("GPL");
24632+#endif
24633+
24634+/*
24635+ * In order to have the coexistence of this EVMS plugin and the orginal MD
24636+ * module, the symbols exported by this plugin are prefixed with "evms_"
24637+ */
24638+
24639+MD_EXPORT_SYMBOL(evms_md_size);
24640+MD_EXPORT_SYMBOL(evms_register_md_personality);
24641+MD_EXPORT_SYMBOL(evms_unregister_md_personality);
24642+ /* Export the following function for use with rdev->node in evms_md_k.h */
24643+MD_EXPORT_SYMBOL(evms_md_partition_name);
24644+ /* Export the following function for use with disks[] in md_p.h */
24645+MD_EXPORT_SYMBOL(evms_md_error);
24646+MD_EXPORT_SYMBOL(evms_md_error_dev);
24647+MD_EXPORT_SYMBOL(evms_md_update_sb);
24648+MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
24649+MD_EXPORT_SYMBOL(evms_md_find_rdev);
24650+MD_EXPORT_SYMBOL(evms_md_find_rdev_from_node);
24651+MD_EXPORT_SYMBOL(evms_md_print_devices);
24652+MD_EXPORT_SYMBOL(evms_mddev_map);
24653+MD_EXPORT_SYMBOL(evms_md_check_ordering);
24654+MD_EXPORT_SYMBOL(evms_md_partial_sync_io);
24655+MD_EXPORT_SYMBOL(evms_md_sync_io);
24656+MD_EXPORT_SYMBOL(evms_md_do_sync);
24657+MD_EXPORT_SYMBOL(evms_md_sync_acct);
24658+MD_EXPORT_SYMBOL(evms_md_done_sync);
24659+MD_EXPORT_SYMBOL(evms_md_recover_arrays);
24660+MD_EXPORT_SYMBOL(evms_md_get_spare);
24661+
24662diff -Naur linux-2002-09-30/drivers/evms/md_linear.c evms-2002-09-30/drivers/evms/md_linear.c
24663--- linux-2002-09-30/drivers/evms/md_linear.c Wed Dec 31 18:00:00 1969
24664+++ evms-2002-09-30/drivers/evms/md_linear.c Thu Aug 15 13:50:12 2002
24665@@ -0,0 +1,285 @@
24666+/*
24667+ linear.c : Multiple Devices driver for Linux
24668+ Copyright (C) 1994-96 Marc ZYNGIER
24669+ <zyngier@ufr-info-p7.ibp.fr> or
24670+ <maz@gloups.fdn.fr>
24671+
24672+ Linear mode management functions.
24673+
24674+ This program is free software; you can redistribute it and/or modify
24675+ it under the terms of the GNU General Public License as published by
24676+ the Free Software Foundation; either version 2, or (at your option)
24677+ any later version.
24678+
24679+ You should have received a copy of the GNU General Public License
24680+ (for example /usr/src/linux/COPYING); if not, write to the Free
24681+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24682+*/
24683+
24684+#include <linux/module.h>
24685+#include <linux/evms/evms_md.h>
24686+#include <linux/evms/evms_linear.h>
24687+#include <linux/slab.h>
24688+
24689+
24690+#define MAJOR_NR MD_MAJOR
24691+#define MD_DRIVER
24692+#define MD_PERSONALITY
24693+
24694+#define LOG_PREFIX "md linear: "
24695+static int linear_run (mddev_t *mddev)
24696+{
24697+ linear_conf_t *conf;
24698+ struct linear_hash *table;
24699+ mdk_rdev_t *rdev;
24700+ int size, i, j, nb_zone;
24701+ unsigned int curr_offset;
24702+
24703+ MOD_INC_USE_COUNT;
24704+
24705+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
24706+ if (!conf)
24707+ goto out;
24708+ mddev->private = conf;
24709+
24710+ if (evms_md_check_ordering(mddev)) {
24711+ printk("linear: disks are not ordered, aborting!\n");
24712+ goto out;
24713+ }
24714+
24715+ /*
24716+ * Find the smallest device.
24717+ */
24718+
24719+ conf->smallest = NULL;
24720+ curr_offset = 0;
24721+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
24722+ dev_info_t *disk = conf->disks + j;
24723+ disk->node = rdev->node;
24724+ disk->dev = rdev->dev;
24725+ disk->size = rdev->size;
24726+ disk->offset = curr_offset;
24727+
24728+ curr_offset += disk->size;
24729+
24730+ if (!conf->smallest || (disk->size < conf->smallest->size))
24731+ conf->smallest = disk;
24732+ }
24733+
24734+ nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size +
24735+ ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
24736+
24737+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
24738+ GFP_KERNEL);
24739+ if (!conf->hash_table)
24740+ goto out;
24741+
24742+ /*
24743+ * Here we generate the linear hash table
24744+ */
24745+ table = conf->hash_table;
24746+ i = 0;
24747+ size = 0;
24748+ for (j = 0; j < mddev->nb_dev; j++) {
24749+ dev_info_t *disk = conf->disks + j;
24750+
24751+ if (size < 0) {
24752+ table[-1].dev1 = disk;
24753+ }
24754+ size += disk->size;
24755+
24756+ while (size>0) {
24757+ table->dev0 = disk;
24758+ table->dev1 = NULL;
24759+ size -= conf->smallest->size;
24760+ table++;
24761+ }
24762+ }
24763+ if (table-conf->hash_table != nb_zone)
24764+ BUG();
24765+ LOG_DETAILS("%s: nr_zones=%d, smallest=%lu\n",
24766+ __FUNCTION__, conf->nr_zones, conf->smallest->size);
24767+ return 0;
24768+
24769+out:
24770+ if (conf)
24771+ kfree(conf);
24772+ MOD_DEC_USE_COUNT;
24773+ return 1;
24774+}
24775+
24776+static int linear_stop (mddev_t *mddev)
24777+{
24778+ linear_conf_t *conf = mddev_to_conf(mddev);
24779+
24780+ kfree(conf->hash_table);
24781+ kfree(conf);
24782+
24783+ MOD_DEC_USE_COUNT;
24784+
24785+ return 0;
24786+}
24787+
24788+/*
24789+ * Function: linear_map
24790+ */
24791+static int linear_map(
24792+ mddev_t *mddev,
24793+ struct evms_logical_node **node,
24794+ struct buffer_head *bh)
24795+{
24796+ linear_conf_t *conf = mddev_to_conf(mddev);
24797+ struct linear_hash *hash;
24798+ dev_info_t *tmp_dev;
24799+ unsigned long block;
24800+
24801+ block = (bh->b_rsector >> 1);
24802+ hash = conf->hash_table + (block / conf->smallest->size);
24803+ if (block >= (hash->dev0->size + hash->dev0->offset)) {
24804+ if (!hash->dev1) {
24805+ LOG_ERROR("%s: hash->dev1==NULL for block %ld\n", __FUNCTION__, block);
24806+ return -ENXIO;
24807+ }
24808+ tmp_dev = hash->dev1;
24809+ } else
24810+ tmp_dev = hash->dev0;
24811+
24812+ if ( (block + (bh->b_size >> 10)) > (tmp_dev->size + tmp_dev->offset)
24813+ || block < tmp_dev->offset) {
24814+ LOG_ERROR("%s: Block %ld out of bounds on node %s size %ld offset %ld\n",
24815+ __FUNCTION__,
24816+ block,
24817+ tmp_dev->node->name,
24818+ tmp_dev->size,
24819+ tmp_dev->offset);
24820+ return -ENXIO;
24821+ }
24822+ bh->b_rsector -= (tmp_dev->offset << 1);
24823+ *node = tmp_dev->node;
24824+ return 0;
24825+}
24826+
24827+static void linear_read(
24828+ struct evms_logical_node *md_node,
24829+ struct buffer_head *bh)
24830+{
24831+ mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24832+ struct evms_logical_node *node;
24833+
24834+ if (evms_md_check_boundary(md_node, bh)) return;
24835+
24836+ if (!linear_map(mddev, &node, bh)) {
24837+ R_IO(node, bh);
24838+ } else {
24839+ bh->b_end_io(bh, 0);
24840+ }
24841+}
24842+
24843+static void linear_write(
24844+ struct evms_logical_node *md_node,
24845+ struct buffer_head *bh)
24846+{
24847+ mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24848+ struct evms_logical_node *node;
24849+
24850+ if (evms_md_check_boundary(md_node, bh)) return;
24851+
24852+ if (!linear_map(mddev, &node, bh)) {
24853+ W_IO(node, bh);
24854+ } else {
24855+ bh->b_end_io(bh, 0);
24856+ }
24857+}
24858+
24859+static int linear_status (char *page, mddev_t *mddev)
24860+{
24861+ int sz = 0;
24862+
24863+#undef MD_DEBUG
24864+#ifdef MD_DEBUG
24865+ int j;
24866+ linear_conf_t *conf = mddev_to_conf(mddev);
24867+
24868+ sz += sprintf(page+sz, " ");
24869+ for (j = 0; j < conf->nr_zones; j++)
24870+ {
24871+ sz += sprintf(page+sz, "[%s",
24872+ partition_name(conf->hash_table[j].dev0->dev));
24873+
24874+ if (conf->hash_table[j].dev1)
24875+ sz += sprintf(page+sz, "/%s] ",
24876+ partition_name(conf->hash_table[j].dev1->dev));
24877+ else
24878+ sz += sprintf(page+sz, "] ");
24879+ }
24880+ sz += sprintf(page+sz, "\n");
24881+#endif
24882+ sz += sprintf(page+sz, " %dk rounding", mddev->chunk_size/1024);
24883+ return sz;
24884+}
24885+
24886+static int linear_evms_ioctl (
24887+ mddev_t * mddev,
24888+ struct inode * inode,
24889+ struct file * file,
24890+ unsigned int cmd,
24891+ unsigned long arg)
24892+{
24893+ int rc = 0;
24894+ struct evms_logical_node *node;
24895+
24896+ switch (cmd) {
24897+ case EVMS_GET_BMAP:
24898+ {
24899+ struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
24900+ struct buffer_head *bh =
24901+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
24902+ if (bh) {
24903+ bh->b_rsector = (unsigned long)bmap->rsector;
24904+ bh->b_size = node->block_size;
24905+ rc = linear_map(mddev, &node, bh);
24906+ if (!rc) {
24907+ bmap->rsector = (u64)bh->b_rsector;
24908+ if (node)
24909+ rc = IOCTL(node, inode, file, cmd, arg);
24910+ else
24911+ rc = -ENODEV;
24912+ }
24913+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
24914+ } else
24915+ rc = -ENOMEM;
24916+ break;
24917+ }
24918+
24919+ default:
24920+ rc = -EINVAL;
24921+ }
24922+ return rc;
24923+}
24924+
24925+static mdk_personality_t linear_personality = {
24926+ .name = "evms_linear",
24927+ .read = linear_read,
24928+ .write = linear_write,
24929+ .run = linear_run,
24930+ .stop = linear_stop,
24931+ .status = linear_status,
24932+ .evms_ioctl = linear_evms_ioctl
24933+};
24934+
24935+static int md__init linear_init (void)
24936+{
24937+ return evms_register_md_personality (LINEAR, &linear_personality);
24938+}
24939+
24940+static void linear_exit (void)
24941+{
24942+ evms_unregister_md_personality (LINEAR);
24943+}
24944+
24945+
24946+module_init(linear_init);
24947+module_exit(linear_exit);
24948+#ifdef MODULE_LICENSE
24949+MODULE_LICENSE("GPL");
24950+#endif
24951diff -Naur linux-2002-09-30/drivers/evms/md_raid0.c evms-2002-09-30/drivers/evms/md_raid0.c
24952--- linux-2002-09-30/drivers/evms/md_raid0.c Wed Dec 31 18:00:00 1969
24953+++ evms-2002-09-30/drivers/evms/md_raid0.c Thu Aug 15 13:50:12 2002
24954@@ -0,0 +1,448 @@
24955+/*
24956+ raid0.c : Multiple Devices driver for Linux
24957+ Copyright (C) 1994-96 Marc ZYNGIER
24958+ <zyngier@ufr-info-p7.ibp.fr> or
24959+ <maz@gloups.fdn.fr>
24960+ Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
24961+
24962+
24963+ RAID-0 management functions.
24964+
24965+ This program is free software; you can redistribute it and/or modify
24966+ it under the terms of the GNU General Public License as published by
24967+ the Free Software Foundation; either version 2, or (at your option)
24968+ any later version.
24969+
24970+ You should have received a copy of the GNU General Public License
24971+ (for example /usr/src/linux/COPYING); if not, write to the Free
24972+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24973+*/
24974+
24975+#include <linux/module.h>
24976+#include <linux/evms/evms_raid0.h>
24977+
24978+#define MAJOR_NR MD_MAJOR
24979+#define MD_DRIVER
24980+#define MD_PERSONALITY
24981+
24982+#define LOG_PREFIX "md raid0: "
24983+
24984+static int create_strip_zones (mddev_t *mddev)
24985+{
24986+ int i, c, j, j1, j2;
24987+ unsigned long current_offset, curr_zone_offset, rdev_size_in_sects;
24988+ raid0_conf_t *conf = mddev_to_conf(mddev);
24989+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
24990+
24991+ /*
24992+ * The number of 'same size groups'
24993+ */
24994+ conf->nr_strip_zones = 0;
24995+
24996+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
24997+ LOG_DEBUG(" looking at %s\n", evms_md_partition_name(rdev1->node));
24998+ c = 0;
24999+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
25000+ LOG_DEBUG(" comparing %s(%ld sectors) with %s(%ld sectors)\n",
25001+ evms_md_partition_name(rdev1->node), rdev1->size << 1,
25002+ evms_md_partition_name(rdev2->node), rdev2->size << 1);
25003+ if (rdev2 == rdev1) {
25004+ LOG_DEBUG(" END\n");
25005+ break;
25006+ }
25007+ if (rdev2->size == rdev1->size)
25008+ {
25009+ /*
25010+ * Not unique, dont count it as a new
25011+ * group
25012+ */
25013+ LOG_DEBUG(" EQUAL\n");
25014+ c = 1;
25015+ break;
25016+ }
25017+ LOG_DEBUG(" NOT EQUAL\n");
25018+ }
25019+ if (!c) {
25020+ LOG_DEBUG(" ==> UNIQUE\n");
25021+ conf->nr_strip_zones++;
25022+ LOG_DEBUG(" %d zones\n",conf->nr_strip_zones);
25023+ }
25024+ }
25025+ LOG_DEBUG(" FINAL %d zones\n",conf->nr_strip_zones);
25026+
25027+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
25028+ conf->nr_strip_zones);
25029+ if (!conf->strip_zone)
25030+ return 1;
25031+
25032+
25033+ conf->smallest = NULL;
25034+ current_offset = 0;
25035+ curr_zone_offset = 0;
25036+
25037+ for (i = 0; i < conf->nr_strip_zones; i++)
25038+ {
25039+ struct strip_zone *zone = conf->strip_zone + i;
25040+
25041+ LOG_DEBUG(" zone %d\n", i);
25042+ zone->dev_offset = current_offset;
25043+ smallest = NULL;
25044+ c = 0;
25045+
25046+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
25047+
25048+ LOG_DEBUG(" checking %s ...",evms_md_partition_name(rdev->node));
25049+ rdev_size_in_sects = rdev->size << 1;
25050+ if (rdev_size_in_sects > current_offset)
25051+ {
25052+ LOG_DEBUG(" contained as device %d\n", c);
25053+ zone->node[c] = rdev->node;
25054+ c++;
25055+ if (!smallest || (rdev_size_in_sects < (smallest->size <<1) )) {
25056+ smallest = rdev;
25057+ LOG_DEBUG(" (%ld) is smallest!.\n", rdev_size_in_sects);
25058+ }
25059+ } else
25060+ LOG_DEBUG(" nope.\n");
25061+ }
25062+
25063+ zone->nb_dev = c;
25064+ zone->size_in_sects = ((smallest->size <<1) - current_offset) * c;
25065+ LOG_DEBUG(" zone->nb_dev: %d, size: %ld\n",
25066+ zone->nb_dev,zone->size_in_sects);
25067+
25068+ if (!conf->smallest || (zone->size_in_sects < conf->smallest->size_in_sects))
25069+ conf->smallest = zone;
25070+
25071+ zone->zone_offset = curr_zone_offset;
25072+ curr_zone_offset += zone->size_in_sects;
25073+
25074+ current_offset = smallest->size << 1;
25075+ LOG_DEBUG(" current zone offset: %ld\n",current_offset);
25076+ }
25077+ LOG_DEBUG(" done.\n");
25078+ return 0;
25079+}
25080+
25081+static int raid0_run (mddev_t *mddev)
25082+{
25083+ unsigned long cur=0, i=0, size, zone0_size, nb_zone;
25084+ unsigned long mddev_size_in_sects = evms_md_size[mdidx(mddev)] << 1;
25085+ raid0_conf_t *conf;
25086+
25087+ MOD_INC_USE_COUNT;
25088+
25089+ conf = vmalloc(sizeof (raid0_conf_t));
25090+ if (!conf)
25091+ goto out;
25092+ mddev->private = (void *)conf;
25093+
25094+ if (evms_md_check_ordering(mddev)) {
25095+ LOG_ERROR("disks are not ordered, aborting!\n");
25096+ goto out_free_conf;
25097+ }
25098+
25099+ if (create_strip_zones (mddev))
25100+ goto out_free_conf;
25101+
25102+ LOG_DETAILS("evms_md_size is %ld sectors.\n", mddev_size_in_sects);
25103+ LOG_DETAILS("conf->smallest->size_in_sects is %ld sectors.\n", conf->smallest->size_in_sects);
25104+ nb_zone = mddev_size_in_sects / conf->smallest->size_in_sects +
25105+ (mddev_size_in_sects % conf->smallest->size_in_sects ? 1 : 0);
25106+ LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
25107+ conf->nr_zones = nb_zone;
25108+
25109+ LOG_DEBUG("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
25110+
25111+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
25112+ if (!conf->hash_table)
25113+ goto out_free_zone_conf;
25114+ size = conf->strip_zone[cur].size_in_sects;
25115+
25116+ i = 0;
25117+ while (cur < conf->nr_strip_zones) {
25118+ conf->hash_table[i].zone0 = conf->strip_zone + cur;
25119+
25120+ /*
25121+ * If we completely fill the slot
25122+ */
25123+ if (size >= conf->smallest->size_in_sects) {
25124+ conf->hash_table[i++].zone1 = NULL;
25125+ size -= conf->smallest->size_in_sects;
25126+
25127+ if (!size) {
25128+ if (++cur == conf->nr_strip_zones)
25129+ continue;
25130+ size = conf->strip_zone[cur].size_in_sects;
25131+ }
25132+ continue;
25133+ }
25134+ if (++cur == conf->nr_strip_zones) {
25135+ /*
25136+ * Last dev, set unit1 as NULL
25137+ */
25138+ conf->hash_table[i].zone1=NULL;
25139+ continue;
25140+ }
25141+
25142+ /*
25143+ * Here we use a 2nd dev to fill the slot
25144+ */
25145+ zone0_size = size;
25146+ size = conf->strip_zone[cur].size_in_sects;
25147+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
25148+ size -= (conf->smallest->size_in_sects - zone0_size);
25149+ }
25150+ return 0;
25151+
25152+out_free_zone_conf:
25153+ vfree(conf->strip_zone);
25154+ conf->strip_zone = NULL;
25155+
25156+out_free_conf:
25157+ vfree(conf);
25158+ mddev->private = NULL;
25159+out:
25160+ MOD_DEC_USE_COUNT;
25161+ return 1;
25162+}
25163+
25164+static int raid0_stop (mddev_t *mddev)
25165+{
25166+ raid0_conf_t *conf = mddev_to_conf(mddev);
25167+
25168+ vfree (conf->hash_table);
25169+ conf->hash_table = NULL;
25170+ vfree (conf->strip_zone);
25171+ conf->strip_zone = NULL;
25172+ vfree (conf);
25173+ mddev->private = NULL;
25174+
25175+ MOD_DEC_USE_COUNT;
25176+ return 0;
25177+}
25178+
25179+
25180+/*
25181+ * Function: raid0_map
25182+ *
25183+ * Return 0 for success, else error
25184+ *
25185+ */
25186+
25187+static inline int raid0_map(
25188+ mddev_t *mddev,
25189+ unsigned long lsn,
25190+ unsigned long size,
25191+ struct evms_logical_node **node,
25192+ unsigned long *new_lsn,
25193+ unsigned long *new_size)
25194+{
25195+ unsigned int sect_in_chunk, chunksize_bits, chunk_size_in_sects;
25196+ raid0_conf_t *conf = mddev_to_conf(mddev);
25197+ struct raid0_hash *hash;
25198+ struct strip_zone *zone;
25199+ unsigned long chunk;
25200+
25201+ chunk_size_in_sects = mddev->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT;
25202+ chunksize_bits = ffz(~chunk_size_in_sects);
25203+ hash = conf->hash_table + (lsn / conf->smallest->size_in_sects);
25204+
25205+ /* Sanity check */
25206+ if (!hash)
25207+ goto bad_hash;
25208+
25209+ if (!hash->zone0)
25210+ goto bad_zone0;
25211+
25212+ if (lsn >= (hash->zone0->size_in_sects + hash->zone0->zone_offset)) {
25213+ if (!hash->zone1)
25214+ goto bad_zone1;
25215+ zone = hash->zone1;
25216+ } else
25217+ zone = hash->zone0;
25218+
25219+ sect_in_chunk = lsn & (chunk_size_in_sects - 1);
25220+ chunk = (lsn - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
25221+ *node = zone->node[(lsn >> chunksize_bits) % zone->nb_dev];
25222+
25223+ *new_lsn = ((chunk << chunksize_bits) + zone->dev_offset) + sect_in_chunk;
25224+
25225+ *new_size = (size <= chunk_size_in_sects - sect_in_chunk) ?
25226+ size : chunk_size_in_sects - sect_in_chunk;
25227+
25228+ return 0;
25229+
25230+bad_hash:
25231+ LOG_ERROR("%s: bug: hash==NULL for lsn %lu\n", __FUNCTION__, lsn);
25232+ goto outerr;
25233+bad_zone0:
25234+ LOG_ERROR("%s: bug: hash->zone0==NULL for lsn %lu\n", __FUNCTION__, lsn);
25235+ goto outerr;
25236+bad_zone1:
25237+ LOG_ERROR("%s: bug: hash->zone1==NULL for lsn %lu\n", __FUNCTION__, lsn);
25238+outerr:
25239+ return -EINVAL;
25240+}
25241+
25242+void raid0_error(int rw, struct evms_logical_node *node, struct buffer_head *bh)
25243+{
25244+ LOG_ERROR(" %s FAILED on node(%s) rsector(%lu) size(%d)\n",
25245+ (rw == READ) ? "READ" : "WRITE",
25246+ node->name,
25247+ bh->b_rsector,
25248+ bh->b_size);
25249+
25250+ bh->b_end_io(bh, 0);
25251+}
25252+
25253+static inline void raid0_rw (
25254+ struct evms_logical_node *md_node,
25255+ struct buffer_head *bh,
25256+ int rw)
25257+{
25258+ mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25259+ struct evms_logical_node *node;
25260+ unsigned long new_lsn, size_in_sects, new_size;
25261+
25262+ if (evms_md_check_boundary(md_node, bh)) return;
25263+ size_in_sects = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
25264+ if (!raid0_map(mddev, bh->b_rsector, size_in_sects, &node, &new_lsn, &new_size)) {
25265+ if (new_size == size_in_sects) {
25266+ /*
25267+ * This is the normal case:
25268+ * the request is entirely within the stripe boundary
25269+ */
25270+ bh->b_rsector = new_lsn;
25271+ if (rw == READ) {
25272+ R_IO(node, bh);
25273+ } else {
25274+ W_IO(node, bh);
25275+ }
25276+ return;
25277+ } else {
25278+ /*
25279+ * BUGBUG!
25280+ * Need more processing here (ie. break up the request)
25281+ */
25282+ LOG_ERROR("This version of EVMS RAID0 does not support I/O requests that are:\n");
25283+ LOG_ERROR(" - larger than the stripe size\n");
25284+ LOG_ERROR(" - cross the stripe boundary\n");
25285+ }
25286+ }
25287+ raid0_error(rw, node, bh);
25288+}
25289+
25290+static void raid0_read(
25291+ struct evms_logical_node *md_node,
25292+ struct buffer_head *bh)
25293+{
25294+ raid0_rw(md_node, bh, READ);
25295+}
25296+
25297+static void raid0_write(
25298+ struct evms_logical_node *md_node,
25299+ struct buffer_head *bh)
25300+{
25301+ raid0_rw(md_node, bh, WRITE);
25302+}
25303+
25304+static int raid0_status (char *page, mddev_t *mddev)
25305+{
25306+ int sz = 0;
25307+#undef MD_DEBUG
25308+#ifdef MD_DEBUG
25309+ int j, k;
25310+ raid0_conf_t *conf = mddev_to_conf(mddev);
25311+
25312+ sz += sprintf(page + sz, " ");
25313+ for (j = 0; j < conf->nr_zones; j++) {
25314+ sz += sprintf(page + sz, "[z%d",
25315+ conf->hash_table[j].zone0 - conf->strip_zone);
25316+ if (conf->hash_table[j].zone1)
25317+ sz += sprintf(page+sz, "/z%d] ",
25318+ conf->hash_table[j].zone1 - conf->strip_zone);
25319+ else
25320+ sz += sprintf(page+sz, "] ");
25321+ }
25322+
25323+ sz += sprintf(page + sz, "\n");
25324+
25325+ for (j = 0; j < conf->nr_strip_zones; j++) {
25326+ sz += sprintf(page + sz, " z%d=[", j);
25327+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
25328+ sz += sprintf (page+sz, "%s/", conf->strip_zone[j].node[k]->name);
25329+ sz--;
25330+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
25331+ conf->strip_zone[j].zone_offset,
25332+ conf->strip_zone[j].dev_offset,
25333+ conf->strip_zone[j].size_in_sects);
25334+ }
25335+#endif
25336+ sz += sprintf(page + sz, " %dk chunks", mddev->chunk_size/1024);
25337+ return sz;
25338+}
25339+
25340+static int raid0_evms_ioctl (
25341+ mddev_t * mddev,
25342+ struct inode * inode,
25343+ struct file * file,
25344+ unsigned int cmd,
25345+ unsigned long arg)
25346+{
25347+ int rc = 0;
25348+ struct evms_logical_node *node;
25349+
25350+ switch (cmd) {
25351+ case EVMS_GET_BMAP:
25352+ {
25353+ struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
25354+ unsigned long new_lsn, new_size;
25355+ unsigned long size = mddev->node->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
25356+ rc = raid0_map(mddev,
25357+ (unsigned long)bmap->rsector,
25358+ size,
25359+ &node,
25360+ &new_lsn,
25361+ &new_size);
25362+ if (!rc) {
25363+ if (node) {
25364+ bmap->rsector = (u64)new_lsn;
25365+ rc = IOCTL(node, inode, file, cmd, arg);
25366+ } else
25367+ rc = -ENODEV;
25368+ }
25369+ break;
25370+ }
25371+
25372+ default:
25373+ rc = -EINVAL;
25374+ }
25375+ return rc;
25376+}
25377+
25378+static mdk_personality_t raid0_personality = {
25379+ .name = "evms_raid0",
25380+ .read = raid0_read,
25381+ .write = raid0_write,
25382+ .run = raid0_run,
25383+ .stop = raid0_stop,
25384+ .status = raid0_status,
25385+ .evms_ioctl = raid0_evms_ioctl
25386+};
25387+
25388+static int md__init raid0_init (void)
25389+{
25390+ return evms_register_md_personality (RAID0, &raid0_personality);
25391+}
25392+
25393+static void raid0_exit (void)
25394+{
25395+ evms_unregister_md_personality (RAID0);
25396+}
25397+
25398+module_init(raid0_init);
25399+module_exit(raid0_exit);
25400+#ifdef MODULE_LICENSE
25401+MODULE_LICENSE("GPL");
25402+#endif
25403diff -Naur linux-2002-09-30/drivers/evms/md_raid1.c evms-2002-09-30/drivers/evms/md_raid1.c
25404--- linux-2002-09-30/drivers/evms/md_raid1.c Wed Dec 31 18:00:00 1969
25405+++ evms-2002-09-30/drivers/evms/md_raid1.c Mon Sep 30 00:02:48 2002
25406@@ -0,0 +1,1935 @@
25407+/*
25408+ * md_raid1.c : Multiple Devices driver for Linux
25409+ *
25410+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
25411+ *
25412+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
25413+ *
25414+ * RAID-1 management functions.
25415+ *
25416+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
25417+ *
25418