]> git.pld-linux.org Git - packages/kernel.git/blame - raid-2.2.20-A0
- obsolete
[packages/kernel.git] / raid-2.2.20-A0
CommitLineData
035452c4
KT
1--- linux/init/main.c.orig Fri Nov 23 11:17:45 2001
2+++ linux/init/main.c Fri Nov 23 11:18:53 2001
3@@ -19,6 +19,7 @@
4 #include <linux/utsname.h>
5 #include <linux/ioport.h>
6 #include <linux/init.h>
7+#include <linux/raid/md.h>
8 #include <linux/smp_lock.h>
9 #include <linux/blk.h>
10 #include <linux/hdreg.h>
11@@ -1012,9 +1013,12 @@
12 #ifdef CONFIG_FTAPE
13 { "ftape=", ftape_setup},
14 #endif
15-#ifdef CONFIG_MD_BOOT
16+#ifdef CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
17 { "md=", md_setup},
18 #endif
19+#if CONFIG_BLK_DEV_MD
20+ { "raid=", raid_setup},
21+#endif
22 #ifdef CONFIG_ADBMOUSE
23 { "adb_buttons=", adb_mouse_setup },
24 #endif
25@@ -1592,6 +1596,9 @@
26 while (pid != wait(&i));
27 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
28 || MINOR(real_root_dev) != 0) {
29+#ifdef CONFIG_BLK_DEV_MD
30+ autodetect_raid();
31+#endif
32 error = change_root(real_root_dev,"/initrd");
33 if (error)
34 printk(KERN_ERR "Change root to /initrd: "
35--- linux/include/linux/raid/hsm.h.orig Fri Nov 23 11:18:15 2001
36+++ linux/include/linux/raid/hsm.h Fri Nov 23 11:18:15 2001
37@@ -0,0 +1,65 @@
38+#ifndef _HSM_H
39+#define _HSM_H
40+
41+#include <linux/raid/md.h>
42+
43+#if __alpha__
44+#error fix cpu_addr on Alpha first
45+#endif
46+
47+#include <linux/raid/hsm_p.h>
48+
49+#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
50+#define index_dev(lv,index) index_pv((lv),(index))->dev
51+#define index_block(lv,index) (index)->data.phys_block
52+#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
53+
54+#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
55+
56+
57+typedef struct pv_bg_desc_s {
58+ unsigned int free_blocks;
59+ pv_block_group_t *bg;
60+} pv_bg_desc_t;
61+
62+typedef struct pv_s pv_t;
63+typedef struct vg_s vg_t;
64+typedef struct lv_s lv_t;
65+
66+struct pv_s
67+{
68+ int phys_nr;
69+ kdev_t dev;
70+ pv_sb_t *pv_sb;
71+ pv_bg_desc_t *bg_array;
72+};
73+
74+struct lv_s
75+{
76+ int log_id;
77+ vg_t *vg;
78+
79+ unsigned int max_indices;
80+ unsigned int free_indices;
81+ lv_lptr_t root_index;
82+
83+ kdev_t dev;
84+};
85+
86+struct vg_s
87+{
88+ int nr_pv;
89+ pv_t pv_array [MD_SB_DISKS];
90+
91+ int nr_lv;
92+ lv_t lv_array [HSM_MAX_LVS_PER_VG];
93+
94+ vg_sb_t *vg_sb;
95+ mddev_t *mddev;
96+};
97+
98+#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
99+#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
100+
101+#endif
102+
103--- linux/include/linux/raid/hsm_p.h.orig Fri Nov 23 11:18:15 2001
104+++ linux/include/linux/raid/hsm_p.h Fri Nov 23 11:18:15 2001
105@@ -0,0 +1,237 @@
106+#ifndef _HSM_P_H
107+#define _HSM_P_H
108+
109+#define HSM_BLOCKSIZE 4096
110+#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
111+#define PACKED __attribute__ ((packed))
112+
113+/*
114+ * Identifies a block in physical space
115+ */
116+typedef struct phys_idx_s {
117+ __u16 phys_nr;
118+ __u32 phys_block;
119+
120+} PACKED phys_idx_t;
121+
122+/*
123+ * Identifies a block in logical space
124+ */
125+typedef struct log_idx_s {
126+ __u16 log_id;
127+ __u32 log_index;
128+
129+} PACKED log_idx_t;
130+
131+/*
132+ * Describes one PV
133+ */
134+#define HSM_PV_SB_MAGIC 0xf091ae9fU
135+
136+#define HSM_PV_SB_GENERIC_WORDS 32
137+#define HSM_PV_SB_RESERVED_WORDS \
138+ (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
139+
140+/*
141+ * On-disk PV identification data, on block 0 in any PV.
142+ */
143+typedef struct pv_sb_s
144+{
145+ __u32 pv_magic; /* 0 */
146+
147+ __u32 pv_uuid0; /* 1 */
148+ __u32 pv_uuid1; /* 2 */
149+ __u32 pv_uuid2; /* 3 */
150+ __u32 pv_uuid3; /* 4 */
151+
152+ __u32 pv_major; /* 5 */
153+ __u32 pv_minor; /* 6 */
154+ __u32 pv_patch; /* 7 */
155+
156+ __u32 pv_ctime; /* 8 Creation time */
157+
158+ __u32 pv_total_size; /* 9 size of this PV, in blocks */
159+ __u32 pv_first_free; /* 10 first free block */
160+ __u32 pv_first_used; /* 11 first used block */
161+ __u32 pv_blocks_left; /* 12 unallocated blocks */
162+ __u32 pv_bg_size; /* 13 size of a block group, in blocks */
163+ __u32 pv_block_size; /* 14 size of blocks, in bytes */
164+ __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
165+ __u32 pv_block_groups; /* 16 number of block groups */
166+
167+ __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
168+
169+ /*
170+ * Reserved
171+ */
172+ __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
173+
174+} PACKED pv_sb_t;
175+
176+/*
177+ * this is pretty much arbitrary, but has to be less than ~64
178+ */
179+#define HSM_MAX_LVS_PER_VG 32
180+
181+#define HSM_VG_SB_GENERIC_WORDS 32
182+
183+#define LV_DESCRIPTOR_WORDS 8
184+#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
185+ LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
186+
187+#if (HSM_PV_SB_RESERVED_WORDS < 0)
188+#error you messed this one up dude ...
189+#endif
190+
191+typedef struct lv_descriptor_s
192+{
193+ __u32 lv_id; /* 0 */
194+ phys_idx_t lv_root_idx; /* 1 */
195+ __u16 __reserved; /* 2 */
196+ __u32 lv_max_indices; /* 3 */
197+ __u32 lv_free_indices; /* 4 */
198+ __u32 md_id; /* 5 */
199+
200+ __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
201+
202+} PACKED lv_descriptor_t;
203+
204+#define HSM_VG_SB_MAGIC 0x98320d7aU
205+/*
206+ * On-disk VG identification data, in block 1 on all PVs
207+ */
208+typedef struct vg_sb_s
209+{
210+ __u32 vg_magic; /* 0 */
211+ __u32 nr_lvs; /* 1 */
212+
213+ __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
214+
215+ lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
216+ /*
217+ * Reserved
218+ */
219+ __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
220+
221+} PACKED vg_sb_t;
222+
223+/*
224+ * Describes one LV
225+ */
226+
227+#define HSM_LV_SB_MAGIC 0xe182bd8aU
228+
229+/* do we need lv_sb_t? */
230+
231+typedef struct lv_sb_s
232+{
233+ /*
234+ * On-disk LV identifier
235+ */
236+ __u32 lv_magic; /* 0 LV identifier */
237+ __u32 lv_uuid0; /* 1 */
238+ __u32 lv_uuid1; /* 2 */
239+ __u32 lv_uuid2; /* 3 */
240+ __u32 lv_uuid3; /* 4 */
241+
242+ __u32 lv_major; /* 5 PV identifier */
243+ __u32 lv_minor; /* 6 PV identifier */
244+ __u32 lv_patch; /* 7 PV identifier */
245+
246+ __u32 ctime; /* 8 Creation time */
247+ __u32 size; /* 9 size of this LV, in blocks */
248+ phys_idx_t start; /* 10 position of root index block */
249+ log_idx_t first_free; /* 11-12 first free index */
250+
251+ /*
252+ * Reserved
253+ */
254+ __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
255+
256+} PACKED lv_sb_t;
257+
258+/*
259+ * Pointer pointing from the physical space, points to
260+ * the LV owning this block. It also contains various
261+ * statistics about the physical block.
262+ */
263+typedef struct pv_pptr_s
264+{
265+ union {
266+ /* case 1 */
267+ struct {
268+ log_idx_t owner;
269+ log_idx_t predicted;
270+ __u32 last_referenced;
271+ } used;
272+ /* case 2 */
273+ struct {
274+ __u16 log_id;
275+ __u16 __unused1;
276+ __u32 next_free;
277+ __u32 __unused2;
278+ __u32 __unused3;
279+ } free;
280+ } u;
281+} PACKED pv_pptr_t;
282+
283+static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
284+{
285+ return !pptr->u.free.log_id;
286+}
287+
288+
289+#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
290+
291+#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
292+/*
293+ * A table of pointers filling up a single block, managing
294+ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
295+ * groups form the physical space of blocks.
296+ */
297+typedef struct pv_block_group_s
298+{
299+ __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
300+
301+ pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
302+
303+} PACKED pv_block_group_t;
304+
305+/*
306+ * Pointer from the logical space, points to
307+ * the (PV,block) containing this logical block
308+ */
309+typedef struct lv_lptr_s
310+{
311+ phys_idx_t data;
312+ __u16 __reserved;
313+ __u32 cpu_addr;
314+ __u32 __reserved2;
315+
316+} PACKED lv_lptr_t;
317+
318+static __inline__ int index_free (const lv_lptr_t * index)
319+{
320+ return !index->data.phys_block;
321+}
322+
323+static __inline__ int index_present (const lv_lptr_t * index)
324+{
325+ return index->cpu_addr;
326+}
327+
328+
329+#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
330+/*
331+ * A table of pointers filling up a single block, managing
332+ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
333+ * the logical space of blocks.
334+ */
335+typedef struct lv_index_block_s
336+{
337+ lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
338+
339+} PACKED lv_index_block_t;
340+
341+#endif
342+
343--- linux/include/linux/raid/linear.h.orig Fri Nov 23 11:18:15 2001
344+++ linux/include/linux/raid/linear.h Fri Nov 23 11:18:15 2001
345@@ -0,0 +1,32 @@
346+#ifndef _LINEAR_H
347+#define _LINEAR_H
348+
349+#include <linux/raid/md.h>
350+
351+struct dev_info {
352+ kdev_t dev;
353+ int size;
354+ unsigned int offset;
355+};
356+
357+typedef struct dev_info dev_info_t;
358+
359+struct linear_hash
360+{
361+ dev_info_t *dev0, *dev1;
362+};
363+
364+struct linear_private_data
365+{
366+ struct linear_hash *hash_table;
367+ dev_info_t disks[MD_SB_DISKS];
368+ dev_info_t *smallest;
369+ int nr_zones;
370+};
371+
372+
373+typedef struct linear_private_data linear_conf_t;
374+
375+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
376+
377+#endif
378--- linux/include/linux/raid/md.h.orig Fri Nov 23 11:18:15 2001
379+++ linux/include/linux/raid/md.h Fri Nov 23 11:18:15 2001
380@@ -0,0 +1,96 @@
381+/*
382+ md.h : Multiple Devices driver for Linux
383+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
384+ Copyright (C) 1994-96 Marc ZYNGIER
385+ <zyngier@ufr-info-p7.ibp.fr> or
386+ <maz@gloups.fdn.fr>
387+
388+ This program is free software; you can redistribute it and/or modify
389+ it under the terms of the GNU General Public License as published by
390+ the Free Software Foundation; either version 2, or (at your option)
391+ any later version.
392+
393+ You should have received a copy of the GNU General Public License
394+ (for example /usr/src/linux/COPYING); if not, write to the Free
395+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
396+*/
397+
398+#ifndef _MD_H
399+#define _MD_H
400+
401+#include <linux/mm.h>
402+#include <linux/fs.h>
403+#include <linux/blkdev.h>
404+#include <asm/semaphore.h>
405+#include <linux/major.h>
406+#include <linux/ioctl.h>
407+#include <linux/types.h>
408+#include <asm/bitops.h>
409+#include <linux/module.h>
410+#include <linux/mm.h>
411+#include <linux/hdreg.h>
412+#include <linux/sysctl.h>
413+#include <linux/fs.h>
414+#include <linux/proc_fs.h>
415+#include <linux/smp_lock.h>
416+#include <linux/delay.h>
417+#include <net/checksum.h>
418+#include <linux/random.h>
419+#include <linux/locks.h>
420+#include <asm/io.h>
421+
422+#include <linux/raid/md_compatible.h>
423+/*
424+ * 'md_p.h' holds the 'physical' layout of RAID devices
425+ * 'md_u.h' holds the user <=> kernel API
426+ *
427+ * 'md_k.h' holds kernel internal definitions
428+ */
429+
430+#include <linux/raid/md_p.h>
431+#include <linux/raid/md_u.h>
432+#include <linux/raid/md_k.h>
433+
434+/*
435+ * Different major versions are not compatible.
436+ * Different minor versions are only downward compatible.
437+ * Different patchlevel versions are downward and upward compatible.
438+ */
439+#define MD_MAJOR_VERSION 0
440+#define MD_MINOR_VERSION 90
441+#define MD_PATCHLEVEL_VERSION 0
442+
443+extern int md_size[MAX_MD_DEVS];
444+extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
445+
446+extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
447+extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
448+extern char * partition_name (kdev_t dev);
449+extern int register_md_personality (int p_num, mdk_personality_t *p);
450+extern int unregister_md_personality (int p_num);
451+extern mdk_thread_t * md_register_thread (void (*run) (void *data),
452+ void *data, const char *name);
453+extern void md_unregister_thread (mdk_thread_t *thread);
454+extern void md_wakeup_thread(mdk_thread_t *thread);
455+extern void md_interrupt_thread (mdk_thread_t *thread);
456+extern int md_update_sb (mddev_t *mddev);
457+extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
458+extern void md_recover_arrays (void);
459+extern int md_check_ordering (mddev_t *mddev);
460+extern void autodetect_raid(void);
461+extern struct gendisk * find_gendisk (kdev_t dev);
462+extern int md_notify_reboot(struct notifier_block *this,
463+ unsigned long code, void *x);
464+#if CONFIG_BLK_DEV_MD
465+extern void raid_setup(char *str,int *ints) md__init;
466+#endif
467+#ifdef CONFIG_MD_BOOT
468+extern void md_setup(char *str,int *ints) md__init;
469+#endif
470+
471+extern void md_print_devices (void);
472+
473+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
474+
475+#endif _MD_H
476+
477--- linux/include/linux/raid/md_compatible.h.orig Fri Nov 23 11:18:16 2001
478+++ linux/include/linux/raid/md_compatible.h Fri Nov 23 11:18:16 2001
479@@ -0,0 +1,387 @@
480+
481+/*
482+ md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
483+ Copyright (C) 1998 Ingo Molnar
484+
485+ This program is free software; you can redistribute it and/or modify
486+ it under the terms of the GNU General Public License as published by
487+ the Free Software Foundation; either version 2, or (at your option)
488+ any later version.
489+
490+ You should have received a copy of the GNU General Public License
491+ (for example /usr/src/linux/COPYING); if not, write to the Free
492+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
493+*/
494+
495+#include <linux/version.h>
496+
497+#ifndef _MD_COMPATIBLE_H
498+#define _MD_COMPATIBLE_H
499+
500+#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
501+
502+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
503+
504+/* 000 */
505+#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
506+
507+#ifdef __i386__
508+/* 001 */
509+extern __inline__ int md_cpu_has_mmx(void)
510+{
511+ return x86_capability & 0x00800000;
512+}
513+#endif
514+
515+/* 002 */
516+#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
517+
518+/* 003 */
519+/*
520+ * someone please suggest a sane compatibility layer for modules
521+ */
522+#define MD_EXPORT_SYMBOL(x)
523+
524+/* 004 */
525+static inline unsigned long
526+md_copy_from_user(void *to, const void *from, unsigned long n)
527+{
528+ int err;
529+
530+ err = verify_area(VERIFY_READ,from,n);
531+ if (!err)
532+ memcpy_fromfs(to, from, n);
533+ return err;
534+}
535+
536+/* 005 */
537+extern inline unsigned long
538+md_copy_to_user(void *to, const void *from, unsigned long n)
539+{
540+ int err;
541+
542+ err = verify_area(VERIFY_WRITE,to,n);
543+ if (!err)
544+ memcpy_tofs(to, from, n);
545+ return err;
546+}
547+
548+/* 006 */
549+#define md_put_user(x,ptr) \
550+({ \
551+ int __err; \
552+ \
553+ __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
554+ if (!__err) \
555+ put_user(x,ptr); \
556+ __err; \
557+})
558+
559+/* 007 */
560+extern inline int md_capable_admin(void)
561+{
562+ return suser();
563+}
564+
565+/* 008 */
566+#define MD_FILE_TO_INODE(file) ((file)->f_inode)
567+
568+/* 009 */
569+extern inline void md_flush_signals (void)
570+{
571+ current->signal = 0;
572+}
573+
574+/* 010 */
575+#define __S(nr) (1<<((nr)-1))
576+extern inline void md_init_signals (void)
577+{
578+ current->exit_signal = SIGCHLD;
579+ current->blocked = ~(__S(SIGKILL));
580+}
581+#undef __S
582+
583+/* 011 */
584+extern inline unsigned long md_signal_pending (struct task_struct * tsk)
585+{
586+ return (tsk->signal & ~tsk->blocked);
587+}
588+
589+/* 012 */
590+#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
591+
592+/* 013 */
593+#define md_mdelay(n) (\
594+ {unsigned long msec=(n); while (msec--) udelay(1000);})
595+
596+/* 014 */
597+#define MD_SYS_DOWN 0
598+#define MD_SYS_HALT 0
599+#define MD_SYS_POWER_OFF 0
600+
601+/* 015 */
602+#define md_register_reboot_notifier(x)
603+
604+/* 016 */
605+extern __inline__ unsigned long
606+md_test_and_set_bit(int nr, void * addr)
607+{
608+ unsigned long flags;
609+ unsigned long oldbit;
610+
611+ save_flags(flags);
612+ cli();
613+ oldbit = test_bit(nr,addr);
614+ set_bit(nr,addr);
615+ restore_flags(flags);
616+ return oldbit;
617+}
618+
619+/* 017 */
620+extern __inline__ unsigned long
621+md_test_and_clear_bit(int nr, void * addr)
622+{
623+ unsigned long flags;
624+ unsigned long oldbit;
625+
626+ save_flags(flags);
627+ cli();
628+ oldbit = test_bit(nr,addr);
629+ clear_bit(nr,addr);
630+ restore_flags(flags);
631+ return oldbit;
632+}
633+
634+/* 018 */
635+#define md_atomic_read(x) (*(volatile int *)(x))
636+#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
637+
638+/* 019 */
639+extern __inline__ void md_lock_kernel (void)
640+{
641+#if __SMP__
642+ lock_kernel();
643+ syscall_count++;
644+#endif
645+}
646+
647+extern __inline__ void md_unlock_kernel (void)
648+{
649+#if __SMP__
650+ syscall_count--;
651+ unlock_kernel();
652+#endif
653+}
654+/* 020 */
655+
656+#define md__init
657+#define md__initdata
658+#define md__initfunc(__arginit) __arginit
659+
660+/* 021 */
661+
662+/* 022 */
663+
664+struct md_list_head {
665+ struct md_list_head *next, *prev;
666+};
667+
668+#define MD_LIST_HEAD(name) \
669+ struct md_list_head name = { &name, &name }
670+
671+#define MD_INIT_LIST_HEAD(ptr) do { \
672+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
673+} while (0)
674+
675+static __inline__ void md__list_add(struct md_list_head * new,
676+ struct md_list_head * prev,
677+ struct md_list_head * next)
678+{
679+ next->prev = new;
680+ new->next = next;
681+ new->prev = prev;
682+ prev->next = new;
683+}
684+
685+static __inline__ void md_list_add(struct md_list_head *new,
686+ struct md_list_head *head)
687+{
688+ md__list_add(new, head, head->next);
689+}
690+
691+static __inline__ void md__list_del(struct md_list_head * prev,
692+ struct md_list_head * next)
693+{
694+ next->prev = prev;
695+ prev->next = next;
696+}
697+
698+static __inline__ void md_list_del(struct md_list_head *entry)
699+{
700+ md__list_del(entry->prev, entry->next);
701+}
702+
703+static __inline__ int md_list_empty(struct md_list_head *head)
704+{
705+ return head->next == head;
706+}
707+
708+#define md_list_entry(ptr, type, member) \
709+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
710+
711+/* 023 */
712+
713+static __inline__ signed long md_schedule_timeout(signed long timeout)
714+{
715+ current->timeout = jiffies + timeout;
716+ schedule();
717+ return 0;
718+}
719+
720+/* 024 */
721+#define md_need_resched(tsk) (need_resched)
722+
723+/* 025 */
724+typedef struct { int gcc_is_buggy; } md_spinlock_t;
725+#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
726+
727+#define md_spin_lock_irq cli
728+#define md_spin_unlock_irq sti
729+#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
730+#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
731+
732+/* END */
733+
734+#else
735+
736+#include <linux/reboot.h>
737+#include <linux/vmalloc.h>
738+
739+/* 000 */
740+#define md__get_free_pages(x,y) __get_free_pages(x,y)
741+
742+#ifdef __i386__
743+/* 001 */
744+extern __inline__ int md_cpu_has_mmx(void)
745+{
746+ return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
747+}
748+#endif
749+
750+/* 002 */
751+#define md_clear_page(page) clear_page(page)
752+
753+/* 003 */
754+#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
755+
756+/* 004 */
757+#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
758+
759+/* 005 */
760+#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
761+
762+/* 006 */
763+#define md_put_user put_user
764+
765+/* 007 */
766+extern inline int md_capable_admin(void)
767+{
768+ return capable(CAP_SYS_ADMIN);
769+}
770+
771+/* 008 */
772+#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
773+
774+/* 009 */
775+extern inline void md_flush_signals (void)
776+{
777+ spin_lock(&current->sigmask_lock);
778+ flush_signals(current);
779+ spin_unlock(&current->sigmask_lock);
780+}
781+
782+/* 010 */
783+extern inline void md_init_signals (void)
784+{
785+ current->exit_signal = SIGCHLD;
786+ siginitsetinv(&current->blocked, sigmask(SIGKILL));
787+}
788+
789+/* 011 */
790+#define md_signal_pending signal_pending
791+
792+/* 012 */
793+extern inline void md_set_global_readahead(int * table)
794+{
795+ max_readahead[MD_MAJOR] = table;
796+}
797+
798+/* 013 */
799+#define md_mdelay(x) mdelay(x)
800+
801+/* 014 */
802+#define MD_SYS_DOWN SYS_DOWN
803+#define MD_SYS_HALT SYS_HALT
804+#define MD_SYS_POWER_OFF SYS_POWER_OFF
805+
806+/* 015 */
807+#define md_register_reboot_notifier register_reboot_notifier
808+
809+/* 016 */
810+#define md_test_and_set_bit test_and_set_bit
811+
812+/* 017 */
813+#define md_test_and_clear_bit test_and_clear_bit
814+
815+/* 018 */
816+#define md_atomic_read atomic_read
817+#define md_atomic_set atomic_set
818+
819+/* 019 */
820+#define md_lock_kernel lock_kernel
821+#define md_unlock_kernel unlock_kernel
822+
823+/* 020 */
824+
825+#include <linux/init.h>
826+
827+#define md__init __init
828+#define md__initdata __initdata
829+#define md__initfunc(__arginit) __initfunc(__arginit)
830+
831+/* 021 */
832+
833+
834+/* 022 */
835+
836+#define md_list_head list_head
837+#define MD_LIST_HEAD(name) LIST_HEAD(name)
838+#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
839+#define md_list_add list_add
840+#define md_list_del list_del
841+#define md_list_empty list_empty
842+
843+#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
844+
845+/* 023 */
846+
847+#define md_schedule_timeout schedule_timeout
848+
849+/* 024 */
850+#define md_need_resched(tsk) ((tsk)->need_resched)
851+
852+/* 025 */
853+#define md_spinlock_t spinlock_t
854+#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
855+
856+#define md_spin_lock_irq spin_lock_irq
857+#define md_spin_unlock_irq spin_unlock_irq
858+#define md_spin_unlock_irqrestore spin_unlock_irqrestore
859+#define md_spin_lock_irqsave spin_lock_irqsave
860+
861+/* END */
862+
863+#endif
864+
865+#endif _MD_COMPATIBLE_H
866+
867--- linux/include/linux/raid/md_k.h.orig Fri Nov 23 11:18:16 2001
868+++ linux/include/linux/raid/md_k.h Fri Nov 23 11:18:16 2001
869@@ -0,0 +1,338 @@
870+/*
871+ md_k.h : kernel internal structure of the Linux MD driver
872+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
873+
874+ This program is free software; you can redistribute it and/or modify
875+ it under the terms of the GNU General Public License as published by
876+ the Free Software Foundation; either version 2, or (at your option)
877+ any later version.
878+
879+ You should have received a copy of the GNU General Public License
880+ (for example /usr/src/linux/COPYING); if not, write to the Free
881+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
882+*/
883+
884+#ifndef _MD_K_H
885+#define _MD_K_H
886+
887+#define MD_RESERVED 0UL
888+#define LINEAR 1UL
889+#define STRIPED 2UL
890+#define RAID0 STRIPED
891+#define RAID1 3UL
892+#define RAID5 4UL
893+#define TRANSLUCENT 5UL
894+#define HSM 6UL
895+#define MAX_PERSONALITY 7UL
896+
897+extern inline int pers_to_level (int pers)
898+{
899+ switch (pers) {
900+ case HSM: return -3;
901+ case TRANSLUCENT: return -2;
902+ case LINEAR: return -1;
903+ case RAID0: return 0;
904+ case RAID1: return 1;
905+ case RAID5: return 5;
906+ }
907+ panic("pers_to_level()");
908+}
909+
910+extern inline int level_to_pers (int level)
911+{
912+ switch (level) {
913+ case -3: return HSM;
914+ case -2: return TRANSLUCENT;
915+ case -1: return LINEAR;
916+ case 0: return RAID0;
917+ case 1: return RAID1;
918+ case 4:
919+ case 5: return RAID5;
920+ }
921+ return MD_RESERVED;
922+}
923+
924+typedef struct mddev_s mddev_t;
925+typedef struct mdk_rdev_s mdk_rdev_t;
926+
927+#if (MINORBITS != 8)
928+#error MD doesnt handle bigger kdev yet
929+#endif
930+
931+#define MAX_REAL 12 /* Max number of disks per md dev */
932+#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
933+
934+/*
935+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
936+ * the personality. (eg. HSM uses this to identify individual LVs)
937+ */
938+typedef struct dev_mapping_s {
939+ mddev_t *mddev;
940+ void *data;
941+} dev_mapping_t;
942+
943+extern dev_mapping_t mddev_map [MAX_MD_DEVS];
944+
945+extern inline mddev_t * kdev_to_mddev (kdev_t dev)
946+{
947+ return mddev_map[MINOR(dev)].mddev;
948+}
949+
950+/*
951+ * options passed in raidrun:
952+ */
953+
954+#define MAX_CHUNK_SIZE (4096*1024)
955+
956+/*
957+ * default readahead
958+ */
959+#define MD_READAHEAD (256 * 512)
960+
961+extern inline int disk_faulty(mdp_disk_t * d)
962+{
963+ return d->state & (1 << MD_DISK_FAULTY);
964+}
965+
966+extern inline int disk_active(mdp_disk_t * d)
967+{
968+ return d->state & (1 << MD_DISK_ACTIVE);
969+}
970+
971+extern inline int disk_sync(mdp_disk_t * d)
972+{
973+ return d->state & (1 << MD_DISK_SYNC);
974+}
975+
976+extern inline int disk_spare(mdp_disk_t * d)
977+{
978+ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
979+}
980+
981+extern inline int disk_removed(mdp_disk_t * d)
982+{
983+ return d->state & (1 << MD_DISK_REMOVED);
984+}
985+
986+extern inline void mark_disk_faulty(mdp_disk_t * d)
987+{
988+ d->state |= (1 << MD_DISK_FAULTY);
989+}
990+
991+extern inline void mark_disk_active(mdp_disk_t * d)
992+{
993+ d->state |= (1 << MD_DISK_ACTIVE);
994+}
995+
996+extern inline void mark_disk_sync(mdp_disk_t * d)
997+{
998+ d->state |= (1 << MD_DISK_SYNC);
999+}
1000+
1001+extern inline void mark_disk_spare(mdp_disk_t * d)
1002+{
1003+ d->state = 0;
1004+}
1005+
1006+extern inline void mark_disk_removed(mdp_disk_t * d)
1007+{
1008+ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
1009+}
1010+
1011+extern inline void mark_disk_inactive(mdp_disk_t * d)
1012+{
1013+ d->state &= ~(1 << MD_DISK_ACTIVE);
1014+}
1015+
1016+extern inline void mark_disk_nonsync(mdp_disk_t * d)
1017+{
1018+ d->state &= ~(1 << MD_DISK_SYNC);
1019+}
1020+
1021+/*
1022+ * MD's 'extended' device
1023+ */
1024+struct mdk_rdev_s
1025+{
1026+ struct md_list_head same_set; /* RAID devices within the same set */
1027+ struct md_list_head all; /* all RAID devices */
1028+ struct md_list_head pending; /* undetected RAID devices */
1029+
1030+ kdev_t dev; /* Device number */
1031+ kdev_t old_dev; /* "" when it was last imported */
1032+ int size; /* Device size (in blocks) */
1033+ mddev_t *mddev; /* RAID array if running */
1034+ unsigned long last_events; /* IO event timestamp */
1035+
1036+ struct inode *inode; /* Lock inode */
1037+ struct file filp; /* Lock file */
1038+
1039+ mdp_super_t *sb;
1040+ int sb_offset;
1041+
1042+ int faulty; /* if faulty do not issue IO requests */
1043+ int desc_nr; /* descriptor index in the superblock */
1044+};
1045+
1046+
1047+/*
1048+ * disk operations in a working array:
1049+ */
1050+#define DISKOP_SPARE_INACTIVE 0
1051+#define DISKOP_SPARE_WRITE 1
1052+#define DISKOP_SPARE_ACTIVE 2
1053+#define DISKOP_HOT_REMOVE_DISK 3
1054+#define DISKOP_HOT_ADD_DISK 4
1055+
1056+typedef struct mdk_personality_s mdk_personality_t;
1057+
1058+struct mddev_s
1059+{
1060+ void *private;
1061+ mdk_personality_t *pers;
1062+ int __minor;
1063+ mdp_super_t *sb;
1064+ int nb_dev;
1065+ struct md_list_head disks;
1066+ int sb_dirty;
1067+ mdu_param_t param;
1068+ int ro;
1069+ unsigned int curr_resync;
1070+ unsigned long resync_start;
1071+ char *name;
1072+ int recovery_running;
1073+ struct semaphore reconfig_sem;
1074+ struct semaphore recovery_sem;
1075+ struct semaphore resync_sem;
1076+ struct md_list_head all_mddevs;
1077+};
1078+
1079+struct mdk_personality_s
1080+{
1081+ char *name;
1082+ int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1083+ unsigned long *rsector, unsigned long size);
1084+ int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
1085+ void (*end_request)(struct buffer_head * bh, int uptodate);
1086+ int (*run)(mddev_t *mddev);
1087+ int (*stop)(mddev_t *mddev);
1088+ int (*status)(char *page, mddev_t *mddev);
1089+ int (*ioctl)(struct inode *inode, struct file *file,
1090+ unsigned int cmd, unsigned long arg);
1091+ int max_invalid_dev;
1092+ int (*error_handler)(mddev_t *mddev, kdev_t dev);
1093+
1094+/*
1095+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
1096+ * hot-removed. Hot removal is different from failure. (failure marks
1097+ * a disk inactive, but the disk is still part of the array) The interface
1098+ * to such operations is the 'pers->diskop()' function, can be NULL.
1099+ *
1100+ * the diskop function can change the pointer pointing to the incoming
1101+ * descriptor, but must do so very carefully. (currently only
1102+ * SPARE_ACTIVE expects such a change)
1103+ */
1104+ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
1105+
1106+ int (*stop_resync)(mddev_t *mddev);
1107+ int (*restart_resync)(mddev_t *mddev);
1108+};
1109+
1110+
1111+/*
1112+ * Currently we index md_array directly, based on the minor
1113+ * number. This will have to change to dynamic allocation
1114+ * once we start supporting partitioning of md devices.
1115+ */
1116+extern inline int mdidx (mddev_t * mddev)
1117+{
1118+ return mddev->__minor;
1119+}
1120+
1121+extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
1122+{
1123+ return MKDEV(MD_MAJOR, mdidx(mddev));
1124+}
1125+
1126+extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
1127+extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
1128+
1129+/*
1130+ * iterates through some rdev ringlist. It's safe to remove the
1131+ * current 'rdev'. Dont touch 'tmp' though.
1132+ */
1133+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
1134+ \
1135+ for (tmp = head.next; \
1136+ rdev = md_list_entry(tmp, mdk_rdev_t, field), \
1137+ tmp = tmp->next, tmp->prev != &head \
1138+ ; )
1139+/*
1140+ * iterates through the 'same array disks' ringlist
1141+ */
1142+#define ITERATE_RDEV(mddev,rdev,tmp) \
1143+ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
1144+
1145+/*
1146+ * Same as above, but assumes that the device has rdev->desc_nr numbered
1147+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
1148+ */
1149+#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
1150+ for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
1151+
1152+
1153+/*
1154+ * Iterates through all 'RAID managed disks'
1155+ */
1156+#define ITERATE_RDEV_ALL(rdev,tmp) \
1157+ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
1158+
1159+/*
1160+ * Iterates through 'pending RAID disks'
1161+ */
1162+#define ITERATE_RDEV_PENDING(rdev,tmp) \
1163+ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
1164+
1165+/*
1166+ * iterates through all used mddevs in the system.
1167+ */
1168+#define ITERATE_MDDEV(mddev,tmp) \
1169+ \
1170+ for (tmp = all_mddevs.next; \
1171+ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
1172+ tmp = tmp->next, tmp->prev != &all_mddevs \
1173+ ; )
1174+
1175+extern inline int lock_mddev (mddev_t * mddev)
1176+{
1177+ return down_interruptible(&mddev->reconfig_sem);
1178+}
1179+
1180+extern inline void unlock_mddev (mddev_t * mddev)
1181+{
1182+ up(&mddev->reconfig_sem);
1183+}
1184+
1185+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
1186+ x = y; y = __tmp; } while (0)
1187+
1188+typedef struct mdk_thread_s {
1189+ void (*run) (void *data);
1190+ void *data;
1191+ struct wait_queue *wqueue;
1192+ unsigned long flags;
1193+ struct semaphore *sem;
1194+ struct task_struct *tsk;
1195+ const char *name;
1196+} mdk_thread_t;
1197+
1198+#define THREAD_WAKEUP 0
1199+
1200+typedef struct dev_name_s {
1201+ struct md_list_head list;
1202+ kdev_t dev;
1203+ char name [MAX_DISKNAME_LEN];
1204+} dev_name_t;
1205+
1206+#endif _MD_K_H
1207+
1208--- linux/include/linux/raid/md_p.h.orig Fri Nov 23 11:18:16 2001
1209+++ linux/include/linux/raid/md_p.h Fri Nov 23 11:18:16 2001
1210@@ -0,0 +1,161 @@
1211+/*
1212+ md_p.h : physical layout of Linux RAID devices
1213+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
1214+
1215+ This program is free software; you can redistribute it and/or modify
1216+ it under the terms of the GNU General Public License as published by
1217+ the Free Software Foundation; either version 2, or (at your option)
1218+ any later version.
1219+
1220+ You should have received a copy of the GNU General Public License
1221+ (for example /usr/src/linux/COPYING); if not, write to the Free
1222+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1223+*/
1224+
1225+#ifndef _MD_P_H
1226+#define _MD_P_H
1227+
1228+/*
1229+ * RAID superblock.
1230+ *
1231+ * The RAID superblock maintains some statistics on each RAID configuration.
1232+ * Each real device in the RAID set contains it near the end of the device.
1233+ * Some of the ideas are copied from the ext2fs implementation.
1234+ *
1235+ * We currently use 4096 bytes as follows:
1236+ *
1237+ * word offset function
1238+ *
1239+ * 0 - 31 Constant generic RAID device information.
1240+ * 32 - 63 Generic state information.
1241+ * 64 - 127 Personality specific information.
1242+ * 128 - 511 12 32-words descriptors of the disks in the raid set.
1243+ * 512 - 911 Reserved.
1244+ * 912 - 1023 Disk specific descriptor.
1245+ */
1246+
1247+/*
1248+ * If x is the real device size in bytes, we return an apparent size of:
1249+ *
1250+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1251+ *
1252+ * and place the 4kB superblock at offset y.
1253+ */
1254+#define MD_RESERVED_BYTES (64 * 1024)
1255+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1256+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1257+
1258+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1259+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1260+
1261+#define MD_SB_BYTES 4096
1262+#define MD_SB_WORDS (MD_SB_BYTES / 4)
1263+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1264+#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1265+
1266+/*
1267+ * The following are counted in 32-bit words
1268+ */
1269+#define MD_SB_GENERIC_OFFSET 0
1270+#define MD_SB_PERSONALITY_OFFSET 64
1271+#define MD_SB_DISKS_OFFSET 128
1272+#define MD_SB_DESCRIPTOR_OFFSET 992
1273+
1274+#define MD_SB_GENERIC_CONSTANT_WORDS 32
1275+#define MD_SB_GENERIC_STATE_WORDS 32
1276+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1277+#define MD_SB_PERSONALITY_WORDS 64
1278+#define MD_SB_DISKS_WORDS 384
1279+#define MD_SB_DESCRIPTOR_WORDS 32
1280+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1281+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1282+#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1283+
1284+/*
1285+ * Device "operational" state bits
1286+ */
1287+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
1288+#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
1289+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
1290+#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
1291+
1292+typedef struct mdp_device_descriptor_s {
1293+ __u32 number; /* 0 Device number in the entire set */
1294+ __u32 major; /* 1 Device major number */
1295+ __u32 minor; /* 2 Device minor number */
1296+ __u32 raid_disk; /* 3 The role of the device in the raid set */
1297+ __u32 state; /* 4 Operational state */
1298+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1299+} mdp_disk_t;
1300+
1301+#define MD_SB_MAGIC 0xa92b4efc
1302+
1303+/*
1304+ * Superblock state bits
1305+ */
1306+#define MD_SB_CLEAN 0
1307+#define MD_SB_ERRORS 1
1308+
1309+typedef struct mdp_superblock_s {
1310+ /*
1311+ * Constant generic information
1312+ */
1313+ __u32 md_magic; /* 0 MD identifier */
1314+ __u32 major_version; /* 1 major version to which the set conforms */
1315+ __u32 minor_version; /* 2 minor version ... */
1316+ __u32 patch_version; /* 3 patchlevel version ... */
1317+ __u32 gvalid_words; /* 4 Number of used words in this section */
1318+ __u32 set_uuid0; /* 5 Raid set identifier */
1319+ __u32 ctime; /* 6 Creation time */
1320+ __u32 level; /* 7 Raid personality */
1321+ __u32 size; /* 8 Apparent size of each individual disk */
1322+ __u32 nr_disks; /* 9 total disks in the raid set */
1323+ __u32 raid_disks; /* 10 disks in a fully functional raid set */
1324+ __u32 md_minor; /* 11 preferred MD minor device number */
1325+ __u32 not_persistent; /* 12 does it have a persistent superblock */
1326+ __u32 set_uuid1; /* 13 Raid set identifier #2 */
1327+ __u32 set_uuid2; /* 14 Raid set identifier #3 */
1328+ __u32 set_uuid3; /* 14 Raid set identifier #4 */
1329+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
1330+
1331+ /*
1332+ * Generic state information
1333+ */
1334+ __u32 utime; /* 0 Superblock update time */
1335+ __u32 state; /* 1 State bits (clean, ...) */
1336+ __u32 active_disks; /* 2 Number of currently active disks */
1337+ __u32 working_disks; /* 3 Number of working disks */
1338+ __u32 failed_disks; /* 4 Number of failed disks */
1339+ __u32 spare_disks; /* 5 Number of spare disks */
1340+ __u32 sb_csum; /* 6 checksum of the whole superblock */
1341+ __u64 events; /* 7 number of superblock updates (64-bit!) */
1342+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
1343+
1344+ /*
1345+ * Personality information
1346+ */
1347+ __u32 layout; /* 0 the array's physical layout */
1348+ __u32 chunk_size; /* 1 chunk size in bytes */
1349+ __u32 root_pv; /* 2 LV root PV */
1350+ __u32 root_block; /* 3 LV root block */
1351+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
1352+
1353+ /*
1354+ * Disks information
1355+ */
1356+ mdp_disk_t disks[MD_SB_DISKS];
1357+
1358+ /*
1359+ * Reserved
1360+ */
1361+ __u32 reserved[MD_SB_RESERVED_WORDS];
1362+
1363+ /*
1364+ * Active descriptor
1365+ */
1366+ mdp_disk_t this_disk;
1367+
1368+} mdp_super_t;
1369+
1370+#endif _MD_P_H
1371+
1372--- linux/include/linux/raid/md_u.h.orig Fri Nov 23 11:18:16 2001
1373+++ linux/include/linux/raid/md_u.h Fri Nov 23 11:18:16 2001
1374@@ -0,0 +1,115 @@
1375+/*
1376+ md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
1377+ Copyright (C) 1998 Ingo Molnar
1378+
1379+ This program is free software; you can redistribute it and/or modify
1380+ it under the terms of the GNU General Public License as published by
1381+ the Free Software Foundation; either version 2, or (at your option)
1382+ any later version.
1383+
1384+ You should have received a copy of the GNU General Public License
1385+ (for example /usr/src/linux/COPYING); if not, write to the Free
1386+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1387+*/
1388+
1389+#ifndef _MD_U_H
1390+#define _MD_U_H
1391+
1392+/* ioctls */
1393+
1394+/* status */
1395+#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
1396+#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
1397+#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
1398+#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
1399+
1400+/* configuration */
1401+#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
1402+#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
1403+#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
1404+#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
1405+#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
1406+#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
1407+#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
1408+#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
1409+#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
1410+#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
1411+
1412+/* usage */
1413+#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
1414+#define START_ARRAY _IO (MD_MAJOR, 0x31)
1415+#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
1416+#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
1417+#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
1418+
1419+typedef struct mdu_version_s {
1420+ int major;
1421+ int minor;
1422+ int patchlevel;
1423+} mdu_version_t;
1424+
1425+typedef struct mdu_array_info_s {
1426+ /*
1427+ * Generic constant information
1428+ */
1429+ int major_version;
1430+ int minor_version;
1431+ int patch_version;
1432+ int ctime;
1433+ int level;
1434+ int size;
1435+ int nr_disks;
1436+ int raid_disks;
1437+ int md_minor;
1438+ int not_persistent;
1439+
1440+ /*
1441+ * Generic state information
1442+ */
1443+ int utime; /* 0 Superblock update time */
1444+ int state; /* 1 State bits (clean, ...) */
1445+ int active_disks; /* 2 Number of currently active disks */
1446+ int working_disks; /* 3 Number of working disks */
1447+ int failed_disks; /* 4 Number of failed disks */
1448+ int spare_disks; /* 5 Number of spare disks */
1449+
1450+ /*
1451+ * Personality information
1452+ */
1453+ int layout; /* 0 the array's physical layout */
1454+ int chunk_size; /* 1 chunk size in bytes */
1455+
1456+} mdu_array_info_t;
1457+
1458+typedef struct mdu_disk_info_s {
1459+ /*
1460+ * configuration/status of one particular disk
1461+ */
1462+ int number;
1463+ int major;
1464+ int minor;
1465+ int raid_disk;
1466+ int state;
1467+
1468+} mdu_disk_info_t;
1469+
1470+typedef struct mdu_start_info_s {
1471+ /*
1472+ * configuration/status of one particular disk
1473+ */
1474+ int major;
1475+ int minor;
1476+ int raid_disk;
1477+ int state;
1478+
1479+} mdu_start_info_t;
1480+
1481+typedef struct mdu_param_s
1482+{
1483+ int personality; /* 1,2,3,4 */
1484+ int chunk_size; /* in bytes */
1485+ int max_fault; /* unused for now */
1486+} mdu_param_t;
1487+
1488+#endif _MD_U_H
1489+
1490--- linux/include/linux/raid/raid0.h.orig Fri Nov 23 11:18:16 2001
1491+++ linux/include/linux/raid/raid0.h Fri Nov 23 11:18:16 2001
1492@@ -0,0 +1,33 @@
1493+#ifndef _RAID0_H
1494+#define _RAID0_H
1495+
1496+#include <linux/raid/md.h>
1497+
1498+struct strip_zone
1499+{
1500+ int zone_offset; /* Zone offset in md_dev */
1501+ int dev_offset; /* Zone offset in real dev */
1502+ int size; /* Zone size */
1503+ int nb_dev; /* # of devices attached to the zone */
1504+ mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
1505+};
1506+
1507+struct raid0_hash
1508+{
1509+ struct strip_zone *zone0, *zone1;
1510+};
1511+
1512+struct raid0_private_data
1513+{
1514+ struct raid0_hash *hash_table; /* Dynamically allocated */
1515+ struct strip_zone *strip_zone; /* This one too */
1516+ int nr_strip_zones;
1517+ struct strip_zone *smallest;
1518+ int nr_zones;
1519+};
1520+
1521+typedef struct raid0_private_data raid0_conf_t;
1522+
1523+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
1524+
1525+#endif
1526--- linux/include/linux/raid/raid1.h.orig Fri Nov 23 11:18:16 2001
1527+++ linux/include/linux/raid/raid1.h Fri Nov 23 11:18:16 2001
1528@@ -0,0 +1,64 @@
1529+#ifndef _RAID1_H
1530+#define _RAID1_H
1531+
1532+#include <linux/raid/md.h>
1533+
1534+struct mirror_info {
1535+ int number;
1536+ int raid_disk;
1537+ kdev_t dev;
1538+ int next;
1539+ int sect_limit;
1540+
1541+ /*
1542+ * State bits:
1543+ */
1544+ int operational;
1545+ int write_only;
1546+ int spare;
1547+
1548+ int used_slot;
1549+};
1550+
1551+struct raid1_private_data {
1552+ mddev_t *mddev;
1553+ struct mirror_info mirrors[MD_SB_DISKS];
1554+ int nr_disks;
1555+ int raid_disks;
1556+ int working_disks;
1557+ int last_used;
1558+ unsigned long next_sect;
1559+ int sect_count;
1560+ mdk_thread_t *thread, *resync_thread;
1561+ int resync_mirrors;
1562+ struct mirror_info *spare;
1563+};
1564+
1565+typedef struct raid1_private_data raid1_conf_t;
1566+
1567+/*
1568+ * this is the only point in the RAID code where we violate
1569+ * C type safety. mddev->private is an 'opaque' pointer.
1570+ */
1571+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
1572+
1573+/*
1574+ * this is our 'private' 'collective' RAID1 buffer head.
1575+ * it contains information about what kind of IO operations were started
1576+ * for this RAID1 operation, and about their status:
1577+ */
1578+
1579+struct raid1_bh {
1580+ atomic_t remaining; /* 'have we finished' count,
1581+ * used from IRQ handlers
1582+ */
1583+ int cmd;
1584+ unsigned long state;
1585+ mddev_t *mddev;
1586+ struct buffer_head *master_bh;
1587+ struct buffer_head *mirror_bh [MD_SB_DISKS];
1588+ struct buffer_head bh_req;
1589+ struct buffer_head *next_retry;
1590+};
1591+
1592+#endif
1593--- linux/include/linux/raid/raid5.h.orig Fri Nov 23 11:18:16 2001
1594+++ linux/include/linux/raid/raid5.h Fri Nov 23 11:18:16 2001
1595@@ -0,0 +1,113 @@
1596+#ifndef _RAID5_H
1597+#define _RAID5_H
1598+
1599+#include <linux/raid/md.h>
1600+#include <linux/raid/xor.h>
1601+
1602+struct disk_info {
1603+ kdev_t dev;
1604+ int operational;
1605+ int number;
1606+ int raid_disk;
1607+ int write_only;
1608+ int spare;
1609+ int used_slot;
1610+};
1611+
1612+struct stripe_head {
1613+ md_spinlock_t stripe_lock;
1614+ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
1615+ struct stripe_head *free_next; /* pool of free sh's */
1616+ struct buffer_head *buffer_pool; /* pool of free buffers */
1617+ struct buffer_head *bh_pool; /* pool of free bh's */
1618+ struct raid5_private_data *raid_conf;
1619+ struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
1620+ struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
1621+ struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
1622+ struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
1623+ int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
1624+ int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
1625+ unsigned long sector; /* sector of this row */
1626+ int size; /* buffers size */
1627+ int pd_idx; /* parity disk index */
1628+ atomic_t nr_pending; /* nr of pending cmds */
1629+ unsigned long state; /* state flags */
1630+ int cmd; /* stripe cmd */
1631+ int count; /* nr of waiters */
1632+ int write_method; /* reconstruct-write / read-modify-write */
1633+ int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
1634+ struct wait_queue *wait; /* processes waiting for this stripe */
1635+};
1636+
1637+/*
1638+ * Phase
1639+ */
1640+#define PHASE_BEGIN 0
1641+#define PHASE_READ_OLD 1
1642+#define PHASE_WRITE 2
1643+#define PHASE_READ 3
1644+#define PHASE_COMPLETE 4
1645+
1646+/*
1647+ * Write method
1648+ */
1649+#define METHOD_NONE 0
1650+#define RECONSTRUCT_WRITE 1
1651+#define READ_MODIFY_WRITE 2
1652+
1653+/*
1654+ * Stripe state
1655+ */
1656+#define STRIPE_LOCKED 0
1657+#define STRIPE_ERROR 1
1658+
1659+/*
1660+ * Stripe commands
1661+ */
1662+#define STRIPE_NONE 0
1663+#define STRIPE_WRITE 1
1664+#define STRIPE_READ 2
1665+
1666+struct raid5_private_data {
1667+ struct stripe_head **stripe_hashtbl;
1668+ mddev_t *mddev;
1669+ mdk_thread_t *thread, *resync_thread;
1670+ struct disk_info disks[MD_SB_DISKS];
1671+ struct disk_info *spare;
1672+ int buffer_size;
1673+ int chunk_size, level, algorithm;
1674+ int raid_disks, working_disks, failed_disks;
1675+ int sector_count;
1676+ unsigned long next_sector;
1677+ atomic_t nr_handle;
1678+ struct stripe_head *next_free_stripe;
1679+ int nr_stripes;
1680+ int resync_parity;
1681+ int max_nr_stripes;
1682+ int clock;
1683+ int nr_hashed_stripes;
1684+ int nr_locked_stripes;
1685+ int nr_pending_stripes;
1686+ int nr_cached_stripes;
1687+
1688+ /*
1689+ * Free stripes pool
1690+ */
1691+ int nr_free_sh;
1692+ struct stripe_head *free_sh_list;
1693+ struct wait_queue *wait_for_stripe;
1694+};
1695+
1696+typedef struct raid5_private_data raid5_conf_t;
1697+
1698+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
1699+
1700+/*
1701+ * Our supported algorithms
1702+ */
1703+#define ALGORITHM_LEFT_ASYMMETRIC 0
1704+#define ALGORITHM_RIGHT_ASYMMETRIC 1
1705+#define ALGORITHM_LEFT_SYMMETRIC 2
1706+#define ALGORITHM_RIGHT_SYMMETRIC 3
1707+
1708+#endif
1709--- linux/include/linux/raid/translucent.h.orig Fri Nov 23 11:18:16 2001
1710+++ linux/include/linux/raid/translucent.h Fri Nov 23 11:18:16 2001
1711@@ -0,0 +1,23 @@
1712+#ifndef _TRANSLUCENT_H
1713+#define _TRANSLUCENT_H
1714+
1715+#include <linux/raid/md.h>
1716+
1717+typedef struct dev_info dev_info_t;
1718+
1719+struct dev_info {
1720+ kdev_t dev;
1721+ int size;
1722+};
1723+
1724+struct translucent_private_data
1725+{
1726+ dev_info_t disks[MD_SB_DISKS];
1727+};
1728+
1729+
1730+typedef struct translucent_private_data translucent_conf_t;
1731+
1732+#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
1733+
1734+#endif
1735--- linux/include/linux/raid/xor.h.orig Fri Nov 23 11:18:16 2001
1736+++ linux/include/linux/raid/xor.h Fri Nov 23 11:18:16 2001
1737@@ -0,0 +1,12 @@
1738+#ifndef _XOR_H
1739+#define _XOR_H
1740+
1741+#include <linux/raid/md.h>
1742+
1743+#define MAX_XOR_BLOCKS 5
1744+
1745+extern void calibrate_xor_block(void);
1746+extern void (*xor_block)(unsigned int count,
1747+ struct buffer_head **bh_ptr);
1748+
1749+#endif
1750--- linux/include/linux/fs.h.orig Fri Nov 23 11:17:45 2001
1751+++ linux/include/linux/fs.h Fri Nov 23 11:18:16 2001
1752@@ -191,6 +191,7 @@
1753 #define BH_Req 3 /* 0 if the buffer has been invalidated */
1754 #define BH_Protected 6 /* 1 if the buffer is protected */
1755 #define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */
1756+#define BH_LowPrio 8 /* 1 if the buffer is lowprio */
1757
1758 /*
1759 * Try to keep the most commonly used fields in single cache lines (16
1760@@ -784,6 +785,7 @@
1761 extern void refile_buffer(struct buffer_head * buf);
1762 extern void set_writetime(struct buffer_head * buf, int flag);
1763 extern int try_to_free_buffers(struct page *, int);
1764+extern void cache_drop_behind(struct buffer_head *bh);
1765
1766 extern int nr_buffers;
1767 extern long buffermem;
1768@@ -814,6 +816,25 @@
1769 }
1770 }
1771
1772+extern inline void mark_buffer_highprio(struct buffer_head * bh)
1773+{
1774+ clear_bit(BH_LowPrio, &bh->b_state);
1775+}
1776+
1777+extern inline void mark_buffer_lowprio(struct buffer_head * bh)
1778+{
1779+ /*
1780+ * dirty buffers cannot be marked lowprio.
1781+ */
1782+ if (!buffer_dirty(bh))
1783+ set_bit(BH_LowPrio, &bh->b_state);
1784+}
1785+
1786+static inline int buffer_lowprio(struct buffer_head * bh)
1787+{
1788+ return test_bit(BH_LowPrio, &bh->b_state);
1789+}
1790+
1791 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
1792 {
1793 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1794@@ -821,6 +842,23 @@
1795 if (bh->b_list != BUF_DIRTY)
1796 refile_buffer(bh);
1797 }
1798+ /*
1799+ * if a buffer gets marked dirty then it has to lose
1800+ * it's lowprio state.
1801+ */
1802+ mark_buffer_highprio(bh);
1803+}
1804+
1805+extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
1806+{
1807+ if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1808+ if (bh->b_list != BUF_DIRTY)
1809+ refile_buffer(bh);
1810+ /*
1811+ * Mark it lowprio only if it was not dirty before!
1812+ */
1813+ set_bit(BH_LowPrio, &bh->b_state);
1814+ }
1815 }
1816
1817 extern int check_disk_change(kdev_t dev);
1818@@ -899,6 +937,7 @@
1819 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
1820 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
1821 extern int is_read_only(kdev_t);
1822+extern int is_device_idle(kdev_t);
1823 extern void __brelse(struct buffer_head *);
1824 extern inline void brelse(struct buffer_head *buf)
1825 {
1826@@ -914,8 +953,12 @@
1827 extern void set_blocksize(kdev_t dev, int size);
1828 extern unsigned int get_hardblocksize(kdev_t dev);
1829 extern struct buffer_head * bread(kdev_t dev, int block, int size);
1830+extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
1831+extern void bread_ahead (kdev_t dev, int block, int size);
1832 extern struct buffer_head * breada(kdev_t dev,int block, int size,
1833 unsigned int pos, unsigned int filesize);
1834+extern struct buffer_head * breada_blocks(kdev_t dev,int block,
1835+ int size, int blocks);
1836
1837 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
1838
1839--- linux/include/linux/blkdev.h.orig Mon Dec 11 01:49:44 2000
1840+++ linux/include/linux/blkdev.h Fri Nov 23 11:18:16 2001
1841@@ -91,8 +91,9 @@
1842 extern void make_request(int major,int rw, struct buffer_head * bh);
1843
1844 /* md needs this function to remap requests */
1845-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
1846-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
1847+extern int md_map (kdev_t dev, kdev_t *rdev,
1848+ unsigned long *rsector, unsigned long size);
1849+extern int md_make_request (struct buffer_head * bh, int rw);
1850 extern int md_error (kdev_t mddev, kdev_t rdev);
1851
1852 extern int * blk_size[MAX_BLKDEV];
1853--- linux/include/linux/md.h.orig Fri May 8 09:17:13 1998
1854+++ linux/include/linux/md.h Fri Nov 23 11:18:18 2001
1855@@ -1,300 +0,0 @@
1856-/*
1857- md.h : Multiple Devices driver for Linux
1858- Copyright (C) 1994-96 Marc ZYNGIER
1859- <zyngier@ufr-info-p7.ibp.fr> or
1860- <maz@gloups.fdn.fr>
1861-
1862- This program is free software; you can redistribute it and/or modify
1863- it under the terms of the GNU General Public License as published by
1864- the Free Software Foundation; either version 2, or (at your option)
1865- any later version.
1866-
1867- You should have received a copy of the GNU General Public License
1868- (for example /usr/src/linux/COPYING); if not, write to the Free
1869- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1870-*/
1871-
1872-#ifndef _MD_H
1873-#define _MD_H
1874-
1875-#include <linux/major.h>
1876-#include <linux/ioctl.h>
1877-#include <linux/types.h>
1878-
1879-/*
1880- * Different major versions are not compatible.
1881- * Different minor versions are only downward compatible.
1882- * Different patchlevel versions are downward and upward compatible.
1883- */
1884-#define MD_MAJOR_VERSION 0
1885-#define MD_MINOR_VERSION 36
1886-#define MD_PATCHLEVEL_VERSION 6
1887-
1888-#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
1889-
1890-/* ioctls */
1891-#define REGISTER_DEV _IO (MD_MAJOR, 1)
1892-#define START_MD _IO (MD_MAJOR, 2)
1893-#define STOP_MD _IO (MD_MAJOR, 3)
1894-#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
1895-
1896-/*
1897- personalities :
1898- Byte 0 : Chunk size factor
1899- Byte 1 : Fault tolerance count for each physical device
1900- ( 0 means no fault tolerance,
1901- 0xFF means always tolerate faults), not used by now.
1902- Byte 2 : Personality
1903- Byte 3 : Reserved.
1904- */
1905-
1906-#define FAULT_SHIFT 8
1907-#define PERSONALITY_SHIFT 16
1908-
1909-#define FACTOR_MASK 0x000000FFUL
1910-#define FAULT_MASK 0x0000FF00UL
1911-#define PERSONALITY_MASK 0x00FF0000UL
1912-
1913-#define MD_RESERVED 0 /* Not used by now */
1914-#define LINEAR (1UL << PERSONALITY_SHIFT)
1915-#define STRIPED (2UL << PERSONALITY_SHIFT)
1916-#define RAID0 STRIPED
1917-#define RAID1 (3UL << PERSONALITY_SHIFT)
1918-#define RAID5 (4UL << PERSONALITY_SHIFT)
1919-#define MAX_PERSONALITY 5
1920-
1921-/*
1922- * MD superblock.
1923- *
1924- * The MD superblock maintains some statistics on each MD configuration.
1925- * Each real device in the MD set contains it near the end of the device.
1926- * Some of the ideas are copied from the ext2fs implementation.
1927- *
1928- * We currently use 4096 bytes as follows:
1929- *
1930- * word offset function
1931- *
1932- * 0 - 31 Constant generic MD device information.
1933- * 32 - 63 Generic state information.
1934- * 64 - 127 Personality specific information.
1935- * 128 - 511 12 32-words descriptors of the disks in the raid set.
1936- * 512 - 911 Reserved.
1937- * 912 - 1023 Disk specific descriptor.
1938- */
1939-
1940-/*
1941- * If x is the real device size in bytes, we return an apparent size of:
1942- *
1943- * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1944- *
1945- * and place the 4kB superblock at offset y.
1946- */
1947-#define MD_RESERVED_BYTES (64 * 1024)
1948-#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1949-#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1950-
1951-#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1952-#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1953-
1954-#define MD_SB_BYTES 4096
1955-#define MD_SB_WORDS (MD_SB_BYTES / 4)
1956-#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1957-#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1958-
1959-/*
1960- * The following are counted in 32-bit words
1961- */
1962-#define MD_SB_GENERIC_OFFSET 0
1963-#define MD_SB_PERSONALITY_OFFSET 64
1964-#define MD_SB_DISKS_OFFSET 128
1965-#define MD_SB_DESCRIPTOR_OFFSET 992
1966-
1967-#define MD_SB_GENERIC_CONSTANT_WORDS 32
1968-#define MD_SB_GENERIC_STATE_WORDS 32
1969-#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1970-#define MD_SB_PERSONALITY_WORDS 64
1971-#define MD_SB_DISKS_WORDS 384
1972-#define MD_SB_DESCRIPTOR_WORDS 32
1973-#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1974-#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1975-#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1976-
1977-/*
1978- * Device "operational" state bits
1979- */
1980-#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
1981-#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
1982-#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
1983-
1984-typedef struct md_device_descriptor_s {
1985- __u32 number; /* 0 Device number in the entire set */
1986- __u32 major; /* 1 Device major number */
1987- __u32 minor; /* 2 Device minor number */
1988- __u32 raid_disk; /* 3 The role of the device in the raid set */
1989- __u32 state; /* 4 Operational state */
1990- __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1991-} md_descriptor_t;
1992-
1993-#define MD_SB_MAGIC 0xa92b4efc
1994-
1995-/*
1996- * Superblock state bits
1997- */
1998-#define MD_SB_CLEAN 0
1999-#define MD_SB_ERRORS 1
2000-
2001-typedef struct md_superblock_s {
2002-
2003- /*
2004- * Constant generic information
2005- */
2006- __u32 md_magic; /* 0 MD identifier */
2007- __u32 major_version; /* 1 major version to which the set conforms */
2008- __u32 minor_version; /* 2 minor version to which the set conforms */
2009- __u32 patch_version; /* 3 patchlevel version to which the set conforms */
2010- __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
2011- __u32 set_magic; /* 5 Raid set identifier */
2012- __u32 ctime; /* 6 Creation time */
2013- __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
2014- __u32 size; /* 8 Apparent size of each individual disk, in kB */
2015- __u32 nr_disks; /* 9 Number of total disks in the raid set */
2016- __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
2017- __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
2018-
2019- /*
2020- * Generic state information
2021- */
2022- __u32 utime; /* 0 Superblock update time */
2023- __u32 state; /* 1 State bits (clean, ...) */
2024- __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
2025- __u32 working_disks; /* 3 Number of working disks */
2026- __u32 failed_disks; /* 4 Number of failed disks */
2027- __u32 spare_disks; /* 5 Number of spare disks */
2028- __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
2029-
2030- /*
2031- * Personality information
2032- */
2033- __u32 parity_algorithm;
2034- __u32 chunk_size;
2035- __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
2036-
2037- /*
2038- * Disks information
2039- */
2040- md_descriptor_t disks[MD_SB_DISKS];
2041-
2042- /*
2043- * Reserved
2044- */
2045- __u32 reserved[MD_SB_RESERVED_WORDS];
2046-
2047- /*
2048- * Active descriptor
2049- */
2050- md_descriptor_t descriptor;
2051-} md_superblock_t;
2052-
2053-#ifdef __KERNEL__
2054-
2055-#include <linux/mm.h>
2056-#include <linux/fs.h>
2057-#include <linux/blkdev.h>
2058-#include <asm/semaphore.h>
2059-
2060-/*
2061- * Kernel-based reconstruction is mostly working, but still requires
2062- * some additional work.
2063- */
2064-#define SUPPORT_RECONSTRUCTION 0
2065-
2066-#define MAX_REAL 8 /* Max number of physical dev per md dev */
2067-#define MAX_MD_DEV 4 /* Max number of md dev */
2068-
2069-#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
2070-#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
2071-#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
2072-
2073-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
2074-
2075-struct real_dev
2076-{
2077- kdev_t dev; /* Device number */
2078- int size; /* Device size (in blocks) */
2079- int offset; /* Real device offset (in blocks) in md dev
2080- (only used in linear mode) */
2081- struct inode *inode; /* Lock inode */
2082- md_superblock_t *sb;
2083- u32 sb_offset;
2084-};
2085-
2086-struct md_dev;
2087-
2088-#define SPARE_INACTIVE 0
2089-#define SPARE_WRITE 1
2090-#define SPARE_ACTIVE 2
2091-
2092-struct md_personality
2093-{
2094- char *name;
2095- int (*map)(struct md_dev *mddev, kdev_t *rdev,
2096- unsigned long *rsector, unsigned long size);
2097- int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
2098- void (*end_request)(struct buffer_head * bh, int uptodate);
2099- int (*run)(int minor, struct md_dev *mddev);
2100- int (*stop)(int minor, struct md_dev *mddev);
2101- int (*status)(char *page, int minor, struct md_dev *mddev);
2102- int (*ioctl)(struct inode *inode, struct file *file,
2103- unsigned int cmd, unsigned long arg);
2104- int max_invalid_dev;
2105- int (*error_handler)(struct md_dev *mddev, kdev_t dev);
2106-
2107-/*
2108- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
2109- * hot-removed. Hot removal is different from failure. (failure marks
2110- * a disk inactive, but the disk is still part of the array)
2111- */
2112- int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
2113- int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
2114- int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
2115-};
2116-
2117-struct md_dev
2118-{
2119- struct real_dev devices[MAX_REAL];
2120- struct md_personality *pers;
2121- md_superblock_t *sb;
2122- int sb_dirty;
2123- int repartition;
2124- int busy;
2125- int nb_dev;
2126- void *private;
2127-};
2128-
2129-struct md_thread {
2130- void (*run) (void *data);
2131- void *data;
2132- struct wait_queue *wqueue;
2133- unsigned long flags;
2134- struct semaphore *sem;
2135- struct task_struct *tsk;
2136-};
2137-
2138-#define THREAD_WAKEUP 0
2139-
2140-extern struct md_dev md_dev[MAX_MD_DEV];
2141-extern int md_size[MAX_MD_DEV];
2142-extern int md_maxreadahead[MAX_MD_DEV];
2143-
2144-extern char *partition_name (kdev_t dev);
2145-
2146-extern int register_md_personality (int p_num, struct md_personality *p);
2147-extern int unregister_md_personality (int p_num);
2148-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
2149-extern void md_unregister_thread (struct md_thread *thread);
2150-extern void md_wakeup_thread(struct md_thread *thread);
2151-extern int md_update_sb (int minor);
2152-extern int md_do_sync(struct md_dev *mddev);
2153-
2154-#endif __KERNEL__
2155-#endif _MD_H
2156--- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996
2157+++ linux/include/linux/raid0.h Fri Nov 23 11:18:18 2001
2158@@ -1,27 +0,0 @@
2159-#ifndef _RAID0_H
2160-#define _RAID0_H
2161-
2162-struct strip_zone
2163-{
2164- int zone_offset; /* Zone offset in md_dev */
2165- int dev_offset; /* Zone offset in real dev */
2166- int size; /* Zone size */
2167- int nb_dev; /* Number of devices attached to the zone */
2168- struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
2169-};
2170-
2171-struct raid0_hash
2172-{
2173- struct strip_zone *zone0, *zone1;
2174-};
2175-
2176-struct raid0_data
2177-{
2178- struct raid0_hash *hash_table; /* Dynamically allocated */
2179- struct strip_zone *strip_zone; /* This one too */
2180- int nr_strip_zones;
2181- struct strip_zone *smallest;
2182- int nr_zones;
2183-};
2184-
2185-#endif
2186--- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998
2187+++ linux/include/linux/raid1.h Fri Nov 23 11:18:18 2001
2188@@ -1,49 +0,0 @@
2189-#ifndef _RAID1_H
2190-#define _RAID1_H
2191-
2192-#include <linux/md.h>
2193-
2194-struct mirror_info {
2195- int number;
2196- int raid_disk;
2197- kdev_t dev;
2198- int next;
2199- int sect_limit;
2200-
2201- /*
2202- * State bits:
2203- */
2204- int operational;
2205- int write_only;
2206- int spare;
2207-};
2208-
2209-struct raid1_data {
2210- struct md_dev *mddev;
2211- struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
2212- int raid_disks;
2213- int working_disks; /* Number of working disks */
2214- int last_used;
2215- unsigned long next_sect;
2216- int sect_count;
2217- int resync_running;
2218-};
2219-
2220-/*
2221- * this is our 'private' 'collective' RAID1 buffer head.
2222- * it contains information about what kind of IO operations were started
2223- * for this RAID5 operation, and about their status:
2224- */
2225-
2226-struct raid1_bh {
2227- unsigned int remaining;
2228- int cmd;
2229- unsigned long state;
2230- struct md_dev *mddev;
2231- struct buffer_head *master_bh;
2232- struct buffer_head *mirror_bh [MD_SB_DISKS];
2233- struct buffer_head bh_req;
2234- struct buffer_head *next_retry;
2235-};
2236-
2237-#endif
2238--- linux/include/linux/raid5.h.orig Fri May 8 09:17:13 1998
2239+++ linux/include/linux/raid5.h Fri Nov 23 11:18:18 2001
2240@@ -1,110 +0,0 @@
2241-#ifndef _RAID5_H
2242-#define _RAID5_H
2243-
2244-#ifdef __KERNEL__
2245-#include <linux/md.h>
2246-#include <asm/atomic.h>
2247-
2248-struct disk_info {
2249- kdev_t dev;
2250- int operational;
2251- int number;
2252- int raid_disk;
2253- int write_only;
2254- int spare;
2255-};
2256-
2257-struct stripe_head {
2258- struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
2259- struct stripe_head *free_next; /* pool of free sh's */
2260- struct buffer_head *buffer_pool; /* pool of free buffers */
2261- struct buffer_head *bh_pool; /* pool of free bh's */
2262- struct raid5_data *raid_conf;
2263- struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
2264- struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
2265- struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
2266- struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
2267- int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
2268- int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
2269- unsigned long sector; /* sector of this row */
2270- int size; /* buffers size */
2271- int pd_idx; /* parity disk index */
2272- int nr_pending; /* nr of pending cmds */
2273- unsigned long state; /* state flags */
2274- int cmd; /* stripe cmd */
2275- int count; /* nr of waiters */
2276- int write_method; /* reconstruct-write / read-modify-write */
2277- int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
2278- struct wait_queue *wait; /* processes waiting for this stripe */
2279-};
2280-
2281-/*
2282- * Phase
2283- */
2284-#define PHASE_BEGIN 0
2285-#define PHASE_READ_OLD 1
2286-#define PHASE_WRITE 2
2287-#define PHASE_READ 3
2288-#define PHASE_COMPLETE 4
2289-
2290-/*
2291- * Write method
2292- */
2293-#define METHOD_NONE 0
2294-#define RECONSTRUCT_WRITE 1
2295-#define READ_MODIFY_WRITE 2
2296-
2297-/*
2298- * Stripe state
2299- */
2300-#define STRIPE_LOCKED 0
2301-#define STRIPE_ERROR 1
2302-
2303-/*
2304- * Stripe commands
2305- */
2306-#define STRIPE_NONE 0
2307-#define STRIPE_WRITE 1
2308-#define STRIPE_READ 2
2309-
2310-struct raid5_data {
2311- struct stripe_head **stripe_hashtbl;
2312- struct md_dev *mddev;
2313- struct md_thread *thread, *resync_thread;
2314- struct disk_info disks[MD_SB_DISKS];
2315- struct disk_info *spare;
2316- int buffer_size;
2317- int chunk_size, level, algorithm;
2318- int raid_disks, working_disks, failed_disks;
2319- int sector_count;
2320- unsigned long next_sector;
2321- atomic_t nr_handle;
2322- struct stripe_head *next_free_stripe;
2323- int nr_stripes;
2324- int resync_parity;
2325- int max_nr_stripes;
2326- int clock;
2327- int nr_hashed_stripes;
2328- int nr_locked_stripes;
2329- int nr_pending_stripes;
2330- int nr_cached_stripes;
2331-
2332- /*
2333- * Free stripes pool
2334- */
2335- int nr_free_sh;
2336- struct stripe_head *free_sh_list;
2337- struct wait_queue *wait_for_stripe;
2338-};
2339-
2340-#endif
2341-
2342-/*
2343- * Our supported algorithms
2344- */
2345-#define ALGORITHM_LEFT_ASYMMETRIC 0
2346-#define ALGORITHM_RIGHT_ASYMMETRIC 1
2347-#define ALGORITHM_LEFT_SYMMETRIC 2
2348-#define ALGORITHM_RIGHT_SYMMETRIC 3
2349-
2350-#endif
2351--- linux/include/linux/sysctl.h.orig Fri Nov 23 11:17:44 2001
2352+++ linux/include/linux/sysctl.h Fri Nov 23 11:18:18 2001
2353@@ -435,6 +435,7 @@
2354 enum {
2355 DEV_CDROM=1,
2356 DEV_HWMON=2,
2357+ DEV_MD=3,
2358 DEV_MAC_HID=5
2359 };
2360
2361@@ -446,6 +447,11 @@
2362 DEV_CDROM_DEBUG=4,
2363 DEV_CDROM_LOCK=5,
2364 DEV_CDROM_CHECK_MEDIA=6
2365+};
2366+
2367+/* /proc/sys/dev/md */
2368+enum {
2369+ DEV_MD_SPEED_LIMIT=1
2370 };
2371
2372 /* /proc/sys/dev/mac_hid */
2373--- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998
2374+++ linux/include/asm-i386/md.h Fri Nov 23 11:18:18 2001
2375@@ -1,13 +0,0 @@
2376-/* $Id$
2377- * md.h: High speed xor_block operation for RAID4/5
2378- *
2379- */
2380-
2381-#ifndef __ASM_MD_H
2382-#define __ASM_MD_H
2383-
2384-/* #define HAVE_ARCH_XORBLOCK */
2385-
2386-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2387-
2388-#endif /* __ASM_MD_H */
2389--- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998
2390+++ linux/include/asm-alpha/md.h Fri Nov 23 11:18:18 2001
2391@@ -1,13 +0,0 @@
2392-/* $Id$
2393- * md.h: High speed xor_block operation for RAID4/5
2394- *
2395- */
2396-
2397-#ifndef __ASM_MD_H
2398-#define __ASM_MD_H
2399-
2400-/* #define HAVE_ARCH_XORBLOCK */
2401-
2402-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2403-
2404-#endif /* __ASM_MD_H */
2405--- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998
2406+++ linux/include/asm-m68k/md.h Fri Nov 23 11:18:18 2001
2407@@ -1,13 +0,0 @@
2408-/* $Id$
2409- * md.h: High speed xor_block operation for RAID4/5
2410- *
2411- */
2412-
2413-#ifndef __ASM_MD_H
2414-#define __ASM_MD_H
2415-
2416-/* #define HAVE_ARCH_XORBLOCK */
2417-
2418-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2419-
2420-#endif /* __ASM_MD_H */
2421--- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998
2422+++ linux/include/asm-sparc/md.h Fri Nov 23 11:18:18 2001
2423@@ -1,13 +0,0 @@
2424-/* $Id$
2425- * md.h: High speed xor_block operation for RAID4/5
2426- *
2427- */
2428-
2429-#ifndef __ASM_MD_H
2430-#define __ASM_MD_H
2431-
2432-/* #define HAVE_ARCH_XORBLOCK */
2433-
2434-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2435-
2436-#endif /* __ASM_MD_H */
2437--- linux/include/asm-ppc/md.h.orig Wed Oct 27 02:53:42 1999
2438+++ linux/include/asm-ppc/md.h Fri Nov 23 11:18:18 2001
2439@@ -1,13 +0,0 @@
2440-/* $Id$
2441- * md.h: High speed xor_block operation for RAID4/5
2442- *
2443- */
2444-
2445-#ifndef __ASM_MD_H
2446-#define __ASM_MD_H
2447-
2448-/* #define HAVE_ARCH_XORBLOCK */
2449-
2450-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2451-
2452-#endif /* __ASM_MD_H */
2453--- linux/include/asm-sparc64/md.h.orig Tue Jan 13 00:15:58 1998
2454+++ linux/include/asm-sparc64/md.h Fri Nov 23 11:18:18 2001
2455@@ -1,91 +0,0 @@
2456-/* $Id$
2457- * md.h: High speed xor_block operation for RAID4/5
2458- * utilizing the UltraSparc Visual Instruction Set.
2459- *
2460- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
2461- */
2462-
2463-#ifndef __ASM_MD_H
2464-#define __ASM_MD_H
2465-
2466-#include <asm/head.h>
2467-#include <asm/asi.h>
2468-
2469-#define HAVE_ARCH_XORBLOCK
2470-
2471-#define MD_XORBLOCK_ALIGNMENT 64
2472-
2473-/* void __xor_block (char *dest, char *src, long len)
2474- * {
2475- * while (len--) *dest++ ^= *src++;
2476- * }
2477- *
2478- * Requirements:
2479- * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
2480- * !(len & 127) && len >= 256
2481- */
2482-
2483-static inline void __xor_block (char *dest, char *src, long len)
2484-{
2485- __asm__ __volatile__ ("
2486- wr %%g0, %3, %%fprs
2487- wr %%g0, %4, %%asi
2488- membar #LoadStore|#StoreLoad|#StoreStore
2489- sub %2, 128, %2
2490- ldda [%0] %4, %%f0
2491- ldda [%1] %4, %%f16
2492-1: ldda [%0 + 64] %%asi, %%f32
2493- fxor %%f0, %%f16, %%f16
2494- fxor %%f2, %%f18, %%f18
2495- fxor %%f4, %%f20, %%f20
2496- fxor %%f6, %%f22, %%f22
2497- fxor %%f8, %%f24, %%f24
2498- fxor %%f10, %%f26, %%f26
2499- fxor %%f12, %%f28, %%f28
2500- fxor %%f14, %%f30, %%f30
2501- stda %%f16, [%0] %4
2502- ldda [%1 + 64] %%asi, %%f48
2503- ldda [%0 + 128] %%asi, %%f0
2504- fxor %%f32, %%f48, %%f48
2505- fxor %%f34, %%f50, %%f50
2506- add %0, 128, %0
2507- fxor %%f36, %%f52, %%f52
2508- add %1, 128, %1
2509- fxor %%f38, %%f54, %%f54
2510- subcc %2, 128, %2
2511- fxor %%f40, %%f56, %%f56
2512- fxor %%f42, %%f58, %%f58
2513- fxor %%f44, %%f60, %%f60
2514- fxor %%f46, %%f62, %%f62
2515- stda %%f48, [%0 - 64] %%asi
2516- bne,pt %%xcc, 1b
2517- ldda [%1] %4, %%f16
2518- ldda [%0 + 64] %%asi, %%f32
2519- fxor %%f0, %%f16, %%f16
2520- fxor %%f2, %%f18, %%f18
2521- fxor %%f4, %%f20, %%f20
2522- fxor %%f6, %%f22, %%f22
2523- fxor %%f8, %%f24, %%f24
2524- fxor %%f10, %%f26, %%f26
2525- fxor %%f12, %%f28, %%f28
2526- fxor %%f14, %%f30, %%f30
2527- stda %%f16, [%0] %4
2528- ldda [%1 + 64] %%asi, %%f48
2529- membar #Sync
2530- fxor %%f32, %%f48, %%f48
2531- fxor %%f34, %%f50, %%f50
2532- fxor %%f36, %%f52, %%f52
2533- fxor %%f38, %%f54, %%f54
2534- fxor %%f40, %%f56, %%f56
2535- fxor %%f42, %%f58, %%f58
2536- fxor %%f44, %%f60, %%f60
2537- fxor %%f46, %%f62, %%f62
2538- stda %%f48, [%0 + 64] %%asi
2539- membar #Sync|#StoreStore|#StoreLoad
2540- wr %%g0, 0, %%fprs
2541- " : :
2542- "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
2543- "cc", "memory");
2544-}
2545-
2546-#endif /* __ASM_MD_H */
2547--- linux/drivers/block/Config.in.orig Fri Nov 23 11:17:44 2001
2548+++ linux/drivers/block/Config.in Fri Nov 23 11:18:18 2001
2549@@ -103,10 +103,13 @@
2550 fi
2551 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
2552 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
2553+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
2554 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
2555 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
2556 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
2557 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
2558+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
2559+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
2560 fi
2561 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
2562 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
2563--- linux/drivers/block/Makefile.orig Mon Dec 11 01:49:41 2000
2564+++ linux/drivers/block/Makefile Fri Nov 23 11:18:18 2001
2565@@ -294,10 +294,28 @@
2566 endif
2567
2568 ifeq ($(CONFIG_MD_RAID5),y)
2569+LX_OBJS += xor.o
2570 L_OBJS += raid5.o
2571 else
2572 ifeq ($(CONFIG_MD_RAID5),m)
2573+ LX_OBJS += xor.o
2574 M_OBJS += raid5.o
2575+ endif
2576+endif
2577+
2578+ifeq ($(CONFIG_MD_TRANSLUCENT),y)
2579+L_OBJS += translucent.o
2580+else
2581+ ifeq ($(CONFIG_MD_TRANSLUCENT),m)
2582+ M_OBJS += translucent.o
2583+ endif
2584+endif
2585+
2586+ifeq ($(CONFIG_MD_HSM),y)
2587+L_OBJS += hsm.o
2588+else
2589+ ifeq ($(CONFIG_MD_HSM),m)
2590+ M_OBJS += hsm.o
2591 endif
2592 endif
2593
2594--- linux/drivers/block/genhd.c.orig Fri Nov 23 11:17:44 2001
2595+++ linux/drivers/block/genhd.c Fri Nov 23 11:18:18 2001
2596@@ -28,6 +28,7 @@
2597 #include <linux/string.h>
2598 #include <linux/blk.h>
2599 #include <linux/init.h>
2600+#include <linux/raid/md.h>
2601
2602 #ifdef CONFIG_ARCH_S390
2603 #include <asm/dasd.h>
2604@@ -1786,6 +1787,9 @@
2605 else
2606 #endif
2607 rd_load();
2608+#endif
2609+#ifdef CONFIG_BLK_DEV_MD
2610+ autodetect_raid();
2611 #endif
2612 #ifdef CONFIG_MD_BOOT
2613 md_setup_drive();
2614--- linux/drivers/block/hsm.c.orig Fri Nov 23 11:18:18 2001
2615+++ linux/drivers/block/hsm.c Fri Nov 23 11:18:18 2001
2616@@ -0,0 +1,840 @@
2617+/*
2618+ hsm.c : HSM RAID driver for Linux
2619+ Copyright (C) 1998 Ingo Molnar
2620+
2621+ HSM mode management functions.
2622+
2623+ This program is free software; you can redistribute it and/or modify
2624+ it under the terms of the GNU General Public License as published by
2625+ the Free Software Foundation; either version 2, or (at your option)
2626+ any later version.
2627+
2628+ You should have received a copy of the GNU General Public License
2629+ (for example /usr/src/linux/COPYING); if not, write to the Free
2630+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2631+*/
2632+
2633+#include <linux/module.h>
2634+
2635+#include <linux/raid/md.h>
2636+#include <linux/malloc.h>
2637+
2638+#include <linux/raid/hsm.h>
2639+#include <linux/blk.h>
2640+
2641+#define MAJOR_NR MD_MAJOR
2642+#define MD_DRIVER
2643+#define MD_PERSONALITY
2644+
2645+
2646+#define DEBUG_HSM 1
2647+
2648+#if DEBUG_HSM
2649+#define dprintk(x,y...) printk(x,##y)
2650+#else
2651+#define dprintk(x,y...) do { } while (0)
2652+#endif
2653+
2654+void print_bh(struct buffer_head *bh)
2655+{
2656+ dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
2657+ bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
2658+ bh->b_rsector, bh->b_this_page, bh->b_state,
2659+ bh->b_next_free, bh->b_count, bh->b_data,
2660+ bh->b_list, bh->b_flushtime
2661+ );
2662+}
2663+
2664+static int check_bg (pv_t *pv, pv_block_group_t * bg)
2665+{
2666+ int i, free = 0;
2667+
2668+ dprintk("checking bg ...\n");
2669+
2670+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2671+ if (pv_pptr_free(bg->blocks + i)) {
2672+ free++;
2673+ if (test_bit(i, bg->used_bitmap)) {
2674+ printk("hm, bit %d set?\n", i);
2675+ }
2676+ } else {
2677+ if (!test_bit(i, bg->used_bitmap)) {
2678+ printk("hm, bit %d not set?\n", i);
2679+ }
2680+ }
2681+ }
2682+ dprintk("%d free blocks in bg ...\n", free);
2683+ return free;
2684+}
2685+
2686+static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
2687+{
2688+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2689+ struct buffer_head *bh;
2690+
2691+ dprintk("... getting BG at %u ...\n", bg_pos);
2692+
2693+ bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
2694+ if (!bh) {
2695+ MD_BUG();
2696+ return;
2697+ }
2698+ desc->bg = (pv_block_group_t *) bh->b_data;
2699+ desc->free_blocks = check_bg(pv, desc->bg);
2700+}
2701+
2702+static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
2703+ unsigned int lblock, lv_lptr_t * index)
2704+{
2705+ int i;
2706+
2707+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2708+ pv_pptr_t * bptr = desc->bg->blocks + i;
2709+ if (pv_pptr_free(bptr)) {
2710+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2711+
2712+ if (test_bit(i, desc->bg->used_bitmap)) {
2713+ MD_BUG();
2714+ continue;
2715+ }
2716+ bptr->u.used.owner.log_id = lv->log_id;
2717+ bptr->u.used.owner.log_index = lblock;
2718+ index->data.phys_nr = pv->phys_nr;
2719+ index->data.phys_block = bg_pos + i + 1;
2720+ set_bit(i, desc->bg->used_bitmap);
2721+ desc->free_blocks--;
2722+ dprintk(".....free blocks left in bg %p: %d\n",
2723+ desc->bg, desc->free_blocks);
2724+ return 0;
2725+ }
2726+ }
2727+ return -ENOSPC;
2728+}
2729+
2730+static int __get_free_block (lv_t *lv, pv_t *pv,
2731+ unsigned int lblock, lv_lptr_t * index)
2732+{
2733+ int i;
2734+
2735+ dprintk("trying to get free block for lblock %d ...\n", lblock);
2736+
2737+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2738+ pv_bg_desc_t *desc = pv->bg_array + i;
2739+
2740+ dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
2741+ if (!desc->bg)
2742+ get_bg(pv, desc, i);
2743+
2744+ if (desc->bg && desc->free_blocks)
2745+ return find_free_block(lv, pv, desc, i,
2746+ lblock, index);
2747+ }
2748+ dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
2749+ return -ENOSPC;
2750+}
2751+
2752+static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2753+{
2754+ int err;
2755+
2756+ if (!lv->free_indices)
2757+ return -ENOSPC;
2758+
2759+ /* fix me */
2760+ err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
2761+
2762+ if (err || !index->data.phys_block) {
2763+ MD_BUG();
2764+ return -ENOSPC;
2765+ }
2766+
2767+ lv->free_indices--;
2768+
2769+ return 0;
2770+}
2771+
2772+/*
2773+ * fix me: wordsize assumptions ...
2774+ */
2775+#define INDEX_BITS 8
2776+#define INDEX_DEPTH (32/INDEX_BITS)
2777+#define INDEX_MASK ((1<<INDEX_BITS) - 1)
2778+
2779+static void print_index_list (lv_t *lv, lv_lptr_t *index)
2780+{
2781+ lv_lptr_t *tmp;
2782+ int i;
2783+
2784+ dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
2785+ index->data.phys_block, index->cpu_addr);
2786+
2787+ tmp = index_child(index);
2788+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2789+ if (index_block(lv, tmp))
2790+ dprintk("(%d->%d)", i, index_block(lv, tmp));
2791+ tmp++;
2792+ }
2793+ dprintk(".]\n");
2794+}
2795+
2796+static int read_index_group (lv_t *lv, lv_lptr_t *index)
2797+{
2798+ lv_lptr_t *index_group, *tmp;
2799+ struct buffer_head *bh;
2800+ int i;
2801+
2802+ dprintk("reading index group <%s:%d>\n",
2803+ partition_name(index_dev(lv, index)), index_block(lv, index));
2804+
2805+ bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
2806+ if (!bh) {
2807+ MD_BUG();
2808+ return -EIO;
2809+ }
2810+ if (!buffer_uptodate(bh))
2811+ MD_BUG();
2812+
2813+ index_group = (lv_lptr_t *) bh->b_data;
2814+ tmp = index_group;
2815+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2816+ if (index_block(lv, tmp)) {
2817+ dprintk("index group has BLOCK %d, non-present.\n", i);
2818+ tmp->cpu_addr = 0;
2819+ }
2820+ tmp++;
2821+ }
2822+ index->cpu_addr = ptr_to_cpuaddr(index_group);
2823+
2824+ dprintk("have read index group %p at block %d.\n",
2825+ index_group, index_block(lv, index));
2826+ print_index_list(lv, index);
2827+
2828+ return 0;
2829+}
2830+
2831+static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2832+{
2833+ struct buffer_head *bh;
2834+ lv_lptr_t * index_group;
2835+
2836+ if (get_free_block(lv, lblock, index))
2837+ return -ENOSPC;
2838+
2839+ dprintk("creating block for index group <%s:%d>\n",
2840+ partition_name(index_dev(lv, index)), index_block(lv, index));
2841+
2842+ bh = getblk(index_dev(lv, index),
2843+ index_block(lv, index), HSM_BLOCKSIZE);
2844+
2845+ index_group = (lv_lptr_t *) bh->b_data;
2846+ md_clear_page(index_group);
2847+ mark_buffer_uptodate(bh, 1);
2848+
2849+ index->cpu_addr = ptr_to_cpuaddr(index_group);
2850+
2851+ dprintk("allocated index group %p at block %d.\n",
2852+ index_group, index_block(lv, index));
2853+ return 0;
2854+}
2855+
2856+static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
2857+{
2858+ lv_lptr_t * index = index_child(&lv->root_index);
2859+ int idx, l;
2860+
2861+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
2862+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2863+ index += idx;
2864+ if (!l)
2865+ break;
2866+ if (!index_present(index)) {
2867+ dprintk("no group, level %u, pos %u\n", l, idx);
2868+ if (alloc_index_group(lv, lblock, index))
2869+ return NULL;
2870+ }
2871+ index = index_child(index);
2872+ }
2873+ if (!index_block(lv,index)) {
2874+ dprintk("no data, pos %u\n", idx);
2875+ if (get_free_block(lv, lblock, index))
2876+ return NULL;
2877+ return index;
2878+ }
2879+ MD_BUG();
2880+ return index;
2881+}
2882+
2883+static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
2884+{
2885+ lv_lptr_t * index = index_child(&lv->root_index);
2886+ int idx, l;
2887+
2888+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
2889+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2890+ index += idx;
2891+ if (!l)
2892+ break;
2893+ if (index_free(index))
2894+ return NULL;
2895+ if (!index_present(index))
2896+ read_index_group(lv, index);
2897+ if (!index_present(index)) {
2898+ MD_BUG();
2899+ return NULL;
2900+ }
2901+ index = index_child(index);
2902+ }
2903+ if (!index_block(lv,index))
2904+ return NULL;
2905+ return index;
2906+}
2907+
2908+static int read_root_index(lv_t *lv)
2909+{
2910+ int err;
2911+ lv_lptr_t *index = &lv->root_index;
2912+
2913+ if (!index_block(lv, index)) {
2914+ printk("LV has no root index yet, creating.\n");
2915+
2916+ err = alloc_index_group (lv, 0, index);
2917+ if (err) {
2918+ printk("could not create index group, err:%d\n", err);
2919+ return err;
2920+ }
2921+ lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
2922+ lv->root_index.data;
2923+ } else {
2924+ printk("LV already has a root index.\n");
2925+ printk("... at <%s:%d>.\n",
2926+ partition_name(index_dev(lv, index)),
2927+ index_block(lv, index));
2928+
2929+ read_index_group(lv, index);
2930+ }
2931+ return 0;
2932+}
2933+
2934+static int init_pv(pv_t *pv)
2935+{
2936+ struct buffer_head *bh;
2937+ pv_sb_t *pv_sb;
2938+
2939+ bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
2940+ if (!bh) {
2941+ MD_BUG();
2942+ return -1;
2943+ }
2944+
2945+ pv_sb = (pv_sb_t *) bh->b_data;
2946+ pv->pv_sb = pv_sb;
2947+
2948+ if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
2949+ printk("%s is not a PV, has magic %x instead of %x!\n",
2950+ partition_name(pv->dev), pv_sb->pv_magic,
2951+ HSM_PV_SB_MAGIC);
2952+ return -1;
2953+ }
2954+ printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
2955+ pv->phys_nr);
2956+ printk("... created under HSM version %d.%d.%d, at %x.\n",
2957+ pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
2958+ printk("... total # of blocks: %d (%d left unallocated).\n",
2959+ pv_sb->pv_total_size, pv_sb->pv_blocks_left);
2960+
2961+ printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
2962+ printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
2963+ printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
2964+ printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
2965+
2966+ if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
2967+ MD_BUG();
2968+ return 1;
2969+ }
2970+ pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
2971+ if (!pv->bg_array) {
2972+ MD_BUG();
2973+ return 1;
2974+ }
2975+ memset(pv->bg_array, 0, PAGE_SIZE);
2976+
2977+ return 0;
2978+}
2979+
2980+static int free_pv(pv_t *pv)
2981+{
2982+ struct buffer_head *bh;
2983+
2984+ dprintk("freeing PV %d ...\n", pv->phys_nr);
2985+
2986+ if (pv->bg_array) {
2987+ int i;
2988+
2989+ dprintk(".... freeing BGs ...\n");
2990+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2991+ unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
2992+ pv_bg_desc_t *desc = pv->bg_array + i;
2993+
2994+ if (desc->bg) {
2995+ dprintk(".... freeing BG %d ...\n", i);
2996+ bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
2997+ mark_buffer_dirty(bh, 1);
2998+ brelse(bh);
2999+ brelse(bh);
3000+ }
3001+ }
3002+ free_page((unsigned long)pv->bg_array);
3003+ } else
3004+ MD_BUG();
3005+
3006+ bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
3007+ if (!bh) {
3008+ MD_BUG();
3009+ return -1;
3010+ }
3011+ mark_buffer_dirty(bh, 1);
3012+ brelse(bh);
3013+ brelse(bh);
3014+
3015+ return 0;
3016+}
3017+
3018+struct semaphore hsm_sem = MUTEX;
3019+
3020+#define HSM_SECTORS (HSM_BLOCKSIZE/512)
3021+
3022+static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3023+ unsigned long *rsector, unsigned long bsectors)
3024+{
3025+ lv_t *lv = kdev_to_lv(dev);
3026+ lv_lptr_t *index;
3027+ unsigned int lblock = *rsector / HSM_SECTORS;
3028+ unsigned int offset = *rsector % HSM_SECTORS;
3029+ int err = -EIO;
3030+
3031+ if (!lv) {
3032+ printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
3033+ goto out;
3034+ }
3035+ if (offset + bsectors > HSM_SECTORS) {
3036+ MD_BUG();
3037+ goto out;
3038+ }
3039+ down(&hsm_sem);
3040+ index = find_index(lv, lblock);
3041+ if (!index) {
3042+ printk("no block %u yet ... allocating\n", lblock);
3043+ index = alloc_fixed_index(lv, lblock);
3044+ }
3045+
3046+ err = 0;
3047+
3048+ printk(" %u <%s : %ld(%ld)> -> ", lblock,
3049+ partition_name(*rdev), *rsector, bsectors);
3050+
3051+ *rdev = index_dev(lv, index);
3052+ *rsector = index_block(lv, index) * HSM_SECTORS + offset;
3053+
3054+ printk(" <%s : %ld> %u\n",
3055+ partition_name(*rdev), *rsector, index_block(lv, index));
3056+
3057+ up(&hsm_sem);
3058+out:
3059+ return err;
3060+}
3061+
3062+static void free_index (lv_t *lv, lv_lptr_t * index)
3063+{
3064+ struct buffer_head *bh;
3065+
3066+ printk("tryin to get cached block for index group <%s:%d>\n",
3067+ partition_name(index_dev(lv, index)), index_block(lv, index));
3068+
3069+ bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
3070+
3071+ printk("....FREEING ");
3072+ print_index_list(lv, index);
3073+
3074+ if (bh) {
3075+ if (!buffer_uptodate(bh))
3076+ MD_BUG();
3077+ if ((lv_lptr_t *)bh->b_data != index_child(index)) {
3078+ printk("huh? b_data is %p, index content is %p.\n",
3079+ bh->b_data, index_child(index));
3080+ } else
3081+ printk("good, b_data == index content == %p.\n",
3082+ index_child(index));
3083+ printk("b_count == %d, writing.\n", bh->b_count);
3084+ mark_buffer_dirty(bh, 1);
3085+ brelse(bh);
3086+ brelse(bh);
3087+ printk("done.\n");
3088+ } else {
3089+ printk("FAILED!\n");
3090+ }
3091+ print_index_list(lv, index);
3092+ index_child(index) = NULL;
3093+}
3094+
3095+static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3096+{
3097+ char dots [3*8];
3098+ lv_lptr_t * index;
3099+ int i, nr_dots;
3100+
3101+ nr_dots = (INDEX_DEPTH-level)*3;
3102+ memcpy(dots,"...............",nr_dots);
3103+ dots[nr_dots] = 0;
3104+
3105+ dprintk("%s level %d index group block:\n", dots, level);
3106+
3107+
3108+ index = index_0;
3109+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3110+ if (index->data.phys_block) {
3111+ dprintk("%s block <%u,%u,%x>\n", dots,
3112+ index->data.phys_nr,
3113+ index->data.phys_block,
3114+ index->cpu_addr);
3115+ if (level && index_present(index)) {
3116+ dprintk("%s==> deeper one level\n", dots);
3117+ free_index_group(lv, level-1,
3118+ index_child(index));
3119+ dprintk("%s freeing index group block %p ...",
3120+ dots, index_child(index));
3121+ free_index(lv, index);
3122+ }
3123+ }
3124+ index++;
3125+ }
3126+ dprintk("%s DONE: level %d index group block.\n", dots, level);
3127+}
3128+
3129+static void free_lv_indextree (lv_t *lv)
3130+{
3131+ dprintk("freeing LV %d ...\n", lv->log_id);
3132+ dprintk("..root index: %p\n", index_child(&lv->root_index));
3133+ dprintk("..INDEX TREE:\n");
3134+ free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3135+ dprintk("..freeing root index %p ...", index_child(&lv->root_index));
3136+ dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
3137+ lv->root_index.data.phys_block, lv->root_index.cpu_addr);
3138+ free_index(lv, &lv->root_index);
3139+ dprintk("..INDEX TREE done.\n");
3140+ fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
3141+ lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
3142+}
3143+
3144+static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3145+{
3146+ char dots [3*5];
3147+ lv_lptr_t * index;
3148+ int i, nr_dots;
3149+
3150+ nr_dots = (INDEX_DEPTH-level)*3;
3151+ memcpy(dots,"...............",nr_dots);
3152+ dots[nr_dots] = 0;
3153+
3154+ dprintk("%s level %d index group block:\n", dots, level);
3155+
3156+
3157+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3158+ index = index_0 + i;
3159+ if (index->data.phys_block) {
3160+ dprintk("%s block <%u,%u,%x>\n", dots,
3161+ index->data.phys_nr,
3162+ index->data.phys_block,
3163+ index->cpu_addr);
3164+ if (level && index_present(index)) {
3165+ dprintk("%s==> deeper one level\n", dots);
3166+ print_index_group(lv, level-1,
3167+ index_child(index));
3168+ }
3169+ }
3170+ }
3171+ dprintk("%s DONE: level %d index group block.\n", dots, level);
3172+}
3173+
3174+static void print_lv (lv_t *lv)
3175+{
3176+ dprintk("printing LV %d ...\n", lv->log_id);
3177+ dprintk("..root index: %p\n", index_child(&lv->root_index));
3178+ dprintk("..INDEX TREE:\n");
3179+ print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3180+ dprintk("..INDEX TREE done.\n");
3181+}
3182+
3183+static int map_lv (lv_t *lv)
3184+{
3185+ kdev_t dev = lv->dev;
3186+ unsigned int nr = MINOR(dev);
3187+ mddev_t *mddev = lv->vg->mddev;
3188+
3189+ if (MAJOR(dev) != MD_MAJOR) {
3190+ MD_BUG();
3191+ return -1;
3192+ }
3193+ if (kdev_to_mddev(dev)) {
3194+ MD_BUG();
3195+ return -1;
3196+ }
3197+ md_hd_struct[nr].start_sect = 0;
3198+ md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
3199+ md_size[nr] = md_size[mdidx(mddev)];
3200+ add_mddev_mapping(mddev, dev, lv);
3201+
3202+ return 0;
3203+}
3204+
3205+static int unmap_lv (lv_t *lv)
3206+{
3207+ kdev_t dev = lv->dev;
3208+ unsigned int nr = MINOR(dev);
3209+
3210+ if (MAJOR(dev) != MD_MAJOR) {
3211+ MD_BUG();
3212+ return -1;
3213+ }
3214+ md_hd_struct[nr].start_sect = 0;
3215+ md_hd_struct[nr].nr_sects = 0;
3216+ md_size[nr] = 0;
3217+ del_mddev_mapping(lv->vg->mddev, dev);
3218+
3219+ return 0;
3220+}
3221+
3222+static int init_vg (vg_t *vg)
3223+{
3224+ int i;
3225+ lv_t *lv;
3226+ kdev_t dev;
3227+ vg_sb_t *vg_sb;
3228+ struct buffer_head *bh;
3229+ lv_descriptor_t *lv_desc;
3230+
3231+ /*
3232+ * fix me: read all PVs and compare the SB
3233+ */
3234+ dev = vg->pv_array[0].dev;
3235+ bh = bread (dev, 1, HSM_BLOCKSIZE);
3236+ if (!bh) {
3237+ MD_BUG();
3238+ return -1;
3239+ }
3240+
3241+ vg_sb = (vg_sb_t *) bh->b_data;
3242+ vg->vg_sb = vg_sb;
3243+
3244+ if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
3245+ printk("%s is not a valid VG, has magic %x instead of %x!\n",
3246+ partition_name(dev), vg_sb->vg_magic,
3247+ HSM_VG_SB_MAGIC);
3248+ return -1;
3249+ }
3250+
3251+ vg->nr_lv = 0;
3252+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3253+ unsigned int id;
3254+ lv_desc = vg->vg_sb->lv_array + i;
3255+
3256+ id = lv_desc->lv_id;
3257+ if (!id) {
3258+ printk("... LV desc %d empty\n", i);
3259+ continue;
3260+ }
3261+ if (id >= HSM_MAX_LVS_PER_VG) {
3262+ MD_BUG();
3263+ continue;
3264+ }
3265+
3266+ lv = vg->lv_array + id;
3267+ if (lv->vg) {
3268+ MD_BUG();
3269+ continue;
3270+ }
3271+ lv->log_id = id;
3272+ lv->vg = vg;
3273+ lv->max_indices = lv_desc->lv_max_indices;
3274+ lv->free_indices = lv_desc->lv_free_indices;
3275+ lv->root_index.data = lv_desc->lv_root_idx;
3276+ lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
3277+
3278+ vg->nr_lv++;
3279+
3280+ map_lv(lv);
3281+ if (read_root_index(lv)) {
3282+ vg->nr_lv--;
3283+ unmap_lv(lv);
3284+ memset(lv, 0, sizeof(*lv));
3285+ }
3286+ }
3287+ if (vg->nr_lv != vg_sb->nr_lvs)
3288+ MD_BUG();
3289+
3290+ return 0;
3291+}
3292+
3293+static int hsm_run (mddev_t *mddev)
3294+{
3295+ int i;
3296+ vg_t *vg;
3297+ mdk_rdev_t *rdev;
3298+
3299+ MOD_INC_USE_COUNT;
3300+
3301+ vg = kmalloc (sizeof (*vg), GFP_KERNEL);
3302+ if (!vg)
3303+ goto out;
3304+ memset(vg, 0, sizeof(*vg));
3305+ mddev->private = vg;
3306+ vg->mddev = mddev;
3307+
3308+ if (md_check_ordering(mddev)) {
3309+ printk("hsm: disks are not ordered, aborting!\n");
3310+ goto out;
3311+ }
3312+
3313+ set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
3314+
3315+ vg->nr_pv = mddev->nb_dev;
3316+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
3317+ pv_t *pv = vg->pv_array + i;
3318+
3319+ pv->dev = rdev->dev;
3320+ fsync_dev (pv->dev);
3321+ set_blocksize (pv->dev, HSM_BLOCKSIZE);
3322+ pv->phys_nr = i;
3323+ if (init_pv(pv))
3324+ goto out;
3325+ }
3326+
3327+ init_vg(vg);
3328+
3329+ return 0;
3330+
3331+out:
3332+ if (vg) {
3333+ kfree(vg);
3334+ mddev->private = NULL;
3335+ }
3336+ MOD_DEC_USE_COUNT;
3337+
3338+ return 1;
3339+}
3340+
3341+static int hsm_stop (mddev_t *mddev)
3342+{
3343+ lv_t *lv;
3344+ vg_t *vg;
3345+ int i;
3346+
3347+ vg = mddev_to_vg(mddev);
3348+
3349+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3350+ lv = vg->lv_array + i;
3351+ if (!lv->log_id)
3352+ continue;
3353+ print_lv(lv);
3354+ free_lv_indextree(lv);
3355+ unmap_lv(lv);
3356+ }
3357+ for (i = 0; i < vg->nr_pv; i++)
3358+ free_pv(vg->pv_array + i);
3359+
3360+ kfree(vg);
3361+
3362+ MOD_DEC_USE_COUNT;
3363+
3364+ return 0;
3365+}
3366+
3367+
3368+static int hsm_status (char *page, mddev_t *mddev)
3369+{
3370+ int sz = 0, i;
3371+ lv_t *lv;
3372+ vg_t *vg;
3373+
3374+ vg = mddev_to_vg(mddev);
3375+
3376+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3377+ lv = vg->lv_array + i;
3378+ if (!lv->log_id)
3379+ continue;
3380+ sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
3381+ lv->max_indices - lv->free_indices, lv->max_indices);
3382+ }
3383+ return sz;
3384+}
3385+
3386+
3387+static mdk_personality_t hsm_personality=
3388+{
3389+ "hsm",
3390+ hsm_map,
3391+ NULL,
3392+ NULL,
3393+ hsm_run,
3394+ hsm_stop,
3395+ hsm_status,
3396+ NULL,
3397+ 0,
3398+ NULL,
3399+ NULL,
3400+ NULL,
3401+ NULL
3402+};
3403+
3404+#ifndef MODULE
3405+
3406+md__initfunc(void hsm_init (void))
3407+{
3408+ register_md_personality (HSM, &hsm_personality);
3409+}
3410+
3411+#else
3412+
3413+int init_module (void)
3414+{
3415+ return (register_md_personality (HSM, &hsm_personality));
3416+}
3417+
3418+void cleanup_module (void)
3419+{
3420+ unregister_md_personality (HSM);
3421+}
3422+
3423+#endif
3424+
3425+/*
3426+ * This Linus-trick catches bugs via the linker.
3427+ */
3428+
3429+extern void __BUG__in__hsm_dot_c_1(void);
3430+extern void __BUG__in__hsm_dot_c_2(void);
3431+extern void __BUG__in__hsm_dot_c_3(void);
3432+extern void __BUG__in__hsm_dot_c_4(void);
3433+extern void __BUG__in__hsm_dot_c_5(void);
3434+extern void __BUG__in__hsm_dot_c_6(void);
3435+extern void __BUG__in__hsm_dot_c_7(void);
3436+
3437+void bugcatcher (void)
3438+{
3439+ if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
3440+ __BUG__in__hsm_dot_c_1();
3441+ if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
3442+ __BUG__in__hsm_dot_c_2();
3443+
3444+ if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
3445+ __BUG__in__hsm_dot_c_4();
3446+ if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
3447+ __BUG__in__hsm_dot_c_3();
3448+ if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
3449+ __BUG__in__hsm_dot_c_6();
3450+
3451+ if (sizeof(lv_lptr_t) != 16)
3452+ __BUG__in__hsm_dot_c_5();
3453+ if (sizeof(pv_pptr_t) != 16)
3454+ __BUG__in__hsm_dot_c_6();
3455+}
3456+
3457--- linux/drivers/block/linear.c.orig Sat Nov 8 20:39:12 1997
3458+++ linux/drivers/block/linear.c Fri Nov 23 11:18:18 2001
3459@@ -1,4 +1,3 @@
3460-
3461 /*
3462 linear.c : Multiple Devices driver for Linux
3463 Copyright (C) 1994-96 Marc ZYNGIER
3464@@ -19,186 +18,207 @@
3465
3466 #include <linux/module.h>
3467
3468-#include <linux/md.h>
3469+#include <linux/raid/md.h>
3470 #include <linux/malloc.h>
3471-#include <linux/init.h>
3472
3473-#include "linear.h"
3474+#include <linux/raid/linear.h>
3475
3476 #define MAJOR_NR MD_MAJOR
3477 #define MD_DRIVER
3478 #define MD_PERSONALITY
3479
3480-static int linear_run (int minor, struct md_dev *mddev)
3481+static int linear_run (mddev_t *mddev)
3482 {
3483- int cur=0, i, size, dev0_size, nb_zone;
3484- struct linear_data *data;
3485-
3486- MOD_INC_USE_COUNT;
3487-
3488- mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
3489- data=(struct linear_data *) mddev->private;
3490-
3491- /*
3492- Find out the smallest device. This was previously done
3493- at registry time, but since it violates modularity,
3494- I moved it here... Any comment ? ;-)
3495- */
3496-
3497- data->smallest=mddev->devices;
3498- for (i=1; i<mddev->nb_dev; i++)
3499- if (data->smallest->size > mddev->devices[i].size)
3500- data->smallest=mddev->devices+i;
3501-
3502- nb_zone=data->nr_zones=
3503- md_size[minor]/data->smallest->size +
3504- (md_size[minor]%data->smallest->size ? 1 : 0);
3505-
3506- data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
3507-
3508- size=mddev->devices[cur].size;
3509+ linear_conf_t *conf;
3510+ struct linear_hash *table;
3511+ mdk_rdev_t *rdev;
3512+ int size, i, j, nb_zone;
3513+ unsigned int curr_offset;
3514+
3515+ MOD_INC_USE_COUNT;
3516+
3517+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
3518+ if (!conf)
3519+ goto out;
3520+ mddev->private = conf;
3521+
3522+ if (md_check_ordering(mddev)) {
3523+ printk("linear: disks are not ordered, aborting!\n");
3524+ goto out;
3525+ }
3526+ /*
3527+ * Find the smallest device.
3528+ */
3529+
3530+ conf->smallest = NULL;
3531+ curr_offset = 0;
3532+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
3533+ dev_info_t *disk = conf->disks + j;
3534+
3535+ disk->dev = rdev->dev;
3536+ disk->size = rdev->size;
3537+ disk->offset = curr_offset;
3538+
3539+ curr_offset += disk->size;
3540+
3541+ if (!conf->smallest || (disk->size < conf->smallest->size))
3542+ conf->smallest = disk;
3543+ }
3544+
3545+ nb_zone = conf->nr_zones =
3546+ md_size[mdidx(mddev)] / conf->smallest->size +
3547+ ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
3548+
3549+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
3550+ GFP_KERNEL);
3551+ if (!conf->hash_table)
3552+ goto out;
3553+
3554+ /*
3555+ * Here we generate the linear hash table
3556+ */
3557+ table = conf->hash_table;
3558+ i = 0;
3559+ size = 0;
3560+ for (j = 0; j < mddev->nb_dev; j++) {
3561+ dev_info_t *disk = conf->disks + j;
3562+
3563+ if (size < 0) {
3564+ table->dev1 = disk;
3565+ table++;
3566+ }
3567+ size += disk->size;
3568+
3569+ while (size) {
3570+ table->dev0 = disk;
3571+ size -= conf->smallest->size;
3572+ if (size < 0)
3573+ break;
3574+ table->dev1 = NULL;
3575+ table++;
3576+ }
3577+ }
3578+ table->dev1 = NULL;
3579+
3580+ return 0;
3581+
3582+out:
3583+ if (conf)
3584+ kfree(conf);
3585+ MOD_DEC_USE_COUNT;
3586+ return 1;
3587+}
3588+
3589+static int linear_stop (mddev_t *mddev)
3590+{
3591+ linear_conf_t *conf = mddev_to_conf(mddev);
3592+
3593+ kfree(conf->hash_table);
3594+ kfree(conf);
3595
3596- i=0;
3597- while (cur<mddev->nb_dev)
3598- {
3599- data->hash_table[i].dev0=mddev->devices+cur;
3600+ MOD_DEC_USE_COUNT;
3601
3602- if (size>=data->smallest->size) /* If we completely fill the slot */
3603- {
3604- data->hash_table[i++].dev1=NULL;
3605- size-=data->smallest->size;
3606-
3607- if (!size)
3608- {
3609- if (++cur==mddev->nb_dev) continue;
3610- size=mddev->devices[cur].size;
3611- }
3612-
3613- continue;
3614- }
3615-
3616- if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
3617- {
3618- data->hash_table[i].dev1=NULL;
3619- continue;
3620- }
3621-
3622- dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
3623- size=mddev->devices[cur].size;
3624- data->hash_table[i++].dev1=mddev->devices+cur;
3625- size-=(data->smallest->size - dev0_size);
3626- }
3627-
3628- return 0;
3629-}
3630-
3631-static int linear_stop (int minor, struct md_dev *mddev)
3632-{
3633- struct linear_data *data=(struct linear_data *) mddev->private;
3634-
3635- kfree (data->hash_table);
3636- kfree (data);
3637-
3638- MOD_DEC_USE_COUNT;
3639-
3640- return 0;
3641+ return 0;
3642 }
3643
3644
3645-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
3646+static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3647 unsigned long *rsector, unsigned long size)
3648 {
3649- struct linear_data *data=(struct linear_data *) mddev->private;
3650- struct linear_hash *hash;
3651- struct real_dev *tmp_dev;
3652- long block;
3653-
3654- block=*rsector >> 1;
3655- hash=data->hash_table+(block/data->smallest->size);
3656-
3657- if (block >= (hash->dev0->size + hash->dev0->offset))
3658- {
3659- if (!hash->dev1)
3660- {
3661- printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
3662- return (-1);
3663- }
3664-
3665- tmp_dev=hash->dev1;
3666- }
3667- else
3668- tmp_dev=hash->dev0;
3669+ linear_conf_t *conf = mddev_to_conf(mddev);
3670+ struct linear_hash *hash;
3671+ dev_info_t *tmp_dev;
3672+ long block;
3673+
3674+ block = *rsector >> 1;
3675+ hash = conf->hash_table + (block / conf->smallest->size);
3676+
3677+ if (block >= (hash->dev0->size + hash->dev0->offset))
3678+ {
3679+ if (!hash->dev1)
3680+ {
3681+ printk ("linear_map : hash->dev1==NULL for block %ld\n",
3682+ block);
3683+ return -1;
3684+ }
3685+ tmp_dev = hash->dev1;
3686+ } else
3687+ tmp_dev = hash->dev0;
3688
3689- if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
3690- printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3691- block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3692+ if (block >= (tmp_dev->size + tmp_dev->offset)
3693+ || block < tmp_dev->offset)
3694+ printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3695+ block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3696
3697- *rdev=tmp_dev->dev;
3698- *rsector=(block-(tmp_dev->offset)) << 1;
3699+ *rdev = tmp_dev->dev;
3700+ *rsector = (block - tmp_dev->offset) << 1;
3701
3702- return (0);
3703+ return 0;
3704 }
3705
3706-static int linear_status (char *page, int minor, struct md_dev *mddev)
3707+static int linear_status (char *page, mddev_t *mddev)
3708 {
3709- int sz=0;
3710+ int sz=0;
3711
3712 #undef MD_DEBUG
3713 #ifdef MD_DEBUG
3714- int j;
3715- struct linear_data *data=(struct linear_data *) mddev->private;
3716+ int j;
3717+ linear_conf_t *conf = mddev_to_conf(mddev);
3718
3719- sz+=sprintf (page+sz, " ");
3720- for (j=0; j<data->nr_zones; j++)
3721- {
3722- sz+=sprintf (page+sz, "[%s",
3723- partition_name (data->hash_table[j].dev0->dev));
3724-
3725- if (data->hash_table[j].dev1)
3726- sz+=sprintf (page+sz, "/%s] ",
3727- partition_name(data->hash_table[j].dev1->dev));
3728- else
3729- sz+=sprintf (page+sz, "] ");
3730- }
3731-
3732- sz+=sprintf (page+sz, "\n");
3733+ sz += sprintf(page+sz, " ");
3734+ for (j = 0; j < conf->nr_zones; j++)
3735+ {
3736+ sz += sprintf(page+sz, "[%s",
3737+ partition_name(conf->hash_table[j].dev0->dev));
3738+
3739+ if (conf->hash_table[j].dev1)
3740+ sz += sprintf(page+sz, "/%s] ",
3741+ partition_name(conf->hash_table[j].dev1->dev));
3742+ else
3743+ sz += sprintf(page+sz, "] ");
3744+ }
3745+ sz += sprintf(page+sz, "\n");
3746 #endif
3747- sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
3748- return sz;
3749+ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
3750+ return sz;
3751 }
3752
3753
3754-static struct md_personality linear_personality=
3755+static mdk_personality_t linear_personality=
3756 {
3757- "linear",
3758- linear_map,
3759- NULL,
3760- NULL,
3761- linear_run,
3762- linear_stop,
3763- linear_status,
3764- NULL, /* no ioctls */
3765- 0
3766+ "linear",
3767+ linear_map,
3768+ NULL,
3769+ NULL,
3770+ linear_run,
3771+ linear_stop,
3772+ linear_status,
3773+ NULL,
3774+ 0,
3775+ NULL,
3776+ NULL,
3777+ NULL,
3778+ NULL
3779 };
3780
3781-
3782 #ifndef MODULE
3783
3784-__initfunc(void linear_init (void))
3785+md__initfunc(void linear_init (void))
3786 {
3787- register_md_personality (LINEAR, &linear_personality);
3788+ register_md_personality (LINEAR, &linear_personality);
3789 }
3790
3791 #else
3792
3793 int init_module (void)
3794 {
3795- return (register_md_personality (LINEAR, &linear_personality));
3796+ return (register_md_personality (LINEAR, &linear_personality));
3797 }
3798
3799 void cleanup_module (void)
3800 {
3801- unregister_md_personality (LINEAR);
3802+ unregister_md_personality (LINEAR);
3803 }
3804
3805 #endif
3806+
3807--- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996
3808+++ linux/drivers/block/linear.h Fri Nov 23 11:18:18 2001
3809@@ -1,16 +0,0 @@
3810-#ifndef _LINEAR_H
3811-#define _LINEAR_H
3812-
3813-struct linear_hash
3814-{
3815- struct real_dev *dev0, *dev1;
3816-};
3817-
3818-struct linear_data
3819-{
3820- struct linear_hash *hash_table; /* Dynamically allocated */
3821- struct real_dev *smallest;
3822- int nr_zones;
3823-};
3824-
3825-#endif
3826--- linux/drivers/block/ll_rw_blk.c.orig Fri Nov 23 11:17:44 2001
3827+++ linux/drivers/block/ll_rw_blk.c Fri Nov 23 11:18:18 2001
3828@@ -23,6 +23,7 @@
3829 #include <asm/io.h>
3830 #include <asm/uaccess.h>
3831 #include <linux/blk.h>
3832+#include <linux/raid/md.h>
3833
3834 #ifdef CONFIG_POWERMAC
3835 #include <asm/ide.h>
3836@@ -57,6 +58,11 @@
3837 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
3838
3839 /*
3840+ * per-major idle-IO detection
3841+ */
3842+unsigned long io_events[MAX_BLKDEV] = {0, };
3843+
3844+/*
3845 * used to wait on when there are no free requests
3846 */
3847 struct wait_queue * wait_for_request;
3848@@ -587,6 +593,8 @@
3849 return;
3850 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
3851 lock_buffer(bh);
3852+ if (!buffer_lowprio(bh))
3853+ io_events[major]++;
3854
3855 if (blk_size[major]) {
3856 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
3857@@ -836,7 +844,7 @@
3858 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
3859 #ifdef CONFIG_BLK_DEV_MD
3860 if (major==MD_MAJOR &&
3861- md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
3862+ md_map (bh[i]->b_dev, &bh[i]->b_rdev,
3863 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
3864 printk (KERN_ERR
3865 "Bad md_map in ll_rw_block\n");
3866@@ -856,7 +864,7 @@
3867 set_bit(BH_Req, &bh[i]->b_state);
3868 #ifdef CONFIG_BLK_DEV_MD
3869 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
3870- md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
3871+ md_make_request(bh[i], rw);
3872 continue;
3873 }
3874 #endif
3875--- linux/drivers/block/md.c.orig Mon Sep 4 19:39:16 2000
3876+++ linux/drivers/block/md.c Fri Nov 23 11:18:18 2001
3877@@ -1,21 +1,17 @@
3878-
3879 /*
3880 md.c : Multiple Devices driver for Linux
3881- Copyright (C) 1994-96 Marc ZYNGIER
3882- <zyngier@ufr-info-p7.ibp.fr> or
3883- <maz@gloups.fdn.fr>
3884+ Copyright (C) 1998, 1999 Ingo Molnar
3885
3886- A lot of inspiration came from hd.c ...
3887+ completely rewritten, based on the MD driver code from Marc Zyngier
3888
3889- kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3890- boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3891+ Changes:
3892
3893- RAID-1/RAID-5 extensions by:
3894- Ingo Molnar, Miguel de Icaza, Gadi Oxman
3895+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
3896+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3897+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3898+ - kmod support by: Cyrus Durgin
3899+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
3900
3901- Changes for kmod by:
3902- Cyrus Durgin
3903-
3904 This program is free software; you can redistribute it and/or modify
3905 it under the terms of the GNU General Public License as published by
3906 the Free Software Foundation; either version 2, or (at your option)
3907@@ -26,807 +22,3007 @@
3908 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3909 */
3910
3911-/*
3912- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
3913- * the extra system load does not show up that much. Increase it if your
3914- * system can take more.
3915- */
3916-#define SPEED_LIMIT 1024
3917+#include <linux/raid/md.h>
3918+#include <linux/raid/xor.h>
3919
3920-#include <linux/config.h>
3921-#include <linux/module.h>
3922-#include <linux/version.h>
3923-#include <linux/malloc.h>
3924-#include <linux/mm.h>
3925-#include <linux/md.h>
3926-#include <linux/hdreg.h>
3927-#include <linux/stat.h>
3928-#include <linux/fs.h>
3929-#include <linux/proc_fs.h>
3930-#include <linux/blkdev.h>
3931-#include <linux/genhd.h>
3932-#include <linux/smp_lock.h>
3933 #ifdef CONFIG_KMOD
3934 #include <linux/kmod.h>
3935 #endif
3936-#include <linux/errno.h>
3937-#include <linux/init.h>
3938
3939 #define __KERNEL_SYSCALLS__
3940 #include <linux/unistd.h>
3941
3942+#include <asm/unaligned.h>
3943+
3944+extern asmlinkage int sys_sched_yield(void);
3945+extern asmlinkage int sys_setsid(void);
3946+
3947+extern unsigned long io_events[MAX_BLKDEV];
3948+
3949 #define MAJOR_NR MD_MAJOR
3950 #define MD_DRIVER
3951
3952 #include <linux/blk.h>
3953-#include <asm/uaccess.h>
3954-#include <asm/bitops.h>
3955-#include <asm/atomic.h>
3956
3957 #ifdef CONFIG_MD_BOOT
3958-extern kdev_t name_to_kdev_t(char *line) __init;
3959+extern kdev_t name_to_kdev_t(char *line) md__init;
3960 #endif
3961
3962-static struct hd_struct md_hd_struct[MAX_MD_DEV];
3963-static int md_blocksizes[MAX_MD_DEV];
3964-int md_maxreadahead[MAX_MD_DEV];
3965-#if SUPPORT_RECONSTRUCTION
3966-static struct md_thread *md_sync_thread = NULL;
3967-#endif /* SUPPORT_RECONSTRUCTION */
3968+static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
3969+
3970+/*
3971+ * these have to be allocated separately because external
3972+ * subsystems want to have a pre-defined structure
3973+ */
3974+struct hd_struct md_hd_struct[MAX_MD_DEVS];
3975+static int md_blocksizes[MAX_MD_DEVS];
3976+static int md_maxreadahead[MAX_MD_DEVS];
3977+static mdk_thread_t *md_recovery_thread = NULL;
3978
3979-int md_size[MAX_MD_DEV]={0, };
3980+int md_size[MAX_MD_DEVS] = {0, };
3981
3982 static void md_geninit (struct gendisk *);
3983
3984 static struct gendisk md_gendisk=
3985 {
3986- MD_MAJOR,
3987- "md",
3988- 0,
3989- 1,
3990- MAX_MD_DEV,
3991- md_geninit,
3992- md_hd_struct,
3993- md_size,
3994- MAX_MD_DEV,
3995- NULL,
3996- NULL
3997+ MD_MAJOR,
3998+ "md",
3999+ 0,
4000+ 1,
4001+ MAX_MD_DEVS,
4002+ md_geninit,
4003+ md_hd_struct,
4004+ md_size,
4005+ MAX_MD_DEVS,
4006+ NULL,
4007+ NULL
4008 };
4009
4010-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
4011-struct md_dev md_dev[MAX_MD_DEV];
4012-
4013-int md_thread(void * arg);
4014+/*
4015+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
4016+ * is 100 KB/sec, so the extra system load does not show up that much.
4017+ * Increase it if you want to have more _guaranteed_ speed. Note that
4018+ * the RAID driver will use the maximum available bandwith if the IO
4019+ * subsystem is idle.
4020+ *
4021+ * you can change it via /proc/sys/dev/speed-limit
4022+ */
4023
4024-static struct gendisk *find_gendisk (kdev_t dev)
4025-{
4026- struct gendisk *tmp=gendisk_head;
4027+static int sysctl_speed_limit = 100;
4028
4029- while (tmp != NULL)
4030- {
4031- if (tmp->major==MAJOR(dev))
4032- return (tmp);
4033-
4034- tmp=tmp->next;
4035- }
4036+static struct ctl_table_header *md_table_header;
4037
4038- return (NULL);
4039-}
4040+static ctl_table md_table[] = {
4041+ {DEV_MD_SPEED_LIMIT, "speed-limit",
4042+ &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
4043+ {0}
4044+};
4045
4046-char *partition_name (kdev_t dev)
4047-{
4048- static char name[40]; /* This should be long
4049- enough for a device name ! */
4050- struct gendisk *hd = find_gendisk (dev);
4051+static ctl_table md_dir_table[] = {
4052+ {DEV_MD, "md", NULL, 0, 0555, md_table},
4053+ {0}
4054+};
4055
4056- if (!hd)
4057- {
4058- sprintf (name, "[dev %s]", kdevname(dev));
4059- return (name);
4060- }
4061+static ctl_table md_root_table[] = {
4062+ {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
4063+ {0}
4064+};
4065
4066- return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
4067+static void md_register_sysctl(void)
4068+{
4069+ md_table_header = register_sysctl_table(md_root_table, 1);
4070 }
4071
4072-static int legacy_raid_sb (int minor, int pnum)
4073+void md_unregister_sysctl(void)
4074 {
4075- int i, factor;
4076+ unregister_sysctl_table(md_table_header);
4077+}
4078+
4079+/*
4080+ * The mapping between kdev and mddev is not necessary a simple
4081+ * one! Eg. HSM uses several sub-devices to implement Logical
4082+ * Volumes. All these sub-devices map to the same mddev.
4083+ */
4084+dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
4085
4086- factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4087+void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
4088+{
4089+ unsigned int minor = MINOR(dev);
4090
4091- /*****
4092- * do size and offset calculations.
4093- */
4094- for (i=0; i<md_dev[minor].nb_dev; i++) {
4095- md_dev[minor].devices[i].size &= ~(factor - 1);
4096- md_size[minor] += md_dev[minor].devices[i].size;
4097- md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
4098- md_dev[minor].devices[i-1].size) : 0;
4099+ if (MAJOR(dev) != MD_MAJOR) {
4100+ MD_BUG();
4101+ return;
4102 }
4103- if (pnum == RAID0 >> PERSONALITY_SHIFT)
4104- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
4105- return 0;
4106+ if (mddev_map[minor].mddev != NULL) {
4107+ MD_BUG();
4108+ return;
4109+ }
4110+ mddev_map[minor].mddev = mddev;
4111+ mddev_map[minor].data = data;
4112 }
4113
4114-static void free_sb (struct md_dev *mddev)
4115+void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
4116 {
4117- int i;
4118- struct real_dev *realdev;
4119+ unsigned int minor = MINOR(dev);
4120
4121- if (mddev->sb) {
4122- free_page((unsigned long) mddev->sb);
4123- mddev->sb = NULL;
4124+ if (MAJOR(dev) != MD_MAJOR) {
4125+ MD_BUG();
4126+ return;
4127 }
4128- for (i = 0; i <mddev->nb_dev; i++) {
4129- realdev = mddev->devices + i;
4130- if (realdev->sb) {
4131- free_page((unsigned long) realdev->sb);
4132- realdev->sb = NULL;
4133- }
4134+ if (mddev_map[minor].mddev != mddev) {
4135+ MD_BUG();
4136+ return;
4137 }
4138+ mddev_map[minor].mddev = NULL;
4139+ mddev_map[minor].data = NULL;
4140 }
4141
4142 /*
4143- * Check one RAID superblock for generic plausibility
4144+ * Enables to iterate over all existing md arrays
4145 */
4146+static MD_LIST_HEAD(all_mddevs);
4147
4148-#define BAD_MAGIC KERN_ERR \
4149-"md: %s: invalid raid superblock magic (%x) on block %u\n"
4150+static mddev_t * alloc_mddev (kdev_t dev)
4151+{
4152+ mddev_t * mddev;
4153
4154-#define OUT_OF_MEM KERN_ALERT \
4155-"md: out of memory.\n"
4156+ if (MAJOR(dev) != MD_MAJOR) {
4157+ MD_BUG();
4158+ return 0;
4159+ }
4160+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
4161+ if (!mddev)
4162+ return NULL;
4163+
4164+ memset(mddev, 0, sizeof(*mddev));
4165
4166-#define NO_DEVICE KERN_ERR \
4167-"md: disabled device %s\n"
4168+ mddev->__minor = MINOR(dev);
4169+ mddev->reconfig_sem = MUTEX;
4170+ mddev->recovery_sem = MUTEX;
4171+ mddev->resync_sem = MUTEX;
4172+ MD_INIT_LIST_HEAD(&mddev->disks);
4173+ /*
4174+ * The 'base' mddev is the one with data NULL.
4175+ * personalities can create additional mddevs
4176+ * if necessary.
4177+ */
4178+ add_mddev_mapping(mddev, dev, 0);
4179+ md_list_add(&mddev->all_mddevs, &all_mddevs);
4180
4181-#define SUCCESS 0
4182-#define FAILURE -1
4183+ return mddev;
4184+}
4185
4186-static int analyze_one_sb (struct real_dev * rdev)
4187+static void free_mddev (mddev_t *mddev)
4188 {
4189- int ret = FAILURE;
4190- struct buffer_head *bh;
4191- kdev_t dev = rdev->dev;
4192- md_superblock_t *sb;
4193+ if (!mddev) {
4194+ MD_BUG();
4195+ return;
4196+ }
4197
4198 /*
4199- * Read the superblock, it's at the end of the disk
4200+ * Make sure nobody else is using this mddev
4201+ * (careful, we rely on the global kernel lock here)
4202 */
4203- rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
4204- set_blocksize (dev, MD_SB_BYTES);
4205- bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4206-
4207- if (bh) {
4208- sb = (md_superblock_t *) bh->b_data;
4209- if (sb->md_magic != MD_SB_MAGIC) {
4210- printk (BAD_MAGIC, kdevname(dev),
4211- sb->md_magic, rdev->sb_offset);
4212- goto abort;
4213- }
4214- rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
4215- if (!rdev->sb) {
4216- printk (OUT_OF_MEM);
4217- goto abort;
4218- }
4219- memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
4220+ while (md_atomic_read(&mddev->resync_sem.count) != 1)
4221+ schedule();
4222+ while (md_atomic_read(&mddev->recovery_sem.count) != 1)
4223+ schedule();
4224
4225- rdev->size = sb->size;
4226- } else
4227- printk (NO_DEVICE,kdevname(rdev->dev));
4228- ret = SUCCESS;
4229-abort:
4230- if (bh)
4231- brelse (bh);
4232- return ret;
4233+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
4234+ md_list_del(&mddev->all_mddevs);
4235+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
4236+ kfree(mddev);
4237 }
4238
4239-#undef SUCCESS
4240-#undef FAILURE
4241-
4242-#undef BAD_MAGIC
4243-#undef OUT_OF_MEM
4244-#undef NO_DEVICE
4245
4246-/*
4247- * Check a full RAID array for plausibility
4248- */
4249+struct gendisk * find_gendisk (kdev_t dev)
4250+{
4251+ struct gendisk *tmp = gendisk_head;
4252
4253-#define INCONSISTENT KERN_ERR \
4254-"md: superblock inconsistency -- run ckraid\n"
4255+ while (tmp != NULL) {
4256+ if (tmp->major == MAJOR(dev))
4257+ return (tmp);
4258+ tmp = tmp->next;
4259+ }
4260+ return (NULL);
4261+}
4262
4263-#define OUT_OF_DATE KERN_ERR \
4264-"md: superblock update time inconsistenty -- using the most recent one\n"
4265+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
4266+{
4267+ mdk_rdev_t * rdev;
4268+ struct md_list_head *tmp;
4269
4270-#define OLD_VERSION KERN_ALERT \
4271-"md: %s: unsupported raid array version %d.%d.%d\n"
4272+ ITERATE_RDEV(mddev,rdev,tmp) {
4273+ if (rdev->desc_nr == nr)
4274+ return rdev;
4275+ }
4276+ return NULL;
4277+}
4278
4279-#define NOT_CLEAN KERN_ERR \
4280-"md: %s: raid array is not clean -- run ckraid\n"
4281+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
4282+{
4283+ struct md_list_head *tmp;
4284+ mdk_rdev_t *rdev;
4285
4286-#define NOT_CLEAN_IGNORE KERN_ERR \
4287-"md: %s: raid array is not clean -- reconstructing parity\n"
4288+ ITERATE_RDEV(mddev,rdev,tmp) {
4289+ if (rdev->dev == dev)
4290+ return rdev;
4291+ }
4292+ return NULL;
4293+}
4294
4295-#define UNKNOWN_LEVEL KERN_ERR \
4296-"md: %s: unsupported raid level %d\n"
4297+static MD_LIST_HEAD(device_names);
4298
4299-static int analyze_sbs (int minor, int pnum)
4300+char * partition_name (kdev_t dev)
4301 {
4302- struct md_dev *mddev = md_dev + minor;
4303- int i, N = mddev->nb_dev, out_of_date = 0;
4304- struct real_dev * disks = mddev->devices;
4305- md_superblock_t *sb, *freshest = NULL;
4306+ struct gendisk *hd;
4307+ static char nomem [] = "<nomem>";
4308+ dev_name_t *dname;
4309+ struct md_list_head *tmp = device_names.next;
4310
4311- /*
4312- * RAID-0 and linear don't use a RAID superblock
4313- */
4314- if (pnum == RAID0 >> PERSONALITY_SHIFT ||
4315- pnum == LINEAR >> PERSONALITY_SHIFT)
4316- return legacy_raid_sb (minor, pnum);
4317+ while (tmp != &device_names) {
4318+ dname = md_list_entry(tmp, dev_name_t, list);
4319+ if (dname->dev == dev)
4320+ return dname->name;
4321+ tmp = tmp->next;
4322+ }
4323+
4324+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
4325
4326+ if (!dname)
4327+ return nomem;
4328 /*
4329- * Verify the RAID superblock on each real device
4330+ * ok, add this new device name to the list
4331 */
4332- for (i = 0; i < N; i++)
4333- if (analyze_one_sb(disks+i))
4334- goto abort;
4335+ hd = find_gendisk (dev);
4336+
4337+ if (!hd)
4338+ sprintf (dname->name, "[dev %s]", kdevname(dev));
4339+ else
4340+ disk_name (hd, MINOR(dev), dname->name);
4341+
4342+ dname->dev = dev;
4343+ md_list_add(&dname->list, &device_names);
4344+
4345+ return dname->name;
4346+}
4347+
4348+static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
4349+ int persistent)
4350+{
4351+ unsigned int size = 0;
4352+
4353+ if (blk_size[MAJOR(dev)])
4354+ size = blk_size[MAJOR(dev)][MINOR(dev)];
4355+ if (persistent)
4356+ size = MD_NEW_SIZE_BLOCKS(size);
4357+ return size;
4358+}
4359+
4360+static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
4361+{
4362+ unsigned int size;
4363+
4364+ size = calc_dev_sboffset(dev, mddev, persistent);
4365+ if (!mddev->sb) {
4366+ MD_BUG();
4367+ return size;
4368+ }
4369+ if (mddev->sb->chunk_size)
4370+ size &= ~(mddev->sb->chunk_size/1024 - 1);
4371+ return size;
4372+}
4373+
4374+/*
4375+ * We check wether all devices are numbered from 0 to nb_dev-1. The
4376+ * order is guaranteed even after device name changes.
4377+ *
4378+ * Some personalities (raid0, linear) use this. Personalities that
4379+ * provide data have to be able to deal with loss of individual
4380+ * disks, so they do their checking themselves.
4381+ */
4382+int md_check_ordering (mddev_t *mddev)
4383+{
4384+ int i, c;
4385+ mdk_rdev_t *rdev;
4386+ struct md_list_head *tmp;
4387
4388 /*
4389- * The superblock constant part has to be the same
4390- * for all disks in the array.
4391+ * First, all devices must be fully functional
4392 */
4393- sb = NULL;
4394- for (i = 0; i < N; i++) {
4395- if (!disks[i].sb)
4396- continue;
4397- if (!sb) {
4398- sb = disks[i].sb;
4399- continue;
4400- }
4401- if (memcmp(sb,
4402- disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
4403- printk (INCONSISTENT);
4404+ ITERATE_RDEV(mddev,rdev,tmp) {
4405+ if (rdev->faulty) {
4406+ printk("md: md%d's device %s faulty, aborting.\n",
4407+ mdidx(mddev), partition_name(rdev->dev));
4408 goto abort;
4409 }
4410 }
4411
4412- /*
4413- * OK, we have all disks and the array is ready to run. Let's
4414- * find the freshest superblock, that one will be the superblock
4415- * that represents the whole array.
4416- */
4417- if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
4418+ c = 0;
4419+ ITERATE_RDEV(mddev,rdev,tmp) {
4420+ c++;
4421+ }
4422+ if (c != mddev->nb_dev) {
4423+ MD_BUG();
4424 goto abort;
4425- freshest = NULL;
4426- for (i = 0; i < N; i++) {
4427- if (!disks[i].sb)
4428- continue;
4429- if (!freshest) {
4430- freshest = disks[i].sb;
4431- continue;
4432- }
4433- /*
4434- * Find the newest superblock version
4435- */
4436- if (disks[i].sb->utime != freshest->utime) {
4437- out_of_date = 1;
4438- if (disks[i].sb->utime > freshest->utime)
4439- freshest = disks[i].sb;
4440- }
4441 }
4442- if (out_of_date)
4443- printk(OUT_OF_DATE);
4444- memcpy (sb, freshest, sizeof(*freshest));
4445-
4446- /*
4447- * Check if we can support this RAID array
4448- */
4449- if (sb->major_version != MD_MAJOR_VERSION ||
4450- sb->minor_version > MD_MINOR_VERSION) {
4451-
4452- printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
4453- sb->major_version, sb->minor_version,
4454- sb->patch_version);
4455+ if (mddev->nb_dev != mddev->sb->raid_disks) {
4456+ printk("md: md%d, array needs %d disks, has %d, aborting.\n",
4457+ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
4458 goto abort;
4459 }
4460-
4461 /*
4462- * We need to add this as a superblock option.
4463+ * Now the numbering check
4464 */
4465-#if SUPPORT_RECONSTRUCTION
4466- if (sb->state != (1 << MD_SB_CLEAN)) {
4467- if (sb->level == 1) {
4468- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4469+ for (i = 0; i < mddev->nb_dev; i++) {
4470+ c = 0;
4471+ ITERATE_RDEV(mddev,rdev,tmp) {
4472+ if (rdev->desc_nr == i)
4473+ c++;
4474+ }
4475+ if (c == 0) {
4476+ printk("md: md%d, missing disk #%d, aborting.\n",
4477+ mdidx(mddev), i);
4478 goto abort;
4479- } else
4480- printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
4481- }
4482-#else
4483- if (sb->state != (1 << MD_SB_CLEAN)) {
4484- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4485- goto abort;
4486- }
4487-#endif /* SUPPORT_RECONSTRUCTION */
4488-
4489- switch (sb->level) {
4490- case 1:
4491- md_size[minor] = sb->size;
4492- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
4493- break;
4494- case 4:
4495- case 5:
4496- md_size[minor] = sb->size * (sb->raid_disks - 1);
4497- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
4498- break;
4499- default:
4500- printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
4501- sb->level);
4502+ }
4503+ if (c > 1) {
4504+ printk("md: md%d, too many disks #%d, aborting.\n",
4505+ mdidx(mddev), i);
4506 goto abort;
4507+ }
4508 }
4509 return 0;
4510 abort:
4511- free_sb(mddev);
4512 return 1;
4513 }
4514
4515-#undef INCONSISTENT
4516-#undef OUT_OF_DATE
4517-#undef OLD_VERSION
4518-#undef NOT_CLEAN
4519-#undef OLD_LEVEL
4520-
4521-int md_update_sb(int minor)
4522+static unsigned int zoned_raid_size (mddev_t *mddev)
4523 {
4524- struct md_dev *mddev = md_dev + minor;
4525- struct buffer_head *bh;
4526- md_superblock_t *sb = mddev->sb;
4527- struct real_dev *realdev;
4528- kdev_t dev;
4529- int i;
4530- u32 sb_offset;
4531+ unsigned int mask;
4532+ mdk_rdev_t * rdev;
4533+ struct md_list_head *tmp;
4534
4535- sb->utime = CURRENT_TIME;
4536- for (i = 0; i < mddev->nb_dev; i++) {
4537- realdev = mddev->devices + i;
4538- if (!realdev->sb)
4539- continue;
4540- dev = realdev->dev;
4541- sb_offset = realdev->sb_offset;
4542- set_blocksize(dev, MD_SB_BYTES);
4543- printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
4544- bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4545- if (bh) {
4546- sb = (md_superblock_t *) bh->b_data;
4547- memcpy(sb, mddev->sb, MD_SB_BYTES);
4548- memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
4549- mark_buffer_uptodate(bh, 1);
4550- mark_buffer_dirty(bh, 1);
4551- ll_rw_block(WRITE, 1, &bh);
4552- wait_on_buffer(bh);
4553- bforget(bh);
4554- fsync_dev(dev);
4555- invalidate_buffers(dev);
4556- } else
4557- printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
4558+ if (!mddev->sb) {
4559+ MD_BUG();
4560+ return -EINVAL;
4561+ }
4562+ /*
4563+ * do size and offset calculations.
4564+ */
4565+ mask = ~(mddev->sb->chunk_size/1024 - 1);
4566+printk("mask %08x\n", mask);
4567+
4568+ ITERATE_RDEV(mddev,rdev,tmp) {
4569+printk(" rdev->size: %d\n", rdev->size);
4570+ rdev->size &= mask;
4571+printk(" masked rdev->size: %d\n", rdev->size);
4572+ md_size[mdidx(mddev)] += rdev->size;
4573+printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
4574 }
4575 return 0;
4576 }
4577
4578-static int do_md_run (int minor, int repart)
4579+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
4580 {
4581- int pnum, i, min, factor, err;
4582+ if (disk_active(disk)) {
4583+ sb->working_disks--;
4584+ } else {
4585+ if (disk_spare(disk)) {
4586+ sb->spare_disks--;
4587+ sb->working_disks--;
4588+ } else {
4589+ sb->failed_disks--;
4590+ }
4591+ }
4592+ sb->nr_disks--;
4593+ disk->major = 0;
4594+ disk->minor = 0;
4595+ mark_disk_removed(disk);
4596+}
4597
4598- if (!md_dev[minor].nb_dev)
4599- return -EINVAL;
4600-
4601- if (md_dev[minor].pers)
4602- return -EBUSY;
4603+#define BAD_MAGIC KERN_ERR \
4604+"md: invalid raid superblock magic on %s\n"
4605
4606- md_dev[minor].repartition=repart;
4607-
4608- if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
4609- >= MAX_PERSONALITY)
4610- return -EINVAL;
4611-
4612- /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
4613- if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
4614- for (i = 0; i < md_dev [minor].nb_dev; i++)
4615- if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
4616- return -EINVAL;
4617- }
4618- if (!pers[pnum])
4619- {
4620-#ifdef CONFIG_KMOD
4621- char module_name[80];
4622- sprintf (module_name, "md-personality-%d", pnum);
4623- request_module (module_name);
4624- if (!pers[pnum])
4625-#endif
4626- return -EINVAL;
4627- }
4628-
4629- factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4630-
4631- for (i=0; i<md_dev[minor].nb_dev; i++)
4632- if (md_dev[minor].devices[i].size<min)
4633- {
4634- printk ("Dev %s smaller than %dk, cannot shrink\n",
4635- partition_name (md_dev[minor].devices[i].dev), min);
4636- return -EINVAL;
4637- }
4638-
4639- for (i=0; i<md_dev[minor].nb_dev; i++) {
4640- fsync_dev(md_dev[minor].devices[i].dev);
4641- invalidate_buffers(md_dev[minor].devices[i].dev);
4642- }
4643-
4644- /* Resize devices according to the factor. It is used to align
4645- partitions size on a given chunk size. */
4646- md_size[minor]=0;
4647-
4648- /*
4649- * Analyze the raid superblock
4650- */
4651- if (analyze_sbs(minor, pnum))
4652- return -EINVAL;
4653+#define BAD_MINOR KERN_ERR \
4654+"md: %s: invalid raid minor (%x)\n"
4655
4656- md_dev[minor].pers=pers[pnum];
4657-
4658- if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
4659- {
4660- md_dev[minor].pers=NULL;
4661- free_sb(md_dev + minor);
4662- return (err);
4663- }
4664-
4665- if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
4666- {
4667- md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
4668- md_update_sb(minor);
4669- }
4670-
4671- /* FIXME : We assume here we have blocks
4672- that are twice as large as sectors.
4673- THIS MAY NOT BE TRUE !!! */
4674- md_hd_struct[minor].start_sect=0;
4675- md_hd_struct[minor].nr_sects=md_size[minor]<<1;
4676-
4677- read_ahead[MD_MAJOR] = 128;
4678- return (0);
4679-}
4680+#define OUT_OF_MEM KERN_ALERT \
4681+"md: out of memory.\n"
4682+
4683+#define NO_SB KERN_ERR \
4684+"md: disabled device %s, could not read superblock.\n"
4685
4686-static int do_md_stop (int minor, struct inode *inode)
4687+#define BAD_CSUM KERN_WARNING \
4688+"md: invalid superblock checksum on %s\n"
4689+
4690+static int alloc_array_sb (mddev_t * mddev)
4691 {
4692- int i;
4693-
4694- if (inode->i_count>1 || md_dev[minor].busy>1) {
4695- /*
4696- * ioctl : one open channel
4697- */
4698- printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
4699- minor, inode->i_count, md_dev[minor].busy);
4700- return -EBUSY;
4701- }
4702-
4703- if (md_dev[minor].pers) {
4704- /*
4705- * It is safe to call stop here, it only frees private
4706- * data. Also, it tells us if a device is unstoppable
4707- * (eg. resyncing is in progress)
4708- */
4709- if (md_dev[minor].pers->stop (minor, md_dev+minor))
4710- return -EBUSY;
4711- /*
4712- * The device won't exist anymore -> flush it now
4713- */
4714- fsync_dev (inode->i_rdev);
4715- invalidate_buffers (inode->i_rdev);
4716- if (md_dev[minor].sb) {
4717- md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
4718- md_update_sb(minor);
4719- }
4720+ if (mddev->sb) {
4721+ MD_BUG();
4722+ return 0;
4723 }
4724-
4725- /* Remove locks. */
4726- if (md_dev[minor].sb)
4727- free_sb(md_dev + minor);
4728- for (i=0; i<md_dev[minor].nb_dev; i++)
4729- clear_inode (md_dev[minor].devices[i].inode);
4730-
4731- md_dev[minor].nb_dev=md_size[minor]=0;
4732- md_hd_struct[minor].nr_sects=0;
4733- md_dev[minor].pers=NULL;
4734-
4735- read_ahead[MD_MAJOR] = 128;
4736-
4737- return (0);
4738+
4739+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
4740+ if (!mddev->sb)
4741+ return -ENOMEM;
4742+ md_clear_page((unsigned long)mddev->sb);
4743+ return 0;
4744 }
4745
4746-static int do_md_add (int minor, kdev_t dev)
4747+static int alloc_disk_sb (mdk_rdev_t * rdev)
4748 {
4749- int i;
4750- int hot_add=0;
4751- struct real_dev *realdev;
4752+ if (rdev->sb)
4753+ MD_BUG();
4754
4755- if (md_dev[minor].nb_dev==MAX_REAL)
4756+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
4757+ if (!rdev->sb) {
4758+ printk (OUT_OF_MEM);
4759 return -EINVAL;
4760+ }
4761+ md_clear_page((unsigned long)rdev->sb);
4762
4763- if (!fs_may_mount (dev))
4764- return -EBUSY;
4765+ return 0;
4766+}
4767
4768- if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
4769- printk("md_add(): zero device size, huh, bailing out.\n");
4770- return -EINVAL;
4771+static void free_disk_sb (mdk_rdev_t * rdev)
4772+{
4773+ if (rdev->sb) {
4774+ free_page((unsigned long) rdev->sb);
4775+ rdev->sb = NULL;
4776+ rdev->sb_offset = 0;
4777+ rdev->size = 0;
4778+ } else {
4779+ if (!rdev->faulty)
4780+ MD_BUG();
4781 }
4782+}
4783
4784- if (md_dev[minor].pers) {
4785- /*
4786- * The array is already running, hot-add the drive, or
4787- * bail out:
4788- */
4789- if (!md_dev[minor].pers->hot_add_disk)
4790- return -EBUSY;
4791- else
4792- hot_add=1;
4793+static void mark_rdev_faulty (mdk_rdev_t * rdev)
4794+{
4795+ unsigned long flags;
4796+
4797+ if (!rdev) {
4798+ MD_BUG();
4799+ return;
4800 }
4801+ save_flags(flags);
4802+ cli();
4803+ free_disk_sb(rdev);
4804+ rdev->faulty = 1;
4805+ restore_flags(flags);
4806+}
4807+
4808+static int read_disk_sb (mdk_rdev_t * rdev)
4809+{
4810+ int ret = -EINVAL;
4811+ struct buffer_head *bh = NULL;
4812+ kdev_t dev = rdev->dev;
4813+ mdp_super_t *sb;
4814+ u32 sb_offset;
4815
4816+ if (!rdev->sb) {
4817+ MD_BUG();
4818+ goto abort;
4819+ }
4820+
4821 /*
4822- * Careful. We cannot increase nb_dev for a running array.
4823+ * Calculate the position of the superblock,
4824+ * it's at the end of the disk
4825 */
4826- i=md_dev[minor].nb_dev;
4827- realdev = &md_dev[minor].devices[i];
4828- realdev->dev=dev;
4829-
4830- /* Lock the device by inserting a dummy inode. This doesn't
4831- smell very good, but I need to be consistent with the
4832- mount stuff, specially with fs_may_mount. If someone have
4833- a better idea, please help ! */
4834-
4835- realdev->inode=get_empty_inode ();
4836- realdev->inode->i_dev=dev; /* don't care about other fields */
4837- insert_inode_hash (realdev->inode);
4838-
4839- /* Sizes are now rounded at run time */
4840-
4841-/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
4842-
4843- realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
4844+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
4845+ rdev->sb_offset = sb_offset;
4846+ printk("(read) %s's sb offset: %d", partition_name(dev),
4847+ sb_offset);
4848+ fsync_dev(dev);
4849+ set_blocksize (dev, MD_SB_BYTES);
4850+ bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4851
4852- if (hot_add) {
4853+ if (bh) {
4854+ sb = (mdp_super_t *) bh->b_data;
4855+ memcpy (rdev->sb, sb, MD_SB_BYTES);
4856+ } else {
4857+ printk (NO_SB,partition_name(rdev->dev));
4858+ goto abort;
4859+ }
4860+ printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
4861+ ret = 0;
4862+abort:
4863+ if (bh)
4864+ brelse (bh);
4865+ return ret;
4866+}
4867+
4868+static unsigned int calc_sb_csum (mdp_super_t * sb)
4869+{
4870+ unsigned int disk_csum, csum;
4871+
4872+ disk_csum = sb->sb_csum;
4873+ sb->sb_csum = 0;
4874+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
4875+ sb->sb_csum = disk_csum;
4876+ return csum;
4877+}
4878+
4879+/*
4880+ * Check one RAID superblock for generic plausibility
4881+ */
4882+
4883+static int check_disk_sb (mdk_rdev_t * rdev)
4884+{
4885+ mdp_super_t *sb;
4886+ int ret = -EINVAL;
4887+
4888+ sb = rdev->sb;
4889+ if (!sb) {
4890+ MD_BUG();
4891+ goto abort;
4892+ }
4893+
4894+ if (sb->md_magic != MD_SB_MAGIC) {
4895+ printk (BAD_MAGIC, partition_name(rdev->dev));
4896+ goto abort;
4897+ }
4898+
4899+ if (sb->md_minor >= MAX_MD_DEVS) {
4900+ printk (BAD_MINOR, partition_name(rdev->dev),
4901+ sb->md_minor);
4902+ goto abort;
4903+ }
4904+
4905+ if (calc_sb_csum(sb) != sb->sb_csum)
4906+ printk(BAD_CSUM, partition_name(rdev->dev));
4907+ ret = 0;
4908+abort:
4909+ return ret;
4910+}
4911+
4912+static kdev_t dev_unit(kdev_t dev)
4913+{
4914+ unsigned int mask;
4915+ struct gendisk *hd = find_gendisk(dev);
4916+
4917+ if (!hd)
4918+ return 0;
4919+ mask = ~((1 << hd->minor_shift) - 1);
4920+
4921+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
4922+}
4923+
4924+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
4925+{
4926+ struct md_list_head *tmp;
4927+ mdk_rdev_t *rdev;
4928+
4929+ ITERATE_RDEV(mddev,rdev,tmp)
4930+ if (dev_unit(rdev->dev) == dev_unit(dev))
4931+ return rdev;
4932+
4933+ return NULL;
4934+}
4935+
4936+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
4937+{
4938+ struct md_list_head *tmp;
4939+ mdk_rdev_t *rdev;
4940+
4941+ ITERATE_RDEV(mddev1,rdev,tmp)
4942+ if (match_dev_unit(mddev2, rdev->dev))
4943+ return 1;
4944+
4945+ return 0;
4946+}
4947+
4948+static MD_LIST_HEAD(all_raid_disks);
4949+static MD_LIST_HEAD(pending_raid_disks);
4950+
4951+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
4952+{
4953+ mdk_rdev_t *same_pdev;
4954+
4955+ if (rdev->mddev) {
4956+ MD_BUG();
4957+ return;
4958+ }
4959+ same_pdev = match_dev_unit(mddev, rdev->dev);
4960+ if (same_pdev)
4961+ printk( KERN_WARNING
4962+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
4963+" protection against single-disk failure might be compromised.\n",
4964+ mdidx(mddev), partition_name(rdev->dev),
4965+ partition_name(same_pdev->dev));
4966+
4967+ md_list_add(&rdev->same_set, &mddev->disks);
4968+ rdev->mddev = mddev;
4969+ mddev->nb_dev++;
4970+ printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
4971+}
4972+
4973+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
4974+{
4975+ if (!rdev->mddev) {
4976+ MD_BUG();
4977+ return;
4978+ }
4979+ md_list_del(&rdev->same_set);
4980+ MD_INIT_LIST_HEAD(&rdev->same_set);
4981+ rdev->mddev->nb_dev--;
4982+ printk("unbind<%s,%d>\n", partition_name(rdev->dev),
4983+ rdev->mddev->nb_dev);
4984+ rdev->mddev = NULL;
4985+}
4986+
4987+/*
4988+ * prevent the device from being mounted, repartitioned or
4989+ * otherwise reused by a RAID array (or any other kernel
4990+ * subsystem), by opening the device. [simply getting an
4991+ * inode is not enough, the SCSI module usage code needs
4992+ * an explicit open() on the device]
4993+ */
4994+static int lock_rdev (mdk_rdev_t *rdev)
4995+{
4996+ int err = 0;
4997+
4998+ /*
4999+ * First insert a dummy inode.
5000+ */
5001+ if (rdev->inode)
5002+ MD_BUG();
5003+ rdev->inode = get_empty_inode();
5004+ /*
5005+ * we dont care about any other fields
5006+ */
5007+ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
5008+ insert_inode_hash(rdev->inode);
5009+
5010+ memset(&rdev->filp, 0, sizeof(rdev->filp));
5011+ rdev->filp.f_mode = 3; /* read write */
5012+ err = blkdev_open(rdev->inode, &rdev->filp);
5013+ if (err) {
5014+ printk("blkdev_open() failed: %d\n", err);
5015+ clear_inode(rdev->inode);
5016+ rdev->inode = NULL;
5017+ }
5018+ return err;
5019+}
5020+
5021+static void unlock_rdev (mdk_rdev_t *rdev)
5022+{
5023+ blkdev_release(rdev->inode);
5024+ if (!rdev->inode)
5025+ MD_BUG();
5026+ clear_inode(rdev->inode);
5027+ rdev->inode = NULL;
5028+}
5029+
5030+static void export_rdev (mdk_rdev_t * rdev)
5031+{
5032+ printk("export_rdev(%s)\n",partition_name(rdev->dev));
5033+ if (rdev->mddev)
5034+ MD_BUG();
5035+ unlock_rdev(rdev);
5036+ free_disk_sb(rdev);
5037+ md_list_del(&rdev->all);
5038+ MD_INIT_LIST_HEAD(&rdev->all);
5039+ if (rdev->pending.next != &rdev->pending) {
5040+ printk("(%s was pending)\n",partition_name(rdev->dev));
5041+ md_list_del(&rdev->pending);
5042+ MD_INIT_LIST_HEAD(&rdev->pending);
5043+ }
5044+ rdev->dev = 0;
5045+ rdev->faulty = 0;
5046+ kfree(rdev);
5047+}
5048+
5049+static void kick_rdev_from_array (mdk_rdev_t * rdev)
5050+{
5051+ unbind_rdev_from_array(rdev);
5052+ export_rdev(rdev);
5053+}
5054+
5055+static void export_array (mddev_t *mddev)
5056+{
5057+ struct md_list_head *tmp;
5058+ mdk_rdev_t *rdev;
5059+ mdp_super_t *sb = mddev->sb;
5060+
5061+ if (mddev->sb) {
5062+ mddev->sb = NULL;
5063+ free_page((unsigned long) sb);
5064+ }
5065+
5066+ ITERATE_RDEV(mddev,rdev,tmp) {
5067+ if (!rdev->mddev) {
5068+ MD_BUG();
5069+ continue;
5070+ }
5071+ kick_rdev_from_array(rdev);
5072+ }
5073+ if (mddev->nb_dev)
5074+ MD_BUG();
5075+}
5076+
5077+#undef BAD_CSUM
5078+#undef BAD_MAGIC
5079+#undef OUT_OF_MEM
5080+#undef NO_SB
5081+
5082+static void print_desc(mdp_disk_t *desc)
5083+{
5084+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
5085+ partition_name(MKDEV(desc->major,desc->minor)),
5086+ desc->major,desc->minor,desc->raid_disk,desc->state);
5087+}
5088+
5089+static void print_sb(mdp_super_t *sb)
5090+{
5091+ int i;
5092+
5093+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
5094+ sb->major_version, sb->minor_version, sb->patch_version,
5095+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
5096+ sb->ctime);
5097+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
5098+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
5099+ sb->layout, sb->chunk_size);
5100+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
5101+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
5102+ sb->failed_disks, sb->spare_disks,
5103+ sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
5104+
5105+ for (i = 0; i < MD_SB_DISKS; i++) {
5106+ mdp_disk_t *desc;
5107+
5108+ desc = sb->disks + i;
5109+ printk(" D %2d: ", i);
5110+ print_desc(desc);
5111+ }
5112+ printk(" THIS: ");
5113+ print_desc(&sb->this_disk);
5114+
5115+}
5116+
5117+static void print_rdev(mdk_rdev_t *rdev)
5118+{
5119+ printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
5120+ partition_name(rdev->dev), partition_name(rdev->old_dev),
5121+ rdev->size, rdev->faulty, rdev->desc_nr);
5122+ if (rdev->sb) {
5123+ printk("rdev superblock:\n");
5124+ print_sb(rdev->sb);
5125+ } else
5126+ printk("no rdev superblock!\n");
5127+}
5128+
5129+void md_print_devices (void)
5130+{
5131+ struct md_list_head *tmp, *tmp2;
5132+ mdk_rdev_t *rdev;
5133+ mddev_t *mddev;
5134+
5135+ printk("\n");
5136+ printk(" **********************************\n");
5137+ printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
5138+ printk(" **********************************\n");
5139+ ITERATE_MDDEV(mddev,tmp) {
5140+ printk("md%d: ", mdidx(mddev));
5141+
5142+ ITERATE_RDEV(mddev,rdev,tmp2)
5143+ printk("<%s>", partition_name(rdev->dev));
5144+
5145+ if (mddev->sb) {
5146+ printk(" array superblock:\n");
5147+ print_sb(mddev->sb);
5148+ } else
5149+ printk(" no array superblock.\n");
5150+
5151+ ITERATE_RDEV(mddev,rdev,tmp2)
5152+ print_rdev(rdev);
5153+ }
5154+ printk(" **********************************\n");
5155+ printk("\n");
5156+}
5157+
5158+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
5159+{
5160+ int ret;
5161+ mdp_super_t *tmp1, *tmp2;
5162+
5163+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
5164+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
5165+
5166+ if (!tmp1 || !tmp2) {
5167+ ret = 0;
5168+ goto abort;
5169+ }
5170+
5171+ *tmp1 = *sb1;
5172+ *tmp2 = *sb2;
5173+
5174+ /*
5175+ * nr_disks is not constant
5176+ */
5177+ tmp1->nr_disks = 0;
5178+ tmp2->nr_disks = 0;
5179+
5180+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
5181+ ret = 0;
5182+ else
5183+ ret = 1;
5184+
5185+abort:
5186+ if (tmp1)
5187+ kfree(tmp1);
5188+ if (tmp2)
5189+ kfree(tmp2);
5190+
5191+ return ret;
5192+}
5193+
5194+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
5195+{
5196+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
5197+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
5198+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
5199+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
5200+
5201+ return 1;
5202+
5203+ return 0;
5204+}
5205+
5206+static mdk_rdev_t * find_rdev_all (kdev_t dev)
5207+{
5208+ struct md_list_head *tmp;
5209+ mdk_rdev_t *rdev;
5210+
5211+ tmp = all_raid_disks.next;
5212+ while (tmp != &all_raid_disks) {
5213+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
5214+ if (rdev->dev == dev)
5215+ return rdev;
5216+ tmp = tmp->next;
5217+ }
5218+ return NULL;
5219+}
5220+
5221+#define GETBLK_FAILED KERN_ERR \
5222+"md: getblk failed for device %s\n"
5223+
5224+static int write_disk_sb(mdk_rdev_t * rdev)
5225+{
5226+ struct buffer_head *bh;
5227+ kdev_t dev;
5228+ u32 sb_offset, size;
5229+ mdp_super_t *sb;
5230+
5231+ if (!rdev->sb) {
5232+ MD_BUG();
5233+ return -1;
5234+ }
5235+ if (rdev->faulty) {
5236+ MD_BUG();
5237+ return -1;
5238+ }
5239+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
5240+ MD_BUG();
5241+ return -1;
5242+ }
5243+
5244+ dev = rdev->dev;
5245+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
5246+ if (rdev->sb_offset != sb_offset) {
5247+ printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
5248+ goto skip;
5249+ }
5250+ /*
5251+ * If the disk went offline meanwhile and it's just a spare, then
5252+ * it's size has changed to zero silently, and the MD code does
5253+ * not yet know that it's faulty.
5254+ */
5255+ size = calc_dev_size(dev, rdev->mddev, 1);
5256+ if (size != rdev->size) {
5257+ printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
5258+ goto skip;
5259+ }
5260+
5261+ printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
5262+ fsync_dev(dev);
5263+ set_blocksize(dev, MD_SB_BYTES);
5264+ bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
5265+ if (!bh) {
5266+ printk(GETBLK_FAILED, partition_name(dev));
5267+ return 1;
5268+ }
5269+ memset(bh->b_data,0,bh->b_size);
5270+ sb = (mdp_super_t *) bh->b_data;
5271+ memcpy(sb, rdev->sb, MD_SB_BYTES);
5272+
5273+ mark_buffer_uptodate(bh, 1);
5274+ mark_buffer_dirty(bh, 1);
5275+ ll_rw_block(WRITE, 1, &bh);
5276+ wait_on_buffer(bh);
5277+ brelse(bh);
5278+ fsync_dev(dev);
5279+skip:
5280+ return 0;
5281+}
5282+#undef GETBLK_FAILED KERN_ERR
5283+
5284+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5285+{
5286+ int i, ok = 0;
5287+ mdp_disk_t *desc;
5288+
5289+ for (i = 0; i < MD_SB_DISKS; i++) {
5290+ desc = mddev->sb->disks + i;
5291+#if 0
5292+ if (disk_faulty(desc)) {
5293+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
5294+ ok = 1;
5295+ continue;
5296+ }
5297+#endif
5298+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
5299+ rdev->sb->this_disk = *desc;
5300+ rdev->desc_nr = desc->number;
5301+ ok = 1;
5302+ break;
5303+ }
5304+ }
5305+
5306+ if (!ok) {
5307+ MD_BUG();
5308+ }
5309+}
5310+
5311+static int sync_sbs(mddev_t * mddev)
5312+{
5313+ mdk_rdev_t *rdev;
5314+ mdp_super_t *sb;
5315+ struct md_list_head *tmp;
5316+
5317+ ITERATE_RDEV(mddev,rdev,tmp) {
5318+ if (rdev->faulty)
5319+ continue;
5320+ sb = rdev->sb;
5321+ *sb = *mddev->sb;
5322+ set_this_disk(mddev, rdev);
5323+ sb->sb_csum = calc_sb_csum(sb);
5324+ }
5325+ return 0;
5326+}
5327+
5328+int md_update_sb(mddev_t * mddev)
5329+{
5330+ int first, err, count = 100;
5331+ struct md_list_head *tmp;
5332+ mdk_rdev_t *rdev;
5333+ __u64 ev;
5334+
5335+repeat:
5336+ mddev->sb->utime = CURRENT_TIME;
5337+ ev = get_unaligned(&mddev->sb->events);
5338+ ++ev;
5339+ put_unaligned(ev,&mddev->sb->events);
5340+ if (ev == (__u64)0) {
5341+ /*
5342+ * oops, this 64-bit counter should never wrap.
5343+ * Either we are in around ~1 trillion A.C., assuming
5344+ * 1 reboot per second, or we have a bug:
5345+ */
5346+ MD_BUG();
5347+ --ev;
5348+ put_unaligned(ev,&mddev->sb->events);
5349+ }
5350+ sync_sbs(mddev);
5351+
5352+ /*
5353+ * do not write anything to disk if using
5354+ * nonpersistent superblocks
5355+ */
5356+ if (mddev->sb->not_persistent)
5357+ return 0;
5358+
5359+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
5360+ mdidx(mddev));
5361+
5362+ first = 1;
5363+ err = 0;
5364+ ITERATE_RDEV(mddev,rdev,tmp) {
5365+ if (!first) {
5366+ first = 0;
5367+ printk(", ");
5368+ }
5369+ if (rdev->faulty)
5370+ printk("(skipping faulty ");
5371+ printk("%s ", partition_name(rdev->dev));
5372+ if (!rdev->faulty) {
5373+ printk("[events: %08lx]",
5374+ (unsigned long)get_unaligned(&rdev->sb->events));
5375+ err += write_disk_sb(rdev);
5376+ } else
5377+ printk(")\n");
5378+ }
5379+ printk(".\n");
5380+ if (err) {
5381+ printk("errors occured during superblock update, repeating\n");
5382+ if (--count)
5383+ goto repeat;
5384+ printk("excessive errors occured during superblock update, exiting\n");
5385+ }
5386+ return 0;
5387+}
5388+
5389+/*
5390+ * Import a device. If 'on_disk', then sanity check the superblock
5391+ *
5392+ * mark the device faulty if:
5393+ *
5394+ * - the device is nonexistent (zero size)
5395+ * - the device has no valid superblock
5396+ *
5397+ * a faulty rdev _never_ has rdev->sb set.
5398+ */
5399+static int md_import_device (kdev_t newdev, int on_disk)
5400+{
5401+ int err;
5402+ mdk_rdev_t *rdev;
5403+ unsigned int size;
5404+
5405+ if (find_rdev_all(newdev))
5406+ return -EEXIST;
5407+
5408+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
5409+ if (!rdev) {
5410+ printk("could not alloc mem for %s!\n", partition_name(newdev));
5411+ return -ENOMEM;
5412+ }
5413+ memset(rdev, 0, sizeof(*rdev));
5414+
5415+ if (!fs_may_mount(newdev)) {
5416+ printk("md: can not import %s, has active inodes!\n",
5417+ partition_name(newdev));
5418+ err = -EBUSY;
5419+ goto abort_free;
5420+ }
5421+
5422+ if ((err = alloc_disk_sb(rdev)))
5423+ goto abort_free;
5424+
5425+ rdev->dev = newdev;
5426+ if (lock_rdev(rdev)) {
5427+ printk("md: could not lock %s, zero-size? Marking faulty.\n",
5428+ partition_name(newdev));
5429+ err = -EINVAL;
5430+ goto abort_free;
5431+ }
5432+ rdev->desc_nr = -1;
5433+ rdev->faulty = 0;
5434+
5435+ size = 0;
5436+ if (blk_size[MAJOR(newdev)])
5437+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
5438+ if (!size) {
5439+ printk("md: %s has zero size, marking faulty!\n",
5440+ partition_name(newdev));
5441+ err = -EINVAL;
5442+ goto abort_free;
5443+ }
5444+
5445+ if (on_disk) {
5446+ if ((err = read_disk_sb(rdev))) {
5447+ printk("md: could not read %s's sb, not importing!\n",
5448+ partition_name(newdev));
5449+ goto abort_free;
5450+ }
5451+ if ((err = check_disk_sb(rdev))) {
5452+ printk("md: %s has invalid sb, not importing!\n",
5453+ partition_name(newdev));
5454+ goto abort_free;
5455+ }
5456+
5457+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
5458+ rdev->sb->this_disk.minor);
5459+ rdev->desc_nr = rdev->sb->this_disk.number;
5460+ }
5461+ md_list_add(&rdev->all, &all_raid_disks);
5462+ MD_INIT_LIST_HEAD(&rdev->pending);
5463+
5464+ if (rdev->faulty && rdev->sb)
5465+ free_disk_sb(rdev);
5466+ return 0;
5467+
5468+abort_free:
5469+ if (rdev->sb) {
5470+ if (rdev->inode)
5471+ unlock_rdev(rdev);
5472+ free_disk_sb(rdev);
5473+ }
5474+ kfree(rdev);
5475+ return err;
5476+}
5477+
5478+/*
5479+ * Check a full RAID array for plausibility
5480+ */
5481+
5482+#define INCONSISTENT KERN_ERR \
5483+"md: fatal superblock inconsistency in %s -- removing from array\n"
5484+
5485+#define OUT_OF_DATE KERN_ERR \
5486+"md: superblock update time inconsistency -- using the most recent one\n"
5487+
5488+#define OLD_VERSION KERN_ALERT \
5489+"md: md%d: unsupported raid array version %d.%d.%d\n"
5490+
5491+#define NOT_CLEAN_IGNORE KERN_ERR \
5492+"md: md%d: raid array is not clean -- starting background reconstruction\n"
5493+
5494+#define UNKNOWN_LEVEL KERN_ERR \
5495+"md: md%d: unsupported raid level %d\n"
5496+
5497+static int analyze_sbs (mddev_t * mddev)
5498+{
5499+ int out_of_date = 0, i;
5500+ struct md_list_head *tmp, *tmp2;
5501+ mdk_rdev_t *rdev, *rdev2, *freshest;
5502+ mdp_super_t *sb;
5503+
5504+ /*
5505+ * Verify the RAID superblock on each real device
5506+ */
5507+ ITERATE_RDEV(mddev,rdev,tmp) {
5508+ if (rdev->faulty) {
5509+ MD_BUG();
5510+ goto abort;
5511+ }
5512+ if (!rdev->sb) {
5513+ MD_BUG();
5514+ goto abort;
5515+ }
5516+ if (check_disk_sb(rdev))
5517+ goto abort;
5518+ }
5519+
5520+ /*
5521+ * The superblock constant part has to be the same
5522+ * for all disks in the array.
5523+ */
5524+ sb = NULL;
5525+
5526+ ITERATE_RDEV(mddev,rdev,tmp) {
5527+ if (!sb) {
5528+ sb = rdev->sb;
5529+ continue;
5530+ }
5531+ if (!sb_equal(sb, rdev->sb)) {
5532+ printk (INCONSISTENT, partition_name(rdev->dev));
5533+ kick_rdev_from_array(rdev);
5534+ continue;
5535+ }
5536+ }
5537+
5538+ /*
5539+ * OK, we have all disks and the array is ready to run. Let's
5540+ * find the freshest superblock, that one will be the superblock
5541+ * that represents the whole array.
5542+ */
5543+ if (!mddev->sb)
5544+ if (alloc_array_sb(mddev))
5545+ goto abort;
5546+ sb = mddev->sb;
5547+ freshest = NULL;
5548+
5549+ ITERATE_RDEV(mddev,rdev,tmp) {
5550+ __u64 ev1, ev2;
5551+ /*
5552+ * if the checksum is invalid, use the superblock
5553+ * only as a last resort. (decrease it's age by
5554+ * one event)
5555+ */
5556+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
5557+ __u64 ev = get_unaligned(&rdev->sb->events);
5558+ if (ev != (__u64)0) {
5559+ --ev;
5560+ put_unaligned(ev,&rdev->sb->events);
5561+ }
5562+ }
5563+
5564+ printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
5565+ (unsigned long)get_unaligned(&rdev->sb->events));
5566+ if (!freshest) {
5567+ freshest = rdev;
5568+ continue;
5569+ }
5570+ /*
5571+ * Find the newest superblock version
5572+ */
5573+ ev1 = get_unaligned(&rdev->sb->events);
5574+ ev2 = get_unaligned(&freshest->sb->events);
5575+ if (ev1 != ev2) {
5576+ out_of_date = 1;
5577+ if (ev1 > ev2)
5578+ freshest = rdev;
5579+ }
5580+ }
5581+ if (out_of_date) {
5582+ printk(OUT_OF_DATE);
5583+ printk("freshest: %s\n", partition_name(freshest->dev));
5584+ }
5585+ memcpy (sb, freshest->sb, sizeof(*sb));
5586+
5587+ /*
5588+ * at this point we have picked the 'best' superblock
5589+ * from all available superblocks.
5590+ * now we validate this superblock and kick out possibly
5591+ * failed disks.
5592+ */
5593+ ITERATE_RDEV(mddev,rdev,tmp) {
5594+ /*
5595+ * Kick all non-fresh devices faulty
5596+ */
5597+ __u64 ev1, ev2;
5598+ ev1 = get_unaligned(&rdev->sb->events);
5599+ ev2 = get_unaligned(&sb->events);
5600+ ++ev1;
5601+ if (ev1 < ev2) {
5602+ printk("md: kicking non-fresh %s from array!\n",
5603+ partition_name(rdev->dev));
5604+ kick_rdev_from_array(rdev);
5605+ continue;
5606+ }
5607+ }
5608+
5609+ /*
5610+ * Fix up changed device names ... but only if this disk has a
5611+ * recent update time. Use faulty checksum ones too.
5612+ */
5613+ ITERATE_RDEV(mddev,rdev,tmp) {
5614+ __u64 ev1, ev2, ev3;
5615+ if (rdev->faulty) { /* REMOVEME */
5616+ MD_BUG();
5617+ goto abort;
5618+ }
5619+ ev1 = get_unaligned(&rdev->sb->events);
5620+ ev2 = get_unaligned(&sb->events);
5621+ ev3 = ev2;
5622+ --ev3;
5623+ if ((rdev->dev != rdev->old_dev) &&
5624+ ((ev1 == ev2) || (ev1 == ev3))) {
5625+ mdp_disk_t *desc;
5626+
5627+ printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
5628+ if (rdev->desc_nr == -1) {
5629+ MD_BUG();
5630+ goto abort;
5631+ }
5632+ desc = &sb->disks[rdev->desc_nr];
5633+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
5634+ MD_BUG();
5635+ goto abort;
5636+ }
5637+ desc->major = MAJOR(rdev->dev);
5638+ desc->minor = MINOR(rdev->dev);
5639+ desc = &rdev->sb->this_disk;
5640+ desc->major = MAJOR(rdev->dev);
5641+ desc->minor = MINOR(rdev->dev);
5642+ }
5643+ }
5644+
5645+ /*
5646+ * Remove unavailable and faulty devices ...
5647+ *
5648+ * note that if an array becomes completely unrunnable due to
5649+ * missing devices, we do not write the superblock back, so the
5650+ * administrator has a chance to fix things up. The removal thus
5651+ * only happens if it's nonfatal to the contents of the array.
5652+ */
5653+ for (i = 0; i < MD_SB_DISKS; i++) {
5654+ int found;
5655+ mdp_disk_t *desc;
5656+ kdev_t dev;
5657+
5658+ desc = sb->disks + i;
5659+ dev = MKDEV(desc->major, desc->minor);
5660+
5661+ /*
5662+ * We kick faulty devices/descriptors immediately.
5663+ */
5664+ if (disk_faulty(desc)) {
5665+ found = 0;
5666+ ITERATE_RDEV(mddev,rdev,tmp) {
5667+ if (rdev->desc_nr != desc->number)
5668+ continue;
5669+ printk("md%d: kicking faulty %s!\n",
5670+ mdidx(mddev),partition_name(rdev->dev));
5671+ kick_rdev_from_array(rdev);
5672+ found = 1;
5673+ break;
5674+ }
5675+ if (!found) {
5676+ if (dev == MKDEV(0,0))
5677+ continue;
5678+ printk("md%d: removing former faulty %s!\n",
5679+ mdidx(mddev), partition_name(dev));
5680+ }
5681+ remove_descriptor(desc, sb);
5682+ continue;
5683+ }
5684+
5685+ if (dev == MKDEV(0,0))
5686+ continue;
5687+ /*
5688+ * Is this device present in the rdev ring?
5689+ */
5690+ found = 0;
5691+ ITERATE_RDEV(mddev,rdev,tmp) {
5692+ if (rdev->desc_nr == desc->number) {
5693+ found = 1;
5694+ break;
5695+ }
5696+ }
5697+ if (found)
5698+ continue;
5699+
5700+ printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
5701+ remove_descriptor(desc, sb);
5702+ }
5703+
5704+ /*
5705+ * Double check wether all devices mentioned in the
5706+ * superblock are in the rdev ring.
5707+ */
5708+ for (i = 0; i < MD_SB_DISKS; i++) {
5709+ mdp_disk_t *desc;
5710+ kdev_t dev;
5711+
5712+ desc = sb->disks + i;
5713+ dev = MKDEV(desc->major, desc->minor);
5714+
5715+ if (dev == MKDEV(0,0))
5716+ continue;
5717+
5718+ if (disk_faulty(desc)) {
5719+ MD_BUG();
5720+ goto abort;
5721+ }
5722+
5723+ rdev = find_rdev(mddev, dev);
5724+ if (!rdev) {
5725+ MD_BUG();
5726+ goto abort;
5727+ }
5728+ }
5729+
5730+ /*
5731+ * Do a final reality check.
5732+ */
5733+ ITERATE_RDEV(mddev,rdev,tmp) {
5734+ if (rdev->desc_nr == -1) {
5735+ MD_BUG();
5736+ goto abort;
5737+ }
5738+ /*
5739+ * is the desc_nr unique?
5740+ */
5741+ ITERATE_RDEV(mddev,rdev2,tmp2) {
5742+ if ((rdev2 != rdev) &&
5743+ (rdev2->desc_nr == rdev->desc_nr)) {
5744+ MD_BUG();
5745+ goto abort;
5746+ }
5747+ }
5748+ /*
5749+ * is the device unique?
5750+ */
5751+ ITERATE_RDEV(mddev,rdev2,tmp2) {
5752+ if ((rdev2 != rdev) &&
5753+ (rdev2->dev == rdev->dev)) {
5754+ MD_BUG();
5755+ goto abort;
5756+ }
5757+ }
5758+ }
5759+
5760+ /*
5761+ * Check if we can support this RAID array
5762+ */
5763+ if (sb->major_version != MD_MAJOR_VERSION ||
5764+ sb->minor_version > MD_MINOR_VERSION) {
5765+
5766+ printk (OLD_VERSION, mdidx(mddev), sb->major_version,
5767+ sb->minor_version, sb->patch_version);
5768+ goto abort;
5769+ }
5770+
5771+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
5772+ (sb->level == 4) || (sb->level == 5)))
5773+ printk (NOT_CLEAN_IGNORE, mdidx(mddev));
5774+
5775+ return 0;
5776+abort:
5777+ return 1;
5778+}
5779+
5780+#undef INCONSISTENT
5781+#undef OUT_OF_DATE
5782+#undef OLD_VERSION
5783+#undef OLD_LEVEL
5784+
5785+static int device_size_calculation (mddev_t * mddev)
5786+{
5787+ int data_disks = 0, persistent;
5788+ unsigned int readahead;
5789+ mdp_super_t *sb = mddev->sb;
5790+ struct md_list_head *tmp;
5791+ mdk_rdev_t *rdev;
5792+
5793+ /*
5794+ * Do device size calculation. Bail out if too small.
5795+ * (we have to do this after having validated chunk_size,
5796+ * because device size has to be modulo chunk_size)
5797+ */
5798+ persistent = !mddev->sb->not_persistent;
5799+ ITERATE_RDEV(mddev,rdev,tmp) {
5800+ if (rdev->faulty)
5801+ continue;
5802+ if (rdev->size) {
5803+ MD_BUG();
5804+ continue;
5805+ }
5806+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
5807+ if (rdev->size < sb->chunk_size / 1024) {
5808+ printk (KERN_WARNING
5809+ "Dev %s smaller than chunk_size: %dk < %dk\n",
5810+ partition_name(rdev->dev),
5811+ rdev->size, sb->chunk_size / 1024);
5812+ return -EINVAL;
5813+ }
5814+ }
5815+
5816+ switch (sb->level) {
5817+ case -3:
5818+ data_disks = 1;
5819+ break;
5820+ case -2:
5821+ data_disks = 1;
5822+ break;
5823+ case -1:
5824+ zoned_raid_size(mddev);
5825+ data_disks = 1;
5826+ break;
5827+ case 0:
5828+ zoned_raid_size(mddev);
5829+ data_disks = sb->raid_disks;
5830+ break;
5831+ case 1:
5832+ data_disks = 1;
5833+ break;
5834+ case 4:
5835+ case 5:
5836+ data_disks = sb->raid_disks-1;
5837+ break;
5838+ default:
5839+ printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
5840+ goto abort;
5841+ }
5842+ if (!md_size[mdidx(mddev)])
5843+ md_size[mdidx(mddev)] = sb->size * data_disks;
5844+
5845+ readahead = MD_READAHEAD;
5846+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
5847+ readahead = mddev->sb->chunk_size * 4 * data_disks;
5848+ if (readahead < data_disks * MAX_SECTORS*512*2)
5849+ readahead = data_disks * MAX_SECTORS*512*2;
5850+ else {
5851+ if (sb->level == -3)
5852+ readahead = 0;
5853+ }
5854+ md_maxreadahead[mdidx(mddev)] = readahead;
5855+
5856+ printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
5857+ mdidx(mddev), readahead/1024);
5858+
5859+ printk(KERN_INFO
5860+ "md%d: %d data-disks, max readahead per data-disk: %dk\n",
5861+ mdidx(mddev), data_disks, readahead/data_disks/1024);
5862+ return 0;
5863+abort:
5864+ return 1;
5865+}
5866+
5867+
5868+#define TOO_BIG_CHUNKSIZE KERN_ERR \
5869+"too big chunk_size: %d > %d\n"
5870+
5871+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
5872+"too small chunk_size: %d < %ld\n"
5873+
5874+#define BAD_CHUNKSIZE KERN_ERR \
5875+"no chunksize specified, see 'man raidtab'\n"
5876+
5877+static int do_md_run (mddev_t * mddev)
5878+{
5879+ int pnum, err;
5880+ int chunk_size;
5881+ struct md_list_head *tmp;
5882+ mdk_rdev_t *rdev;
5883+
5884+
5885+ if (!mddev->nb_dev) {
5886+ MD_BUG();
5887+ return -EINVAL;
5888+ }
5889+
5890+ if (mddev->pers)
5891+ return -EBUSY;
5892+
5893+ /*
5894+ * Resize disks to align partitions size on a given
5895+ * chunk size.
5896+ */
5897+ md_size[mdidx(mddev)] = 0;
5898+
5899+ /*
5900+ * Analyze all RAID superblock(s)
5901+ */
5902+ if (analyze_sbs(mddev)) {
5903+ MD_BUG();
5904+ return -EINVAL;
5905+ }
5906+
5907+ chunk_size = mddev->sb->chunk_size;
5908+ pnum = level_to_pers(mddev->sb->level);
5909+
5910+ mddev->param.chunk_size = chunk_size;
5911+ mddev->param.personality = pnum;
5912+
5913+ if (chunk_size > MAX_CHUNK_SIZE) {
5914+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
5915+ return -EINVAL;
5916+ }
5917+ /*
5918+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
5919+ */
5920+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
5921+ MD_BUG();
5922+ return -EINVAL;
5923+ }
5924+ if (chunk_size < PAGE_SIZE) {
5925+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
5926+ return -EINVAL;
5927+ }
5928+
5929+ if (pnum >= MAX_PERSONALITY) {
5930+ MD_BUG();
5931+ return -EINVAL;
5932+ }
5933+
5934+ if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
5935+ /*
5936+ * 'default chunksize' in the old md code used to
5937+ * be PAGE_SIZE, baaad.
5938+ * we abort here to be on the safe side. We dont
5939+ * want to continue the bad practice.
5940+ */
5941+ printk(BAD_CHUNKSIZE);
5942+ return -EINVAL;
5943+ }
5944+
5945+ if (!pers[pnum])
5946+ {
5947+#ifdef CONFIG_KMOD
5948+ char module_name[80];
5949+ sprintf (module_name, "md-personality-%d", pnum);
5950+ request_module (module_name);
5951+ if (!pers[pnum])
5952+#endif
5953+ return -EINVAL;
5954+ }
5955+
5956+ if (device_size_calculation(mddev))
5957+ return -EINVAL;
5958+
5959+ /*
5960+ * Drop all container device buffers, from now on
5961+ * the only valid external interface is through the md
5962+ * device.
5963+ */
5964+ ITERATE_RDEV(mddev,rdev,tmp) {
5965+ if (rdev->faulty)
5966+ continue;
5967+ fsync_dev(rdev->dev);
5968+ invalidate_buffers(rdev->dev);
5969+ }
5970+
5971+ mddev->pers = pers[pnum];
5972+
5973+ err = mddev->pers->run(mddev);
5974+ if (err) {
5975+ printk("pers->run() failed ...\n");
5976+ mddev->pers = NULL;
5977+ return -EINVAL;
5978+ }
5979+
5980+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
5981+ md_update_sb(mddev);
5982+
5983+ /*
5984+ * md_size has units of 1K blocks, which are
5985+ * twice as large as sectors.
5986+ */
5987+ md_hd_struct[mdidx(mddev)].start_sect = 0;
5988+ md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
5989+
5990+ read_ahead[MD_MAJOR] = 1024;
5991+ return (0);
5992+}
5993+
5994+#undef TOO_BIG_CHUNKSIZE
5995+#undef BAD_CHUNKSIZE
5996+
5997+#define OUT(x) do { err = (x); goto out; } while (0)
5998+
5999+static int restart_array (mddev_t *mddev)
6000+{
6001+ int err = 0;
6002+
6003+ /*
6004+ * Complain if it has no devices
6005+ */
6006+ if (!mddev->nb_dev)
6007+ OUT(-ENXIO);
6008+
6009+ if (mddev->pers) {
6010+ if (!mddev->ro)
6011+ OUT(-EBUSY);
6012+
6013+ mddev->ro = 0;
6014+ set_device_ro(mddev_to_kdev(mddev), 0);
6015+
6016+ printk (KERN_INFO
6017+ "md%d switched to read-write mode.\n", mdidx(mddev));
6018+ /*
6019+ * Kick recovery or resync if necessary
6020+ */
6021+ md_recover_arrays();
6022+ if (mddev->pers->restart_resync)
6023+ mddev->pers->restart_resync(mddev);
6024+ } else
6025+ err = -EINVAL;
6026+
6027+out:
6028+ return err;
6029+}
6030+
6031+#define STILL_MOUNTED KERN_WARNING \
6032+"md: md%d still mounted.\n"
6033+
6034+static int do_md_stop (mddev_t * mddev, int ro)
6035+{
6036+ int err = 0, resync_interrupted = 0;
6037+ kdev_t dev = mddev_to_kdev(mddev);
6038+
6039+ if (!ro && !fs_may_mount (dev)) {
6040+ printk (STILL_MOUNTED, mdidx(mddev));
6041+ OUT(-EBUSY);
6042+ }
6043+
6044+ /*
6045+ * complain if it's already stopped
6046+ */
6047+ if (!mddev->nb_dev)
6048+ OUT(-ENXIO);
6049+
6050+ if (mddev->pers) {
6051+ /*
6052+ * It is safe to call stop here, it only frees private
6053+ * data. Also, it tells us if a device is unstoppable
6054+ * (eg. resyncing is in progress)
6055+ */
6056+ if (mddev->pers->stop_resync)
6057+ if (mddev->pers->stop_resync(mddev))
6058+ resync_interrupted = 1;
6059+
6060+ if (mddev->recovery_running)
6061+ md_interrupt_thread(md_recovery_thread);
6062+
6063+ /*
6064+ * This synchronizes with signal delivery to the
6065+ * resync or reconstruction thread. It also nicely
6066+ * hangs the process if some reconstruction has not
6067+ * finished.
6068+ */
6069+ down(&mddev->recovery_sem);
6070+ up(&mddev->recovery_sem);
6071+
6072+ /*
6073+ * sync and invalidate buffers because we cannot kill the
6074+ * main thread with valid IO transfers still around.
6075+ * the kernel lock protects us from new requests being
6076+ * added after invalidate_buffers().
6077+ */
6078+ fsync_dev (mddev_to_kdev(mddev));
6079+ fsync_dev (dev);
6080+ invalidate_buffers (dev);
6081+
6082+ if (ro) {
6083+ if (mddev->ro)
6084+ OUT(-ENXIO);
6085+ mddev->ro = 1;
6086+ } else {
6087+ if (mddev->ro)
6088+ set_device_ro(dev, 0);
6089+ if (mddev->pers->stop(mddev)) {
6090+ if (mddev->ro)
6091+ set_device_ro(dev, 1);
6092+ OUT(-EBUSY);
6093+ }
6094+ if (mddev->ro)
6095+ mddev->ro = 0;
6096+ }
6097+ if (mddev->sb) {
6098+ /*
6099+ * mark it clean only if there was no resync
6100+ * interrupted.
6101+ */
6102+ if (!mddev->recovery_running && !resync_interrupted) {
6103+ printk("marking sb clean...\n");
6104+ mddev->sb->state |= 1 << MD_SB_CLEAN;
6105+ }
6106+ md_update_sb(mddev);
6107+ }
6108+ if (ro)
6109+ set_device_ro(dev, 1);
6110+ }
6111+
6112+ /*
6113+ * Free resources if final stop
6114+ */
6115+ if (!ro) {
6116+ export_array(mddev);
6117+ md_size[mdidx(mddev)] = 0;
6118+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
6119+ free_mddev(mddev);
6120+
6121+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
6122+ } else
6123+ printk (KERN_INFO
6124+ "md%d switched to read-only mode.\n", mdidx(mddev));
6125+out:
6126+ return err;
6127+}
6128+
6129+#undef OUT
6130+
6131+/*
6132+ * We have to safely support old arrays too.
6133+ */
6134+int detect_old_array (mdp_super_t *sb)
6135+{
6136+ if (sb->major_version > 0)
6137+ return 0;
6138+ if (sb->minor_version >= 90)
6139+ return 0;
6140+
6141+ return -EINVAL;
6142+}
6143+
6144+
6145+static void autorun_array (mddev_t *mddev)
6146+{
6147+ mdk_rdev_t *rdev;
6148+ struct md_list_head *tmp;
6149+ int err;
6150+
6151+ if (mddev->disks.prev == &mddev->disks) {
6152+ MD_BUG();
6153+ return;
6154+ }
6155+
6156+ printk("running: ");
6157+
6158+ ITERATE_RDEV(mddev,rdev,tmp) {
6159+ printk("<%s>", partition_name(rdev->dev));
6160+ }
6161+ printk("\nnow!\n");
6162+
6163+ err = do_md_run (mddev);
6164+ if (err) {
6165+ printk("do_md_run() returned %d\n", err);
6166+ /*
6167+ * prevent the writeback of an unrunnable array
6168+ */
6169+ mddev->sb_dirty = 0;
6170+ do_md_stop (mddev, 0);
6171+ }
6172+}
6173+
6174+/*
6175+ * lets try to run arrays based on all disks that have arrived
6176+ * until now. (those are in the ->pending list)
6177+ *
6178+ * the method: pick the first pending disk, collect all disks with
6179+ * the same UUID, remove all from the pending list and put them into
6180+ * the 'same_array' list. Then order this list based on superblock
6181+ * update time (freshest comes first), kick out 'old' disks and
6182+ * compare superblocks. If everything's fine then run it.
6183+ */
6184+static void autorun_devices (void)
6185+{
6186+ struct md_list_head candidates;
6187+ struct md_list_head *tmp;
6188+ mdk_rdev_t *rdev0, *rdev;
6189+ mddev_t *mddev;
6190+ kdev_t md_kdev;
6191+
6192+
6193+ printk("autorun ...\n");
6194+ while (pending_raid_disks.next != &pending_raid_disks) {
6195+ rdev0 = md_list_entry(pending_raid_disks.next,
6196+ mdk_rdev_t, pending);
6197+
6198+ printk("considering %s ...\n", partition_name(rdev0->dev));
6199+ MD_INIT_LIST_HEAD(&candidates);
6200+ ITERATE_RDEV_PENDING(rdev,tmp) {
6201+ if (uuid_equal(rdev0, rdev)) {
6202+ if (!sb_equal(rdev0->sb, rdev->sb)) {
6203+ printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
6204+ continue;
6205+ }
6206+ printk(" adding %s ...\n", partition_name(rdev->dev));
6207+ md_list_del(&rdev->pending);
6208+ md_list_add(&rdev->pending, &candidates);
6209+ }
6210+ }
6211 /*
6212- * Check the superblock for consistency.
6213- * The personality itself has to check whether it's getting
6214- * added with the proper flags. The personality has to be
6215- * checked too. ;)
6216+ * now we have a set of devices, with all of them having
6217+ * mostly sane superblocks. It's time to allocate the
6218+ * mddev.
6219 */
6220- if (analyze_one_sb (realdev))
6221+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
6222+ mddev = kdev_to_mddev(md_kdev);
6223+ if (mddev) {
6224+ printk("md%d already running, cannot run %s\n",
6225+ mdidx(mddev), partition_name(rdev0->dev));
6226+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
6227+ export_rdev(rdev);
6228+ continue;
6229+ }
6230+ mddev = alloc_mddev(md_kdev);
6231+ printk("created md%d\n", mdidx(mddev));
6232+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
6233+ bind_rdev_to_array(rdev, mddev);
6234+ md_list_del(&rdev->pending);
6235+ MD_INIT_LIST_HEAD(&rdev->pending);
6236+ }
6237+ autorun_array(mddev);
6238+ }
6239+ printk("... autorun DONE.\n");
6240+}
6241+
6242+/*
6243+ * import RAID devices based on one partition
6244+ * if possible, the array gets run as well.
6245+ */
6246+
6247+#define BAD_VERSION KERN_ERR \
6248+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
6249+
6250+#define OUT_OF_MEM KERN_ALERT \
6251+"md: out of memory.\n"
6252+
6253+#define NO_DEVICE KERN_ERR \
6254+"md: disabled device %s\n"
6255+
6256+#define AUTOADD_FAILED KERN_ERR \
6257+"md: auto-adding devices to md%d FAILED (error %d).\n"
6258+
6259+#define AUTOADD_FAILED_USED KERN_ERR \
6260+"md: cannot auto-add device %s to md%d, already used.\n"
6261+
6262+#define AUTORUN_FAILED KERN_ERR \
6263+"md: auto-running md%d FAILED (error %d).\n"
6264+
6265+#define MDDEV_BUSY KERN_ERR \
6266+"md: cannot auto-add to md%d, already running.\n"
6267+
6268+#define AUTOADDING KERN_INFO \
6269+"md: auto-adding devices to md%d, based on %s's superblock.\n"
6270+
6271+#define AUTORUNNING KERN_INFO \
6272+"md: auto-running md%d.\n"
6273+
6274+static int autostart_array (kdev_t startdev)
6275+{
6276+ int err = -EINVAL, i;
6277+ mdp_super_t *sb = NULL;
6278+ mdk_rdev_t *start_rdev = NULL, *rdev;
6279+
6280+ if (md_import_device(startdev, 1)) {
6281+ printk("could not import %s!\n", partition_name(startdev));
6282+ goto abort;
6283+ }
6284+
6285+ start_rdev = find_rdev_all(startdev);
6286+ if (!start_rdev) {
6287+ MD_BUG();
6288+ goto abort;
6289+ }
6290+ if (start_rdev->faulty) {
6291+ printk("can not autostart based on faulty %s!\n",
6292+ partition_name(startdev));
6293+ goto abort;
6294+ }
6295+ md_list_add(&start_rdev->pending, &pending_raid_disks);
6296+
6297+ sb = start_rdev->sb;
6298+
6299+ err = detect_old_array(sb);
6300+ if (err) {
6301+ printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
6302+ goto abort;
6303+ }
6304+
6305+ for (i = 0; i < MD_SB_DISKS; i++) {
6306+ mdp_disk_t *desc;
6307+ kdev_t dev;
6308+
6309+ desc = sb->disks + i;
6310+ dev = MKDEV(desc->major, desc->minor);
6311+
6312+ if (dev == MKDEV(0,0))
6313+ continue;
6314+ if (dev == startdev)
6315+ continue;
6316+ if (md_import_device(dev, 1)) {
6317+ printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
6318+ continue;
6319+ }
6320+ rdev = find_rdev_all(dev);
6321+ if (!rdev) {
6322+ MD_BUG();
6323+ goto abort;
6324+ }
6325+ md_list_add(&rdev->pending, &pending_raid_disks);
6326+ }
6327+
6328+ /*
6329+ * possibly return codes
6330+ */
6331+ autorun_devices();
6332+ return 0;
6333+
6334+abort:
6335+ if (start_rdev)
6336+ export_rdev(start_rdev);
6337+ return err;
6338+}
6339+
6340+#undef BAD_VERSION
6341+#undef OUT_OF_MEM
6342+#undef NO_DEVICE
6343+#undef AUTOADD_FAILED_USED
6344+#undef AUTOADD_FAILED
6345+#undef AUTORUN_FAILED
6346+#undef AUTOADDING
6347+#undef AUTORUNNING
6348+
6349+struct {
6350+ int set;
6351+ int noautodetect;
6352+
6353+} raid_setup_args md__initdata = { 0, 0 };
6354+
6355+/*
6356+ * Searches all registered partitions for autorun RAID arrays
6357+ * at boot time.
6358+ */
6359+md__initfunc(void autodetect_raid(void))
6360+{
6361+#ifdef CONFIG_AUTODETECT_RAID
6362+ struct gendisk *disk;
6363+ mdk_rdev_t *rdev;
6364+ int i;
6365+
6366+ if (raid_setup_args.noautodetect) {
6367+ printk(KERN_INFO "skipping autodetection of RAID arrays\n");
6368+ return;
6369+ }
6370+ printk(KERN_INFO "autodetecting RAID arrays\n");
6371+
6372+ for (disk = gendisk_head ; disk ; disk = disk->next) {
6373+ for (i = 0; i < disk->max_p*disk->max_nr; i++) {
6374+ kdev_t dev = MKDEV(disk->major,i);
6375+
6376+ if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
6377+ printk(KERN_ALERT
6378+"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
6379+" to maintain interoperability with other OSs! Autodetection support for\n"
6380+" type 0x86 will be deleted after some migration timeout. Sorry.\n",
6381+ partition_name(dev));
6382+ disk->part[i].type = LINUX_RAID_PARTITION;
6383+ }
6384+ if (disk->part[i].type != LINUX_RAID_PARTITION)
6385+ continue;
6386+
6387+ if (md_import_device(dev,1)) {
6388+ printk(KERN_ALERT "could not import %s!\n",
6389+ partition_name(dev));
6390+ continue;
6391+ }
6392+ /*
6393+ * Sanity checks:
6394+ */
6395+ rdev = find_rdev_all(dev);
6396+ if (!rdev) {
6397+ MD_BUG();
6398+ continue;
6399+ }
6400+ if (rdev->faulty) {
6401+ MD_BUG();
6402+ continue;
6403+ }
6404+ md_list_add(&rdev->pending, &pending_raid_disks);
6405+ }
6406+ }
6407+
6408+ autorun_devices();
6409+#endif
6410+}
6411+
6412+static int get_version (void * arg)
6413+{
6414+ mdu_version_t ver;
6415+
6416+ ver.major = MD_MAJOR_VERSION;
6417+ ver.minor = MD_MINOR_VERSION;
6418+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
6419+
6420+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
6421+ return -EFAULT;
6422+
6423+ return 0;
6424+}
6425+
6426+#define SET_FROM_SB(x) info.x = mddev->sb->x
6427+static int get_array_info (mddev_t * mddev, void * arg)
6428+{
6429+ mdu_array_info_t info;
6430+
6431+ if (!mddev->sb)
6432+ return -EINVAL;
6433+
6434+ SET_FROM_SB(major_version);
6435+ SET_FROM_SB(minor_version);
6436+ SET_FROM_SB(patch_version);
6437+ SET_FROM_SB(ctime);
6438+ SET_FROM_SB(level);
6439+ SET_FROM_SB(size);
6440+ SET_FROM_SB(nr_disks);
6441+ SET_FROM_SB(raid_disks);
6442+ SET_FROM_SB(md_minor);
6443+ SET_FROM_SB(not_persistent);
6444+
6445+ SET_FROM_SB(utime);
6446+ SET_FROM_SB(state);
6447+ SET_FROM_SB(active_disks);
6448+ SET_FROM_SB(working_disks);
6449+ SET_FROM_SB(failed_disks);
6450+ SET_FROM_SB(spare_disks);
6451+
6452+ SET_FROM_SB(layout);
6453+ SET_FROM_SB(chunk_size);
6454+
6455+ if (md_copy_to_user(arg, &info, sizeof(info)))
6456+ return -EFAULT;
6457+
6458+ return 0;
6459+}
6460+#undef SET_FROM_SB
6461+
6462+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
6463+static int get_disk_info (mddev_t * mddev, void * arg)
6464+{
6465+ mdu_disk_info_t info;
6466+ unsigned int nr;
6467+
6468+ if (!mddev->sb)
6469+ return -EINVAL;
6470+
6471+ if (md_copy_from_user(&info, arg, sizeof(info)))
6472+ return -EFAULT;
6473+
6474+ nr = info.number;
6475+ if (nr >= mddev->sb->nr_disks)
6476+ return -EINVAL;
6477+
6478+ SET_FROM_SB(major);
6479+ SET_FROM_SB(minor);
6480+ SET_FROM_SB(raid_disk);
6481+ SET_FROM_SB(state);
6482+
6483+ if (md_copy_to_user(arg, &info, sizeof(info)))
6484+ return -EFAULT;
6485+
6486+ return 0;
6487+}
6488+#undef SET_FROM_SB
6489+
6490+#define SET_SB(x) mddev->sb->disks[nr].x = info.x
6491+
6492+static int add_new_disk (mddev_t * mddev, void * arg)
6493+{
6494+ int err, size, persistent;
6495+ mdu_disk_info_t info;
6496+ mdk_rdev_t *rdev;
6497+ unsigned int nr;
6498+ kdev_t dev;
6499+
6500+ if (!mddev->sb)
6501+ return -EINVAL;
6502+
6503+ if (md_copy_from_user(&info, arg, sizeof(info)))
6504+ return -EFAULT;
6505+
6506+ nr = info.number;
6507+ if (nr >= mddev->sb->nr_disks)
6508+ return -EINVAL;
6509+
6510+ dev = MKDEV(info.major,info.minor);
6511+
6512+ if (find_rdev_all(dev)) {
6513+ printk("device %s already used in a RAID array!\n",
6514+ partition_name(dev));
6515+ return -EBUSY;
6516+ }
6517+
6518+ SET_SB(number);
6519+ SET_SB(major);
6520+ SET_SB(minor);
6521+ SET_SB(raid_disk);
6522+ SET_SB(state);
6523+
6524+ if ((info.state & (1<<MD_DISK_FAULTY))==0) {
6525+ err = md_import_device (dev, 0);
6526+ if (err) {
6527+ printk("md: error, md_import_device() returned %d\n", err);
6528+ return -EINVAL;
6529+ }
6530+ rdev = find_rdev_all(dev);
6531+ if (!rdev) {
6532+ MD_BUG();
6533 return -EINVAL;
6534+ }
6535+
6536+ rdev->old_dev = dev;
6537+ rdev->desc_nr = info.number;
6538+
6539+ bind_rdev_to_array(rdev, mddev);
6540+
6541+ persistent = !mddev->sb->not_persistent;
6542+ if (!persistent)
6543+ printk("nonpersistent superblock ...\n");
6544+ if (!mddev->sb->chunk_size)
6545+ printk("no chunksize?\n");
6546+
6547+ size = calc_dev_size(dev, mddev, persistent);
6548+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6549+
6550+ if (!mddev->sb->size || (mddev->sb->size > size))
6551+ mddev->sb->size = size;
6552+ }
6553+
6554+ /*
6555+ * sync all other superblocks with the main superblock
6556+ */
6557+ sync_sbs(mddev);
6558+
6559+ return 0;
6560+}
6561+#undef SET_SB
6562+
6563+static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
6564+{
6565+ int err;
6566+ mdk_rdev_t *rdev;
6567+ mdp_disk_t *disk;
6568+
6569+ if (!mddev->pers)
6570+ return -ENODEV;
6571+
6572+ printk("trying to remove %s from md%d ... \n",
6573+ partition_name(dev), mdidx(mddev));
6574+
6575+ if (!mddev->pers->diskop) {
6576+ printk("md%d: personality does not support diskops!\n",
6577+ mdidx(mddev));
6578+ return -EINVAL;
6579+ }
6580+
6581+ rdev = find_rdev(mddev, dev);
6582+ if (!rdev)
6583+ return -ENXIO;
6584+
6585+ if (rdev->desc_nr == -1) {
6586+ MD_BUG();
6587+ return -EINVAL;
6588+ }
6589+ disk = &mddev->sb->disks[rdev->desc_nr];
6590+ if (disk_active(disk))
6591+ goto busy;
6592+ if (disk_removed(disk)) {
6593+ MD_BUG();
6594+ return -EINVAL;
6595+ }
6596+
6597+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
6598+ if (err == -EBUSY)
6599+ goto busy;
6600+ if (err) {
6601+ MD_BUG();
6602+ return -EINVAL;
6603+ }
6604+
6605+ remove_descriptor(disk, mddev->sb);
6606+ kick_rdev_from_array(rdev);
6607+ mddev->sb_dirty = 1;
6608+ md_update_sb(mddev);
6609+
6610+ return 0;
6611+busy:
6612+ printk("cannot remove active disk %s from md%d ... \n",
6613+ partition_name(dev), mdidx(mddev));
6614+ return -EBUSY;
6615+}
6616+
6617+static int hot_add_disk (mddev_t * mddev, kdev_t dev)
6618+{
6619+ int i, err, persistent;
6620+ unsigned int size;
6621+ mdk_rdev_t *rdev;
6622+ mdp_disk_t *disk;
6623+
6624+ if (!mddev->pers)
6625+ return -ENODEV;
6626+
6627+ printk("trying to hot-add %s to md%d ... \n",
6628+ partition_name(dev), mdidx(mddev));
6629+
6630+ if (!mddev->pers->diskop) {
6631+ printk("md%d: personality does not support diskops!\n",
6632+ mdidx(mddev));
6633+ return -EINVAL;
6634+ }
6635+
6636+ persistent = !mddev->sb->not_persistent;
6637+ size = calc_dev_size(dev, mddev, persistent);
6638+
6639+ if (size < mddev->sb->size) {
6640+ printk("md%d: disk size %d blocks < array size %d\n",
6641+ mdidx(mddev), size, mddev->sb->size);
6642+ return -ENOSPC;
6643+ }
6644+
6645+ rdev = find_rdev(mddev, dev);
6646+ if (rdev)
6647+ return -EBUSY;
6648+
6649+ err = md_import_device (dev, 0);
6650+ if (err) {
6651+ printk("md: error, md_import_device() returned %d\n", err);
6652+ return -EINVAL;
6653+ }
6654+ rdev = find_rdev_all(dev);
6655+ if (!rdev) {
6656+ MD_BUG();
6657+ return -EINVAL;
6658+ }
6659+ if (rdev->faulty) {
6660+ printk("md: can not hot-add faulty %s disk to md%d!\n",
6661+ partition_name(dev), mdidx(mddev));
6662+ err = -EINVAL;
6663+ goto abort_export;
6664+ }
6665+ bind_rdev_to_array(rdev, mddev);
6666+
6667+ /*
6668+ * The rest should better be atomic, we can have disk failures
6669+ * noticed in interrupt contexts ...
6670+ */
6671+ cli();
6672+ rdev->old_dev = dev;
6673+ rdev->size = size;
6674+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6675+
6676+ disk = mddev->sb->disks + mddev->sb->raid_disks;
6677+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
6678+ disk = mddev->sb->disks + i;
6679+
6680+ if (!disk->major && !disk->minor)
6681+ break;
6682+ if (disk_removed(disk))
6683+ break;
6684+ }
6685+ if (i == MD_SB_DISKS) {
6686+ sti();
6687+ printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
6688+ err = -EBUSY;
6689+ goto abort_unbind_export;
6690+ }
6691+
6692+ if (disk_removed(disk)) {
6693 /*
6694- * hot_add has to bump up nb_dev itself
6695+ * reuse slot
6696 */
6697- if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
6698- /*
6699- * FIXME: here we should free up the inode and stuff
6700- */
6701- printk ("FIXME\n");
6702- return -EINVAL;
6703+ if (disk->number != i) {
6704+ sti();
6705+ MD_BUG();
6706+ err = -EINVAL;
6707+ goto abort_unbind_export;
6708 }
6709- } else
6710- md_dev[minor].nb_dev++;
6711+ } else {
6712+ disk->number = i;
6713+ }
6714
6715- printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
6716- return (0);
6717+ disk->raid_disk = disk->number;
6718+ disk->major = MAJOR(dev);
6719+ disk->minor = MINOR(dev);
6720+
6721+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
6722+ sti();
6723+ MD_BUG();
6724+ err = -EINVAL;
6725+ goto abort_unbind_export;
6726+ }
6727+
6728+ mark_disk_spare(disk);
6729+ mddev->sb->nr_disks++;
6730+ mddev->sb->spare_disks++;
6731+ mddev->sb->working_disks++;
6732+
6733+ mddev->sb_dirty = 1;
6734+
6735+ sti();
6736+ md_update_sb(mddev);
6737+
6738+ /*
6739+ * Kick recovery, maybe this spare has to be added to the
6740+ * array immediately.
6741+ */
6742+ md_recover_arrays();
6743+
6744+ return 0;
6745+
6746+abort_unbind_export:
6747+ unbind_rdev_from_array(rdev);
6748+
6749+abort_export:
6750+ export_rdev(rdev);
6751+ return err;
6752+}
6753+
6754+#define SET_SB(x) mddev->sb->x = info.x
6755+static int set_array_info (mddev_t * mddev, void * arg)
6756+{
6757+ mdu_array_info_t info;
6758+
6759+ if (mddev->sb) {
6760+ printk("array md%d already has a superblock!\n",
6761+ mdidx(mddev));
6762+ return -EBUSY;
6763+ }
6764+
6765+ if (md_copy_from_user(&info, arg, sizeof(info)))
6766+ return -EFAULT;
6767+
6768+ if (alloc_array_sb(mddev))
6769+ return -ENOMEM;
6770+
6771+ mddev->sb->major_version = MD_MAJOR_VERSION;
6772+ mddev->sb->minor_version = MD_MINOR_VERSION;
6773+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
6774+ mddev->sb->ctime = CURRENT_TIME;
6775+
6776+ SET_SB(level);
6777+ SET_SB(size);
6778+ SET_SB(nr_disks);
6779+ SET_SB(raid_disks);
6780+ SET_SB(md_minor);
6781+ SET_SB(not_persistent);
6782+
6783+ SET_SB(state);
6784+ SET_SB(active_disks);
6785+ SET_SB(working_disks);
6786+ SET_SB(failed_disks);
6787+ SET_SB(spare_disks);
6788+
6789+ SET_SB(layout);
6790+ SET_SB(chunk_size);
6791+
6792+ mddev->sb->md_magic = MD_SB_MAGIC;
6793+
6794+ /*
6795+ * Generate a 128 bit UUID
6796+ */
6797+ get_random_bytes(&mddev->sb->set_uuid0, 4);
6798+ get_random_bytes(&mddev->sb->set_uuid1, 4);
6799+ get_random_bytes(&mddev->sb->set_uuid2, 4);
6800+ get_random_bytes(&mddev->sb->set_uuid3, 4);
6801+
6802+ return 0;
6803+}
6804+#undef SET_SB
6805+
6806+static int set_disk_info (mddev_t * mddev, void * arg)
6807+{
6808+ printk("not yet");
6809+ return -EINVAL;
6810+}
6811+
6812+static int clear_array (mddev_t * mddev)
6813+{
6814+ printk("not yet");
6815+ return -EINVAL;
6816+}
6817+
6818+static int write_raid_info (mddev_t * mddev)
6819+{
6820+ printk("not yet");
6821+ return -EINVAL;
6822+}
6823+
6824+static int protect_array (mddev_t * mddev)
6825+{
6826+ printk("not yet");
6827+ return -EINVAL;
6828+}
6829+
6830+static int unprotect_array (mddev_t * mddev)
6831+{
6832+ printk("not yet");
6833+ return -EINVAL;
6834+}
6835+
6836+static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
6837+{
6838+ int ret;
6839+
6840+ fsync_dev(mddev_to_kdev(mddev));
6841+ ret = md_error(mddev_to_kdev(mddev), dev);
6842+ return ret;
6843 }
6844
6845 static int md_ioctl (struct inode *inode, struct file *file,
6846 unsigned int cmd, unsigned long arg)
6847 {
6848- int minor, err;
6849- struct hd_geometry *loc = (struct hd_geometry *) arg;
6850+ unsigned int minor;
6851+ int err = 0;
6852+ struct hd_geometry *loc = (struct hd_geometry *) arg;
6853+ mddev_t *mddev = NULL;
6854+ kdev_t dev;
6855
6856- if (!capable(CAP_SYS_ADMIN))
6857- return -EACCES;
6858+ if (!md_capable_admin())
6859+ return -EACCES;
6860
6861- if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
6862- (minor & 0x7f) < MAX_PERSONALITY &&
6863- pers[minor & 0x7f] &&
6864- pers[minor & 0x7f]->ioctl)
6865- return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
6866-
6867- if (minor >= MAX_MD_DEV)
6868- return -EINVAL;
6869+ dev = inode->i_rdev;
6870+ minor = MINOR(dev);
6871+ if (minor >= MAX_MD_DEVS)
6872+ return -EINVAL;
6873
6874- switch (cmd)
6875- {
6876- case REGISTER_DEV:
6877- return do_md_add (minor, to_kdev_t ((dev_t) arg));
6878+ /*
6879+ * Commands dealing with the RAID driver but not any
6880+ * particular array:
6881+ */
6882+ switch (cmd)
6883+ {
6884+ case RAID_VERSION:
6885+ err = get_version((void *)arg);
6886+ goto done;
6887+
6888+ case PRINT_RAID_DEBUG:
6889+ err = 0;
6890+ md_print_devices();
6891+ goto done_unlock;
6892+
6893+ case BLKGETSIZE: /* Return device size */
6894+ if (!arg) {
6895+ err = -EINVAL;
6896+ goto abort;
6897+ }
6898+ err = md_put_user(md_hd_struct[minor].nr_sects,
6899+ (long *) arg);
6900+ goto done;
6901
6902- case START_MD:
6903- return do_md_run (minor, (int) arg);
6904+ case BLKFLSBUF:
6905+ fsync_dev(dev);
6906+ invalidate_buffers(dev);
6907+ goto done;
6908
6909- case STOP_MD:
6910- return do_md_stop (minor, inode);
6911-
6912- case BLKGETSIZE: /* Return device size */
6913- if (!arg) return -EINVAL;
6914- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
6915- if (err)
6916- return err;
6917- break;
6918-
6919- case BLKFLSBUF:
6920- fsync_dev (inode->i_rdev);
6921- invalidate_buffers (inode->i_rdev);
6922- break;
6923-
6924- case BLKRASET:
6925- if (arg > 0xff)
6926- return -EINVAL;
6927- read_ahead[MAJOR(inode->i_rdev)] = arg;
6928- return 0;
6929-
6930- case BLKRAGET:
6931- if (!arg) return -EINVAL;
6932- err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
6933- if (err)
6934- return err;
6935- break;
6936-
6937- /* We have a problem here : there is no easy way to give a CHS
6938- virtual geometry. We currently pretend that we have a 2 heads
6939- 4 sectors (with a BIG number of cylinders...). This drives dosfs
6940- just mad... ;-) */
6941-
6942- case HDIO_GETGEO:
6943- if (!loc) return -EINVAL;
6944- err = put_user (2, (char *) &loc->heads);
6945- if (err)
6946- return err;
6947- err = put_user (4, (char *) &loc->sectors);
6948- if (err)
6949- return err;
6950- err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
6951- if (err)
6952- return err;
6953- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
6954- (long *) &loc->start);
6955- if (err)
6956- return err;
6957- break;
6958-
6959- RO_IOCTLS(inode->i_rdev,arg);
6960+ case BLKRASET:
6961+ if (arg > 0xff) {
6962+ err = -EINVAL;
6963+ goto abort;
6964+ }
6965+ read_ahead[MAJOR(dev)] = arg;
6966+ goto done;
6967
6968- default:
6969- return -EINVAL;
6970- }
6971+ case BLKRAGET:
6972+ if (!arg) {
6973+ err = -EINVAL;
6974+ goto abort;
6975+ }
6976+ err = md_put_user (read_ahead[
6977+ MAJOR(dev)], (long *) arg);
6978+ goto done;
6979+ default:
6980+ }
6981+
6982+ /*
6983+ * Commands creating/starting a new array:
6984+ */
6985+
6986+ mddev = kdev_to_mddev(dev);
6987+
6988+ switch (cmd)
6989+ {
6990+ case SET_ARRAY_INFO:
6991+ case START_ARRAY:
6992+ if (mddev) {
6993+ printk("array md%d already exists!\n",
6994+ mdidx(mddev));
6995+ err = -EEXIST;
6996+ goto abort;
6997+ }
6998+ default:
6999+ }
7000+
7001+ switch (cmd)
7002+ {
7003+ case SET_ARRAY_INFO:
7004+ mddev = alloc_mddev(dev);
7005+ if (!mddev) {
7006+ err = -ENOMEM;
7007+ goto abort;
7008+ }
7009+ /*
7010+ * alloc_mddev() should possibly self-lock.
7011+ */
7012+ err = lock_mddev(mddev);
7013+ if (err) {
7014+ printk("ioctl, reason %d, cmd %d\n", err, cmd);
7015+ goto abort;
7016+ }
7017+ err = set_array_info(mddev, (void *)arg);
7018+ if (err) {
7019+ printk("couldnt set array info. %d\n", err);
7020+ goto abort;
7021+ }
7022+ goto done_unlock;
7023+
7024+ case START_ARRAY:
7025+ /*
7026+ * possibly make it lock the array ...
7027+ */
7028+ err = autostart_array((kdev_t)arg);
7029+ if (err) {
7030+ printk("autostart %s failed!\n",
7031+ partition_name((kdev_t)arg));
7032+ goto abort;
7033+ }
7034+ goto done;
7035+
7036+ default:
7037+ }
7038+
7039+ /*
7040+ * Commands querying/configuring an existing array:
7041+ */
7042+
7043+ if (!mddev) {
7044+ err = -ENODEV;
7045+ goto abort;
7046+ }
7047+ err = lock_mddev(mddev);
7048+ if (err) {
7049+ printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
7050+ goto abort;
7051+ }
7052+
7053+ /*
7054+ * Commands even a read-only array can execute:
7055+ */
7056+ switch (cmd)
7057+ {
7058+ case GET_ARRAY_INFO:
7059+ err = get_array_info(mddev, (void *)arg);
7060+ goto done_unlock;
7061+
7062+ case GET_DISK_INFO:
7063+ err = get_disk_info(mddev, (void *)arg);
7064+ goto done_unlock;
7065+
7066+ case RESTART_ARRAY_RW:
7067+ err = restart_array(mddev);
7068+ goto done_unlock;
7069+
7070+ case STOP_ARRAY:
7071+ err = do_md_stop (mddev, 0);
7072+ goto done_unlock;
7073+
7074+ case STOP_ARRAY_RO:
7075+ err = do_md_stop (mddev, 1);
7076+ goto done_unlock;
7077+
7078+ /*
7079+ * We have a problem here : there is no easy way to give a CHS
7080+ * virtual geometry. We currently pretend that we have a 2 heads
7081+ * 4 sectors (with a BIG number of cylinders...). This drives
7082+ * dosfs just mad... ;-)
7083+ */
7084+ case HDIO_GETGEO:
7085+ if (!loc) {
7086+ err = -EINVAL;
7087+ goto abort_unlock;
7088+ }
7089+ err = md_put_user (2, (char *) &loc->heads);
7090+ if (err)
7091+ goto abort_unlock;
7092+ err = md_put_user (4, (char *) &loc->sectors);
7093+ if (err)
7094+ goto abort_unlock;
7095+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
7096+ (short *) &loc->cylinders);
7097+ if (err)
7098+ goto abort_unlock;
7099+ err = md_put_user (md_hd_struct[minor].start_sect,
7100+ (long *) &loc->start);
7101+ goto done_unlock;
7102+ }
7103
7104- return (0);
7105+ /*
7106+ * The remaining ioctls are changing the state of the
7107+ * superblock, so we do not allow read-only arrays
7108+ * here:
7109+ */
7110+ if (mddev->ro) {
7111+ err = -EROFS;
7112+ goto abort_unlock;
7113+ }
7114+
7115+ switch (cmd)
7116+ {
7117+ case CLEAR_ARRAY:
7118+ err = clear_array(mddev);
7119+ goto done_unlock;
7120+
7121+ case ADD_NEW_DISK:
7122+ err = add_new_disk(mddev, (void *)arg);
7123+ goto done_unlock;
7124+
7125+ case HOT_REMOVE_DISK:
7126+ err = hot_remove_disk(mddev, (kdev_t)arg);
7127+ goto done_unlock;
7128+
7129+ case HOT_ADD_DISK:
7130+ err = hot_add_disk(mddev, (kdev_t)arg);
7131+ goto done_unlock;
7132+
7133+ case SET_DISK_INFO:
7134+ err = set_disk_info(mddev, (void *)arg);
7135+ goto done_unlock;
7136+
7137+ case WRITE_RAID_INFO:
7138+ err = write_raid_info(mddev);
7139+ goto done_unlock;
7140+
7141+ case UNPROTECT_ARRAY:
7142+ err = unprotect_array(mddev);
7143+ goto done_unlock;
7144+
7145+ case PROTECT_ARRAY:
7146+ err = protect_array(mddev);
7147+ goto done_unlock;
7148+
7149+ case SET_DISK_FAULTY:
7150+ err = set_disk_faulty(mddev, (kdev_t)arg);
7151+ goto done_unlock;
7152+
7153+ case RUN_ARRAY:
7154+ {
7155+ mdu_param_t param;
7156+
7157+ err = md_copy_from_user(&param, (mdu_param_t *)arg,
7158+ sizeof(param));
7159+ if (err)
7160+ goto abort_unlock;
7161+
7162+ err = do_md_run (mddev);
7163+ /*
7164+ * we have to clean up the mess if
7165+ * the array cannot be run for some
7166+ * reason ...
7167+ */
7168+ if (err) {
7169+ mddev->sb_dirty = 0;
7170+ do_md_stop (mddev, 0);
7171+ }
7172+ goto done_unlock;
7173+ }
7174+
7175+ default:
7176+ printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
7177+ err = -EINVAL;
7178+ goto abort_unlock;
7179+ }
7180+
7181+done_unlock:
7182+abort_unlock:
7183+ if (mddev)
7184+ unlock_mddev(mddev);
7185+ else
7186+ printk("huh11?\n");
7187+
7188+ return err;
7189+done:
7190+ if (err)
7191+ printk("huh12?\n");
7192+abort:
7193+ return err;
7194 }
7195
7196+
7197+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
7198+
7199 static int md_open (struct inode *inode, struct file *file)
7200 {
7201- int minor=MINOR(inode->i_rdev);
7202+ /*
7203+ * Always succeed
7204+ */
7205+ return (0);
7206+}
7207+
7208+static void md_release (struct inode *inode, struct file *file)
7209+{
7210+ sync_dev(inode->i_rdev);
7211+}
7212+
7213+
7214+static int md_read (struct inode *inode, struct file *file,
7215+ char *buf, int count)
7216+{
7217+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7218
7219- md_dev[minor].busy++;
7220- return (0); /* Always succeed */
7221+ if (!mddev || !mddev->pers)
7222+ return -ENXIO;
7223+
7224+ return block_read (inode, file, buf, count);
7225 }
7226
7227+static int md_write (struct inode *inode, struct file *file,
7228+ const char *buf, int count)
7229+{
7230+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7231+
7232+ if (!mddev || !mddev->pers)
7233+ return -ENXIO;
7234
7235-static int md_release (struct inode *inode, struct file *file)
7236+ return block_write (inode, file, buf, count);
7237+}
7238+
7239+static struct file_operations md_fops=
7240 {
7241- int minor=MINOR(inode->i_rdev);
7242+ NULL,
7243+ md_read,
7244+ md_write,
7245+ NULL,
7246+ NULL,
7247+ md_ioctl,
7248+ NULL,
7249+ md_open,
7250+ md_release,
7251+ block_fsync
7252+};
7253+
7254+#else
7255
7256- sync_dev (inode->i_rdev);
7257- md_dev[minor].busy--;
7258- return 0;
7259+static int md_open (struct inode *inode, struct file *file)
7260+{
7261+ /*
7262+ * Always succeed
7263+ */
7264+ return (0);
7265 }
7266
7267+static int md_release (struct inode *inode, struct file *file)
7268+{
7269+ sync_dev(inode->i_rdev);
7270+ return 0;
7271+}
7272
7273 static ssize_t md_read (struct file *file, char *buf, size_t count,
7274 loff_t *ppos)
7275 {
7276- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7277+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7278
7279- if (!md_dev[minor].pers) /* Check if device is being run */
7280- return -ENXIO;
7281+ if (!mddev || !mddev->pers)
7282+ return -ENXIO;
7283
7284- return block_read(file, buf, count, ppos);
7285+ return block_read(file, buf, count, ppos);
7286 }
7287
7288 static ssize_t md_write (struct file *file, const char *buf,
7289 size_t count, loff_t *ppos)
7290 {
7291- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7292+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7293
7294- if (!md_dev[minor].pers) /* Check if device is being run */
7295- return -ENXIO;
7296+ if (!mddev || !mddev->pers)
7297+ return -ENXIO;
7298
7299- return block_write(file, buf, count, ppos);
7300+ return block_write(file, buf, count, ppos);
7301 }
7302
7303 static struct file_operations md_fops=
7304 {
7305- NULL,
7306- md_read,
7307- md_write,
7308- NULL,
7309- NULL,
7310- md_ioctl,
7311- NULL,
7312- md_open,
7313- NULL,
7314- md_release,
7315- block_fsync
7316+ NULL,
7317+ md_read,
7318+ md_write,
7319+ NULL,
7320+ NULL,
7321+ md_ioctl,
7322+ NULL,
7323+ md_open,
7324+ NULL,
7325+ md_release,
7326+ block_fsync
7327 };
7328
7329-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
7330+#endif
7331+
7332+int md_map (kdev_t dev, kdev_t *rdev,
7333+ unsigned long *rsector, unsigned long size)
7334 {
7335- if ((unsigned int) minor >= MAX_MD_DEV)
7336- {
7337- printk ("Bad md device %d\n", minor);
7338- return (-1);
7339- }
7340-
7341- if (!md_dev[minor].pers)
7342- {
7343- printk ("Oops ! md%d not running, giving up !\n", minor);
7344- return (-1);
7345- }
7346+ int err;
7347+ mddev_t *mddev = kdev_to_mddev(dev);
7348
7349- return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
7350+ if (!mddev || !mddev->pers) {
7351+ err = -ENXIO;
7352+ goto out;
7353+ }
7354+
7355+ err = mddev->pers->map(mddev, dev, rdev, rsector, size);
7356+out:
7357+ return err;
7358 }
7359
7360-int md_make_request (int minor, int rw, struct buffer_head * bh)
7361+int md_make_request (struct buffer_head * bh, int rw)
7362 {
7363- if (md_dev [minor].pers->make_request) {
7364- if (buffer_locked(bh))
7365- return 0;
7366+ int err;
7367+ mddev_t *mddev = kdev_to_mddev(bh->b_dev);
7368+
7369+ if (!mddev || !mddev->pers) {
7370+ err = -ENXIO;
7371+ goto out;
7372+ }
7373+
7374+ if (mddev->pers->make_request) {
7375+ if (buffer_locked(bh)) {
7376+ err = 0;
7377+ goto out;
7378+ }
7379 set_bit(BH_Lock, &bh->b_state);
7380 if (rw == WRITE || rw == WRITEA) {
7381 if (!buffer_dirty(bh)) {
7382- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7383- return 0;
7384+ bh->b_end_io(bh, buffer_uptodate(bh));
7385+ err = 0;
7386+ goto out;
7387 }
7388 }
7389 if (rw == READ || rw == READA) {
7390 if (buffer_uptodate(bh)) {
7391- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7392- return 0;
7393+ bh->b_end_io(bh, buffer_uptodate(bh));
7394+ err = 0;
7395+ goto out;
7396 }
7397 }
7398- return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
7399+ err = mddev->pers->make_request(mddev, rw, bh);
7400 } else {
7401 make_request (MAJOR(bh->b_rdev), rw, bh);
7402- return 0;
7403+ err = 0;
7404 }
7405+out:
7406+ return err;
7407 }
7408
7409 static void do_md_request (void)
7410 {
7411- printk ("Got md request, not good...");
7412- return;
7413+ printk(KERN_ALERT "Got md request, not good...");
7414+ return;
7415+}
7416+
7417+int md_thread(void * arg)
7418+{
7419+ mdk_thread_t *thread = arg;
7420+
7421+ md_lock_kernel();
7422+ exit_mm(current);
7423+ exit_files(current);
7424+ exit_fs(current);
7425+
7426+ /*
7427+ * Detach thread
7428+ */
7429+ sys_setsid();
7430+ sprintf(current->comm, thread->name);
7431+ md_init_signals();
7432+ md_flush_signals();
7433+ thread->tsk = current;
7434+
7435+ /*
7436+ * md_thread is a 'system-thread', it's priority should be very
7437+ * high. We avoid resource deadlocks individually in each
7438+ * raid personality. (RAID5 does preallocation) We also use RR and
7439+ * the very same RT priority as kswapd, thus we will never get
7440+ * into a priority inversion deadlock.
7441+ *
7442+ * we definitely have to have equal or higher priority than
7443+ * bdflush, otherwise bdflush will deadlock if there are too
7444+ * many dirty RAID5 blocks.
7445+ */
7446+ current->policy = SCHED_OTHER;
7447+ current->priority = 40;
7448+
7449+ up(thread->sem);
7450+
7451+ for (;;) {
7452+ cli();
7453+ if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7454+ if (!thread->run)
7455+ break;
7456+ interruptible_sleep_on(&thread->wqueue);
7457+ }
7458+ sti();
7459+ clear_bit(THREAD_WAKEUP, &thread->flags);
7460+ if (thread->run) {
7461+ thread->run(thread->data);
7462+ run_task_queue(&tq_disk);
7463+ }
7464+ if (md_signal_pending(current)) {
7465+ printk("%8s(%d) flushing signals.\n", current->comm,
7466+ current->pid);
7467+ md_flush_signals();
7468+ }
7469+ }
7470+ sti();
7471+ up(thread->sem);
7472+ return 0;
7473 }
7474
7475-void md_wakeup_thread(struct md_thread *thread)
7476+void md_wakeup_thread(mdk_thread_t *thread)
7477 {
7478 set_bit(THREAD_WAKEUP, &thread->flags);
7479 wake_up(&thread->wqueue);
7480 }
7481
7482-struct md_thread *md_register_thread (void (*run) (void *), void *data)
7483+mdk_thread_t *md_register_thread (void (*run) (void *),
7484+ void *data, const char *name)
7485 {
7486- struct md_thread *thread = (struct md_thread *)
7487- kmalloc(sizeof(struct md_thread), GFP_KERNEL);
7488+ mdk_thread_t *thread;
7489 int ret;
7490 struct semaphore sem = MUTEX_LOCKED;
7491
7492- if (!thread) return NULL;
7493+ thread = (mdk_thread_t *) kmalloc
7494+ (sizeof(mdk_thread_t), GFP_KERNEL);
7495+ if (!thread)
7496+ return NULL;
7497
7498- memset(thread, 0, sizeof(struct md_thread));
7499+ memset(thread, 0, sizeof(mdk_thread_t));
7500 init_waitqueue(&thread->wqueue);
7501
7502 thread->sem = &sem;
7503 thread->run = run;
7504 thread->data = data;
7505+ thread->name = name;
7506 ret = kernel_thread(md_thread, thread, 0);
7507 if (ret < 0) {
7508 kfree(thread);
7509@@ -836,270 +3032,407 @@
7510 return thread;
7511 }
7512
7513-void md_unregister_thread (struct md_thread *thread)
7514+void md_interrupt_thread (mdk_thread_t *thread)
7515+{
7516+ if (!thread->tsk) {
7517+ MD_BUG();
7518+ return;
7519+ }
7520+ printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
7521+ send_sig(SIGKILL, thread->tsk, 1);
7522+}
7523+
7524+void md_unregister_thread (mdk_thread_t *thread)
7525 {
7526 struct semaphore sem = MUTEX_LOCKED;
7527
7528 thread->sem = &sem;
7529 thread->run = NULL;
7530- if (thread->tsk)
7531- printk("Killing md_thread %d %p %s\n",
7532- thread->tsk->pid, thread->tsk, thread->tsk->comm);
7533- else
7534- printk("Aiee. md_thread has 0 tsk\n");
7535- send_sig(SIGKILL, thread->tsk, 1);
7536- printk("downing on %p\n", &sem);
7537+ thread->name = NULL;
7538+ if (!thread->tsk) {
7539+ MD_BUG();
7540+ return;
7541+ }
7542+ md_interrupt_thread(thread);
7543 down(&sem);
7544 }
7545
7546-#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
7547-
7548-int md_thread(void * arg)
7549+void md_recover_arrays (void)
7550 {
7551- struct md_thread *thread = arg;
7552-
7553- lock_kernel();
7554- exit_mm(current);
7555- exit_files(current);
7556- exit_fs(current);
7557-
7558- current->session = 1;
7559- current->pgrp = 1;
7560- sprintf(current->comm, "md_thread");
7561- siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
7562- thread->tsk = current;
7563- up(thread->sem);
7564-
7565- for (;;) {
7566- cli();
7567- if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7568- do {
7569- spin_lock(&current->sigmask_lock);
7570- flush_signals(current);
7571- spin_unlock(&current->sigmask_lock);
7572- interruptible_sleep_on(&thread->wqueue);
7573- cli();
7574- if (test_bit(THREAD_WAKEUP, &thread->flags))
7575- break;
7576- if (!thread->run) {
7577- sti();
7578- up(thread->sem);
7579- return 0;
7580- }
7581- } while (signal_pending(current));
7582- }
7583- sti();
7584- clear_bit(THREAD_WAKEUP, &thread->flags);
7585- if (thread->run) {
7586- thread->run(thread->data);
7587- run_task_queue(&tq_disk);
7588- }
7589+ if (!md_recovery_thread) {
7590+ MD_BUG();
7591+ return;
7592 }
7593+ md_wakeup_thread(md_recovery_thread);
7594 }
7595
7596-EXPORT_SYMBOL(md_size);
7597-EXPORT_SYMBOL(md_maxreadahead);
7598-EXPORT_SYMBOL(register_md_personality);
7599-EXPORT_SYMBOL(unregister_md_personality);
7600-EXPORT_SYMBOL(partition_name);
7601-EXPORT_SYMBOL(md_dev);
7602-EXPORT_SYMBOL(md_error);
7603-EXPORT_SYMBOL(md_register_thread);
7604-EXPORT_SYMBOL(md_unregister_thread);
7605-EXPORT_SYMBOL(md_update_sb);
7606-EXPORT_SYMBOL(md_map);
7607-EXPORT_SYMBOL(md_wakeup_thread);
7608-EXPORT_SYMBOL(md_do_sync);
7609
7610-#ifdef CONFIG_PROC_FS
7611-static struct proc_dir_entry proc_md = {
7612- PROC_MD, 6, "mdstat",
7613- S_IFREG | S_IRUGO, 1, 0, 0,
7614- 0, &proc_array_inode_operations,
7615-};
7616+int md_error (kdev_t dev, kdev_t rdev)
7617+{
7618+ mddev_t *mddev = kdev_to_mddev(dev);
7619+ mdk_rdev_t * rrdev;
7620+ int rc;
7621+
7622+ if (!mddev) {
7623+ MD_BUG();
7624+ return 0;
7625+ }
7626+ rrdev = find_rdev(mddev, rdev);
7627+ mark_rdev_faulty(rrdev);
7628+ /*
7629+ * if recovery was running, stop it now.
7630+ */
7631+ if (mddev->pers->stop_resync)
7632+ mddev->pers->stop_resync(mddev);
7633+ if (mddev->recovery_running)
7634+ md_interrupt_thread(md_recovery_thread);
7635+ if (mddev->pers->error_handler) {
7636+ rc = mddev->pers->error_handler(mddev, rdev);
7637+ md_recover_arrays();
7638+ return rc;
7639+ }
7640+#if 0
7641+ /*
7642+ * Drop all buffers in the failed array.
7643+ * _not_. This is called from IRQ handlers ...
7644+ */
7645+ invalidate_buffers(rdev);
7646 #endif
7647+ return 0;
7648+}
7649
7650-static void md_geninit (struct gendisk *gdisk)
7651+static int status_unused (char * page)
7652 {
7653- int i;
7654-
7655- for(i=0;i<MAX_MD_DEV;i++)
7656- {
7657- md_blocksizes[i] = 1024;
7658- md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
7659- md_gendisk.part[i].start_sect=-1; /* avoid partition check */
7660- md_gendisk.part[i].nr_sects=0;
7661- md_dev[i].pers=NULL;
7662- }
7663+ int sz = 0, i = 0;
7664+ mdk_rdev_t *rdev;
7665+ struct md_list_head *tmp;
7666
7667- blksize_size[MD_MAJOR] = md_blocksizes;
7668- max_readahead[MD_MAJOR] = md_maxreadahead;
7669+ sz += sprintf(page + sz, "unused devices: ");
7670
7671-#ifdef CONFIG_PROC_FS
7672- proc_register(&proc_root, &proc_md);
7673-#endif
7674+ ITERATE_RDEV_ALL(rdev,tmp) {
7675+ if (!rdev->same_set.next && !rdev->same_set.prev) {
7676+ /*
7677+ * The device is not yet used by any array.
7678+ */
7679+ i++;
7680+ sz += sprintf(page + sz, "%s ",
7681+ partition_name(rdev->dev));
7682+ }
7683+ }
7684+ if (!i)
7685+ sz += sprintf(page + sz, "<none>");
7686+
7687+ sz += sprintf(page + sz, "\n");
7688+ return sz;
7689 }
7690
7691-int md_error (kdev_t mddev, kdev_t rdev)
7692+
7693+static int status_resync (char * page, mddev_t * mddev)
7694 {
7695- unsigned int minor = MINOR (mddev);
7696- int rc;
7697+ int sz = 0;
7698+ unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
7699
7700- if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
7701- panic ("md_error gets unknown device\n");
7702- if (!md_dev [minor].pers)
7703- panic ("md_error gets an error for an unknown device\n");
7704- if (md_dev [minor].pers->error_handler) {
7705- rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
7706-#if SUPPORT_RECONSTRUCTION
7707- md_wakeup_thread(md_sync_thread);
7708-#endif /* SUPPORT_RECONSTRUCTION */
7709- return rc;
7710- }
7711- return 0;
7712+ resync = mddev->curr_resync;
7713+ blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
7714+ max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
7715+
7716+ /*
7717+ * Should not happen.
7718+ */
7719+ if (!max_blocks) {
7720+ MD_BUG();
7721+ return 0;
7722+ }
7723+ res = resync*100/max_blocks;
7724+ if (!mddev->recovery_running)
7725+ /*
7726+ * true resync
7727+ */
7728+ sz += sprintf(page + sz, " resync=%u%%", res);
7729+ else
7730+ /*
7731+ * recovery ...
7732+ */
7733+ sz += sprintf(page + sz, " recovery=%u%%", res);
7734+
7735+ /*
7736+ * We do not want to overflow, so the order of operands and
7737+ * the * 100 / 100 trick are important. We do a +1 to be
7738+ * safe against division by zero. We only estimate anyway.
7739+ *
7740+ * dt: time until now
7741+ * tt: total time
7742+ * et: estimated finish time
7743+ */
7744+ dt = ((jiffies - mddev->resync_start) / HZ);
7745+ tt = (dt * (max_blocks / (resync/100+1)))/100;
7746+ if (tt > dt)
7747+ et = tt - dt;
7748+ else
7749+ /*
7750+ * ignore rounding effects near finish time
7751+ */
7752+ et = 0;
7753+
7754+ sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
7755+
7756+ return sz;
7757 }
7758
7759 int get_md_status (char *page)
7760 {
7761- int sz=0, i, j, size;
7762-
7763- sz+=sprintf( page+sz, "Personalities : ");
7764- for (i=0; i<MAX_PERSONALITY; i++)
7765- if (pers[i])
7766- sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
7767-
7768- page[sz-1]='\n';
7769-
7770- sz+=sprintf (page+sz, "read_ahead ");
7771- if (read_ahead[MD_MAJOR]==INT_MAX)
7772- sz+=sprintf (page+sz, "not set\n");
7773- else
7774- sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7775+ int sz = 0, j, size;
7776+ struct md_list_head *tmp, *tmp2;
7777+ mdk_rdev_t *rdev;
7778+ mddev_t *mddev;
7779+
7780+ sz += sprintf(page + sz, "Personalities : ");
7781+ for (j = 0; j < MAX_PERSONALITY; j++)
7782+ if (pers[j])
7783+ sz += sprintf(page+sz, "[%s] ", pers[j]->name);
7784+
7785+ sz += sprintf(page+sz, "\n");
7786+
7787+
7788+ sz += sprintf(page+sz, "read_ahead ");
7789+ if (read_ahead[MD_MAJOR] == INT_MAX)
7790+ sz += sprintf(page+sz, "not set\n");
7791+ else
7792+ sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7793
7794- for (i=0; i<MAX_MD_DEV; i++)
7795- {
7796- sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
7797-
7798- if (md_dev[i].pers)
7799- sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
7800+ ITERATE_MDDEV(mddev,tmp) {
7801+ sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
7802+ mddev->pers ? "" : "in");
7803+ if (mddev->pers) {
7804+ if (mddev->ro)
7805+ sz += sprintf(page + sz, " (read-only)");
7806+ sz += sprintf(page + sz, " %s", mddev->pers->name);
7807+ }
7808
7809- size=0;
7810- for (j=0; j<md_dev[i].nb_dev; j++)
7811- {
7812- sz+=sprintf (page+sz, " %s",
7813- partition_name(md_dev[i].devices[j].dev));
7814- size+=md_dev[i].devices[j].size;
7815- }
7816+ size = 0;
7817+ ITERATE_RDEV(mddev,rdev,tmp2) {
7818+ sz += sprintf(page + sz, " %s[%d]",
7819+ partition_name(rdev->dev), rdev->desc_nr);
7820+ if (rdev->faulty) {
7821+ sz += sprintf(page + sz, "(F)");
7822+ continue;
7823+ }
7824+ size += rdev->size;
7825+ }
7826
7827- if (md_dev[i].nb_dev) {
7828- if (md_dev[i].pers)
7829- sz+=sprintf (page+sz, " %d blocks", md_size[i]);
7830- else
7831- sz+=sprintf (page+sz, " %d blocks", size);
7832- }
7833+ if (mddev->nb_dev) {
7834+ if (mddev->pers)
7835+ sz += sprintf(page + sz, " %d blocks",
7836+ md_size[mdidx(mddev)]);
7837+ else
7838+ sz += sprintf(page + sz, " %d blocks", size);
7839+ }
7840
7841- if (!md_dev[i].pers)
7842- {
7843- sz+=sprintf (page+sz, "\n");
7844- continue;
7845- }
7846+ if (!mddev->pers) {
7847+ sz += sprintf(page+sz, "\n");
7848+ continue;
7849+ }
7850
7851- if (md_dev[i].pers->max_invalid_dev)
7852- sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
7853+ sz += mddev->pers->status (page+sz, mddev);
7854
7855- sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
7856- sz+=sprintf (page+sz, "\n");
7857- }
7858+ if (mddev->curr_resync)
7859+ sz += status_resync (page+sz, mddev);
7860+ else {
7861+ if (md_atomic_read(&mddev->resync_sem.count) != 1)
7862+ sz += sprintf(page + sz, " resync=DELAYED");
7863+ }
7864+ sz += sprintf(page + sz, "\n");
7865+ }
7866+ sz += status_unused (page + sz);
7867
7868- return (sz);
7869+ return (sz);
7870 }
7871
7872-int register_md_personality (int p_num, struct md_personality *p)
7873+int register_md_personality (int pnum, mdk_personality_t *p)
7874 {
7875- int i=(p_num >> PERSONALITY_SHIFT);
7876-
7877- if (i >= MAX_PERSONALITY)
7878- return -EINVAL;
7879+ if (pnum >= MAX_PERSONALITY)
7880+ return -EINVAL;
7881
7882- if (pers[i])
7883- return -EBUSY;
7884+ if (pers[pnum])
7885+ return -EBUSY;
7886
7887- pers[i]=p;
7888- printk ("%s personality registered\n", p->name);
7889- return 0;
7890+ pers[pnum] = p;
7891+ printk(KERN_INFO "%s personality registered\n", p->name);
7892+ return 0;
7893 }
7894
7895-int unregister_md_personality (int p_num)
7896+int unregister_md_personality (int pnum)
7897 {
7898- int i=(p_num >> PERSONALITY_SHIFT);
7899-
7900- if (i >= MAX_PERSONALITY)
7901- return -EINVAL;
7902+ if (pnum >= MAX_PERSONALITY)
7903+ return -EINVAL;
7904
7905- printk ("%s personality unregistered\n", pers[i]->name);
7906- pers[i]=NULL;
7907- return 0;
7908+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
7909+ pers[pnum] = NULL;
7910+ return 0;
7911 }
7912
7913-static md_descriptor_t *get_spare(struct md_dev *mddev)
7914+static mdp_disk_t *get_spare(mddev_t *mddev)
7915 {
7916- int i;
7917- md_superblock_t *sb = mddev->sb;
7918- md_descriptor_t *descriptor;
7919- struct real_dev *realdev;
7920-
7921- for (i = 0; i < mddev->nb_dev; i++) {
7922- realdev = &mddev->devices[i];
7923- if (!realdev->sb)
7924+ mdp_super_t *sb = mddev->sb;
7925+ mdp_disk_t *disk;
7926+ mdk_rdev_t *rdev;
7927+ struct md_list_head *tmp;
7928+
7929+ ITERATE_RDEV(mddev,rdev,tmp) {
7930+ if (rdev->faulty)
7931+ continue;
7932+ if (!rdev->sb) {
7933+ MD_BUG();
7934 continue;
7935- descriptor = &sb->disks[realdev->sb->descriptor.number];
7936- if (descriptor->state & (1 << MD_FAULTY_DEVICE))
7937+ }
7938+ disk = &sb->disks[rdev->desc_nr];
7939+ if (disk_faulty(disk)) {
7940+ MD_BUG();
7941 continue;
7942- if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
7943+ }
7944+ if (disk_active(disk))
7945 continue;
7946- return descriptor;
7947+ return disk;
7948 }
7949 return NULL;
7950 }
7951
7952+static int is_mddev_idle (mddev_t *mddev)
7953+{
7954+ mdk_rdev_t * rdev;
7955+ struct md_list_head *tmp;
7956+ int idle;
7957+ unsigned long curr_events;
7958+
7959+ idle = 1;
7960+ ITERATE_RDEV(mddev,rdev,tmp) {
7961+ curr_events = io_events[MAJOR(rdev->dev)];
7962+
7963+ if (curr_events != rdev->last_events) {
7964+// printk("!I(%d)", curr_events-rdev->last_events);
7965+ rdev->last_events = curr_events;
7966+ idle = 0;
7967+ }
7968+ }
7969+ return idle;
7970+}
7971+
7972 /*
7973 * parallel resyncing thread.
7974- *
7975- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
7976- * - fix read error handing
7977 */
7978
7979-int md_do_sync(struct md_dev *mddev)
7980+/*
7981+ * Determine correct block size for this device.
7982+ */
7983+unsigned int device_bsize (kdev_t dev)
7984+{
7985+ unsigned int i, correct_size;
7986+
7987+ correct_size = BLOCK_SIZE;
7988+ if (blksize_size[MAJOR(dev)]) {
7989+ i = blksize_size[MAJOR(dev)][MINOR(dev)];
7990+ if (i)
7991+ correct_size = i;
7992+ }
7993+
7994+ return correct_size;
7995+}
7996+
7997+static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
7998+
7999+#define RA_ORDER (1)
8000+#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
8001+#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
8002+
8003+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
8004 {
8005- struct buffer_head *bh;
8006- int max_blocks, blocksize, curr_bsize, percent=1, j;
8007- kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
8008+ mddev_t *mddev2;
8009+ struct buffer_head **bh;
8010+ unsigned int max_blocks, blocksize, curr_bsize,
8011+ i, ii, j, k, chunk, window, nr_blocks, err, serialize;
8012+ kdev_t read_disk = mddev_to_kdev(mddev);
8013 int major = MAJOR(read_disk), minor = MINOR(read_disk);
8014 unsigned long starttime;
8015+ int max_read_errors = 2*MAX_NR_BLOCKS,
8016+ max_write_errors = 2*MAX_NR_BLOCKS;
8017+ struct md_list_head *tmp;
8018+
8019+retry_alloc:
8020+ bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
8021+ if (!bh) {
8022+ printk(KERN_ERR
8023+ "could not alloc bh array for reconstruction ... retrying!\n");
8024+ goto retry_alloc;
8025+ }
8026+
8027+ err = down_interruptible(&mddev->resync_sem);
8028+ if (err)
8029+ goto out_nolock;
8030+
8031+recheck:
8032+ serialize = 0;
8033+ ITERATE_MDDEV(mddev2,tmp) {
8034+ if (mddev2 == mddev)
8035+ continue;
8036+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
8037+ printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
8038+ serialize = 1;
8039+ break;
8040+ }
8041+ }
8042+ if (serialize) {
8043+ interruptible_sleep_on(&resync_wait);
8044+ if (md_signal_pending(current)) {
8045+ md_flush_signals();
8046+ err = -EINTR;
8047+ goto out;
8048+ }
8049+ goto recheck;
8050+ }
8051+
8052+ mddev->curr_resync = 1;
8053
8054- blocksize = blksize_size[major][minor];
8055+ blocksize = device_bsize(read_disk);
8056 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8057
8058- printk("... resync log\n");
8059- printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
8060- printk(" .... raid array: %s\n", kdevname(read_disk));
8061- printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
8062- printk("md: syncing RAID array %s\n", kdevname(read_disk));
8063+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
8064+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
8065+ sysctl_speed_limit);
8066+ printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
8067+
8068+ /*
8069+ * Resync has low priority.
8070+ */
8071+ current->priority = 1;
8072+
8073+ is_mddev_idle(mddev); /* this also initializes IO event counters */
8074+ starttime = jiffies;
8075+ mddev->resync_start = starttime;
8076
8077- mddev->busy++;
8078+ /*
8079+ * Tune reconstruction:
8080+ */
8081+ window = md_maxreadahead[mdidx(mddev)]/1024;
8082+ nr_blocks = window / (blocksize >> 10);
8083+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8084+ nr_blocks = MAX_NR_BLOCKS;
8085+ printk(KERN_INFO "md: using %dk window.\n",window);
8086
8087- starttime=jiffies;
8088- for (j = 0; j < max_blocks; j++) {
8089+ for (j = 0; j < max_blocks; j += nr_blocks) {
8090
8091+ if (j)
8092+ mddev->curr_resync = j;
8093 /*
8094 * B careful. When some1 mounts a non-'blocksize' filesystem
8095 * then we get the blocksize changed right under us. Go deal
8096 * with it transparently, recalculate 'blocksize', 'j' and
8097 * 'max_blocks':
8098 */
8099- curr_bsize = blksize_size[major][minor];
8100+ curr_bsize = device_bsize(read_disk);
8101 if (curr_bsize != blocksize) {
8102- diff_blocksize:
8103+ printk(KERN_INFO "md%d: blocksize changed\n",
8104+ mdidx(mddev));
8105+retry_read:
8106 if (curr_bsize > blocksize)
8107 /*
8108 * this is safe, rounds downwards.
8109@@ -1109,114 +3442,384 @@
8110 j *= blocksize/curr_bsize;
8111
8112 blocksize = curr_bsize;
8113+ nr_blocks = window / (blocksize >> 10);
8114+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8115+ nr_blocks = MAX_NR_BLOCKS;
8116 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8117- }
8118- if ((bh = breada (read_disk, j, blocksize, j * blocksize,
8119- max_blocks * blocksize)) != NULL) {
8120- mark_buffer_dirty(bh, 1);
8121- brelse(bh);
8122- } else {
8123+ printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
8124+ nr_blocks, blocksize, j, max_blocks);
8125 /*
8126- * FIXME: Ugly, but set_blocksize() isnt safe ...
8127+ * We will retry the current block-group
8128 */
8129- curr_bsize = blksize_size[major][minor];
8130- if (curr_bsize != blocksize)
8131- goto diff_blocksize;
8132+ }
8133
8134- /*
8135- * It's a real read problem. FIXME, handle this
8136- * a better way.
8137- */
8138- printk ( KERN_ALERT
8139- "read error, stopping reconstruction.\n");
8140- mddev->busy--;
8141- return 1;
8142+ /*
8143+ * Cleanup routines expect this
8144+ */
8145+ for (k = 0; k < nr_blocks; k++)
8146+ bh[k] = NULL;
8147+
8148+ chunk = nr_blocks;
8149+ if (chunk > max_blocks-j)
8150+ chunk = max_blocks-j;
8151+
8152+ /*
8153+ * request buffer heads ...
8154+ */
8155+ for (i = 0; i < chunk; i++) {
8156+ bh[i] = getblk (read_disk, j+i, blocksize);
8157+ if (!bh[i])
8158+ goto read_error;
8159+ if (!buffer_dirty(bh[i]))
8160+ mark_buffer_lowprio(bh[i]);
8161 }
8162
8163 /*
8164- * Let's sleep some if we are faster than our speed limit:
8165+ * read buffer heads ...
8166 */
8167- while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
8168- {
8169- current->state = TASK_INTERRUPTIBLE;
8170- schedule_timeout(1);
8171+ ll_rw_block (READ, chunk, bh);
8172+ run_task_queue(&tq_disk);
8173+
8174+ /*
8175+ * verify that all of them are OK ...
8176+ */
8177+ for (i = 0; i < chunk; i++) {
8178+ ii = chunk-i-1;
8179+ wait_on_buffer(bh[ii]);
8180+ if (!buffer_uptodate(bh[ii]))
8181+ goto read_error;
8182+ }
8183+
8184+retry_write:
8185+ for (i = 0; i < chunk; i++)
8186+ mark_buffer_dirty_lowprio(bh[i]);
8187+
8188+ ll_rw_block(WRITE, chunk, bh);
8189+ run_task_queue(&tq_disk);
8190+
8191+ for (i = 0; i < chunk; i++) {
8192+ ii = chunk-i-1;
8193+ wait_on_buffer(bh[ii]);
8194+
8195+ if (spare && disk_faulty(spare)) {
8196+ for (k = 0; k < chunk; k++)
8197+ brelse(bh[k]);
8198+ printk(" <SPARE FAILED!>\n ");
8199+ err = -EIO;
8200+ goto out;
8201+ }
8202+
8203+ if (!buffer_uptodate(bh[ii])) {
8204+ curr_bsize = device_bsize(read_disk);
8205+ if (curr_bsize != blocksize) {
8206+ printk(KERN_INFO
8207+ "md%d: blocksize changed during write\n",
8208+ mdidx(mddev));
8209+ for (k = 0; k < chunk; k++)
8210+ if (bh[k]) {
8211+ if (buffer_lowprio(bh[k]))
8212+ mark_buffer_clean(bh[k]);
8213+ brelse(bh[k]);
8214+ }
8215+ goto retry_read;
8216+ }
8217+ printk(" BAD WRITE %8d>\n", j);
8218+ /*
8219+ * Ouch, write error, retry or bail out.
8220+ */
8221+ if (max_write_errors) {
8222+ max_write_errors--;
8223+ printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8224+ goto retry_write;
8225+ }
8226+ printk ( KERN_ALERT
8227+ "too many write errors, stopping reconstruction.\n");
8228+ for (k = 0; k < chunk; k++)
8229+ if (bh[k]) {
8230+ if (buffer_lowprio(bh[k]))
8231+ mark_buffer_clean(bh[k]);
8232+ brelse(bh[k]);
8233+ }
8234+ err = -EIO;
8235+ goto out;
8236+ }
8237 }
8238
8239 /*
8240- * FIXME: put this status bar thing into /proc
8241+ * This is the normal 'everything went OK' case
8242+ * do a 'free-behind' logic, we sure dont need
8243+ * this buffer if it was the only user.
8244 */
8245- if (!(j%(max_blocks/100))) {
8246- if (!(percent%10))
8247- printk (" %03d%% done.\n",percent);
8248+ for (i = 0; i < chunk; i++)
8249+ if (buffer_dirty(bh[i]))
8250+ brelse(bh[i]);
8251 else
8252- printk (".");
8253- percent++;
8254+ bforget(bh[i]);
8255+
8256+
8257+ if (md_signal_pending(current)) {
8258+ /*
8259+ * got a signal, exit.
8260+ */
8261+ mddev->curr_resync = 0;
8262+ printk("md_do_sync() got signal ... exiting\n");
8263+ md_flush_signals();
8264+ err = -EINTR;
8265+ goto out;
8266 }
8267+
8268+ /*
8269+ * this loop exits only if either when we are slower than
8270+ * the 'hard' speed limit, or the system was IO-idle for
8271+ * a jiffy.
8272+ * the system might be non-idle CPU-wise, but we only care
8273+ * about not overloading the IO subsystem. (things like an
8274+ * e2fsck being done on the RAID array should execute fast)
8275+ */
8276+repeat:
8277+ if (md_need_resched(current))
8278+ schedule();
8279+
8280+ if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
8281+ > sysctl_speed_limit) {
8282+ current->priority = 1;
8283+
8284+ if (!is_mddev_idle(mddev)) {
8285+ current->state = TASK_INTERRUPTIBLE;
8286+ md_schedule_timeout(HZ/2);
8287+ if (!md_signal_pending(current))
8288+ goto repeat;
8289+ }
8290+ } else
8291+ current->priority = 40;
8292 }
8293 fsync_dev(read_disk);
8294- printk("md: %s: sync done.\n", kdevname(read_disk));
8295- mddev->busy--;
8296- return 0;
8297+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
8298+ err = 0;
8299+ /*
8300+ * this also signals 'finished resyncing' to md_stop
8301+ */
8302+out:
8303+ up(&mddev->resync_sem);
8304+out_nolock:
8305+ free_pages((unsigned long)bh, RA_ORDER);
8306+ mddev->curr_resync = 0;
8307+ wake_up(&resync_wait);
8308+ return err;
8309+
8310+read_error:
8311+ /*
8312+ * set_blocksize() might change the blocksize. This
8313+ * should not happen often, but it happens when eg.
8314+ * someone mounts a filesystem that has non-1k
8315+ * blocksize. set_blocksize() doesnt touch our
8316+ * buffer, but to avoid aliasing problems we change
8317+ * our internal blocksize too and retry the read.
8318+ */
8319+ curr_bsize = device_bsize(read_disk);
8320+ if (curr_bsize != blocksize) {
8321+ printk(KERN_INFO "md%d: blocksize changed during read\n",
8322+ mdidx(mddev));
8323+ for (k = 0; k < chunk; k++)
8324+ if (bh[k]) {
8325+ if (buffer_lowprio(bh[k]))
8326+ mark_buffer_clean(bh[k]);
8327+ brelse(bh[k]);
8328+ }
8329+ goto retry_read;
8330+ }
8331+
8332+ /*
8333+ * It's a real read problem. We retry and bail out
8334+ * only if it's excessive.
8335+ */
8336+ if (max_read_errors) {
8337+ max_read_errors--;
8338+ printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8339+ for (k = 0; k < chunk; k++)
8340+ if (bh[k]) {
8341+ if (buffer_lowprio(bh[k]))
8342+ mark_buffer_clean(bh[k]);
8343+ brelse(bh[k]);
8344+ }
8345+ goto retry_read;
8346+ }
8347+ printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
8348+ for (k = 0; k < chunk; k++)
8349+ if (bh[k]) {
8350+ if (buffer_lowprio(bh[k]))
8351+ mark_buffer_clean(bh[k]);
8352+ brelse(bh[k]);
8353+ }
8354+ err = -EIO;
8355+ goto out;
8356 }
8357
8358+#undef MAX_NR_BLOCKS
8359+
8360 /*
8361- * This is a kernel thread which: syncs a spare disk with the active array
8362+ * This is a kernel thread which syncs a spare disk with the active array
8363 *
8364 * the amount of foolproofing might seem to be a tad excessive, but an
8365 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
8366 * of my root partition with the first 0.5 gigs of my /home partition ... so
8367 * i'm a bit nervous ;)
8368 */
8369-void mdsyncd (void *data)
8370+void md_do_recovery (void *data)
8371 {
8372- int i;
8373- struct md_dev *mddev;
8374- md_superblock_t *sb;
8375- md_descriptor_t *spare;
8376+ int err;
8377+ mddev_t *mddev;
8378+ mdp_super_t *sb;
8379+ mdp_disk_t *spare;
8380 unsigned long flags;
8381+ struct md_list_head *tmp;
8382
8383- for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
8384- if ((sb = mddev->sb) == NULL)
8385+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
8386+restart:
8387+ ITERATE_MDDEV(mddev,tmp) {
8388+ sb = mddev->sb;
8389+ if (!sb)
8390+ continue;
8391+ if (mddev->recovery_running)
8392 continue;
8393 if (sb->active_disks == sb->raid_disks)
8394 continue;
8395- if (!sb->spare_disks)
8396+ if (!sb->spare_disks) {
8397+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
8398 continue;
8399+ }
8400+ /*
8401+ * now here we get the spare and resync it.
8402+ */
8403 if ((spare = get_spare(mddev)) == NULL)
8404 continue;
8405- if (!mddev->pers->mark_spare)
8406+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8407+ if (!mddev->pers->diskop)
8408 continue;
8409- if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
8410+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
8411 continue;
8412- if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
8413- mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
8414+ down(&mddev->recovery_sem);
8415+ mddev->recovery_running = 1;
8416+ err = md_do_sync(mddev, spare);
8417+ if (err == -EIO) {
8418+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8419+ if (!disk_faulty(spare)) {
8420+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
8421+ mark_disk_faulty(spare);
8422+ mark_disk_nonsync(spare);
8423+ mark_disk_inactive(spare);
8424+ sb->spare_disks--;
8425+ sb->working_disks--;
8426+ sb->failed_disks++;
8427+ }
8428+ } else
8429+ if (disk_faulty(spare))
8430+ mddev->pers->diskop(mddev, &spare,
8431+ DISKOP_SPARE_INACTIVE);
8432+ if (err == -EINTR) {
8433+ /*
8434+ * Recovery got interrupted ...
8435+ * signal back that we have finished using the array.
8436+ */
8437+ mddev->pers->diskop(mddev, &spare,
8438+ DISKOP_SPARE_INACTIVE);
8439+ up(&mddev->recovery_sem);
8440+ mddev->recovery_running = 0;
8441 continue;
8442+ } else {
8443+ mddev->recovery_running = 0;
8444+ up(&mddev->recovery_sem);
8445 }
8446 save_flags(flags);
8447 cli();
8448- mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
8449- spare->state |= (1 << MD_SYNC_DEVICE);
8450- spare->state |= (1 << MD_ACTIVE_DEVICE);
8451- sb->spare_disks--;
8452- sb->active_disks++;
8453- mddev->sb_dirty = 1;
8454- md_update_sb(mddev - md_dev);
8455+ if (!disk_faulty(spare)) {
8456+ /*
8457+ * the SPARE_ACTIVE diskop possibly changes the
8458+ * pointer too
8459+ */
8460+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
8461+ mark_disk_sync(spare);
8462+ mark_disk_active(spare);
8463+ sb->active_disks++;
8464+ sb->spare_disks--;
8465+ }
8466 restore_flags(flags);
8467+ mddev->sb_dirty = 1;
8468+ md_update_sb(mddev);
8469+ goto restart;
8470 }
8471+ printk(KERN_INFO "md: recovery thread finished ...\n");
8472
8473 }
8474
8475+int md_notify_reboot(struct notifier_block *this,
8476+ unsigned long code, void *x)
8477+{
8478+ struct md_list_head *tmp;
8479+ mddev_t *mddev;
8480+
8481+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
8482+ || (code == MD_SYS_POWER_OFF)) {
8483+
8484+ printk(KERN_INFO "stopping all md devices.\n");
8485+
8486+ ITERATE_MDDEV(mddev,tmp)
8487+ do_md_stop (mddev, 1);
8488+ /*
8489+ * certain more exotic SCSI devices are known to be
8490+ * volatile wrt too early system reboots. While the
8491+ * right place to handle this issue is the given
8492+ * driver, we do want to have a safe RAID driver ...
8493+ */
8494+ md_mdelay(1000*1);
8495+ }
8496+ return NOTIFY_DONE;
8497+}
8498+
8499+struct notifier_block md_notifier = {
8500+ md_notify_reboot,
8501+ NULL,
8502+ 0
8503+};
8504+
8505+md__initfunc(void raid_setup(char *str, int *ints))
8506+{
8507+ char tmpline[100];
8508+ int len, pos, nr, i;
8509+
8510+ len = strlen(str) + 1;
8511+ nr = 0;
8512+ pos = 0;
8513+
8514+ for (i = 0; i < len; i++) {
8515+ char c = str[i];
8516+
8517+ if (c == ',' || !c) {
8518+ tmpline[pos] = 0;
8519+ if (!strcmp(tmpline,"noautodetect"))
8520+ raid_setup_args.noautodetect = 1;
8521+ nr++;
8522+ pos = 0;
8523+ continue;
8524+ }
8525+ tmpline[pos] = c;
8526+ pos++;
8527+ }
8528+ raid_setup_args.set = 1;
8529+ return;
8530+}
8531+
8532 #ifdef CONFIG_MD_BOOT
8533 struct {
8534 int set;
8535 int ints[100];
8536 char str[100];
8537-} md_setup_args __initdata = {
8538+} md_setup_args md__initdata = {
8539 0,{0},{0}
8540 };
8541
8542 /* called from init/main.c */
8543-__initfunc(void md_setup(char *str,int *ints))
8544+md__initfunc(void md_setup(char *str,int *ints))
8545 {
8546 int i;
8547 for(i=0;i<=ints[0];i++) {
8548@@ -1228,21 +3831,24 @@
8549 return;
8550 }
8551
8552-__initfunc(void do_md_setup(char *str,int *ints))
8553+md__initfunc(void do_md_setup(char *str,int *ints))
8554 {
8555- int minor, pers, factor, fault;
8556+#if 0
8557+ int minor, pers, chunk_size, fault;
8558 kdev_t dev;
8559 int i=1;
8560
8561+ printk("i plan to phase this out --mingo\n");
8562+
8563 if(ints[0] < 4) {
8564- printk ("md: Too few Arguments (%d).\n", ints[0]);
8565+ printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
8566 return;
8567 }
8568
8569 minor=ints[i++];
8570
8571- if (minor >= MAX_MD_DEV) {
8572- printk ("md: Minor device number too high.\n");
8573+ if ((unsigned int)minor >= MAX_MD_DEVS) {
8574+ printk (KERN_WARNING "md: Minor device number too high.\n");
8575 return;
8576 }
8577
8578@@ -1252,18 +3858,20 @@
8579 case -1:
8580 #ifdef CONFIG_MD_LINEAR
8581 pers = LINEAR;
8582- printk ("md: Setting up md%d as linear device.\n",minor);
8583+ printk (KERN_INFO "md: Setting up md%d as linear device.\n",
8584+ minor);
8585 #else
8586- printk ("md: Linear mode not configured."
8587+ printk (KERN_WARNING "md: Linear mode not configured."
8588 "Recompile the kernel with linear mode enabled!\n");
8589 #endif
8590 break;
8591 case 0:
8592 pers = STRIPED;
8593 #ifdef CONFIG_MD_STRIPED
8594- printk ("md: Setting up md%d as a striped device.\n",minor);
8595+ printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
8596+ minor);
8597 #else
8598- printk ("md: Striped mode not configured."
8599+ printk (KERN_WARNING "md: Striped mode not configured."
8600 "Recompile the kernel with striped mode enabled!\n");
8601 #endif
8602 break;
8603@@ -1278,79 +3886,145 @@
8604 break;
8605 */
8606 default:
8607- printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
8608+ printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
8609 return;
8610 }
8611
8612- if(pers) {
8613+ if (pers) {
8614
8615- factor=ints[i++]; /* Chunksize */
8616- fault =ints[i++]; /* Faultlevel */
8617+ chunk_size = ints[i++]; /* Chunksize */
8618+ fault = ints[i++]; /* Faultlevel */
8619
8620- pers=pers | factor | (fault << FAULT_SHIFT);
8621+ pers = pers | chunk_size | (fault << FAULT_SHIFT);
8622
8623- while( str && (dev = name_to_kdev_t(str))) {
8624- do_md_add (minor, dev);
8625- if((str = strchr (str, ',')) != NULL)
8626- str++;
8627- }
8628+ while( str && (dev = name_to_kdev_t(str))) {
8629+ do_md_add (minor, dev);
8630+ if((str = strchr (str, ',')) != NULL)
8631+ str++;
8632+ }
8633
8634- do_md_run (minor, pers);
8635- printk ("md: Loading md%d.\n",minor);
8636+ do_md_run (minor, pers);
8637+ printk (KERN_INFO "md: Loading md%d.\n",minor);
8638 }
8639-
8640+#endif
8641 }
8642 #endif
8643
8644+void hsm_init (void);
8645+void translucent_init (void);
8646 void linear_init (void);
8647 void raid0_init (void);
8648 void raid1_init (void);
8649 void raid5_init (void);
8650
8651-__initfunc(int md_init (void))
8652+md__initfunc(int md_init (void))
8653 {
8654- printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
8655- MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
8656- MAX_MD_DEV, MAX_REAL);
8657-
8658- if (register_blkdev (MD_MAJOR, "md", &md_fops))
8659- {
8660- printk ("Unable to get major %d for md\n", MD_MAJOR);
8661- return (-1);
8662- }
8663-
8664- blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
8665- blk_dev[MD_MAJOR].current_request=NULL;
8666- read_ahead[MD_MAJOR]=INT_MAX;
8667- memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
8668- md_gendisk.next=gendisk_head;
8669-
8670- gendisk_head=&md_gendisk;
8671-
8672-#if SUPPORT_RECONSTRUCTION
8673- if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
8674- printk("md: bug: md_sync_thread == NULL\n");
8675-#endif /* SUPPORT_RECONSTRUCTION */
8676+ static char * name = "mdrecoveryd";
8677+
8678+ printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
8679+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
8680+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
8681+
8682+ if (register_blkdev (MD_MAJOR, "md", &md_fops))
8683+ {
8684+ printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
8685+ return (-1);
8686+ }
8687+
8688+ blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
8689+ blk_dev[MD_MAJOR].current_request = NULL;
8690+ read_ahead[MD_MAJOR] = INT_MAX;
8691+ md_gendisk.next = gendisk_head;
8692+
8693+ gendisk_head = &md_gendisk;
8694+
8695+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
8696+ if (!md_recovery_thread)
8697+ printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
8698
8699+ md_register_reboot_notifier(&md_notifier);
8700+ md_register_sysctl();
8701+
8702+#ifdef CONFIG_MD_HSM
8703+ hsm_init ();
8704+#endif
8705+#ifdef CONFIG_MD_TRANSLUCENT
8706+ translucent_init ();
8707+#endif
8708 #ifdef CONFIG_MD_LINEAR
8709- linear_init ();
8710+ linear_init ();
8711 #endif
8712 #ifdef CONFIG_MD_STRIPED
8713- raid0_init ();
8714+ raid0_init ();
8715 #endif
8716 #ifdef CONFIG_MD_MIRRORING
8717- raid1_init ();
8718+ raid1_init ();
8719 #endif
8720 #ifdef CONFIG_MD_RAID5
8721- raid5_init ();
8722+ raid5_init ();
8723+#endif
8724+#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
8725+ /*
8726+ * pick a XOR routine, runtime.
8727+ */
8728+ calibrate_xor_block();
8729 #endif
8730- return (0);
8731+
8732+ return (0);
8733 }
8734
8735 #ifdef CONFIG_MD_BOOT
8736-__initfunc(void md_setup_drive(void))
8737+md__initfunc(void md_setup_drive(void))
8738 {
8739 if(md_setup_args.set)
8740 do_md_setup(md_setup_args.str, md_setup_args.ints);
8741 }
8742 #endif
8743+
8744+MD_EXPORT_SYMBOL(md_size);
8745+MD_EXPORT_SYMBOL(register_md_personality);
8746+MD_EXPORT_SYMBOL(unregister_md_personality);
8747+MD_EXPORT_SYMBOL(partition_name);
8748+MD_EXPORT_SYMBOL(md_error);
8749+MD_EXPORT_SYMBOL(md_recover_arrays);
8750+MD_EXPORT_SYMBOL(md_register_thread);
8751+MD_EXPORT_SYMBOL(md_unregister_thread);
8752+MD_EXPORT_SYMBOL(md_update_sb);
8753+MD_EXPORT_SYMBOL(md_map);
8754+MD_EXPORT_SYMBOL(md_wakeup_thread);
8755+MD_EXPORT_SYMBOL(md_do_sync);
8756+MD_EXPORT_SYMBOL(md_print_devices);
8757+MD_EXPORT_SYMBOL(find_rdev_nr);
8758+MD_EXPORT_SYMBOL(md_check_ordering);
8759+MD_EXPORT_SYMBOL(md_interrupt_thread);
8760+MD_EXPORT_SYMBOL(mddev_map);
8761+
8762+#ifdef CONFIG_PROC_FS
8763+static struct proc_dir_entry proc_md = {
8764+ PROC_MD, 6, "mdstat",
8765+ S_IFREG | S_IRUGO, 1, 0, 0,
8766+ 0, &proc_array_inode_operations,
8767+};
8768+#endif
8769+
8770+static void md_geninit (struct gendisk *gdisk)
8771+{
8772+ int i;
8773+
8774+ for(i = 0; i < MAX_MD_DEVS; i++) {
8775+ md_blocksizes[i] = 1024;
8776+ md_maxreadahead[i] = MD_READAHEAD;
8777+ md_gendisk.part[i].start_sect = -1; /* avoid partition check */
8778+ md_gendisk.part[i].nr_sects = 0;
8779+ }
8780+
8781+ printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8782+
8783+ blksize_size[MD_MAJOR] = md_blocksizes;
8784+ md_set_global_readahead(md_maxreadahead);
8785+
8786+#ifdef CONFIG_PROC_FS
8787+ proc_register(&proc_root, &proc_md);
8788+#endif
8789+}
8790+
8791--- linux/drivers/block/raid0.c.orig Mon Sep 4 19:39:16 2000
8792+++ linux/drivers/block/raid0.c Fri Nov 23 11:18:18 2001
8793@@ -1,4 +1,3 @@
8794-
8795 /*
8796 raid0.c : Multiple Devices driver for Linux
8797 Copyright (C) 1994-96 Marc ZYNGIER
8798@@ -18,146 +17,201 @@
8799 */
8800
8801 #include <linux/module.h>
8802-#include <linux/md.h>
8803-#include <linux/raid0.h>
8804-#include <linux/vmalloc.h>
8805+#include <linux/raid/raid0.h>
8806
8807 #define MAJOR_NR MD_MAJOR
8808 #define MD_DRIVER
8809 #define MD_PERSONALITY
8810
8811-static int create_strip_zones (int minor, struct md_dev *mddev)
8812+static int create_strip_zones (mddev_t *mddev)
8813 {
8814- int i, j, c=0;
8815- int current_offset=0;
8816- struct real_dev *smallest_by_zone;
8817- struct raid0_data *data=(struct raid0_data *) mddev->private;
8818-
8819- data->nr_strip_zones=1;
8820-
8821- for (i=1; i<mddev->nb_dev; i++)
8822- {
8823- for (j=0; j<i; j++)
8824- if (mddev->devices[i].size==mddev->devices[j].size)
8825- {
8826- c=1;
8827- break;
8828- }
8829-
8830- if (!c)
8831- data->nr_strip_zones++;
8832-
8833- c=0;
8834- }
8835-
8836- if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
8837- return 1;
8838-
8839- data->smallest=NULL;
8840-
8841- for (i=0; i<data->nr_strip_zones; i++)
8842- {
8843- data->strip_zone[i].dev_offset=current_offset;
8844- smallest_by_zone=NULL;
8845- c=0;
8846-
8847- for (j=0; j<mddev->nb_dev; j++)
8848- if (mddev->devices[j].size>current_offset)
8849- {
8850- data->strip_zone[i].dev[c++]=mddev->devices+j;
8851- if (!smallest_by_zone ||
8852- smallest_by_zone->size > mddev->devices[j].size)
8853- smallest_by_zone=mddev->devices+j;
8854- }
8855-
8856- data->strip_zone[i].nb_dev=c;
8857- data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
8858-
8859- if (!data->smallest ||
8860- data->smallest->size > data->strip_zone[i].size)
8861- data->smallest=data->strip_zone+i;
8862-
8863- data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
8864- data->strip_zone[i-1].size) : 0;
8865- current_offset=smallest_by_zone->size;
8866- }
8867- return 0;
8868+ int i, c, j, j1, j2;
8869+ int current_offset, curr_zone_offset;
8870+ raid0_conf_t *conf = mddev_to_conf(mddev);
8871+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
8872+
8873+ /*
8874+ * The number of 'same size groups'
8875+ */
8876+ conf->nr_strip_zones = 0;
8877+
8878+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
8879+ printk("raid0: looking at %s\n", partition_name(rdev1->dev));
8880+ c = 0;
8881+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
8882+ printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
8883+ if (rdev2 == rdev1) {
8884+ printk("raid0: END\n");
8885+ break;
8886+ }
8887+ if (rdev2->size == rdev1->size)
8888+ {
8889+ /*
8890+ * Not unique, dont count it as a new
8891+ * group
8892+ */
8893+ printk("raid0: EQUAL\n");
8894+ c = 1;
8895+ break;
8896+ }
8897+ printk("raid0: NOT EQUAL\n");
8898+ }
8899+ if (!c) {
8900+ printk("raid0: ==> UNIQUE\n");
8901+ conf->nr_strip_zones++;
8902+ printk("raid0: %d zones\n", conf->nr_strip_zones);
8903+ }
8904+ }
8905+ printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
8906+
8907+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
8908+ conf->nr_strip_zones);
8909+ if (!conf->strip_zone)
8910+ return 1;
8911+
8912+
8913+ conf->smallest = NULL;
8914+ current_offset = 0;
8915+ curr_zone_offset = 0;
8916+
8917+ for (i = 0; i < conf->nr_strip_zones; i++)
8918+ {
8919+ struct strip_zone *zone = conf->strip_zone + i;
8920+
8921+ printk("zone %d\n", i);
8922+ zone->dev_offset = current_offset;
8923+ smallest = NULL;
8924+ c = 0;
8925+
8926+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
8927+
8928+ printk(" checking %s ...", partition_name(rdev->dev));
8929+ if (rdev->size > current_offset)
8930+ {
8931+ printk(" contained as device %d\n", c);
8932+ zone->dev[c] = rdev;
8933+ c++;
8934+ if (!smallest || (rdev->size <smallest->size)) {
8935+ smallest = rdev;
8936+ printk(" (%d) is smallest!.\n", rdev->size);
8937+ }
8938+ } else
8939+ printk(" nope.\n");
8940+ }
8941+
8942+ zone->nb_dev = c;
8943+ zone->size = (smallest->size - current_offset) * c;
8944+ printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
8945+
8946+ if (!conf->smallest || (zone->size < conf->smallest->size))
8947+ conf->smallest = zone;
8948+
8949+ zone->zone_offset = curr_zone_offset;
8950+ curr_zone_offset += zone->size;
8951+
8952+ current_offset = smallest->size;
8953+ printk("current zone offset: %d\n", current_offset);
8954+ }
8955+ printk("done.\n");
8956+ return 0;
8957 }
8958
8959-static int raid0_run (int minor, struct md_dev *mddev)
8960+static int raid0_run (mddev_t *mddev)
8961 {
8962- int cur=0, i=0, size, zone0_size, nb_zone;
8963- struct raid0_data *data;
8964-
8965- MOD_INC_USE_COUNT;
8966+ int cur=0, i=0, size, zone0_size, nb_zone;
8967+ raid0_conf_t *conf;
8968
8969- if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
8970- data=(struct raid0_data *) mddev->private;
8971-
8972- if (create_strip_zones (minor, mddev))
8973- {
8974- vfree(data);
8975- return 1;
8976- }
8977-
8978- nb_zone=data->nr_zones=
8979- md_size[minor]/data->smallest->size +
8980- (md_size[minor]%data->smallest->size ? 1 : 0);
8981-
8982- printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
8983- if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
8984- {
8985- vfree(data->strip_zone);
8986- vfree(data);
8987- return 1;
8988- }
8989- size=data->strip_zone[cur].size;
8990-
8991- i=0;
8992- while (cur<data->nr_strip_zones)
8993- {
8994- data->hash_table[i].zone0=data->strip_zone+cur;
8995-
8996- if (size>=data->smallest->size)/* If we completely fill the slot */
8997- {
8998- data->hash_table[i++].zone1=NULL;
8999- size-=data->smallest->size;
9000-
9001- if (!size)
9002- {
9003- if (++cur==data->nr_strip_zones) continue;
9004- size=data->strip_zone[cur].size;
9005- }
9006-
9007- continue;
9008- }
9009-
9010- if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
9011- {
9012- data->hash_table[i].zone1=NULL;
9013- continue;
9014- }
9015-
9016- zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
9017- size=data->strip_zone[cur].size;
9018- data->hash_table[i++].zone1=data->strip_zone+cur;
9019- size-=(data->smallest->size - zone0_size);
9020- }
9021+ MOD_INC_USE_COUNT;
9022
9023- return (0);
9024+ conf = vmalloc(sizeof (raid0_conf_t));
9025+ if (!conf)
9026+ goto out;
9027+ mddev->private = (void *)conf;
9028+
9029+ if (md_check_ordering(mddev)) {
9030+ printk("raid0: disks are not ordered, aborting!\n");
9031+ goto out_free_conf;
9032+ }
9033+
9034+ if (create_strip_zones (mddev))
9035+ goto out_free_conf;
9036+
9037+ printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
9038+ printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
9039+ nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
9040+ (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
9041+ printk("raid0 : nb_zone is %d.\n", nb_zone);
9042+ conf->nr_zones = nb_zone;
9043+
9044+ printk("raid0 : Allocating %d bytes for hash.\n",
9045+ sizeof(struct raid0_hash)*nb_zone);
9046+
9047+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
9048+ if (!conf->hash_table)
9049+ goto out_free_zone_conf;
9050+ size = conf->strip_zone[cur].size;
9051+
9052+ i = 0;
9053+ while (cur < conf->nr_strip_zones) {
9054+ conf->hash_table[i].zone0 = conf->strip_zone + cur;
9055+
9056+ /*
9057+ * If we completely fill the slot
9058+ */
9059+ if (size >= conf->smallest->size) {
9060+ conf->hash_table[i++].zone1 = NULL;
9061+ size -= conf->smallest->size;
9062+
9063+ if (!size) {
9064+ if (++cur == conf->nr_strip_zones)
9065+ continue;
9066+ size = conf->strip_zone[cur].size;
9067+ }
9068+ continue;
9069+ }
9070+ if (++cur == conf->nr_strip_zones) {
9071+ /*
9072+ * Last dev, set unit1 as NULL
9073+ */
9074+ conf->hash_table[i].zone1=NULL;
9075+ continue;
9076+ }
9077+
9078+ /*
9079+ * Here we use a 2nd dev to fill the slot
9080+ */
9081+ zone0_size = size;
9082+ size = conf->strip_zone[cur].size;
9083+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
9084+ size -= (conf->smallest->size - zone0_size);
9085+ }
9086+ return 0;
9087+
9088+out_free_zone_conf:
9089+ vfree(conf->strip_zone);
9090+ conf->strip_zone = NULL;
9091+
9092+out_free_conf:
9093+ vfree(conf);
9094+ mddev->private = NULL;
9095+out:
9096+ MOD_DEC_USE_COUNT;
9097+ return 1;
9098 }
9099
9100-
9101-static int raid0_stop (int minor, struct md_dev *mddev)
9102+static int raid0_stop (mddev_t *mddev)
9103 {
9104- struct raid0_data *data=(struct raid0_data *) mddev->private;
9105+ raid0_conf_t *conf = mddev_to_conf(mddev);
9106
9107- vfree (data->hash_table);
9108- vfree (data->strip_zone);
9109- vfree (data);
9110+ vfree (conf->hash_table);
9111+ conf->hash_table = NULL;
9112+ vfree (conf->strip_zone);
9113+ conf->strip_zone = NULL;
9114+ vfree (conf);
9115+ mddev->private = NULL;
9116
9117- MOD_DEC_USE_COUNT;
9118- return 0;
9119+ MOD_DEC_USE_COUNT;
9120+ return 0;
9121 }
9122
9123 /*
9124@@ -167,135 +221,140 @@
9125 * Of course, those facts may not be valid anymore (and surely won't...)
9126 * Hey guys, there's some work out there ;-)
9127 */
9128-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
9129+static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9130 unsigned long *rsector, unsigned long size)
9131 {
9132- struct raid0_data *data=(struct raid0_data *) mddev->private;
9133- static struct raid0_hash *hash;
9134- struct strip_zone *zone;
9135- struct real_dev *tmp_dev;
9136- int blk_in_chunk, factor, chunk, chunk_size;
9137- long block, rblock;
9138-
9139- factor=FACTOR(mddev);
9140- chunk_size=(1UL << FACTOR_SHIFT(factor));
9141- block=*rsector >> 1;
9142- hash=data->hash_table+(block/data->smallest->size);
9143-
9144- if (hash - data->hash_table > data->nr_zones)
9145- {
9146- printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
9147- return -1;
9148- }
9149-
9150- /* Sanity check */
9151- if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
9152- {
9153- printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9154- return (-1);
9155- }
9156-
9157- if (block >= (hash->zone0->size +
9158- hash->zone0->zone_offset))
9159- {
9160- if (!hash->zone1)
9161- {
9162- printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
9163- return (-1);
9164- }
9165-
9166- zone=hash->zone1;
9167- }
9168- else
9169- zone=hash->zone0;
9170+ raid0_conf_t *conf = mddev_to_conf(mddev);
9171+ struct raid0_hash *hash;
9172+ struct strip_zone *zone;
9173+ mdk_rdev_t *tmp_dev;
9174+ int blk_in_chunk, chunksize_bits, chunk, chunk_size;
9175+ long block, rblock;
9176+
9177+ chunk_size = mddev->param.chunk_size >> 10;
9178+ chunksize_bits = ffz(~chunk_size);
9179+ block = *rsector >> 1;
9180+ hash = conf->hash_table + block / conf->smallest->size;
9181+
9182+ if (hash - conf->hash_table > conf->nr_zones) {
9183+ printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
9184+ return -1;
9185+ }
9186+
9187+ /* Sanity check */
9188+ if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
9189+ goto bad_map;
9190+
9191+ if (!hash)
9192+ goto bad_hash;
9193+
9194+ if (!hash->zone0)
9195+ goto bad_zone0;
9196+
9197+ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
9198+ if (!hash->zone1)
9199+ goto bad_zone1;
9200+ zone = hash->zone1;
9201+ } else
9202+ zone = hash->zone0;
9203
9204- blk_in_chunk=block & (chunk_size -1);
9205- chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
9206- tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
9207- rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
9208+ blk_in_chunk = block & (chunk_size -1);
9209+ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
9210+ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
9211+ rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
9212
9213- *rdev=tmp_dev->dev;
9214- *rsector=rblock<<1;
9215+ *rdev = tmp_dev->dev;
9216+ *rsector = rblock << 1;
9217
9218- return (0);
9219+ return 0;
9220+
9221+bad_map:
9222+ printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9223+ return -1;
9224+bad_hash:
9225+ printk("raid0_map bug: hash==NULL for block %ld\n", block);
9226+ return -1;
9227+bad_zone0:
9228+ printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
9229+ return -1;
9230+bad_zone1:
9231+ printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
9232+ return -1;
9233 }
9234
9235
9236-static int raid0_status (char *page, int minor, struct md_dev *mddev)
9237+static int raid0_status (char *page, mddev_t *mddev)
9238 {
9239- int sz=0;
9240+ int sz = 0;
9241 #undef MD_DEBUG
9242 #ifdef MD_DEBUG
9243- int j, k;
9244- struct raid0_data *data=(struct raid0_data *) mddev->private;
9245+ int j, k;
9246+ raid0_conf_t *conf = mddev_to_conf(mddev);
9247
9248- sz+=sprintf (page+sz, " ");
9249- for (j=0; j<data->nr_zones; j++)
9250- {
9251- sz+=sprintf (page+sz, "[z%d",
9252- data->hash_table[j].zone0-data->strip_zone);
9253- if (data->hash_table[j].zone1)
9254- sz+=sprintf (page+sz, "/z%d] ",
9255- data->hash_table[j].zone1-data->strip_zone);
9256- else
9257- sz+=sprintf (page+sz, "] ");
9258- }
9259+ sz += sprintf(page + sz, " ");
9260+ for (j = 0; j < conf->nr_zones; j++) {
9261+ sz += sprintf(page + sz, "[z%d",
9262+ conf->hash_table[j].zone0 - conf->strip_zone);
9263+ if (conf->hash_table[j].zone1)
9264+ sz += sprintf(page+sz, "/z%d] ",
9265+ conf->hash_table[j].zone1 - conf->strip_zone);
9266+ else
9267+ sz += sprintf(page+sz, "] ");
9268+ }
9269
9270- sz+=sprintf (page+sz, "\n");
9271+ sz += sprintf(page + sz, "\n");
9272
9273- for (j=0; j<data->nr_strip_zones; j++)
9274- {
9275- sz+=sprintf (page+sz, " z%d=[", j);
9276- for (k=0; k<data->strip_zone[j].nb_dev; k++)
9277- sz+=sprintf (page+sz, "%s/",
9278- partition_name(data->strip_zone[j].dev[k]->dev));
9279- sz--;
9280- sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9281- data->strip_zone[j].zone_offset,
9282- data->strip_zone[j].dev_offset,
9283- data->strip_zone[j].size);
9284- }
9285+ for (j = 0; j < conf->nr_strip_zones; j++) {
9286+ sz += sprintf(page + sz, " z%d=[", j);
9287+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
9288+ sz += sprintf (page+sz, "%s/", partition_name(
9289+ conf->strip_zone[j].dev[k]->dev));
9290+ sz--;
9291+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9292+ conf->strip_zone[j].zone_offset,
9293+ conf->strip_zone[j].dev_offset,
9294+ conf->strip_zone[j].size);
9295+ }
9296 #endif
9297- sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
9298- return sz;
9299+ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
9300+ return sz;
9301 }
9302
9303-
9304-static struct md_personality raid0_personality=
9305+static mdk_personality_t raid0_personality=
9306 {
9307- "raid0",
9308- raid0_map,
9309- NULL, /* no special make_request */
9310- NULL, /* no special end_request */
9311- raid0_run,
9312- raid0_stop,
9313- raid0_status,
9314- NULL, /* no ioctls */
9315- 0,
9316- NULL, /* no error_handler */
9317- NULL, /* hot_add_disk */
9318- NULL, /* hot_remove_disk */
9319- NULL /* mark_spare */
9320+ "raid0",
9321+ raid0_map,
9322+ NULL, /* no special make_request */
9323+ NULL, /* no special end_request */
9324+ raid0_run,
9325+ raid0_stop,
9326+ raid0_status,
9327+ NULL, /* no ioctls */
9328+ 0,
9329+ NULL, /* no error_handler */
9330+ NULL, /* no diskop */
9331+ NULL, /* no stop resync */
9332+ NULL /* no restart resync */
9333 };
9334
9335-
9336 #ifndef MODULE
9337
9338 void raid0_init (void)
9339 {
9340- register_md_personality (RAID0, &raid0_personality);
9341+ register_md_personality (RAID0, &raid0_personality);
9342 }
9343
9344 #else
9345
9346 int init_module (void)
9347 {
9348- return (register_md_personality (RAID0, &raid0_personality));
9349+ return (register_md_personality (RAID0, &raid0_personality));
9350 }
9351
9352 void cleanup_module (void)
9353 {
9354- unregister_md_personality (RAID0);
9355+ unregister_md_personality (RAID0);
9356 }
9357
9358 #endif
9359+
9360--- linux/drivers/block/raid1.c.orig Mon Dec 11 01:49:41 2000
9361+++ linux/drivers/block/raid1.c Fri Nov 23 11:18:18 2001
9362@@ -1,6 +1,6 @@
9363-/************************************************************************
9364+/*
9365 * raid1.c : Multiple Devices driver for Linux
9366- * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9367+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9368 *
9369 * RAID-1 management functions.
9370 *
9371@@ -15,50 +15,55 @@
9372 */
9373
9374 #include <linux/module.h>
9375-#include <linux/locks.h>
9376 #include <linux/malloc.h>
9377-#include <linux/md.h>
9378-#include <linux/raid1.h>
9379-#include <asm/bitops.h>
9380+#include <linux/raid/raid1.h>
9381 #include <asm/atomic.h>
9382
9383 #define MAJOR_NR MD_MAJOR
9384 #define MD_DRIVER
9385 #define MD_PERSONALITY
9386
9387-/*
9388- * The following can be used to debug the driver
9389- */
9390-/*#define RAID1_DEBUG*/
9391-#ifdef RAID1_DEBUG
9392-#define PRINTK(x) do { printk x; } while (0);
9393-#else
9394-#define PRINTK(x) do { ; } while (0);
9395-#endif
9396+#define MAX_LINEAR_SECTORS 128
9397
9398 #define MAX(a,b) ((a) > (b) ? (a) : (b))
9399 #define MIN(a,b) ((a) < (b) ? (a) : (b))
9400
9401-static struct md_personality raid1_personality;
9402-static struct md_thread *raid1_thread = NULL;
9403+static mdk_personality_t raid1_personality;
9404 struct buffer_head *raid1_retry_list = NULL;
9405
9406-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
9407+static void * raid1_kmalloc (int size)
9408+{
9409+ void * ptr;
9410+ /*
9411+ * now we are rather fault tolerant than nice, but
9412+ * there are a couple of places in the RAID code where we
9413+ * simply can not afford to fail an allocation because
9414+ * there is no failure return path (eg. make_request())
9415+ */
9416+ while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) {
9417+ printk ("raid1: out of memory, retrying...\n");
9418+ current->state = TASK_UNINTERRUPTIBLE;
9419+ schedule_timeout(HZ/10);
9420+ }
9421+
9422+ memset(ptr, 0, size);
9423+ return ptr;
9424+}
9425+
9426+static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
9427 unsigned long *rsector, unsigned long size)
9428 {
9429- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9430- int i, n = raid_conf->raid_disks;
9431+ raid1_conf_t *conf = mddev_to_conf(mddev);
9432+ int i, disks = MD_SB_DISKS;
9433
9434 /*
9435 * Later we do read balancing on the read side
9436 * now we use the first available disk.
9437 */
9438
9439- PRINTK(("raid1_map().\n"));
9440-
9441- for (i=0; i<n; i++) {
9442- if (raid_conf->mirrors[i].operational) {
9443- *rdev = raid_conf->mirrors[i].dev;
9444+ for (i = 0; i < disks; i++) {
9445+ if (conf->mirrors[i].operational) {
9446+ *rdev = conf->mirrors[i].dev;
9447 return (0);
9448 }
9449 }
9450@@ -67,29 +72,29 @@
9451 return (-1);
9452 }
9453
9454-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
9455+static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9456 unsigned long *rsector, unsigned long size)
9457 {
9458 return 0;
9459 }
9460
9461-void raid1_reschedule_retry (struct buffer_head *bh)
9462+static void raid1_reschedule_retry (struct buffer_head *bh)
9463 {
9464 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9465-
9466- PRINTK(("raid1_reschedule_retry().\n"));
9467+ mddev_t *mddev = r1_bh->mddev;
9468+ raid1_conf_t *conf = mddev_to_conf(mddev);
9469
9470 r1_bh->next_retry = raid1_retry_list;
9471 raid1_retry_list = bh;
9472- md_wakeup_thread(raid1_thread);
9473+ md_wakeup_thread(conf->thread);
9474 }
9475
9476 /*
9477- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
9478+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
9479 * operation and are ready to return a success/failure code to the buffer
9480 * cache layer.
9481 */
9482-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
9483+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
9484 {
9485 struct buffer_head *bh = r1_bh->master_bh;
9486
9487@@ -97,8 +102,6 @@
9488 kfree(r1_bh);
9489 }
9490
9491-int raid1_one_error=0;
9492-
9493 void raid1_end_request (struct buffer_head *bh, int uptodate)
9494 {
9495 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9496@@ -106,12 +109,7 @@
9497
9498 save_flags(flags);
9499 cli();
9500- PRINTK(("raid1_end_request().\n"));
9501
9502- if (raid1_one_error) {
9503- raid1_one_error=0;
9504- uptodate=0;
9505- }
9506 /*
9507 * this branch is our 'one mirror IO has finished' event handler:
9508 */
9509@@ -136,15 +134,11 @@
9510 */
9511
9512 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
9513-
9514- PRINTK(("raid1_end_request(), read branch.\n"));
9515-
9516 /*
9517 * we have only one buffer_head on the read side
9518 */
9519 if (uptodate) {
9520- PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
9521- raid1_end_buffer_io(r1_bh, uptodate);
9522+ raid1_end_bh_io(r1_bh, uptodate);
9523 restore_flags(flags);
9524 return;
9525 }
9526@@ -152,71 +146,56 @@
9527 * oops, read error:
9528 */
9529 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
9530- kdevname(bh->b_dev), bh->b_blocknr);
9531- raid1_reschedule_retry (bh);
9532+ partition_name(bh->b_dev), bh->b_blocknr);
9533+ raid1_reschedule_retry(bh);
9534 restore_flags(flags);
9535 return;
9536 }
9537
9538 /*
9539- * WRITE or WRITEA.
9540- */
9541- PRINTK(("raid1_end_request(), write branch.\n"));
9542-
9543- /*
9544+ * WRITE:
9545+ *
9546 * Let's see if all mirrored write operations have finished
9547- * already [we have irqs off, so we can decrease]:
9548+ * already.
9549 */
9550
9551- if (!--r1_bh->remaining) {
9552- struct md_dev *mddev = r1_bh->mddev;
9553- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9554- int i, n = raid_conf->raid_disks;
9555-
9556- PRINTK(("raid1_end_request(), remaining == 0.\n"));
9557+ if (atomic_dec_and_test(&r1_bh->remaining)) {
9558+ int i, disks = MD_SB_DISKS;
9559
9560- for ( i=0; i<n; i++)
9561- if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
9562+ for ( i = 0; i < disks; i++)
9563+ if (r1_bh->mirror_bh[i])
9564+ kfree(r1_bh->mirror_bh[i]);
9565
9566- raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9567+ raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9568 }
9569- else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
9570 restore_flags(flags);
9571 }
9572
9573-/* This routine checks if the undelying device is an md device and in that
9574- * case it maps the blocks before putting the request on the queue
9575+/*
9576+ * This routine checks if the undelying device is an md device
9577+ * and in that case it maps the blocks before putting the
9578+ * request on the queue
9579 */
9580-static inline void
9581-map_and_make_request (int rw, struct buffer_head *bh)
9582+static void map_and_make_request (int rw, struct buffer_head *bh)
9583 {
9584 if (MAJOR (bh->b_rdev) == MD_MAJOR)
9585- md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
9586+ md_map (bh->b_rdev, &bh->b_rdev,
9587+ &bh->b_rsector, bh->b_size >> 9);
9588 clear_bit(BH_Lock, &bh->b_state);
9589 make_request (MAJOR (bh->b_rdev), rw, bh);
9590 }
9591
9592-static int
9593-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9594+static int raid1_make_request (mddev_t *mddev, int rw,
9595+ struct buffer_head * bh)
9596 {
9597-
9598- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9599+ raid1_conf_t *conf = mddev_to_conf(mddev);
9600 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
9601 struct raid1_bh * r1_bh;
9602- int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
9603+ int disks = MD_SB_DISKS;
9604+ int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
9605 struct mirror_info *mirror;
9606
9607- PRINTK(("raid1_make_request().\n"));
9608-
9609- while (!( /* FIXME: now we are rather fault tolerant than nice */
9610- r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
9611- ) )
9612- {
9613- printk ("raid1_make_request(#1): out of memory\n");
9614- current->policy |= SCHED_YIELD;
9615- schedule();
9616- }
9617- memset (r1_bh, 0, sizeof (struct raid1_bh));
9618+ r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
9619
9620 /*
9621 * make_request() can abort the operation when READA or WRITEA are being
9622@@ -227,43 +206,65 @@
9623 if (rw == READA) rw = READ;
9624 if (rw == WRITEA) rw = WRITE;
9625
9626- if (rw == WRITE || rw == WRITEA)
9627- mark_buffer_clean(bh); /* Too early ? */
9628+ if (rw == WRITE) {
9629+ /*
9630+ * Too early ?
9631+ */
9632+ mark_buffer_clean(bh);
9633+ /*
9634+ * not too early. we _first_ clean the bh, then we start
9635+ * the IO, then when the IO has finished, we unlock the
9636+ * bh and mark it uptodate. This way we do not miss the
9637+ * case when the bh got dirty again during the IO.
9638+ */
9639+ }
9640+
9641+ /*
9642+ * special flag for 'lowprio' reconstruction requests ...
9643+ */
9644+ if (buffer_lowprio(bh))
9645+ lowprio = 1;
9646
9647 /*
9648- * i think the read and write branch should be separated completely, since we want
9649- * to do read balancing on the read side for example. Comments? :) --mingo
9650+ * i think the read and write branch should be separated completely,
9651+ * since we want to do read balancing on the read side for example.
9652+ * Comments? :) --mingo
9653 */
9654
9655 r1_bh->master_bh=bh;
9656 r1_bh->mddev=mddev;
9657 r1_bh->cmd = rw;
9658
9659- if (rw==READ || rw==READA) {
9660- int last_used = raid_conf->last_used;
9661- PRINTK(("raid1_make_request(), read branch.\n"));
9662- mirror = raid_conf->mirrors + last_used;
9663+ if (rw==READ) {
9664+ int last_used = conf->last_used;
9665+
9666+ /*
9667+ * read balancing logic:
9668+ */
9669+ mirror = conf->mirrors + last_used;
9670 bh->b_rdev = mirror->dev;
9671 sectors = bh->b_size >> 9;
9672- if (bh->b_blocknr * sectors == raid_conf->next_sect) {
9673- raid_conf->sect_count += sectors;
9674- if (raid_conf->sect_count >= mirror->sect_limit)
9675+
9676+ if (bh->b_blocknr * sectors == conf->next_sect) {
9677+ conf->sect_count += sectors;
9678+ if (conf->sect_count >= mirror->sect_limit)
9679 switch_disks = 1;
9680 } else
9681 switch_disks = 1;
9682- raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
9683- if (switch_disks) {
9684- PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
9685- raid_conf->sect_count = 0;
9686- last_used = raid_conf->last_used = mirror->next;
9687+ conf->next_sect = (bh->b_blocknr + 1) * sectors;
9688+ /*
9689+ * Do not switch disks if full resync is in progress ...
9690+ */
9691+ if (switch_disks && !conf->resync_mirrors) {
9692+ conf->sect_count = 0;
9693+ last_used = conf->last_used = mirror->next;
9694 /*
9695- * Do not switch to write-only disks ... resyncing
9696- * is in progress
9697+ * Do not switch to write-only disks ...
9698+ * reconstruction is in progress
9699 */
9700- while (raid_conf->mirrors[last_used].write_only)
9701- raid_conf->last_used = raid_conf->mirrors[last_used].next;
9702+ while (conf->mirrors[last_used].write_only)
9703+ conf->last_used = conf->mirrors[last_used].next;
9704 }
9705- PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
9706 bh_req = &r1_bh->bh_req;
9707 memcpy(bh_req, bh, sizeof(*bh));
9708 bh_req->b_end_io = raid1_end_request;
9709@@ -273,13 +274,12 @@
9710 }
9711
9712 /*
9713- * WRITE or WRITEA.
9714+ * WRITE:
9715 */
9716- PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
9717
9718- for (i = 0; i < n; i++) {
9719+ for (i = 0; i < disks; i++) {
9720
9721- if (!raid_conf->mirrors [i].operational) {
9722+ if (!conf->mirrors[i].operational) {
9723 /*
9724 * the r1_bh->mirror_bh[i] pointer remains NULL
9725 */
9726@@ -287,89 +287,91 @@
9727 continue;
9728 }
9729
9730+ /*
9731+ * special case for reconstruction ...
9732+ */
9733+ if (lowprio && (i == conf->last_used)) {
9734+ mirror_bh[i] = NULL;
9735+ continue;
9736+ }
9737+
9738+ /*
9739+ * We should use a private pool (size depending on NR_REQUEST),
9740+ * to avoid writes filling up the memory with bhs
9741+ *
9742+ * Such pools are much faster than kmalloc anyways (so we waste
9743+ * almost nothing by not using the master bh when writing and
9744+ * win alot of cleanness) but for now we are cool enough. --mingo
9745+ *
9746+ * It's safe to sleep here, buffer heads cannot be used in a shared
9747+ * manner in the write branch. Look how we lock the buffer at the
9748+ * beginning of this function to grok the difference ;)
9749+ */
9750+ mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
9751+ /*
9752+ * prepare mirrored bh (fields ordered for max mem throughput):
9753+ */
9754+ mirror_bh[i]->b_blocknr = bh->b_blocknr;
9755+ mirror_bh[i]->b_dev = bh->b_dev;
9756+ mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
9757+ mirror_bh[i]->b_rsector = bh->b_rsector;
9758+ mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9759+ if (lowprio)
9760+ mirror_bh[i]->b_state |= (1<<BH_LowPrio);
9761+
9762+ mirror_bh[i]->b_count = 1;
9763+ mirror_bh[i]->b_size = bh->b_size;
9764+ mirror_bh[i]->b_data = bh->b_data;
9765+ mirror_bh[i]->b_list = BUF_LOCKED;
9766+ mirror_bh[i]->b_end_io = raid1_end_request;
9767+ mirror_bh[i]->b_dev_id = r1_bh;
9768+
9769+ r1_bh->mirror_bh[i] = mirror_bh[i];
9770+ sum_bhs++;
9771+ }
9772+
9773+ md_atomic_set(&r1_bh->remaining, sum_bhs);
9774+
9775 /*
9776- * We should use a private pool (size depending on NR_REQUEST),
9777- * to avoid writes filling up the memory with bhs
9778- *
9779- * Such pools are much faster than kmalloc anyways (so we waste almost
9780- * nothing by not using the master bh when writing and win alot of cleanness)
9781- *
9782- * but for now we are cool enough. --mingo
9783- *
9784- * It's safe to sleep here, buffer heads cannot be used in a shared
9785- * manner in the write branch. Look how we lock the buffer at the beginning
9786- * of this function to grok the difference ;)
9787- */
9788- while (!( /* FIXME: now we are rather fault tolerant than nice */
9789- mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
9790- ) )
9791- {
9792- printk ("raid1_make_request(#2): out of memory\n");
9793- current->policy |= SCHED_YIELD;
9794- schedule();
9795- }
9796- memset (mirror_bh[i], 0, sizeof (struct buffer_head));
9797-
9798- /*
9799- * prepare mirrored bh (fields ordered for max mem throughput):
9800- */
9801- mirror_bh [i]->b_blocknr = bh->b_blocknr;
9802- mirror_bh [i]->b_dev = bh->b_dev;
9803- mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
9804- mirror_bh [i]->b_rsector = bh->b_rsector;
9805- mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9806- mirror_bh [i]->b_count = 1;
9807- mirror_bh [i]->b_size = bh->b_size;
9808- mirror_bh [i]->b_data = bh->b_data;
9809- mirror_bh [i]->b_list = BUF_LOCKED;
9810- mirror_bh [i]->b_end_io = raid1_end_request;
9811- mirror_bh [i]->b_dev_id = r1_bh;
9812-
9813- r1_bh->mirror_bh[i] = mirror_bh[i];
9814- sum_bhs++;
9815- }
9816-
9817- r1_bh->remaining = sum_bhs;
9818-
9819- PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
9820-
9821- /*
9822- * We have to be a bit careful about the semaphore above, thats why we
9823- * start the requests separately. Since kmalloc() could fail, sleep and
9824- * make_request() can sleep too, this is the safer solution. Imagine,
9825- * end_request decreasing the semaphore before we could have set it up ...
9826- * We could play tricks with the semaphore (presetting it and correcting
9827- * at the end if sum_bhs is not 'n' but we have to do end_request by hand
9828- * if all requests finish until we had a chance to set up the semaphore
9829- * correctly ... lots of races).
9830- */
9831- for (i = 0; i < n; i++)
9832- if (mirror_bh [i] != NULL)
9833- map_and_make_request (rw, mirror_bh [i]);
9834+ * We have to be a bit careful about the semaphore above, thats
9835+ * why we start the requests separately. Since kmalloc() could
9836+ * fail, sleep and make_request() can sleep too, this is the
9837+ * safer solution. Imagine, end_request decreasing the semaphore
9838+ * before we could have set it up ... We could play tricks with
9839+ * the semaphore (presetting it and correcting at the end if
9840+ * sum_bhs is not 'n' but we have to do end_request by hand if
9841+ * all requests finish until we had a chance to set up the
9842+ * semaphore correctly ... lots of races).
9843+ */
9844+ for (i = 0; i < disks; i++)
9845+ if (mirror_bh[i])
9846+ map_and_make_request(rw, mirror_bh[i]);
9847
9848 return (0);
9849 }
9850
9851-static int raid1_status (char *page, int minor, struct md_dev *mddev)
9852+static int raid1_status (char *page, mddev_t *mddev)
9853 {
9854- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9855+ raid1_conf_t *conf = mddev_to_conf(mddev);
9856 int sz = 0, i;
9857
9858- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
9859- for (i = 0; i < raid_conf->raid_disks; i++)
9860- sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
9861+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
9862+ conf->working_disks);
9863+ for (i = 0; i < conf->raid_disks; i++)
9864+ sz += sprintf (page+sz, "%s",
9865+ conf->mirrors[i].operational ? "U" : "_");
9866 sz += sprintf (page+sz, "]");
9867 return sz;
9868 }
9869
9870-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
9871+static void unlink_disk (raid1_conf_t *conf, int target)
9872 {
9873- int disks = raid_conf->raid_disks;
9874- int j;
9875+ int disks = MD_SB_DISKS;
9876+ int i;
9877
9878- for (j = 0; j < disks; j++)
9879- if (raid_conf->mirrors [j].next == failed_index)
9880- raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
9881+ for (i = 0; i < disks; i++)
9882+ if (conf->mirrors[i].next == target)
9883+ conf->mirrors[i].next = conf->mirrors[target].next;
9884 }
9885
9886 #define LAST_DISK KERN_ALERT \
9887@@ -388,48 +390,53 @@
9888 #define ALREADY_SYNCING KERN_INFO \
9889 "raid1: syncing already in progress.\n"
9890
9891-static int raid1_error (struct md_dev *mddev, kdev_t dev)
9892+static void mark_disk_bad (mddev_t *mddev, int failed)
9893 {
9894- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9895- struct mirror_info *mirror;
9896- md_superblock_t *sb = mddev->sb;
9897- int disks = raid_conf->raid_disks;
9898- int i;
9899+ raid1_conf_t *conf = mddev_to_conf(mddev);
9900+ struct mirror_info *mirror = conf->mirrors+failed;
9901+ mdp_super_t *sb = mddev->sb;
9902+
9903+ mirror->operational = 0;
9904+ unlink_disk(conf, failed);
9905+ mark_disk_faulty(sb->disks+mirror->number);
9906+ mark_disk_nonsync(sb->disks+mirror->number);
9907+ mark_disk_inactive(sb->disks+mirror->number);
9908+ sb->active_disks--;
9909+ sb->working_disks--;
9910+ sb->failed_disks++;
9911+ mddev->sb_dirty = 1;
9912+ md_wakeup_thread(conf->thread);
9913+ conf->working_disks--;
9914+ printk (DISK_FAILED, partition_name (mirror->dev),
9915+ conf->working_disks);
9916+}
9917
9918- PRINTK(("raid1_error called\n"));
9919+static int raid1_error (mddev_t *mddev, kdev_t dev)
9920+{
9921+ raid1_conf_t *conf = mddev_to_conf(mddev);
9922+ struct mirror_info * mirrors = conf->mirrors;
9923+ int disks = MD_SB_DISKS;
9924+ int i;
9925
9926- if (raid_conf->working_disks == 1) {
9927+ if (conf->working_disks == 1) {
9928 /*
9929 * Uh oh, we can do nothing if this is our last disk, but
9930 * first check if this is a queued request for a device
9931 * which has just failed.
9932 */
9933- for (i = 0, mirror = raid_conf->mirrors; i < disks;
9934- i++, mirror++)
9935- if (mirror->dev == dev && !mirror->operational)
9936+ for (i = 0; i < disks; i++) {
9937+ if (mirrors[i].dev==dev && !mirrors[i].operational)
9938 return 0;
9939+ }
9940 printk (LAST_DISK);
9941 } else {
9942- /* Mark disk as unusable */
9943- for (i = 0, mirror = raid_conf->mirrors; i < disks;
9944- i++, mirror++) {
9945- if (mirror->dev == dev && mirror->operational){
9946- mirror->operational = 0;
9947- raid1_fix_links (raid_conf, i);
9948- sb->disks[mirror->number].state |=
9949- (1 << MD_FAULTY_DEVICE);
9950- sb->disks[mirror->number].state &=
9951- ~(1 << MD_SYNC_DEVICE);
9952- sb->disks[mirror->number].state &=
9953- ~(1 << MD_ACTIVE_DEVICE);
9954- sb->active_disks--;
9955- sb->working_disks--;
9956- sb->failed_disks++;
9957- mddev->sb_dirty = 1;
9958- md_wakeup_thread(raid1_thread);
9959- raid_conf->working_disks--;
9960- printk (DISK_FAILED, kdevname (dev),
9961- raid_conf->working_disks);
9962+ /*
9963+ * Mark disk as unusable
9964+ */
9965+ for (i = 0; i < disks; i++) {
9966+ if (mirrors[i].dev==dev && mirrors[i].operational) {
9967+ mark_disk_bad (mddev, i);
9968+ break;
9969 }
9970 }
9971 }
9972@@ -442,219 +449,396 @@
9973 #undef START_SYNCING
9974
9975 /*
9976- * This is the personality-specific hot-addition routine
9977+ * Insert the spare disk into the drive-ring
9978 */
9979+static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
9980+{
9981+ int j, next;
9982+ int disks = MD_SB_DISKS;
9983+ struct mirror_info *p = conf->mirrors;
9984
9985-#define NO_SUPERBLOCK KERN_ERR \
9986-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
9987+ for (j = 0; j < disks; j++, p++)
9988+ if (p->operational && !p->write_only) {
9989+ next = p->next;
9990+ p->next = mirror->raid_disk;
9991+ mirror->next = next;
9992+ return;
9993+ }
9994
9995-#define WRONG_LEVEL KERN_ERR \
9996-"raid1: hot-add: level of disk is not RAID-1\n"
9997+ printk("raid1: bug: no read-operational devices\n");
9998+}
9999
10000-#define HOT_ADD_SUCCEEDED KERN_INFO \
10001-"raid1: device %s hot-added\n"
10002+static void print_raid1_conf (raid1_conf_t *conf)
10003+{
10004+ int i;
10005+ struct mirror_info *tmp;
10006
10007-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
10008+ printk("RAID1 conf printout:\n");
10009+ if (!conf) {
10010+ printk("(conf==NULL)\n");
10011+ return;
10012+ }
10013+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
10014+ conf->raid_disks, conf->nr_disks);
10015+
10016+ for (i = 0; i < MD_SB_DISKS; i++) {
10017+ tmp = conf->mirrors + i;
10018+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10019+ i, tmp->spare,tmp->operational,
10020+ tmp->number,tmp->raid_disk,tmp->used_slot,
10021+ partition_name(tmp->dev));
10022+ }
10023+}
10024+
10025+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10026 {
10027+ int err = 0;
10028+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10029+ raid1_conf_t *conf = mddev->private;
10030+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10031 unsigned long flags;
10032- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10033- struct mirror_info *mirror;
10034- md_superblock_t *sb = mddev->sb;
10035- struct real_dev * realdev;
10036- int n;
10037+ mdp_super_t *sb = mddev->sb;
10038+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10039+
10040+ save_flags(flags);
10041+ cli();
10042
10043+ print_raid1_conf(conf);
10044 /*
10045- * The device has its superblock already read and it was found
10046- * to be consistent for generic RAID usage. Now we check whether
10047- * it's usable for RAID-1 hot addition.
10048+ * find the disk ...
10049 */
10050+ switch (state) {
10051
10052- n = mddev->nb_dev++;
10053- realdev = &mddev->devices[n];
10054- if (!realdev->sb) {
10055- printk (NO_SUPERBLOCK);
10056- return -EINVAL;
10057- }
10058- if (realdev->sb->level != 1) {
10059- printk (WRONG_LEVEL);
10060- return -EINVAL;
10061+ case DISKOP_SPARE_ACTIVE:
10062+
10063+ /*
10064+ * Find the failed disk within the RAID1 configuration ...
10065+ * (this can only be in the first conf->working_disks part)
10066+ */
10067+ for (i = 0; i < conf->raid_disks; i++) {
10068+ tmp = conf->mirrors + i;
10069+ if ((!tmp->operational && !tmp->spare) ||
10070+ !tmp->used_slot) {
10071+ failed_disk = i;
10072+ break;
10073+ }
10074+ }
10075+ /*
10076+ * When we activate a spare disk we _must_ have a disk in
10077+ * the lower (active) part of the array to replace.
10078+ */
10079+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10080+ MD_BUG();
10081+ err = 1;
10082+ goto abort;
10083+ }
10084+ /* fall through */
10085+
10086+ case DISKOP_SPARE_WRITE:
10087+ case DISKOP_SPARE_INACTIVE:
10088+
10089+ /*
10090+ * Find the spare disk ... (can only be in the 'high'
10091+ * area of the array)
10092+ */
10093+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10094+ tmp = conf->mirrors + i;
10095+ if (tmp->spare && tmp->number == (*d)->number) {
10096+ spare_disk = i;
10097+ break;
10098+ }
10099+ }
10100+ if (spare_disk == -1) {
10101+ MD_BUG();
10102+ err = 1;
10103+ goto abort;
10104+ }
10105+ break;
10106+
10107+ case DISKOP_HOT_REMOVE_DISK:
10108+
10109+ for (i = 0; i < MD_SB_DISKS; i++) {
10110+ tmp = conf->mirrors + i;
10111+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
10112+ if (tmp->operational) {
10113+ err = -EBUSY;
10114+ goto abort;
10115+ }
10116+ removed_disk = i;
10117+ break;
10118+ }
10119+ }
10120+ if (removed_disk == -1) {
10121+ MD_BUG();
10122+ err = 1;
10123+ goto abort;
10124+ }
10125+ break;
10126+
10127+ case DISKOP_HOT_ADD_DISK:
10128+
10129+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10130+ tmp = conf->mirrors + i;
10131+ if (!tmp->used_slot) {
10132+ added_disk = i;
10133+ break;
10134+ }
10135+ }
10136+ if (added_disk == -1) {
10137+ MD_BUG();
10138+ err = 1;
10139+ goto abort;
10140+ }
10141+ break;
10142 }
10143- /* FIXME: are there other things left we could sanity-check? */
10144
10145+ switch (state) {
10146 /*
10147- * We have to disable interrupts, as our RAID-1 state is used
10148- * from irq handlers as well.
10149+ * Switch the spare disk to write-only mode:
10150 */
10151- save_flags(flags);
10152- cli();
10153+ case DISKOP_SPARE_WRITE:
10154+ sdisk = conf->mirrors + spare_disk;
10155+ sdisk->operational = 1;
10156+ sdisk->write_only = 1;
10157+ break;
10158+ /*
10159+ * Deactivate a spare disk:
10160+ */
10161+ case DISKOP_SPARE_INACTIVE:
10162+ sdisk = conf->mirrors + spare_disk;
10163+ sdisk->operational = 0;
10164+ sdisk->write_only = 0;
10165+ break;
10166+ /*
10167+ * Activate (mark read-write) the (now sync) spare disk,
10168+ * which means we switch it's 'raid position' (->raid_disk)
10169+ * with the failed disk. (only the first 'conf->nr_disks'
10170+ * slots are used for 'real' disks and we must preserve this
10171+ * property)
10172+ */
10173+ case DISKOP_SPARE_ACTIVE:
10174
10175- raid_conf->raid_disks++;
10176- mirror = raid_conf->mirrors+n;
10177+ sdisk = conf->mirrors + spare_disk;
10178+ fdisk = conf->mirrors + failed_disk;
10179
10180- mirror->number=n;
10181- mirror->raid_disk=n;
10182- mirror->dev=dev;
10183- mirror->next=0; /* FIXME */
10184- mirror->sect_limit=128;
10185-
10186- mirror->operational=0;
10187- mirror->spare=1;
10188- mirror->write_only=0;
10189-
10190- sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
10191- sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
10192- sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
10193- sb->nr_disks++;
10194- sb->spare_disks++;
10195+ spare_desc = &sb->disks[sdisk->number];
10196+ failed_desc = &sb->disks[fdisk->number];
10197
10198- restore_flags(flags);
10199+ if (spare_desc != *d) {
10200+ MD_BUG();
10201+ err = 1;
10202+ goto abort;
10203+ }
10204
10205- md_update_sb(MINOR(dev));
10206+ if (spare_desc->raid_disk != sdisk->raid_disk) {
10207+ MD_BUG();
10208+ err = 1;
10209+ goto abort;
10210+ }
10211+
10212+ if (sdisk->raid_disk != spare_disk) {
10213+ MD_BUG();
10214+ err = 1;
10215+ goto abort;
10216+ }
10217
10218- printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
10219+ if (failed_desc->raid_disk != fdisk->raid_disk) {
10220+ MD_BUG();
10221+ err = 1;
10222+ goto abort;
10223+ }
10224
10225- return 0;
10226-}
10227+ if (fdisk->raid_disk != failed_disk) {
10228+ MD_BUG();
10229+ err = 1;
10230+ goto abort;
10231+ }
10232
10233-#undef NO_SUPERBLOCK
10234-#undef WRONG_LEVEL
10235-#undef HOT_ADD_SUCCEEDED
10236+ /*
10237+ * do the switch finally
10238+ */
10239+ xchg_values(*spare_desc, *failed_desc);
10240+ xchg_values(*fdisk, *sdisk);
10241
10242-/*
10243- * Insert the spare disk into the drive-ring
10244- */
10245-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
10246-{
10247- int j, next;
10248- struct mirror_info *p = raid_conf->mirrors;
10249+ /*
10250+ * (careful, 'failed' and 'spare' are switched from now on)
10251+ *
10252+ * we want to preserve linear numbering and we want to
10253+ * give the proper raid_disk number to the now activated
10254+ * disk. (this means we switch back these values)
10255+ */
10256+
10257+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10258+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10259+ xchg_values(spare_desc->number, failed_desc->number);
10260+ xchg_values(sdisk->number, fdisk->number);
10261
10262- for (j = 0; j < raid_conf->raid_disks; j++, p++)
10263- if (p->operational && !p->write_only) {
10264- next = p->next;
10265- p->next = mirror->raid_disk;
10266- mirror->next = next;
10267- return;
10268- }
10269- printk("raid1: bug: no read-operational devices\n");
10270-}
10271+ *d = failed_desc;
10272
10273-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
10274- int state)
10275-{
10276- int i = 0, failed_disk = -1;
10277- struct raid1_data *raid_conf = mddev->private;
10278- struct mirror_info *mirror = raid_conf->mirrors;
10279- md_descriptor_t *descriptor;
10280- unsigned long flags;
10281+ if (sdisk->dev == MKDEV(0,0))
10282+ sdisk->used_slot = 0;
10283+ /*
10284+ * this really activates the spare.
10285+ */
10286+ fdisk->spare = 0;
10287+ fdisk->write_only = 0;
10288+ link_disk(conf, fdisk);
10289
10290- for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
10291- if (mirror->spare && mirror->number == spare->number)
10292- goto found;
10293- }
10294- return 1;
10295-found:
10296- for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
10297- i++, mirror++)
10298- if (!mirror->operational)
10299- failed_disk = i;
10300+ /*
10301+ * if we activate a spare, we definitely replace a
10302+ * non-operational disk slot in the 'low' area of
10303+ * the disk array.
10304+ */
10305
10306- save_flags(flags);
10307- cli();
10308- switch (state) {
10309- case SPARE_WRITE:
10310- mirror->operational = 1;
10311- mirror->write_only = 1;
10312- raid_conf->raid_disks = MAX(raid_conf->raid_disks,
10313- mirror->raid_disk + 1);
10314- break;
10315- case SPARE_INACTIVE:
10316- mirror->operational = 0;
10317- mirror->write_only = 0;
10318- break;
10319- case SPARE_ACTIVE:
10320- mirror->spare = 0;
10321- mirror->write_only = 0;
10322- raid_conf->working_disks++;
10323- add_ring(raid_conf, mirror);
10324-
10325- if (failed_disk != -1) {
10326- descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
10327- i = spare->raid_disk;
10328- spare->raid_disk = descriptor->raid_disk;
10329- descriptor->raid_disk = i;
10330- }
10331- break;
10332- default:
10333- printk("raid1_mark_spare: bug: state == %d\n", state);
10334- restore_flags(flags);
10335- return 1;
10336+ conf->working_disks++;
10337+
10338+ break;
10339+
10340+ case DISKOP_HOT_REMOVE_DISK:
10341+ rdisk = conf->mirrors + removed_disk;
10342+
10343+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10344+ MD_BUG();
10345+ err = 1;
10346+ goto abort;
10347+ }
10348+ rdisk->dev = MKDEV(0,0);
10349+ rdisk->used_slot = 0;
10350+ conf->nr_disks--;
10351+ break;
10352+
10353+ case DISKOP_HOT_ADD_DISK:
10354+ adisk = conf->mirrors + added_disk;
10355+ added_desc = *d;
10356+
10357+ if (added_disk != added_desc->number) {
10358+ MD_BUG();
10359+ err = 1;
10360+ goto abort;
10361+ }
10362+
10363+ adisk->number = added_desc->number;
10364+ adisk->raid_disk = added_desc->raid_disk;
10365+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10366+
10367+ adisk->operational = 0;
10368+ adisk->write_only = 0;
10369+ adisk->spare = 1;
10370+ adisk->used_slot = 1;
10371+ conf->nr_disks++;
10372+
10373+ break;
10374+
10375+ default:
10376+ MD_BUG();
10377+ err = 1;
10378+ goto abort;
10379 }
10380+abort:
10381 restore_flags(flags);
10382- return 0;
10383+ print_raid1_conf(conf);
10384+ return err;
10385 }
10386
10387+
10388+#define IO_ERROR KERN_ALERT \
10389+"raid1: %s: unrecoverable I/O read error for block %lu\n"
10390+
10391+#define REDIRECT_SECTOR KERN_ERR \
10392+"raid1: %s: redirecting sector %lu to another mirror\n"
10393+
10394 /*
10395 * This is a kernel thread which:
10396 *
10397 * 1. Retries failed read operations on working mirrors.
10398 * 2. Updates the raid superblock when problems encounter.
10399 */
10400-void raid1d (void *data)
10401+static void raid1d (void *data)
10402 {
10403 struct buffer_head *bh;
10404 kdev_t dev;
10405 unsigned long flags;
10406- struct raid1_bh * r1_bh;
10407- struct md_dev *mddev;
10408+ struct raid1_bh *r1_bh;
10409+ mddev_t *mddev;
10410
10411- PRINTK(("raid1d() active\n"));
10412- save_flags(flags);
10413- cli();
10414 while (raid1_retry_list) {
10415+ save_flags(flags);
10416+ cli();
10417 bh = raid1_retry_list;
10418 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
10419 raid1_retry_list = r1_bh->next_retry;
10420 restore_flags(flags);
10421
10422- mddev = md_dev + MINOR(bh->b_dev);
10423+ mddev = kdev_to_mddev(bh->b_dev);
10424 if (mddev->sb_dirty) {
10425- printk("dirty sb detected, updating.\n");
10426+ printk(KERN_INFO "dirty sb detected, updating.\n");
10427 mddev->sb_dirty = 0;
10428- md_update_sb(MINOR(bh->b_dev));
10429+ md_update_sb(mddev);
10430 }
10431 dev = bh->b_rdev;
10432- __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
10433+ __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
10434+ bh->b_size >> 9);
10435 if (bh->b_rdev == dev) {
10436- printk (KERN_ALERT
10437- "raid1: %s: unrecoverable I/O read error for block %lu\n",
10438- kdevname(bh->b_dev), bh->b_blocknr);
10439- raid1_end_buffer_io(r1_bh, 0);
10440+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
10441+ raid1_end_bh_io(r1_bh, 0);
10442 } else {
10443- printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
10444- kdevname(bh->b_dev), bh->b_blocknr);
10445+ printk (REDIRECT_SECTOR,
10446+ partition_name(bh->b_dev), bh->b_blocknr);
10447 map_and_make_request (r1_bh->cmd, bh);
10448 }
10449- cli();
10450 }
10451- restore_flags(flags);
10452+}
10453+#undef IO_ERROR
10454+#undef REDIRECT_SECTOR
10455+
10456+/*
10457+ * Private kernel thread to reconstruct mirrors after an unclean
10458+ * shutdown.
10459+ */
10460+static void raid1syncd (void *data)
10461+{
10462+ raid1_conf_t *conf = data;
10463+ mddev_t *mddev = conf->mddev;
10464+
10465+ if (!conf->resync_mirrors)
10466+ return;
10467+ if (conf->resync_mirrors == 2)
10468+ return;
10469+ down(&mddev->recovery_sem);
10470+ if (md_do_sync(mddev, NULL)) {
10471+ up(&mddev->recovery_sem);
10472+ return;
10473+ }
10474+ /*
10475+ * Only if everything went Ok.
10476+ */
10477+ conf->resync_mirrors = 0;
10478+ up(&mddev->recovery_sem);
10479 }
10480
10481+
10482 /*
10483 * This will catch the scenario in which one of the mirrors was
10484 * mounted as a normal device rather than as a part of a raid set.
10485+ *
10486+ * check_consistency is very personality-dependent, eg. RAID5 cannot
10487+ * do this check, it uses another method.
10488 */
10489-static int __check_consistency (struct md_dev *mddev, int row)
10490+static int __check_consistency (mddev_t *mddev, int row)
10491 {
10492- struct raid1_data *raid_conf = mddev->private;
10493+ raid1_conf_t *conf = mddev_to_conf(mddev);
10494+ int disks = MD_SB_DISKS;
10495 kdev_t dev;
10496 struct buffer_head *bh = NULL;
10497 int i, rc = 0;
10498 char *buffer = NULL;
10499
10500- for (i = 0; i < raid_conf->raid_disks; i++) {
10501- if (!raid_conf->mirrors[i].operational)
10502+ for (i = 0; i < disks; i++) {
10503+ printk("(checking disk %d)\n",i);
10504+ if (!conf->mirrors[i].operational)
10505 continue;
10506- dev = raid_conf->mirrors[i].dev;
10507+ printk("(really checking disk %d)\n",i);
10508+ dev = conf->mirrors[i].dev;
10509 set_blocksize(dev, 4096);
10510 if ((bh = bread(dev, row / 4, 4096)) == NULL)
10511 break;
10512@@ -683,167 +867,342 @@
10513 return rc;
10514 }
10515
10516-static int check_consistency (struct md_dev *mddev)
10517+static int check_consistency (mddev_t *mddev)
10518 {
10519- int size = mddev->sb->size;
10520- int row;
10521+ if (__check_consistency(mddev, 0))
10522+/*
10523+ * we do not do this currently, as it's perfectly possible to
10524+ * have an inconsistent array when it's freshly created. Only
10525+ * newly written data has to be consistent.
10526+ */
10527+ return 0;
10528
10529- for (row = 0; row < size; row += size / 8)
10530- if (__check_consistency(mddev, row))
10531- return 1;
10532 return 0;
10533 }
10534
10535-static int raid1_run (int minor, struct md_dev *mddev)
10536+#define INVALID_LEVEL KERN_WARNING \
10537+"raid1: md%d: raid level not set to mirroring (%d)\n"
10538+
10539+#define NO_SB KERN_ERR \
10540+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
10541+
10542+#define ERRORS KERN_ERR \
10543+"raid1: disabled mirror %s (errors detected)\n"
10544+
10545+#define NOT_IN_SYNC KERN_ERR \
10546+"raid1: disabled mirror %s (not in sync)\n"
10547+
10548+#define INCONSISTENT KERN_ERR \
10549+"raid1: disabled mirror %s (inconsistent descriptor)\n"
10550+
10551+#define ALREADY_RUNNING KERN_ERR \
10552+"raid1: disabled mirror %s (mirror %d already operational)\n"
10553+
10554+#define OPERATIONAL KERN_INFO \
10555+"raid1: device %s operational as mirror %d\n"
10556+
10557+#define MEM_ERROR KERN_ERR \
10558+"raid1: couldn't allocate memory for md%d\n"
10559+
10560+#define SPARE KERN_INFO \
10561+"raid1: spare disk %s\n"
10562+
10563+#define NONE_OPERATIONAL KERN_ERR \
10564+"raid1: no operational mirrors for md%d\n"
10565+
10566+#define RUNNING_CKRAID KERN_ERR \
10567+"raid1: detected mirror differences -- running resync\n"
10568+
10569+#define ARRAY_IS_ACTIVE KERN_INFO \
10570+"raid1: raid set md%d active with %d out of %d mirrors\n"
10571+
10572+#define THREAD_ERROR KERN_ERR \
10573+"raid1: couldn't allocate thread for md%d\n"
10574+
10575+#define START_RESYNC KERN_WARNING \
10576+"raid1: raid set md%d not clean; reconstructing mirrors\n"
10577+
10578+static int raid1_run (mddev_t *mddev)
10579 {
10580- struct raid1_data *raid_conf;
10581- int i, j, raid_disk;
10582- md_superblock_t *sb = mddev->sb;
10583- md_descriptor_t *descriptor;
10584- struct real_dev *realdev;
10585+ raid1_conf_t *conf;
10586+ int i, j, disk_idx;
10587+ struct mirror_info *disk;
10588+ mdp_super_t *sb = mddev->sb;
10589+ mdp_disk_t *descriptor;
10590+ mdk_rdev_t *rdev;
10591+ struct md_list_head *tmp;
10592+ int start_recovery = 0;
10593
10594 MOD_INC_USE_COUNT;
10595
10596 if (sb->level != 1) {
10597- printk("raid1: %s: raid level not set to mirroring (%d)\n",
10598- kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
10599- MOD_DEC_USE_COUNT;
10600- return -EIO;
10601- }
10602- /****
10603- * copy the now verified devices into our private RAID1 bookkeeping
10604- * area. [whatever we allocate in raid1_run(), should be freed in
10605- * raid1_stop()]
10606+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
10607+ goto out;
10608+ }
10609+ /*
10610+ * copy the already verified devices into our private RAID1
10611+ * bookkeeping area. [whatever we allocate in raid1_run(),
10612+ * should be freed in raid1_stop()]
10613 */
10614
10615- while (!( /* FIXME: now we are rather fault tolerant than nice */
10616- mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
10617- ) )
10618- {
10619- printk ("raid1_run(): out of memory\n");
10620- current->policy |= SCHED_YIELD;
10621- schedule();
10622- }
10623- raid_conf = mddev->private;
10624- memset(raid_conf, 0, sizeof(*raid_conf));
10625-
10626- PRINTK(("raid1_run(%d) called.\n", minor));
10627-
10628- for (i = 0; i < mddev->nb_dev; i++) {
10629- realdev = &mddev->devices[i];
10630- if (!realdev->sb) {
10631- printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10632+ conf = raid1_kmalloc(sizeof(raid1_conf_t));
10633+ mddev->private = conf;
10634+ if (!conf) {
10635+ printk(MEM_ERROR, mdidx(mddev));
10636+ goto out;
10637+ }
10638+
10639+ ITERATE_RDEV(mddev,rdev,tmp) {
10640+ if (rdev->faulty) {
10641+ printk(ERRORS, partition_name(rdev->dev));
10642+ } else {
10643+ if (!rdev->sb) {
10644+ MD_BUG();
10645+ continue;
10646+ }
10647+ }
10648+ if (rdev->desc_nr == -1) {
10649+ MD_BUG();
10650 continue;
10651 }
10652-
10653- /*
10654- * This is important -- we are using the descriptor on
10655- * the disk only to get a pointer to the descriptor on
10656- * the main superblock, which might be more recent.
10657- */
10658- descriptor = &sb->disks[realdev->sb->descriptor.number];
10659- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10660- printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
10661+ descriptor = &sb->disks[rdev->desc_nr];
10662+ disk_idx = descriptor->raid_disk;
10663+ disk = conf->mirrors + disk_idx;
10664+
10665+ if (disk_faulty(descriptor)) {
10666+ disk->number = descriptor->number;
10667+ disk->raid_disk = disk_idx;
10668+ disk->dev = rdev->dev;
10669+ disk->sect_limit = MAX_LINEAR_SECTORS;
10670+ disk->operational = 0;
10671+ disk->write_only = 0;
10672+ disk->spare = 0;
10673+ disk->used_slot = 1;
10674 continue;
10675 }
10676- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10677- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10678- printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
10679+ if (disk_active(descriptor)) {
10680+ if (!disk_sync(descriptor)) {
10681+ printk(NOT_IN_SYNC,
10682+ partition_name(rdev->dev));
10683 continue;
10684 }
10685- raid_disk = descriptor->raid_disk;
10686- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10687- printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10688+ if ((descriptor->number > MD_SB_DISKS) ||
10689+ (disk_idx > sb->raid_disks)) {
10690+
10691+ printk(INCONSISTENT,
10692+ partition_name(rdev->dev));
10693 continue;
10694 }
10695- if (raid_conf->mirrors[raid_disk].operational) {
10696- printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
10697+ if (disk->operational) {
10698+ printk(ALREADY_RUNNING,
10699+ partition_name(rdev->dev),
10700+ disk_idx);
10701 continue;
10702 }
10703- printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
10704- raid_conf->mirrors[raid_disk].number = descriptor->number;
10705- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10706- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10707- raid_conf->mirrors[raid_disk].operational = 1;
10708- raid_conf->mirrors[raid_disk].sect_limit = 128;
10709- raid_conf->working_disks++;
10710+ printk(OPERATIONAL, partition_name(rdev->dev),
10711+ disk_idx);
10712+ disk->number = descriptor->number;
10713+ disk->raid_disk = disk_idx;
10714+ disk->dev = rdev->dev;
10715+ disk->sect_limit = MAX_LINEAR_SECTORS;
10716+ disk->operational = 1;
10717+ disk->write_only = 0;
10718+ disk->spare = 0;
10719+ disk->used_slot = 1;
10720+ conf->working_disks++;
10721 } else {
10722 /*
10723 * Must be a spare disk ..
10724 */
10725- printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
10726- raid_disk = descriptor->raid_disk;
10727- raid_conf->mirrors[raid_disk].number = descriptor->number;
10728- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10729- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10730- raid_conf->mirrors[raid_disk].sect_limit = 128;
10731-
10732- raid_conf->mirrors[raid_disk].operational = 0;
10733- raid_conf->mirrors[raid_disk].write_only = 0;
10734- raid_conf->mirrors[raid_disk].spare = 1;
10735- }
10736- }
10737- if (!raid_conf->working_disks) {
10738- printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10739- kfree(raid_conf);
10740- mddev->private = NULL;
10741- MOD_DEC_USE_COUNT;
10742- return -EIO;
10743- }
10744-
10745- raid_conf->raid_disks = sb->raid_disks;
10746- raid_conf->mddev = mddev;
10747-
10748- for (j = 0; !raid_conf->mirrors[j].operational; j++);
10749- raid_conf->last_used = j;
10750- for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
10751- if (raid_conf->mirrors[i].operational) {
10752- PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
10753- raid_conf->mirrors[i].next = j;
10754+ printk(SPARE, partition_name(rdev->dev));
10755+ disk->number = descriptor->number;
10756+ disk->raid_disk = disk_idx;
10757+ disk->dev = rdev->dev;
10758+ disk->sect_limit = MAX_LINEAR_SECTORS;
10759+ disk->operational = 0;
10760+ disk->write_only = 0;
10761+ disk->spare = 1;
10762+ disk->used_slot = 1;
10763+ }
10764+ }
10765+ if (!conf->working_disks) {
10766+ printk(NONE_OPERATIONAL, mdidx(mddev));
10767+ goto out_free_conf;
10768+ }
10769+
10770+ conf->raid_disks = sb->raid_disks;
10771+ conf->nr_disks = sb->nr_disks;
10772+ conf->mddev = mddev;
10773+
10774+ for (i = 0; i < MD_SB_DISKS; i++) {
10775+
10776+ descriptor = sb->disks+i;
10777+ disk_idx = descriptor->raid_disk;
10778+ disk = conf->mirrors + disk_idx;
10779+
10780+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
10781+ !disk->used_slot) {
10782+
10783+ disk->number = descriptor->number;
10784+ disk->raid_disk = disk_idx;
10785+ disk->dev = MKDEV(0,0);
10786+
10787+ disk->operational = 0;
10788+ disk->write_only = 0;
10789+ disk->spare = 0;
10790+ disk->used_slot = 1;
10791+ }
10792+ }
10793+
10794+ /*
10795+ * find the first working one and use it as a starting point
10796+ * to read balancing.
10797+ */
10798+ for (j = 0; !conf->mirrors[j].operational; j++)
10799+ /* nothing */;
10800+ conf->last_used = j;
10801+
10802+ /*
10803+ * initialize the 'working disks' list.
10804+ */
10805+ for (i = conf->raid_disks - 1; i >= 0; i--) {
10806+ if (conf->mirrors[i].operational) {
10807+ conf->mirrors[i].next = j;
10808 j = i;
10809 }
10810 }
10811
10812- if (check_consistency(mddev)) {
10813- printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
10814- sb->state |= 1 << MD_SB_ERRORS;
10815- kfree(raid_conf);
10816- mddev->private = NULL;
10817- MOD_DEC_USE_COUNT;
10818- return -EIO;
10819+ if (conf->working_disks != sb->raid_disks) {
10820+ printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10821+ start_recovery = 1;
10822 }
10823
10824+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
10825+ /*
10826+ * we do sanity checks even if the device says
10827+ * it's clean ...
10828+ */
10829+ if (check_consistency(mddev)) {
10830+ printk(RUNNING_CKRAID);
10831+ sb->state &= ~(1 << MD_SB_CLEAN);
10832+ }
10833+ }
10834+
10835+ {
10836+ const char * name = "raid1d";
10837+
10838+ conf->thread = md_register_thread(raid1d, conf, name);
10839+ if (!conf->thread) {
10840+ printk(THREAD_ERROR, mdidx(mddev));
10841+ goto out_free_conf;
10842+ }
10843+ }
10844+
10845+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
10846+ const char * name = "raid1syncd";
10847+
10848+ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
10849+ if (!conf->resync_thread) {
10850+ printk(THREAD_ERROR, mdidx(mddev));
10851+ goto out_free_conf;
10852+ }
10853+
10854+ printk(START_RESYNC, mdidx(mddev));
10855+ conf->resync_mirrors = 1;
10856+ md_wakeup_thread(conf->resync_thread);
10857+ }
10858+
10859 /*
10860 * Regenerate the "device is in sync with the raid set" bit for
10861 * each device.
10862 */
10863- for (i = 0; i < sb->nr_disks ; i++) {
10864- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10865+ for (i = 0; i < MD_SB_DISKS; i++) {
10866+ mark_disk_nonsync(sb->disks+i);
10867 for (j = 0; j < sb->raid_disks; j++) {
10868- if (!raid_conf->mirrors[j].operational)
10869+ if (!conf->mirrors[j].operational)
10870 continue;
10871- if (sb->disks[i].number == raid_conf->mirrors[j].number)
10872- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10873+ if (sb->disks[i].number == conf->mirrors[j].number)
10874+ mark_disk_sync(sb->disks+i);
10875 }
10876 }
10877- sb->active_disks = raid_conf->working_disks;
10878+ sb->active_disks = conf->working_disks;
10879
10880- printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
10881- /* Ok, everything is just fine now */
10882- return (0);
10883+ if (start_recovery)
10884+ md_recover_arrays();
10885+
10886+
10887+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
10888+ /*
10889+ * Ok, everything is just fine now
10890+ */
10891+ return 0;
10892+
10893+out_free_conf:
10894+ kfree(conf);
10895+ mddev->private = NULL;
10896+out:
10897+ MOD_DEC_USE_COUNT;
10898+ return -EIO;
10899+}
10900+
10901+#undef INVALID_LEVEL
10902+#undef NO_SB
10903+#undef ERRORS
10904+#undef NOT_IN_SYNC
10905+#undef INCONSISTENT
10906+#undef ALREADY_RUNNING
10907+#undef OPERATIONAL
10908+#undef SPARE
10909+#undef NONE_OPERATIONAL
10910+#undef RUNNING_CKRAID
10911+#undef ARRAY_IS_ACTIVE
10912+
10913+static int raid1_stop_resync (mddev_t *mddev)
10914+{
10915+ raid1_conf_t *conf = mddev_to_conf(mddev);
10916+
10917+ if (conf->resync_thread) {
10918+ if (conf->resync_mirrors) {
10919+ conf->resync_mirrors = 2;
10920+ md_interrupt_thread(conf->resync_thread);
10921+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
10922+ return 1;
10923+ }
10924+ return 0;
10925+ }
10926+ return 0;
10927+}
10928+
10929+static int raid1_restart_resync (mddev_t *mddev)
10930+{
10931+ raid1_conf_t *conf = mddev_to_conf(mddev);
10932+
10933+ if (conf->resync_mirrors) {
10934+ if (!conf->resync_thread) {
10935+ MD_BUG();
10936+ return 0;
10937+ }
10938+ conf->resync_mirrors = 1;
10939+ md_wakeup_thread(conf->resync_thread);
10940+ return 1;
10941+ }
10942+ return 0;
10943 }
10944
10945-static int raid1_stop (int minor, struct md_dev *mddev)
10946+static int raid1_stop (mddev_t *mddev)
10947 {
10948- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10949+ raid1_conf_t *conf = mddev_to_conf(mddev);
10950
10951- kfree (raid_conf);
10952+ md_unregister_thread(conf->thread);
10953+ if (conf->resync_thread)
10954+ md_unregister_thread(conf->resync_thread);
10955+ kfree(conf);
10956 mddev->private = NULL;
10957 MOD_DEC_USE_COUNT;
10958 return 0;
10959 }
10960
10961-static struct md_personality raid1_personality=
10962+static mdk_personality_t raid1_personality=
10963 {
10964 "raid1",
10965 raid1_map,
10966@@ -855,15 +1214,13 @@
10967 NULL, /* no ioctls */
10968 0,
10969 raid1_error,
10970- raid1_hot_add_disk,
10971- /* raid1_hot_remove_drive */ NULL,
10972- raid1_mark_spare
10973+ raid1_diskop,
10974+ raid1_stop_resync,
10975+ raid1_restart_resync
10976 };
10977
10978 int raid1_init (void)
10979 {
10980- if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
10981- return -EBUSY;
10982 return register_md_personality (RAID1, &raid1_personality);
10983 }
10984
10985@@ -875,7 +1232,6 @@
10986
10987 void cleanup_module (void)
10988 {
10989- md_unregister_thread (raid1_thread);
10990 unregister_md_personality (RAID1);
10991 }
10992 #endif
10993--- linux/drivers/block/raid5.c.orig Fri May 8 09:17:13 1998
10994+++ linux/drivers/block/raid5.c Fri Nov 23 11:18:20 2001
10995@@ -1,4 +1,4 @@
10996-/*****************************************************************************
10997+/*
10998 * raid5.c : Multiple Devices driver for Linux
10999 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
11000 *
11001@@ -14,16 +14,15 @@
11002 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
11003 */
11004
11005+
11006 #include <linux/module.h>
11007 #include <linux/locks.h>
11008 #include <linux/malloc.h>
11009-#include <linux/md.h>
11010-#include <linux/raid5.h>
11011+#include <linux/raid/raid5.h>
11012 #include <asm/bitops.h>
11013 #include <asm/atomic.h>
11014-#include <asm/md.h>
11015
11016-static struct md_personality raid5_personality;
11017+static mdk_personality_t raid5_personality;
11018
11019 /*
11020 * Stripe cache
11021@@ -33,7 +32,7 @@
11022 #define HASH_PAGES_ORDER 0
11023 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
11024 #define HASH_MASK (NR_HASH - 1)
11025-#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11026+#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11027
11028 /*
11029 * The following can be used to debug the driver
11030@@ -46,6 +45,8 @@
11031 #define PRINTK(x) do { ; } while (0)
11032 #endif
11033
11034+static void print_raid5_conf (raid5_conf_t *conf);
11035+
11036 static inline int stripe_locked(struct stripe_head *sh)
11037 {
11038 return test_bit(STRIPE_LOCKED, &sh->state);
11039@@ -61,32 +62,32 @@
11040 */
11041 static inline void lock_stripe(struct stripe_head *sh)
11042 {
11043- struct raid5_data *raid_conf = sh->raid_conf;
11044- if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11045+ raid5_conf_t *conf = sh->raid_conf;
11046+ if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11047 PRINTK(("locking stripe %lu\n", sh->sector));
11048- raid_conf->nr_locked_stripes++;
11049+ conf->nr_locked_stripes++;
11050 }
11051 }
11052
11053 static inline void unlock_stripe(struct stripe_head *sh)
11054 {
11055- struct raid5_data *raid_conf = sh->raid_conf;
11056- if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11057+ raid5_conf_t *conf = sh->raid_conf;
11058+ if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11059 PRINTK(("unlocking stripe %lu\n", sh->sector));
11060- raid_conf->nr_locked_stripes--;
11061+ conf->nr_locked_stripes--;
11062 wake_up(&sh->wait);
11063 }
11064 }
11065
11066 static inline void finish_stripe(struct stripe_head *sh)
11067 {
11068- struct raid5_data *raid_conf = sh->raid_conf;
11069+ raid5_conf_t *conf = sh->raid_conf;
11070 unlock_stripe(sh);
11071 sh->cmd = STRIPE_NONE;
11072 sh->phase = PHASE_COMPLETE;
11073- raid_conf->nr_pending_stripes--;
11074- raid_conf->nr_cached_stripes++;
11075- wake_up(&raid_conf->wait_for_stripe);
11076+ conf->nr_pending_stripes--;
11077+ conf->nr_cached_stripes++;
11078+ wake_up(&conf->wait_for_stripe);
11079 }
11080
11081 void __wait_on_stripe(struct stripe_head *sh)
11082@@ -114,7 +115,7 @@
11083 __wait_on_stripe(sh);
11084 }
11085
11086-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11087+static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
11088 {
11089 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
11090
11091@@ -123,21 +124,22 @@
11092 sh->hash_next->hash_pprev = sh->hash_pprev;
11093 *sh->hash_pprev = sh->hash_next;
11094 sh->hash_pprev = NULL;
11095- raid_conf->nr_hashed_stripes--;
11096+ conf->nr_hashed_stripes--;
11097 }
11098 }
11099
11100-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11101+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
11102 {
11103- struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
11104+ struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
11105
11106- PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
11107+ PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
11108+ sh->sector, conf->nr_hashed_stripes));
11109
11110 if ((sh->hash_next = *shp) != NULL)
11111 (*shp)->hash_pprev = &sh->hash_next;
11112 *shp = sh;
11113 sh->hash_pprev = shp;
11114- raid_conf->nr_hashed_stripes++;
11115+ conf->nr_hashed_stripes++;
11116 }
11117
11118 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
11119@@ -145,13 +147,15 @@
11120 struct buffer_head *bh;
11121 unsigned long flags;
11122
11123- save_flags(flags);
11124- cli();
11125- if ((bh = sh->buffer_pool) == NULL)
11126- return NULL;
11127+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11128+ bh = sh->buffer_pool;
11129+ if (!bh)
11130+ goto out_unlock;
11131 sh->buffer_pool = bh->b_next;
11132 bh->b_size = b_size;
11133- restore_flags(flags);
11134+out_unlock:
11135+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11136+
11137 return bh;
11138 }
11139
11140@@ -160,12 +164,14 @@
11141 struct buffer_head *bh;
11142 unsigned long flags;
11143
11144- save_flags(flags);
11145- cli();
11146- if ((bh = sh->bh_pool) == NULL)
11147- return NULL;
11148+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11149+ bh = sh->bh_pool;
11150+ if (!bh)
11151+ goto out_unlock;
11152 sh->bh_pool = bh->b_next;
11153- restore_flags(flags);
11154+out_unlock:
11155+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11156+
11157 return bh;
11158 }
11159
11160@@ -173,54 +179,52 @@
11161 {
11162 unsigned long flags;
11163
11164- save_flags(flags);
11165- cli();
11166+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11167 bh->b_next = sh->buffer_pool;
11168 sh->buffer_pool = bh;
11169- restore_flags(flags);
11170+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11171 }
11172
11173 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
11174 {
11175 unsigned long flags;
11176
11177- save_flags(flags);
11178- cli();
11179+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11180 bh->b_next = sh->bh_pool;
11181 sh->bh_pool = bh;
11182- restore_flags(flags);
11183+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11184 }
11185
11186-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
11187+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
11188 {
11189 struct stripe_head *sh;
11190 unsigned long flags;
11191
11192 save_flags(flags);
11193 cli();
11194- if ((sh = raid_conf->free_sh_list) == NULL) {
11195+ if ((sh = conf->free_sh_list) == NULL) {
11196 restore_flags(flags);
11197 return NULL;
11198 }
11199- raid_conf->free_sh_list = sh->free_next;
11200- raid_conf->nr_free_sh--;
11201- if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
11202+ conf->free_sh_list = sh->free_next;
11203+ conf->nr_free_sh--;
11204+ if (!conf->nr_free_sh && conf->free_sh_list)
11205 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
11206 restore_flags(flags);
11207- if (sh->hash_pprev || sh->nr_pending || sh->count)
11208+ if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
11209 printk("get_free_stripe(): bug\n");
11210 return sh;
11211 }
11212
11213-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
11214+static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
11215 {
11216 unsigned long flags;
11217
11218 save_flags(flags);
11219 cli();
11220- sh->free_next = raid_conf->free_sh_list;
11221- raid_conf->free_sh_list = sh;
11222- raid_conf->nr_free_sh++;
11223+ sh->free_next = conf->free_sh_list;
11224+ conf->free_sh_list = sh;
11225+ conf->nr_free_sh++;
11226 restore_flags(flags);
11227 }
11228
11229@@ -324,8 +328,8 @@
11230
11231 static void kfree_stripe(struct stripe_head *sh)
11232 {
11233- struct raid5_data *raid_conf = sh->raid_conf;
11234- int disks = raid_conf->raid_disks, j;
11235+ raid5_conf_t *conf = sh->raid_conf;
11236+ int disks = conf->raid_disks, j;
11237
11238 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
11239 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
11240@@ -338,19 +342,19 @@
11241 if (sh->bh_new[j] || sh->bh_copy[j])
11242 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
11243 }
11244- remove_hash(raid_conf, sh);
11245- put_free_stripe(raid_conf, sh);
11246+ remove_hash(conf, sh);
11247+ put_free_stripe(conf, sh);
11248 }
11249
11250-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
11251+static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
11252 {
11253 struct stripe_head *sh;
11254 int i, count = 0;
11255
11256- PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
11257+ PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
11258 for (i = 0; i < NR_HASH; i++) {
11259 repeat:
11260- sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
11261+ sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
11262 for (; sh; sh = sh->hash_next) {
11263 if (sh->phase != PHASE_COMPLETE)
11264 continue;
11265@@ -360,30 +364,30 @@
11266 continue;
11267 kfree_stripe(sh);
11268 if (++count == nr) {
11269- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11270- raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
11271+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11272+ conf->clock = (i + conf->clock) & HASH_MASK;
11273 return nr;
11274 }
11275 goto repeat;
11276 }
11277 }
11278- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11279+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11280 return count;
11281 }
11282
11283-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11284+static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11285 {
11286 struct stripe_head *sh;
11287
11288- if (raid_conf->buffer_size != size) {
11289- PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
11290- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
11291- raid_conf->buffer_size = size;
11292+ if (conf->buffer_size != size) {
11293+ PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
11294+ shrink_stripe_cache(conf, conf->max_nr_stripes);
11295+ conf->buffer_size = size;
11296 }
11297
11298 PRINTK(("find_stripe, sector %lu\n", sector));
11299- for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
11300- if (sh->sector == sector && sh->raid_conf == raid_conf) {
11301+ for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
11302+ if (sh->sector == sector && sh->raid_conf == conf) {
11303 if (sh->size == size) {
11304 PRINTK(("found stripe %lu\n", sector));
11305 return sh;
11306@@ -397,7 +401,7 @@
11307 return NULL;
11308 }
11309
11310-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
11311+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
11312 {
11313 struct stripe_head *sh;
11314
11315@@ -405,62 +409,64 @@
11316 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
11317 return 1;
11318 memset(sh, 0, sizeof(*sh));
11319- if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
11320- shrink_buffers(sh, 2 * raid_conf->raid_disks);
11321+ sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
11322+
11323+ if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
11324+ shrink_buffers(sh, 2 * conf->raid_disks);
11325 kfree(sh);
11326 return 1;
11327 }
11328- if (grow_bh(sh, raid_conf->raid_disks, priority)) {
11329- shrink_buffers(sh, 2 * raid_conf->raid_disks);
11330- shrink_bh(sh, raid_conf->raid_disks);
11331+ if (grow_bh(sh, conf->raid_disks, priority)) {
11332+ shrink_buffers(sh, 2 * conf->raid_disks);
11333+ shrink_bh(sh, conf->raid_disks);
11334 kfree(sh);
11335 return 1;
11336 }
11337- put_free_stripe(raid_conf, sh);
11338- raid_conf->nr_stripes++;
11339+ put_free_stripe(conf, sh);
11340+ conf->nr_stripes++;
11341 }
11342 return 0;
11343 }
11344
11345-static void shrink_stripes(struct raid5_data *raid_conf, int num)
11346+static void shrink_stripes(raid5_conf_t *conf, int num)
11347 {
11348 struct stripe_head *sh;
11349
11350 while (num--) {
11351- sh = get_free_stripe(raid_conf);
11352+ sh = get_free_stripe(conf);
11353 if (!sh)
11354 break;
11355- shrink_buffers(sh, raid_conf->raid_disks * 2);
11356- shrink_bh(sh, raid_conf->raid_disks);
11357+ shrink_buffers(sh, conf->raid_disks * 2);
11358+ shrink_bh(sh, conf->raid_disks);
11359 kfree(sh);
11360- raid_conf->nr_stripes--;
11361+ conf->nr_stripes--;
11362 }
11363 }
11364
11365-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11366+static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11367 {
11368 struct stripe_head *sh = NULL, *tmp;
11369 struct buffer_head *buffer_pool, *bh_pool;
11370
11371 PRINTK(("kmalloc_stripe called\n"));
11372
11373- while ((sh = get_free_stripe(raid_conf)) == NULL) {
11374- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
11375- if ((sh = get_free_stripe(raid_conf)) != NULL)
11376+ while ((sh = get_free_stripe(conf)) == NULL) {
11377+ shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
11378+ if ((sh = get_free_stripe(conf)) != NULL)
11379 break;
11380- if (!raid_conf->nr_pending_stripes)
11381+ if (!conf->nr_pending_stripes)
11382 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
11383- md_wakeup_thread(raid_conf->thread);
11384+ md_wakeup_thread(conf->thread);
11385 PRINTK(("waiting for some stripes to complete\n"));
11386- sleep_on(&raid_conf->wait_for_stripe);
11387+ sleep_on(&conf->wait_for_stripe);
11388 }
11389
11390 /*
11391 * The above might have slept, so perhaps another process
11392 * already created the stripe for us..
11393 */
11394- if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
11395- put_free_stripe(raid_conf, sh);
11396+ if ((tmp = find_stripe(conf, sector, size)) != NULL) {
11397+ put_free_stripe(conf, sh);
11398 wait_on_stripe(tmp);
11399 return tmp;
11400 }
11401@@ -472,25 +478,25 @@
11402 sh->bh_pool = bh_pool;
11403 sh->phase = PHASE_COMPLETE;
11404 sh->cmd = STRIPE_NONE;
11405- sh->raid_conf = raid_conf;
11406+ sh->raid_conf = conf;
11407 sh->sector = sector;
11408 sh->size = size;
11409- raid_conf->nr_cached_stripes++;
11410- insert_hash(raid_conf, sh);
11411+ conf->nr_cached_stripes++;
11412+ insert_hash(conf, sh);
11413 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
11414 return sh;
11415 }
11416
11417-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11418+static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11419 {
11420 struct stripe_head *sh;
11421
11422 PRINTK(("get_stripe, sector %lu\n", sector));
11423- sh = find_stripe(raid_conf, sector, size);
11424+ sh = find_stripe(conf, sector, size);
11425 if (sh)
11426 wait_on_stripe(sh);
11427 else
11428- sh = kmalloc_stripe(raid_conf, sector, size);
11429+ sh = kmalloc_stripe(conf, sector, size);
11430 return sh;
11431 }
11432
11433@@ -523,7 +529,7 @@
11434 bh->b_end_io(bh, uptodate);
11435 if (!uptodate)
11436 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
11437- "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
11438+ "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
11439 }
11440
11441 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
11442@@ -537,36 +543,35 @@
11443 static void raid5_end_request (struct buffer_head * bh, int uptodate)
11444 {
11445 struct stripe_head *sh = bh->b_dev_id;
11446- struct raid5_data *raid_conf = sh->raid_conf;
11447- int disks = raid_conf->raid_disks, i;
11448+ raid5_conf_t *conf = sh->raid_conf;
11449+ int disks = conf->raid_disks, i;
11450 unsigned long flags;
11451
11452 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
11453- save_flags(flags);
11454- cli();
11455+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11456 raid5_mark_buffer_uptodate(bh, uptodate);
11457- --sh->nr_pending;
11458- if (!sh->nr_pending) {
11459- md_wakeup_thread(raid_conf->thread);
11460- atomic_inc(&raid_conf->nr_handle);
11461+ if (atomic_dec_and_test(&sh->nr_pending)) {
11462+ md_wakeup_thread(conf->thread);
11463+ atomic_inc(&conf->nr_handle);
11464 }
11465- if (!uptodate)
11466+ if (!uptodate) {
11467 md_error(bh->b_dev, bh->b_rdev);
11468- if (raid_conf->failed_disks) {
11469+ }
11470+ if (conf->failed_disks) {
11471 for (i = 0; i < disks; i++) {
11472- if (raid_conf->disks[i].operational)
11473+ if (conf->disks[i].operational)
11474 continue;
11475 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
11476 continue;
11477- if (bh->b_rdev != raid_conf->disks[i].dev)
11478+ if (bh->b_rdev != conf->disks[i].dev)
11479 continue;
11480 set_bit(STRIPE_ERROR, &sh->state);
11481 }
11482 }
11483- restore_flags(flags);
11484+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11485 }
11486
11487-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
11488+static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
11489 unsigned long *rsector, unsigned long size)
11490 {
11491 /* No complex mapping used: the core of the work is done in the
11492@@ -577,11 +582,10 @@
11493
11494 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
11495 {
11496- struct raid5_data *raid_conf = sh->raid_conf;
11497- struct md_dev *mddev = raid_conf->mddev;
11498- int minor = (int) (mddev - md_dev);
11499+ raid5_conf_t *conf = sh->raid_conf;
11500+ mddev_t *mddev = conf->mddev;
11501 char *b_data;
11502- kdev_t dev = MKDEV(MD_MAJOR, minor);
11503+ kdev_t dev = mddev_to_kdev(mddev);
11504 int block = sh->sector / (sh->size >> 9);
11505
11506 b_data = ((volatile struct buffer_head *) bh)->b_data;
11507@@ -589,7 +593,7 @@
11508 init_buffer(bh, dev, block, raid5_end_request, sh);
11509 ((volatile struct buffer_head *) bh)->b_data = b_data;
11510
11511- bh->b_rdev = raid_conf->disks[i].dev;
11512+ bh->b_rdev = conf->disks[i].dev;
11513 bh->b_rsector = sh->sector;
11514
11515 bh->b_state = (1 << BH_Req);
11516@@ -597,33 +601,62 @@
11517 bh->b_list = BUF_LOCKED;
11518 }
11519
11520-static int raid5_error (struct md_dev *mddev, kdev_t dev)
11521+static int raid5_error (mddev_t *mddev, kdev_t dev)
11522 {
11523- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
11524- md_superblock_t *sb = mddev->sb;
11525+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
11526+ mdp_super_t *sb = mddev->sb;
11527 struct disk_info *disk;
11528 int i;
11529
11530 PRINTK(("raid5_error called\n"));
11531- raid_conf->resync_parity = 0;
11532- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
11533+ conf->resync_parity = 0;
11534+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
11535 if (disk->dev == dev && disk->operational) {
11536 disk->operational = 0;
11537- sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
11538- sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
11539- sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
11540+ mark_disk_faulty(sb->disks+disk->number);
11541+ mark_disk_nonsync(sb->disks+disk->number);
11542+ mark_disk_inactive(sb->disks+disk->number);
11543 sb->active_disks--;
11544 sb->working_disks--;
11545 sb->failed_disks++;
11546 mddev->sb_dirty = 1;
11547- raid_conf->working_disks--;
11548- raid_conf->failed_disks++;
11549- md_wakeup_thread(raid_conf->thread);
11550+ conf->working_disks--;
11551+ conf->failed_disks++;
11552+ md_wakeup_thread(conf->thread);
11553 printk (KERN_ALERT
11554- "RAID5: Disk failure on %s, disabling device."
11555- "Operation continuing on %d devices\n",
11556- kdevname (dev), raid_conf->working_disks);
11557+ "raid5: Disk failure on %s, disabling device."
11558+ " Operation continuing on %d devices\n",
11559+ partition_name (dev), conf->working_disks);
11560+ return -EIO;
11561 }
11562+ }
11563+ /*
11564+ * handle errors in spares (during reconstruction)
11565+ */
11566+ if (conf->spare) {
11567+ disk = conf->spare;
11568+ if (disk->dev == dev) {
11569+ printk (KERN_ALERT
11570+ "raid5: Disk failure on spare %s\n",
11571+ partition_name (dev));
11572+ if (!conf->spare->operational) {
11573+ MD_BUG();
11574+ return -EIO;
11575+ }
11576+ disk->operational = 0;
11577+ disk->write_only = 0;
11578+ conf->spare = NULL;
11579+ mark_disk_faulty(sb->disks+disk->number);
11580+ mark_disk_nonsync(sb->disks+disk->number);
11581+ mark_disk_inactive(sb->disks+disk->number);
11582+ sb->spare_disks--;
11583+ sb->working_disks--;
11584+ sb->failed_disks++;
11585+
11586+ return -EIO;
11587+ }
11588+ }
11589+ MD_BUG();
11590 return 0;
11591 }
11592
11593@@ -634,12 +667,12 @@
11594 static inline unsigned long
11595 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
11596 unsigned int * dd_idx, unsigned int * pd_idx,
11597- struct raid5_data *raid_conf)
11598+ raid5_conf_t *conf)
11599 {
11600 unsigned int stripe;
11601 int chunk_number, chunk_offset;
11602 unsigned long new_sector;
11603- int sectors_per_chunk = raid_conf->chunk_size >> 9;
11604+ int sectors_per_chunk = conf->chunk_size >> 9;
11605
11606 /* First compute the information on this sector */
11607
11608@@ -662,9 +695,9 @@
11609 /*
11610 * Select the parity disk based on the user selected algorithm.
11611 */
11612- if (raid_conf->level == 4)
11613+ if (conf->level == 4)
11614 *pd_idx = data_disks;
11615- else switch (raid_conf->algorithm) {
11616+ else switch (conf->algorithm) {
11617 case ALGORITHM_LEFT_ASYMMETRIC:
11618 *pd_idx = data_disks - stripe % raid_disks;
11619 if (*dd_idx >= *pd_idx)
11620@@ -684,7 +717,7 @@
11621 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
11622 break;
11623 default:
11624- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11625+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11626 }
11627
11628 /*
11629@@ -705,16 +738,16 @@
11630
11631 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
11632 {
11633- struct raid5_data *raid_conf = sh->raid_conf;
11634- int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
11635+ raid5_conf_t *conf = sh->raid_conf;
11636+ int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
11637 unsigned long new_sector = sh->sector, check;
11638- int sectors_per_chunk = raid_conf->chunk_size >> 9;
11639+ int sectors_per_chunk = conf->chunk_size >> 9;
11640 unsigned long stripe = new_sector / sectors_per_chunk;
11641 int chunk_offset = new_sector % sectors_per_chunk;
11642 int chunk_number, dummy1, dummy2, dd_idx = i;
11643 unsigned long r_sector, blocknr;
11644
11645- switch (raid_conf->algorithm) {
11646+ switch (conf->algorithm) {
11647 case ALGORITHM_LEFT_ASYMMETRIC:
11648 case ALGORITHM_RIGHT_ASYMMETRIC:
11649 if (i > sh->pd_idx)
11650@@ -727,14 +760,14 @@
11651 i -= (sh->pd_idx + 1);
11652 break;
11653 default:
11654- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11655+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11656 }
11657
11658 chunk_number = stripe * data_disks + i;
11659 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
11660 blocknr = r_sector / (sh->size >> 9);
11661
11662- check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
11663+ check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
11664 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
11665 printk("compute_blocknr: map not correct\n");
11666 return 0;
11667@@ -742,36 +775,11 @@
11668 return blocknr;
11669 }
11670
11671-#ifdef HAVE_ARCH_XORBLOCK
11672-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11673-{
11674- __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
11675-}
11676-#else
11677-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11678-{
11679- long lines = dest->b_size / (sizeof (long)) / 8, i;
11680- long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
11681-
11682- for (i = lines; i > 0; i--) {
11683- *(destp + 0) ^= *(sourcep + 0);
11684- *(destp + 1) ^= *(sourcep + 1);
11685- *(destp + 2) ^= *(sourcep + 2);
11686- *(destp + 3) ^= *(sourcep + 3);
11687- *(destp + 4) ^= *(sourcep + 4);
11688- *(destp + 5) ^= *(sourcep + 5);
11689- *(destp + 6) ^= *(sourcep + 6);
11690- *(destp + 7) ^= *(sourcep + 7);
11691- destp += 8;
11692- sourcep += 8;
11693- }
11694-}
11695-#endif
11696-
11697 static void compute_block(struct stripe_head *sh, int dd_idx)
11698 {
11699- struct raid5_data *raid_conf = sh->raid_conf;
11700- int i, disks = raid_conf->raid_disks;
11701+ raid5_conf_t *conf = sh->raid_conf;
11702+ int i, count, disks = conf->raid_disks;
11703+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11704
11705 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
11706
11707@@ -780,69 +788,100 @@
11708 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
11709
11710 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
11711+ bh_ptr[0] = sh->bh_old[dd_idx];
11712+ count = 1;
11713 for (i = 0; i < disks; i++) {
11714 if (i == dd_idx)
11715 continue;
11716 if (sh->bh_old[i]) {
11717- xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
11718- continue;
11719- } else
11720+ bh_ptr[count++] = sh->bh_old[i];
11721+ } else {
11722 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
11723+ }
11724+ if (count == MAX_XOR_BLOCKS) {
11725+ xor_block(count, &bh_ptr[0]);
11726+ count = 1;
11727+ }
11728+ }
11729+ if(count != 1) {
11730+ xor_block(count, &bh_ptr[0]);
11731 }
11732 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
11733 }
11734
11735 static void compute_parity(struct stripe_head *sh, int method)
11736 {
11737- struct raid5_data *raid_conf = sh->raid_conf;
11738- int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
11739+ raid5_conf_t *conf = sh->raid_conf;
11740+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
11741+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11742
11743 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
11744+ lowprio = 1;
11745 for (i = 0; i < disks; i++) {
11746 if (i == pd_idx || !sh->bh_new[i])
11747 continue;
11748 if (!sh->bh_copy[i])
11749 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
11750 raid5_build_block(sh, sh->bh_copy[i], i);
11751+ if (!buffer_lowprio(sh->bh_new[i]))
11752+ lowprio = 0;
11753+ else
11754+ mark_buffer_lowprio(sh->bh_copy[i]);
11755 mark_buffer_clean(sh->bh_new[i]);
11756 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
11757 }
11758 if (sh->bh_copy[pd_idx] == NULL)
11759 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
11760 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
11761+ if (lowprio)
11762+ mark_buffer_lowprio(sh->bh_copy[pd_idx]);
11763
11764 if (method == RECONSTRUCT_WRITE) {
11765 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
11766+ bh_ptr[0] = sh->bh_copy[pd_idx];
11767+ count = 1;
11768 for (i = 0; i < disks; i++) {
11769 if (i == sh->pd_idx)
11770 continue;
11771 if (sh->bh_new[i]) {
11772- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11773- continue;
11774+ bh_ptr[count++] = sh->bh_copy[i];
11775+ } else if (sh->bh_old[i]) {
11776+ bh_ptr[count++] = sh->bh_old[i];
11777 }
11778- if (sh->bh_old[i]) {
11779- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11780- continue;
11781+ if (count == MAX_XOR_BLOCKS) {
11782+ xor_block(count, &bh_ptr[0]);
11783+ count = 1;
11784 }
11785 }
11786+ if (count != 1) {
11787+ xor_block(count, &bh_ptr[0]);
11788+ }
11789 } else if (method == READ_MODIFY_WRITE) {
11790 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
11791+ bh_ptr[0] = sh->bh_copy[pd_idx];
11792+ count = 1;
11793 for (i = 0; i < disks; i++) {
11794 if (i == sh->pd_idx)
11795 continue;
11796 if (sh->bh_new[i] && sh->bh_old[i]) {
11797- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11798- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11799- continue;
11800+ bh_ptr[count++] = sh->bh_copy[i];
11801+ bh_ptr[count++] = sh->bh_old[i];
11802+ }
11803+ if (count >= (MAX_XOR_BLOCKS - 1)) {
11804+ xor_block(count, &bh_ptr[0]);
11805+ count = 1;
11806 }
11807 }
11808+ if (count != 1) {
11809+ xor_block(count, &bh_ptr[0]);
11810+ }
11811 }
11812 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
11813 }
11814
11815 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
11816 {
11817- struct raid5_data *raid_conf = sh->raid_conf;
11818+ raid5_conf_t *conf = sh->raid_conf;
11819 struct buffer_head *bh_req;
11820
11821 if (sh->bh_new[dd_idx]) {
11822@@ -860,19 +899,22 @@
11823 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
11824 sh->phase = PHASE_BEGIN;
11825 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
11826- raid_conf->nr_pending_stripes++;
11827- atomic_inc(&raid_conf->nr_handle);
11828+ conf->nr_pending_stripes++;
11829+ atomic_inc(&conf->nr_handle);
11830 }
11831 sh->bh_new[dd_idx] = bh;
11832 sh->bh_req[dd_idx] = bh_req;
11833 sh->cmd_new[dd_idx] = rw;
11834 sh->new[dd_idx] = 1;
11835+
11836+ if (buffer_lowprio(bh))
11837+ mark_buffer_lowprio(bh_req);
11838 }
11839
11840 static void complete_stripe(struct stripe_head *sh)
11841 {
11842- struct raid5_data *raid_conf = sh->raid_conf;
11843- int disks = raid_conf->raid_disks;
11844+ raid5_conf_t *conf = sh->raid_conf;
11845+ int disks = conf->raid_disks;
11846 int i, new = 0;
11847
11848 PRINTK(("complete_stripe %lu\n", sh->sector));
11849@@ -909,6 +951,22 @@
11850 }
11851 }
11852
11853+
11854+static int is_stripe_lowprio(struct stripe_head *sh, int disks)
11855+{
11856+ int i, lowprio = 1;
11857+
11858+ for (i = 0; i < disks; i++) {
11859+ if (sh->bh_new[i])
11860+ if (!buffer_lowprio(sh->bh_new[i]))
11861+ lowprio = 0;
11862+ if (sh->bh_old[i])
11863+ if (!buffer_lowprio(sh->bh_old[i]))
11864+ lowprio = 0;
11865+ }
11866+ return lowprio;
11867+}
11868+
11869 /*
11870 * handle_stripe() is our main logic routine. Note that:
11871 *
11872@@ -919,28 +977,27 @@
11873 * 2. We should be careful to set sh->nr_pending whenever we sleep,
11874 * to prevent re-entry of handle_stripe() for the same sh.
11875 *
11876- * 3. raid_conf->failed_disks and disk->operational can be changed
11877+ * 3. conf->failed_disks and disk->operational can be changed
11878 * from an interrupt. This complicates things a bit, but it allows
11879 * us to stop issuing requests for a failed drive as soon as possible.
11880 */
11881 static void handle_stripe(struct stripe_head *sh)
11882 {
11883- struct raid5_data *raid_conf = sh->raid_conf;
11884- struct md_dev *mddev = raid_conf->mddev;
11885- int minor = (int) (mddev - md_dev);
11886+ raid5_conf_t *conf = sh->raid_conf;
11887+ mddev_t *mddev = conf->mddev;
11888 struct buffer_head *bh;
11889- int disks = raid_conf->raid_disks;
11890- int i, nr = 0, nr_read = 0, nr_write = 0;
11891+ int disks = conf->raid_disks;
11892+ int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
11893 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
11894 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
11895 int reading = 0, nr_writing = 0;
11896 int method1 = INT_MAX, method2 = INT_MAX;
11897 int block;
11898 unsigned long flags;
11899- int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
11900+ int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
11901
11902 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
11903- if (sh->nr_pending) {
11904+ if (md_atomic_read(&sh->nr_pending)) {
11905 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
11906 return;
11907 }
11908@@ -949,9 +1006,9 @@
11909 return;
11910 }
11911
11912- atomic_dec(&raid_conf->nr_handle);
11913+ atomic_dec(&conf->nr_handle);
11914
11915- if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11916+ if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11917 printk("raid5: restarting stripe %lu\n", sh->sector);
11918 sh->phase = PHASE_BEGIN;
11919 }
11920@@ -969,11 +1026,11 @@
11921 save_flags(flags);
11922 cli();
11923 for (i = 0; i < disks; i++) {
11924- operational[i] = raid_conf->disks[i].operational;
11925- if (i == sh->pd_idx && raid_conf->resync_parity)
11926+ operational[i] = conf->disks[i].operational;
11927+ if (i == sh->pd_idx && conf->resync_parity)
11928 operational[i] = 0;
11929 }
11930- failed_disks = raid_conf->failed_disks;
11931+ failed_disks = conf->failed_disks;
11932 restore_flags(flags);
11933
11934 if (failed_disks > 1) {
11935@@ -1017,7 +1074,7 @@
11936 }
11937
11938 if (nr_write && nr_read)
11939- printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11940+ printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11941
11942 if (nr_write) {
11943 /*
11944@@ -1030,7 +1087,7 @@
11945 if (sh->bh_new[i])
11946 continue;
11947 block = (int) compute_blocknr(sh, i);
11948- bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
11949+ bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
11950 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
11951 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
11952 add_stripe_bh(sh, bh, i, WRITE);
11953@@ -1064,21 +1121,22 @@
11954
11955 if (!method1 || !method2) {
11956 lock_stripe(sh);
11957- sh->nr_pending++;
11958+ lowprio = is_stripe_lowprio(sh, disks);
11959+ atomic_inc(&sh->nr_pending);
11960 sh->phase = PHASE_WRITE;
11961 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
11962 for (i = 0; i < disks; i++) {
11963- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11964+ if (!operational[i] && !conf->spare && !conf->resync_parity)
11965 continue;
11966 if (i == sh->pd_idx || sh->bh_new[i])
11967 nr_writing++;
11968 }
11969
11970- sh->nr_pending = nr_writing;
11971- PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
11972+ md_atomic_set(&sh->nr_pending, nr_writing);
11973+ PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
11974
11975 for (i = 0; i < disks; i++) {
11976- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11977+ if (!operational[i] && !conf->spare && !conf->resync_parity)
11978 continue;
11979 bh = sh->bh_copy[i];
11980 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
11981@@ -1089,18 +1147,30 @@
11982 bh->b_state |= (1<<BH_Dirty);
11983 PRINTK(("making request for buffer %d\n", i));
11984 clear_bit(BH_Lock, &bh->b_state);
11985- if (!operational[i] && !raid_conf->resync_parity) {
11986- bh->b_rdev = raid_conf->spare->dev;
11987- make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
11988- } else
11989- make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
11990+ if (!operational[i] && !conf->resync_parity) {
11991+ bh->b_rdev = conf->spare->dev;
11992+ make_request(MAJOR(conf->spare->dev), WRITE, bh);
11993+ } else {
11994+#if 0
11995+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
11996+#else
11997+ if (!lowprio || (i==sh->pd_idx))
11998+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
11999+ else {
12000+ mark_buffer_clean(bh);
12001+ raid5_end_request(bh,1);
12002+ sh->new[i] = 0;
12003+ }
12004+#endif
12005+ }
12006 }
12007 }
12008 return;
12009 }
12010
12011 lock_stripe(sh);
12012- sh->nr_pending++;
12013+ lowprio = is_stripe_lowprio(sh, disks);
12014+ atomic_inc(&sh->nr_pending);
12015 if (method1 < method2) {
12016 sh->write_method = RECONSTRUCT_WRITE;
12017 for (i = 0; i < disks; i++) {
12018@@ -1110,6 +1180,8 @@
12019 continue;
12020 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12021 raid5_build_block(sh, sh->bh_old[i], i);
12022+ if (lowprio)
12023+ mark_buffer_lowprio(sh->bh_old[i]);
12024 reading++;
12025 }
12026 } else {
12027@@ -1121,19 +1193,21 @@
12028 continue;
12029 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12030 raid5_build_block(sh, sh->bh_old[i], i);
12031+ if (lowprio)
12032+ mark_buffer_lowprio(sh->bh_old[i]);
12033 reading++;
12034 }
12035 }
12036 sh->phase = PHASE_READ_OLD;
12037- sh->nr_pending = reading;
12038- PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
12039+ md_atomic_set(&sh->nr_pending, reading);
12040+ PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12041 for (i = 0; i < disks; i++) {
12042 if (!sh->bh_old[i])
12043 continue;
12044 if (buffer_uptodate(sh->bh_old[i]))
12045 continue;
12046 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12047- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12048+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12049 }
12050 } else {
12051 /*
12052@@ -1141,7 +1215,8 @@
12053 */
12054 method1 = nr_read - nr_cache_overwrite;
12055 lock_stripe(sh);
12056- sh->nr_pending++;
12057+ lowprio = is_stripe_lowprio(sh,disks);
12058+ atomic_inc(&sh->nr_pending);
12059
12060 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
12061 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
12062@@ -1149,18 +1224,22 @@
12063 for (i = 0; i < disks; i++) {
12064 if (!sh->bh_new[i])
12065 continue;
12066- if (!sh->bh_old[i])
12067+ if (!sh->bh_old[i]) {
12068 compute_block(sh, i);
12069+ if (lowprio)
12070+ mark_buffer_lowprio
12071+ (sh->bh_old[i]);
12072+ }
12073 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12074 }
12075- sh->nr_pending--;
12076+ atomic_dec(&sh->nr_pending);
12077 complete_stripe(sh);
12078 return;
12079 }
12080 if (nr_failed_overwrite) {
12081 sh->phase = PHASE_READ_OLD;
12082- sh->nr_pending = (disks - 1) - nr_cache;
12083- PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
12084+ md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
12085+ PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12086 for (i = 0; i < disks; i++) {
12087 if (sh->bh_old[i])
12088 continue;
12089@@ -1168,13 +1247,16 @@
12090 continue;
12091 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12092 raid5_build_block(sh, sh->bh_old[i], i);
12093+ if (lowprio)
12094+ mark_buffer_lowprio(sh->bh_old[i]);
12095 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12096- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12097+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12098 }
12099 } else {
12100 sh->phase = PHASE_READ;
12101- sh->nr_pending = nr_read - nr_cache_overwrite;
12102- PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
12103+ md_atomic_set(&sh->nr_pending,
12104+ nr_read - nr_cache_overwrite);
12105+ PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12106 for (i = 0; i < disks; i++) {
12107 if (!sh->bh_new[i])
12108 continue;
12109@@ -1182,16 +1264,16 @@
12110 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12111 continue;
12112 }
12113- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
12114+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
12115 }
12116 }
12117 }
12118 }
12119
12120-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
12121+static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
12122 {
12123- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12124- const unsigned int raid_disks = raid_conf->raid_disks;
12125+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12126+ const unsigned int raid_disks = conf->raid_disks;
12127 const unsigned int data_disks = raid_disks - 1;
12128 unsigned int dd_idx, pd_idx;
12129 unsigned long new_sector;
12130@@ -1202,15 +1284,15 @@
12131 if (rw == WRITEA) rw = WRITE;
12132
12133 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
12134- &dd_idx, &pd_idx, raid_conf);
12135+ &dd_idx, &pd_idx, conf);
12136
12137 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
12138 repeat:
12139- sh = get_stripe(raid_conf, new_sector, bh->b_size);
12140+ sh = get_stripe(conf, new_sector, bh->b_size);
12141 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
12142 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
12143 lock_stripe(sh);
12144- if (!sh->nr_pending)
12145+ if (!md_atomic_read(&sh->nr_pending))
12146 handle_stripe(sh);
12147 goto repeat;
12148 }
12149@@ -1221,24 +1303,24 @@
12150 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
12151 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
12152 lock_stripe(sh);
12153- md_wakeup_thread(raid_conf->thread);
12154+ md_wakeup_thread(conf->thread);
12155 wait_on_stripe(sh);
12156 goto repeat;
12157 }
12158 add_stripe_bh(sh, bh, dd_idx, rw);
12159
12160- md_wakeup_thread(raid_conf->thread);
12161+ md_wakeup_thread(conf->thread);
12162 return 0;
12163 }
12164
12165 static void unplug_devices(struct stripe_head *sh)
12166 {
12167 #if 0
12168- struct raid5_data *raid_conf = sh->raid_conf;
12169+ raid5_conf_t *conf = sh->raid_conf;
12170 int i;
12171
12172- for (i = 0; i < raid_conf->raid_disks; i++)
12173- unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
12174+ for (i = 0; i < conf->raid_disks; i++)
12175+ unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
12176 #endif
12177 }
12178
12179@@ -1252,8 +1334,8 @@
12180 static void raid5d (void *data)
12181 {
12182 struct stripe_head *sh;
12183- struct raid5_data *raid_conf = data;
12184- struct md_dev *mddev = raid_conf->mddev;
12185+ raid5_conf_t *conf = data;
12186+ mddev_t *mddev = conf->mddev;
12187 int i, handled = 0, unplug = 0;
12188 unsigned long flags;
12189
12190@@ -1261,47 +1343,47 @@
12191
12192 if (mddev->sb_dirty) {
12193 mddev->sb_dirty = 0;
12194- md_update_sb((int) (mddev - md_dev));
12195+ md_update_sb(mddev);
12196 }
12197 for (i = 0; i < NR_HASH; i++) {
12198 repeat:
12199- sh = raid_conf->stripe_hashtbl[i];
12200+ sh = conf->stripe_hashtbl[i];
12201 for (; sh; sh = sh->hash_next) {
12202- if (sh->raid_conf != raid_conf)
12203+ if (sh->raid_conf != conf)
12204 continue;
12205 if (sh->phase == PHASE_COMPLETE)
12206 continue;
12207- if (sh->nr_pending)
12208+ if (md_atomic_read(&sh->nr_pending))
12209 continue;
12210- if (sh->sector == raid_conf->next_sector) {
12211- raid_conf->sector_count += (sh->size >> 9);
12212- if (raid_conf->sector_count >= 128)
12213+ if (sh->sector == conf->next_sector) {
12214+ conf->sector_count += (sh->size >> 9);
12215+ if (conf->sector_count >= 128)
12216 unplug = 1;
12217 } else
12218 unplug = 1;
12219 if (unplug) {
12220- PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
12221+ PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
12222 unplug_devices(sh);
12223 unplug = 0;
12224- raid_conf->sector_count = 0;
12225+ conf->sector_count = 0;
12226 }
12227- raid_conf->next_sector = sh->sector + (sh->size >> 9);
12228+ conf->next_sector = sh->sector + (sh->size >> 9);
12229 handled++;
12230 handle_stripe(sh);
12231 goto repeat;
12232 }
12233 }
12234- if (raid_conf) {
12235- PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
12236+ if (conf) {
12237+ PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
12238 save_flags(flags);
12239 cli();
12240- if (!atomic_read(&raid_conf->nr_handle))
12241- clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
12242+ if (!md_atomic_read(&conf->nr_handle))
12243+ clear_bit(THREAD_WAKEUP, &conf->thread->flags);
12244+ restore_flags(flags);
12245 }
12246 PRINTK(("--- raid5d inactive\n"));
12247 }
12248
12249-#if SUPPORT_RECONSTRUCTION
12250 /*
12251 * Private kernel thread for parity reconstruction after an unclean
12252 * shutdown. Reconstruction on spare drives in case of a failed drive
12253@@ -1309,44 +1391,64 @@
12254 */
12255 static void raid5syncd (void *data)
12256 {
12257- struct raid5_data *raid_conf = data;
12258- struct md_dev *mddev = raid_conf->mddev;
12259+ raid5_conf_t *conf = data;
12260+ mddev_t *mddev = conf->mddev;
12261
12262- if (!raid_conf->resync_parity)
12263+ if (!conf->resync_parity)
12264+ return;
12265+ if (conf->resync_parity == 2)
12266+ return;
12267+ down(&mddev->recovery_sem);
12268+ if (md_do_sync(mddev,NULL)) {
12269+ up(&mddev->recovery_sem);
12270+ printk("raid5: resync aborted!\n");
12271 return;
12272- md_do_sync(mddev);
12273- raid_conf->resync_parity = 0;
12274+ }
12275+ conf->resync_parity = 0;
12276+ up(&mddev->recovery_sem);
12277+ printk("raid5: resync finished.\n");
12278 }
12279-#endif /* SUPPORT_RECONSTRUCTION */
12280
12281-static int __check_consistency (struct md_dev *mddev, int row)
12282+static int __check_consistency (mddev_t *mddev, int row)
12283 {
12284- struct raid5_data *raid_conf = mddev->private;
12285+ raid5_conf_t *conf = mddev->private;
12286 kdev_t dev;
12287 struct buffer_head *bh[MD_SB_DISKS], tmp;
12288- int i, rc = 0, nr = 0;
12289+ int i, rc = 0, nr = 0, count;
12290+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
12291
12292- if (raid_conf->working_disks != raid_conf->raid_disks)
12293+ if (conf->working_disks != conf->raid_disks)
12294 return 0;
12295 tmp.b_size = 4096;
12296 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
12297 return 0;
12298+ md_clear_page((unsigned long)tmp.b_data);
12299 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
12300- for (i = 0; i < raid_conf->raid_disks; i++) {
12301- dev = raid_conf->disks[i].dev;
12302+ for (i = 0; i < conf->raid_disks; i++) {
12303+ dev = conf->disks[i].dev;
12304 set_blocksize(dev, 4096);
12305 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
12306 break;
12307 nr++;
12308 }
12309- if (nr == raid_conf->raid_disks) {
12310- for (i = 1; i < nr; i++)
12311- xor_block(&tmp, bh[i]);
12312+ if (nr == conf->raid_disks) {
12313+ bh_ptr[0] = &tmp;
12314+ count = 1;
12315+ for (i = 1; i < nr; i++) {
12316+ bh_ptr[count++] = bh[i];
12317+ if (count == MAX_XOR_BLOCKS) {
12318+ xor_block(count, &bh_ptr[0]);
12319+ count = 1;
12320+ }
12321+ }
12322+ if (count != 1) {
12323+ xor_block(count, &bh_ptr[0]);
12324+ }
12325 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
12326 rc = 1;
12327 }
12328- for (i = 0; i < raid_conf->raid_disks; i++) {
12329- dev = raid_conf->disks[i].dev;
12330+ for (i = 0; i < conf->raid_disks; i++) {
12331+ dev = conf->disks[i].dev;
12332 if (bh[i]) {
12333 bforget(bh[i]);
12334 bh[i] = NULL;
12335@@ -1358,285 +1460,607 @@
12336 return rc;
12337 }
12338
12339-static int check_consistency (struct md_dev *mddev)
12340+static int check_consistency (mddev_t *mddev)
12341 {
12342- int size = mddev->sb->size;
12343- int row;
12344+ if (__check_consistency(mddev, 0))
12345+/*
12346+ * We are not checking this currently, as it's legitimate to have
12347+ * an inconsistent array, at creation time.
12348+ */
12349+ return 0;
12350
12351- for (row = 0; row < size; row += size / 8)
12352- if (__check_consistency(mddev, row))
12353- return 1;
12354 return 0;
12355 }
12356
12357-static int raid5_run (int minor, struct md_dev *mddev)
12358+static int raid5_run (mddev_t *mddev)
12359 {
12360- struct raid5_data *raid_conf;
12361+ raid5_conf_t *conf;
12362 int i, j, raid_disk, memory;
12363- md_superblock_t *sb = mddev->sb;
12364- md_descriptor_t *descriptor;
12365- struct real_dev *realdev;
12366+ mdp_super_t *sb = mddev->sb;
12367+ mdp_disk_t *desc;
12368+ mdk_rdev_t *rdev;
12369+ struct disk_info *disk;
12370+ struct md_list_head *tmp;
12371+ int start_recovery = 0;
12372
12373 MOD_INC_USE_COUNT;
12374
12375 if (sb->level != 5 && sb->level != 4) {
12376- printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
12377+ printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
12378 MOD_DEC_USE_COUNT;
12379 return -EIO;
12380 }
12381
12382- mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
12383- if ((raid_conf = mddev->private) == NULL)
12384+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
12385+ if ((conf = mddev->private) == NULL)
12386 goto abort;
12387- memset (raid_conf, 0, sizeof (*raid_conf));
12388- raid_conf->mddev = mddev;
12389+ memset (conf, 0, sizeof (*conf));
12390+ conf->mddev = mddev;
12391
12392- if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12393+ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12394 goto abort;
12395- memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12396+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12397
12398- init_waitqueue(&raid_conf->wait_for_stripe);
12399- PRINTK(("raid5_run(%d) called.\n", minor));
12400-
12401- for (i = 0; i < mddev->nb_dev; i++) {
12402- realdev = &mddev->devices[i];
12403- if (!realdev->sb) {
12404- printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
12405- continue;
12406- }
12407+ init_waitqueue(&conf->wait_for_stripe);
12408+ PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
12409
12410+ ITERATE_RDEV(mddev,rdev,tmp) {
12411 /*
12412 * This is important -- we are using the descriptor on
12413 * the disk only to get a pointer to the descriptor on
12414 * the main superblock, which might be more recent.
12415 */
12416- descriptor = &sb->disks[realdev->sb->descriptor.number];
12417- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
12418- printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
12419+ desc = sb->disks + rdev->desc_nr;
12420+ raid_disk = desc->raid_disk;
12421+ disk = conf->disks + raid_disk;
12422+
12423+ if (disk_faulty(desc)) {
12424+ printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
12425+ if (!rdev->faulty) {
12426+ MD_BUG();
12427+ goto abort;
12428+ }
12429+ disk->number = desc->number;
12430+ disk->raid_disk = raid_disk;
12431+ disk->dev = rdev->dev;
12432+
12433+ disk->operational = 0;
12434+ disk->write_only = 0;
12435+ disk->spare = 0;
12436+ disk->used_slot = 1;
12437 continue;
12438 }
12439- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
12440- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
12441- printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
12442- continue;
12443+ if (disk_active(desc)) {
12444+ if (!disk_sync(desc)) {
12445+ printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
12446+ MD_BUG();
12447+ goto abort;
12448 }
12449- raid_disk = descriptor->raid_disk;
12450- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
12451- printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
12452+ if (raid_disk > sb->raid_disks) {
12453+ printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
12454 continue;
12455 }
12456- if (raid_conf->disks[raid_disk].operational) {
12457- printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
12458+ if (disk->operational) {
12459+ printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
12460 continue;
12461 }
12462- printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
12463+ printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
12464
12465- raid_conf->disks[raid_disk].number = descriptor->number;
12466- raid_conf->disks[raid_disk].raid_disk = raid_disk;
12467- raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
12468- raid_conf->disks[raid_disk].operational = 1;
12469+ disk->number = desc->number;
12470+ disk->raid_disk = raid_disk;
12471+ disk->dev = rdev->dev;
12472+ disk->operational = 1;
12473+ disk->used_slot = 1;
12474
12475- raid_conf->working_disks++;
12476+ conf->working_disks++;
12477 } else {
12478 /*
12479 * Must be a spare disk ..
12480 */
12481- printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
12482- raid_disk = descriptor->raid_disk;
12483- raid_conf->disks[raid_disk].number = descriptor->number;
12484- raid_conf->disks[raid_disk].raid_disk = raid_disk;
12485- raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
12486-
12487- raid_conf->disks[raid_disk].operational = 0;
12488- raid_conf->disks[raid_disk].write_only = 0;
12489- raid_conf->disks[raid_disk].spare = 1;
12490- }
12491- }
12492- raid_conf->raid_disks = sb->raid_disks;
12493- raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
12494- raid_conf->mddev = mddev;
12495- raid_conf->chunk_size = sb->chunk_size;
12496- raid_conf->level = sb->level;
12497- raid_conf->algorithm = sb->parity_algorithm;
12498- raid_conf->max_nr_stripes = NR_STRIPES;
12499+ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
12500+ disk->number = desc->number;
12501+ disk->raid_disk = raid_disk;
12502+ disk->dev = rdev->dev;
12503
12504- if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
12505- printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
12506- goto abort;
12507+ disk->operational = 0;
12508+ disk->write_only = 0;
12509+ disk->spare = 1;
12510+ disk->used_slot = 1;
12511+ }
12512 }
12513- if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
12514- printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
12515+
12516+ for (i = 0; i < MD_SB_DISKS; i++) {
12517+ desc = sb->disks + i;
12518+ raid_disk = desc->raid_disk;
12519+ disk = conf->disks + raid_disk;
12520+
12521+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
12522+ !conf->disks[raid_disk].used_slot) {
12523+
12524+ disk->number = desc->number;
12525+ disk->raid_disk = raid_disk;
12526+ disk->dev = MKDEV(0,0);
12527+
12528+ disk->operational = 0;
12529+ disk->write_only = 0;
12530+ disk->spare = 0;
12531+ disk->used_slot = 1;
12532+ }
12533+ }
12534+
12535+ conf->raid_disks = sb->raid_disks;
12536+ /*
12537+ * 0 for a fully functional array, 1 for a degraded array.
12538+ */
12539+ conf->failed_disks = conf->raid_disks - conf->working_disks;
12540+ conf->mddev = mddev;
12541+ conf->chunk_size = sb->chunk_size;
12542+ conf->level = sb->level;
12543+ conf->algorithm = sb->layout;
12544+ conf->max_nr_stripes = NR_STRIPES;
12545+
12546+#if 0
12547+ for (i = 0; i < conf->raid_disks; i++) {
12548+ if (!conf->disks[i].used_slot) {
12549+ MD_BUG();
12550+ goto abort;
12551+ }
12552+ }
12553+#endif
12554+ if (!conf->chunk_size || conf->chunk_size % 4) {
12555+ printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
12556 goto abort;
12557 }
12558- if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12559- printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
12560+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12561+ printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
12562 goto abort;
12563 }
12564- if (raid_conf->failed_disks > 1) {
12565- printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
12566+ if (conf->failed_disks > 1) {
12567+ printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
12568 goto abort;
12569 }
12570
12571- if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
12572- printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
12573- sb->state |= 1 << MD_SB_ERRORS;
12574- goto abort;
12575+ if (conf->working_disks != sb->raid_disks) {
12576+ printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
12577+ start_recovery = 1;
12578 }
12579
12580- if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
12581- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12582- goto abort;
12583+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
12584+ check_consistency(mddev)) {
12585+ printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
12586+ sb->state &= ~(1 << MD_SB_CLEAN);
12587 }
12588
12589-#if SUPPORT_RECONSTRUCTION
12590- if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
12591- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12592- goto abort;
12593+ {
12594+ const char * name = "raid5d";
12595+
12596+ conf->thread = md_register_thread(raid5d, conf, name);
12597+ if (!conf->thread) {
12598+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12599+ goto abort;
12600+ }
12601 }
12602-#endif /* SUPPORT_RECONSTRUCTION */
12603
12604- memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
12605- raid_conf->raid_disks * (sizeof(struct buffer_head) +
12606+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
12607+ conf->raid_disks * (sizeof(struct buffer_head) +
12608 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
12609- if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
12610+ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
12611 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
12612- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12613+ shrink_stripes(conf, conf->max_nr_stripes);
12614 goto abort;
12615 } else
12616- printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
12617+ printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
12618
12619 /*
12620 * Regenerate the "device is in sync with the raid set" bit for
12621 * each device.
12622 */
12623- for (i = 0; i < sb->nr_disks ; i++) {
12624- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
12625+ for (i = 0; i < MD_SB_DISKS ; i++) {
12626+ mark_disk_nonsync(sb->disks + i);
12627 for (j = 0; j < sb->raid_disks; j++) {
12628- if (!raid_conf->disks[j].operational)
12629+ if (!conf->disks[j].operational)
12630 continue;
12631- if (sb->disks[i].number == raid_conf->disks[j].number)
12632- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
12633+ if (sb->disks[i].number == conf->disks[j].number)
12634+ mark_disk_sync(sb->disks + i);
12635 }
12636 }
12637- sb->active_disks = raid_conf->working_disks;
12638+ sb->active_disks = conf->working_disks;
12639
12640 if (sb->active_disks == sb->raid_disks)
12641- printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12642+ printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12643 else
12644- printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12645+ printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12646+
12647+ if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
12648+ const char * name = "raid5syncd";
12649+
12650+ conf->resync_thread = md_register_thread(raid5syncd, conf,name);
12651+ if (!conf->resync_thread) {
12652+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12653+ goto abort;
12654+ }
12655
12656- if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
12657- printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
12658- raid_conf->resync_parity = 1;
12659-#if SUPPORT_RECONSTRUCTION
12660- md_wakeup_thread(raid_conf->resync_thread);
12661-#endif /* SUPPORT_RECONSTRUCTION */
12662+ printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
12663+ conf->resync_parity = 1;
12664+ md_wakeup_thread(conf->resync_thread);
12665 }
12666
12667+ print_raid5_conf(conf);
12668+ if (start_recovery)
12669+ md_recover_arrays();
12670+ print_raid5_conf(conf);
12671+
12672 /* Ok, everything is just fine now */
12673 return (0);
12674 abort:
12675- if (raid_conf) {
12676- if (raid_conf->stripe_hashtbl)
12677- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12678- kfree(raid_conf);
12679+ if (conf) {
12680+ print_raid5_conf(conf);
12681+ if (conf->stripe_hashtbl)
12682+ free_pages((unsigned long) conf->stripe_hashtbl,
12683+ HASH_PAGES_ORDER);
12684+ kfree(conf);
12685 }
12686 mddev->private = NULL;
12687- printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12688+ printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
12689 MOD_DEC_USE_COUNT;
12690 return -EIO;
12691 }
12692
12693-static int raid5_stop (int minor, struct md_dev *mddev)
12694+static int raid5_stop_resync (mddev_t *mddev)
12695+{
12696+ raid5_conf_t *conf = mddev_to_conf(mddev);
12697+ mdk_thread_t *thread = conf->resync_thread;
12698+
12699+ if (thread) {
12700+ if (conf->resync_parity) {
12701+ conf->resync_parity = 2;
12702+ md_interrupt_thread(thread);
12703+ printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
12704+ return 1;
12705+ }
12706+ return 0;
12707+ }
12708+ return 0;
12709+}
12710+
12711+static int raid5_restart_resync (mddev_t *mddev)
12712 {
12713- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12714+ raid5_conf_t *conf = mddev_to_conf(mddev);
12715
12716- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
12717- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12718- md_unregister_thread(raid_conf->thread);
12719-#if SUPPORT_RECONSTRUCTION
12720- md_unregister_thread(raid_conf->resync_thread);
12721-#endif /* SUPPORT_RECONSTRUCTION */
12722- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12723- kfree(raid_conf);
12724+ if (conf->resync_parity) {
12725+ if (!conf->resync_thread) {
12726+ MD_BUG();
12727+ return 0;
12728+ }
12729+ printk("raid5: waking up raid5resync.\n");
12730+ conf->resync_parity = 1;
12731+ md_wakeup_thread(conf->resync_thread);
12732+ return 1;
12733+ } else
12734+ printk("raid5: no restart-resync needed.\n");
12735+ return 0;
12736+}
12737+
12738+
12739+static int raid5_stop (mddev_t *mddev)
12740+{
12741+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12742+
12743+ shrink_stripe_cache(conf, conf->max_nr_stripes);
12744+ shrink_stripes(conf, conf->max_nr_stripes);
12745+ md_unregister_thread(conf->thread);
12746+ if (conf->resync_thread)
12747+ md_unregister_thread(conf->resync_thread);
12748+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
12749+ kfree(conf);
12750 mddev->private = NULL;
12751 MOD_DEC_USE_COUNT;
12752 return 0;
12753 }
12754
12755-static int raid5_status (char *page, int minor, struct md_dev *mddev)
12756+static int raid5_status (char *page, mddev_t *mddev)
12757 {
12758- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12759- md_superblock_t *sb = mddev->sb;
12760+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12761+ mdp_super_t *sb = mddev->sb;
12762 int sz = 0, i;
12763
12764- sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
12765- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
12766- for (i = 0; i < raid_conf->raid_disks; i++)
12767- sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
12768+ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
12769+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
12770+ for (i = 0; i < conf->raid_disks; i++)
12771+ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
12772 sz += sprintf (page+sz, "]");
12773 return sz;
12774 }
12775
12776-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
12777+static void print_raid5_conf (raid5_conf_t *conf)
12778+{
12779+ int i;
12780+ struct disk_info *tmp;
12781+
12782+ printk("RAID5 conf printout:\n");
12783+ if (!conf) {
12784+ printk("(conf==NULL)\n");
12785+ return;
12786+ }
12787+ printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
12788+ conf->working_disks, conf->failed_disks);
12789+
12790+ for (i = 0; i < MD_SB_DISKS; i++) {
12791+ tmp = conf->disks + i;
12792+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
12793+ i, tmp->spare,tmp->operational,
12794+ tmp->number,tmp->raid_disk,tmp->used_slot,
12795+ partition_name(tmp->dev));
12796+ }
12797+}
12798+
12799+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
12800 {
12801- int i = 0, failed_disk = -1;
12802- struct raid5_data *raid_conf = mddev->private;
12803- struct disk_info *disk = raid_conf->disks;
12804+ int err = 0;
12805+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
12806+ raid5_conf_t *conf = mddev->private;
12807+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
12808 unsigned long flags;
12809- md_superblock_t *sb = mddev->sb;
12810- md_descriptor_t *descriptor;
12811+ mdp_super_t *sb = mddev->sb;
12812+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
12813
12814- for (i = 0; i < MD_SB_DISKS; i++, disk++) {
12815- if (disk->spare && disk->number == spare->number)
12816- goto found;
12817- }
12818- return 1;
12819-found:
12820- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
12821- if (!disk->operational)
12822- failed_disk = i;
12823- if (failed_disk == -1)
12824- return 1;
12825 save_flags(flags);
12826 cli();
12827+
12828+ print_raid5_conf(conf);
12829+ /*
12830+ * find the disk ...
12831+ */
12832 switch (state) {
12833- case SPARE_WRITE:
12834- disk->operational = 1;
12835- disk->write_only = 1;
12836- raid_conf->spare = disk;
12837- break;
12838- case SPARE_INACTIVE:
12839- disk->operational = 0;
12840- disk->write_only = 0;
12841- raid_conf->spare = NULL;
12842- break;
12843- case SPARE_ACTIVE:
12844- disk->spare = 0;
12845- disk->write_only = 0;
12846
12847- descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
12848- i = spare->raid_disk;
12849- disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
12850- if (disk->raid_disk != failed_disk)
12851- printk("raid5: disk->raid_disk != failed_disk");
12852- descriptor->raid_disk = i;
12853-
12854- raid_conf->spare = NULL;
12855- raid_conf->working_disks++;
12856- raid_conf->failed_disks--;
12857- raid_conf->disks[failed_disk] = *disk;
12858- break;
12859- default:
12860- printk("raid5_mark_spare: bug: state == %d\n", state);
12861- restore_flags(flags);
12862- return 1;
12863+ case DISKOP_SPARE_ACTIVE:
12864+
12865+ /*
12866+ * Find the failed disk within the RAID5 configuration ...
12867+ * (this can only be in the first conf->raid_disks part)
12868+ */
12869+ for (i = 0; i < conf->raid_disks; i++) {
12870+ tmp = conf->disks + i;
12871+ if ((!tmp->operational && !tmp->spare) ||
12872+ !tmp->used_slot) {
12873+ failed_disk = i;
12874+ break;
12875+ }
12876+ }
12877+ /*
12878+ * When we activate a spare disk we _must_ have a disk in
12879+ * the lower (active) part of the array to replace.
12880+ */
12881+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
12882+ MD_BUG();
12883+ err = 1;
12884+ goto abort;
12885+ }
12886+ /* fall through */
12887+
12888+ case DISKOP_SPARE_WRITE:
12889+ case DISKOP_SPARE_INACTIVE:
12890+
12891+ /*
12892+ * Find the spare disk ... (can only be in the 'high'
12893+ * area of the array)
12894+ */
12895+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12896+ tmp = conf->disks + i;
12897+ if (tmp->spare && tmp->number == (*d)->number) {
12898+ spare_disk = i;
12899+ break;
12900+ }
12901+ }
12902+ if (spare_disk == -1) {
12903+ MD_BUG();
12904+ err = 1;
12905+ goto abort;
12906+ }
12907+ break;
12908+
12909+ case DISKOP_HOT_REMOVE_DISK:
12910+
12911+ for (i = 0; i < MD_SB_DISKS; i++) {
12912+ tmp = conf->disks + i;
12913+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
12914+ if (tmp->operational) {
12915+ err = -EBUSY;
12916+ goto abort;
12917+ }
12918+ removed_disk = i;
12919+ break;
12920+ }
12921+ }
12922+ if (removed_disk == -1) {
12923+ MD_BUG();
12924+ err = 1;
12925+ goto abort;
12926+ }
12927+ break;
12928+
12929+ case DISKOP_HOT_ADD_DISK:
12930+
12931+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12932+ tmp = conf->disks + i;
12933+ if (!tmp->used_slot) {
12934+ added_disk = i;
12935+ break;
12936+ }
12937+ }
12938+ if (added_disk == -1) {
12939+ MD_BUG();
12940+ err = 1;
12941+ goto abort;
12942+ }
12943+ break;
12944+ }
12945+
12946+ switch (state) {
12947+ /*
12948+ * Switch the spare disk to write-only mode:
12949+ */
12950+ case DISKOP_SPARE_WRITE:
12951+ if (conf->spare) {
12952+ MD_BUG();
12953+ err = 1;
12954+ goto abort;
12955+ }
12956+ sdisk = conf->disks + spare_disk;
12957+ sdisk->operational = 1;
12958+ sdisk->write_only = 1;
12959+ conf->spare = sdisk;
12960+ break;
12961+ /*
12962+ * Deactivate a spare disk:
12963+ */
12964+ case DISKOP_SPARE_INACTIVE:
12965+ sdisk = conf->disks + spare_disk;
12966+ sdisk->operational = 0;
12967+ sdisk->write_only = 0;
12968+ /*
12969+ * Was the spare being resynced?
12970+ */
12971+ if (conf->spare == sdisk)
12972+ conf->spare = NULL;
12973+ break;
12974+ /*
12975+ * Activate (mark read-write) the (now sync) spare disk,
12976+ * which means we switch it's 'raid position' (->raid_disk)
12977+ * with the failed disk. (only the first 'conf->raid_disks'
12978+ * slots are used for 'real' disks and we must preserve this
12979+ * property)
12980+ */
12981+ case DISKOP_SPARE_ACTIVE:
12982+ if (!conf->spare) {
12983+ MD_BUG();
12984+ err = 1;
12985+ goto abort;
12986+ }
12987+ sdisk = conf->disks + spare_disk;
12988+ fdisk = conf->disks + failed_disk;
12989+
12990+ spare_desc = &sb->disks[sdisk->number];
12991+ failed_desc = &sb->disks[fdisk->number];
12992+
12993+ if (spare_desc != *d) {
12994+ MD_BUG();
12995+ err = 1;
12996+ goto abort;
12997+ }
12998+
12999+ if (spare_desc->raid_disk != sdisk->raid_disk) {
13000+ MD_BUG();
13001+ err = 1;
13002+ goto abort;
13003+ }
13004+
13005+ if (sdisk->raid_disk != spare_disk) {
13006+ MD_BUG();
13007+ err = 1;
13008+ goto abort;
13009+ }
13010+
13011+ if (failed_desc->raid_disk != fdisk->raid_disk) {
13012+ MD_BUG();
13013+ err = 1;
13014+ goto abort;
13015+ }
13016+
13017+ if (fdisk->raid_disk != failed_disk) {
13018+ MD_BUG();
13019+ err = 1;
13020+ goto abort;
13021+ }
13022+
13023+ /*
13024+ * do the switch finally
13025+ */
13026+ xchg_values(*spare_desc, *failed_desc);
13027+ xchg_values(*fdisk, *sdisk);
13028+
13029+ /*
13030+ * (careful, 'failed' and 'spare' are switched from now on)
13031+ *
13032+ * we want to preserve linear numbering and we want to
13033+ * give the proper raid_disk number to the now activated
13034+ * disk. (this means we switch back these values)
13035+ */
13036+
13037+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
13038+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
13039+ xchg_values(spare_desc->number, failed_desc->number);
13040+ xchg_values(sdisk->number, fdisk->number);
13041+
13042+ *d = failed_desc;
13043+
13044+ if (sdisk->dev == MKDEV(0,0))
13045+ sdisk->used_slot = 0;
13046+
13047+ /*
13048+ * this really activates the spare.
13049+ */
13050+ fdisk->spare = 0;
13051+ fdisk->write_only = 0;
13052+
13053+ /*
13054+ * if we activate a spare, we definitely replace a
13055+ * non-operational disk slot in the 'low' area of
13056+ * the disk array.
13057+ */
13058+ conf->failed_disks--;
13059+ conf->working_disks++;
13060+ conf->spare = NULL;
13061+
13062+ break;
13063+
13064+ case DISKOP_HOT_REMOVE_DISK:
13065+ rdisk = conf->disks + removed_disk;
13066+
13067+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
13068+ MD_BUG();
13069+ err = 1;
13070+ goto abort;
13071+ }
13072+ rdisk->dev = MKDEV(0,0);
13073+ rdisk->used_slot = 0;
13074+
13075+ break;
13076+
13077+ case DISKOP_HOT_ADD_DISK:
13078+ adisk = conf->disks + added_disk;
13079+ added_desc = *d;
13080+
13081+ if (added_disk != added_desc->number) {
13082+ MD_BUG();
13083+ err = 1;
13084+ goto abort;
13085+ }
13086+
13087+ adisk->number = added_desc->number;
13088+ adisk->raid_disk = added_desc->raid_disk;
13089+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
13090+
13091+ adisk->operational = 0;
13092+ adisk->write_only = 0;
13093+ adisk->spare = 1;
13094+ adisk->used_slot = 1;
13095+
13096+
13097+ break;
13098+
13099+ default:
13100+ MD_BUG();
13101+ err = 1;
13102+ goto abort;
13103 }
13104+abort:
13105 restore_flags(flags);
13106- return 0;
13107+ print_raid5_conf(conf);
13108+ return err;
13109 }
13110
13111-static struct md_personality raid5_personality=
13112+static mdk_personality_t raid5_personality=
13113 {
13114 "raid5",
13115 raid5_map,
13116@@ -1648,14 +2072,19 @@
13117 NULL, /* no ioctls */
13118 0,
13119 raid5_error,
13120- /* raid5_hot_add_disk, */ NULL,
13121- /* raid1_hot_remove_drive */ NULL,
13122- raid5_mark_spare
13123+ raid5_diskop,
13124+ raid5_stop_resync,
13125+ raid5_restart_resync
13126 };
13127
13128 int raid5_init (void)
13129 {
13130- return register_md_personality (RAID5, &raid5_personality);
13131+ int err;
13132+
13133+ err = register_md_personality (RAID5, &raid5_personality);
13134+ if (err)
13135+ return err;
13136+ return 0;
13137 }
13138
13139 #ifdef MODULE
13140--- linux/drivers/block/translucent.c.orig Fri Nov 23 11:18:20 2001
13141+++ linux/drivers/block/translucent.c Fri Nov 23 11:18:20 2001
13142@@ -0,0 +1,136 @@
13143+/*
13144+ translucent.c : Translucent RAID driver for Linux
13145+ Copyright (C) 1998 Ingo Molnar
13146+
13147+ Translucent mode management functions.
13148+
13149+ This program is free software; you can redistribute it and/or modify
13150+ it under the terms of the GNU General Public License as published by
13151+ the Free Software Foundation; either version 2, or (at your option)
13152+ any later version.
13153+
13154+ You should have received a copy of the GNU General Public License
13155+ (for example /usr/src/linux/COPYING); if not, write to the Free
13156+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13157+*/
13158+
13159+#include <linux/module.h>
13160+
13161+#include <linux/raid/md.h>
13162+#include <linux/malloc.h>
13163+
13164+#include <linux/raid/translucent.h>
13165+
13166+#define MAJOR_NR MD_MAJOR
13167+#define MD_DRIVER
13168+#define MD_PERSONALITY
13169+
13170+static int translucent_run (mddev_t *mddev)
13171+{
13172+ translucent_conf_t *conf;
13173+ mdk_rdev_t *rdev;
13174+ int i;
13175+
13176+ MOD_INC_USE_COUNT;
13177+
13178+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
13179+ if (!conf)
13180+ goto out;
13181+ mddev->private = conf;
13182+
13183+ if (mddev->nb_dev != 2) {
13184+ printk("translucent: this mode needs 2 disks, aborting!\n");
13185+ goto out;
13186+ }
13187+
13188+ if (md_check_ordering(mddev)) {
13189+ printk("translucent: disks are not ordered, aborting!\n");
13190+ goto out;
13191+ }
13192+
13193+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
13194+ dev_info_t *disk = conf->disks + i;
13195+
13196+ disk->dev = rdev->dev;
13197+ disk->size = rdev->size;
13198+ }
13199+
13200+ return 0;
13201+
13202+out:
13203+ if (conf)
13204+ kfree(conf);
13205+
13206+ MOD_DEC_USE_COUNT;
13207+ return 1;
13208+}
13209+
13210+static int translucent_stop (mddev_t *mddev)
13211+{
13212+ translucent_conf_t *conf = mddev_to_conf(mddev);
13213+
13214+ kfree(conf);
13215+
13216+ MOD_DEC_USE_COUNT;
13217+
13218+ return 0;
13219+}
13220+
13221+
13222+static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
13223+ unsigned long *rsector, unsigned long size)
13224+{
13225+ translucent_conf_t *conf = mddev_to_conf(mddev);
13226+
13227+ *rdev = conf->disks[0].dev;
13228+
13229+ return 0;
13230+}
13231+
13232+static int translucent_status (char *page, mddev_t *mddev)
13233+{
13234+ int sz = 0;
13235+
13236+ sz += sprintf(page+sz, " %d%% full", 10);
13237+ return sz;
13238+}
13239+
13240+
13241+static mdk_personality_t translucent_personality=
13242+{
13243+ "translucent",
13244+ translucent_map,
13245+ NULL,
13246+ NULL,
13247+ translucent_run,
13248+ translucent_stop,
13249+ translucent_status,
13250+ NULL,
13251+ 0,
13252+ NULL,
13253+ NULL,
13254+ NULL,
13255+ NULL
13256+};
13257+
13258+#ifndef MODULE
13259+
13260+md__initfunc(void translucent_init (void))
13261+{
13262+ register_md_personality (TRANSLUCENT, &translucent_personality);
13263+}
13264+
13265+#else
13266+
13267+int init_module (void)
13268+{
13269+ return (register_md_personality (TRANSLUCENT, &translucent_personality));
13270+}
13271+
13272+void cleanup_module (void)
13273+{
13274+ unregister_md_personality (TRANSLUCENT);
13275+}
13276+
13277+#endif
13278+
13279--- linux/drivers/block/xor.c.orig Fri Nov 23 11:18:20 2001
13280+++ linux/drivers/block/xor.c Fri Nov 23 11:18:20 2001
13281@@ -0,0 +1,1894 @@
13282+/*
13283+ * xor.c : Multiple Devices driver for Linux
13284+ *
13285+ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
13286+ *
13287+ *
13288+ * optimized RAID-5 checksumming functions.
13289+ *
13290+ * This program is free software; you can redistribute it and/or modify
13291+ * it under the terms of the GNU General Public License as published by
13292+ * the Free Software Foundation; either version 2, or (at your option)
13293+ * any later version.
13294+ *
13295+ * You should have received a copy of the GNU General Public License
13296+ * (for example /usr/src/linux/COPYING); if not, write to the Free
13297+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13298+ */
13299+#include <linux/module.h>
13300+#include <linux/raid/md.h>
13301+#ifdef __sparc_v9__
13302+#include <asm/head.h>
13303+#include <asm/asi.h>
13304+#include <asm/visasm.h>
13305+#endif
13306+
13307+/*
13308+ * we use the 'XOR function template' to register multiple xor
13309+ * functions runtime. The kernel measures their speed upon bootup
13310+ * and decides which one to use. (compile-time registration is
13311+ * not enough as certain CPU features like MMX can only be detected
13312+ * runtime)
13313+ *
13314+ * this architecture makes it pretty easy to add new routines
13315+ * that are faster on certain CPUs, without killing other CPU's
13316+ * 'native' routine. Although the current routines are belived
13317+ * to be the physically fastest ones on all CPUs tested, but
13318+ * feel free to prove me wrong and add yet another routine =B-)
13319+ * --mingo
13320+ */
13321+
13322+#define MAX_XOR_BLOCKS 5
13323+
13324+#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
13325+
13326+typedef void (*xor_block_t) XOR_ARGS;
13327+xor_block_t xor_block = NULL;
13328+
13329+#ifndef __sparc_v9__
13330+
13331+struct xor_block_template;
13332+
13333+struct xor_block_template {
13334+ char * name;
13335+ xor_block_t xor_block;
13336+ int speed;
13337+ struct xor_block_template * next;
13338+};
13339+
13340+struct xor_block_template * xor_functions = NULL;
13341+
13342+#define XORBLOCK_TEMPLATE(x) \
13343+static void xor_block_##x XOR_ARGS; \
13344+static struct xor_block_template t_xor_block_##x = \
13345+ { #x, xor_block_##x, 0, NULL }; \
13346+static void xor_block_##x XOR_ARGS
13347+
13348+#ifdef __i386__
13349+
13350+#ifdef CONFIG_X86_XMM
13351+/*
13352+ * Cache avoiding checksumming functions utilizing KNI instructions
13353+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
13354+ */
13355+
13356+XORBLOCK_TEMPLATE(pIII_kni)
13357+{
13358+ char xmm_save[16*4];
13359+ int cr0;
13360+ int lines = (bh_ptr[0]->b_size>>8);
13361+
13362+ __asm__ __volatile__ (
13363+ "movl %%cr0,%0 ;\n\t"
13364+ "clts ;\n\t"
13365+ "movups %%xmm0,(%1) ;\n\t"
13366+ "movups %%xmm1,0x10(%1) ;\n\t"
13367+ "movups %%xmm2,0x20(%1) ;\n\t"
13368+ "movups %%xmm3,0x30(%1) ;\n\t"
13369+ : "=r" (cr0)
13370+ : "r" (xmm_save)
13371+ : "memory" );
13372+
13373+#define OFFS(x) "8*("#x"*2)"
13374+#define PF0(x) \
13375+ " prefetcht0 "OFFS(x)"(%1) ;\n"
13376+#define LD(x,y) \
13377+ " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
13378+#define ST(x,y) \
13379+ " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
13380+#define PF1(x) \
13381+ " prefetchnta "OFFS(x)"(%2) ;\n"
13382+#define PF2(x) \
13383+ " prefetchnta "OFFS(x)"(%3) ;\n"
13384+#define PF3(x) \
13385+ " prefetchnta "OFFS(x)"(%4) ;\n"
13386+#define PF4(x) \
13387+ " prefetchnta "OFFS(x)"(%5) ;\n"
13388+#define PF5(x) \
13389+ " prefetchnta "OFFS(x)"(%6) ;\n"
13390+#define XO1(x,y) \
13391+ " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
13392+#define XO2(x,y) \
13393+ " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
13394+#define XO3(x,y) \
13395+ " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
13396+#define XO4(x,y) \
13397+ " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
13398+#define XO5(x,y) \
13399+ " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
13400+
13401+ switch(count) {
13402+ case 2:
13403+ __asm__ __volatile__ (
13404+#undef BLOCK
13405+#define BLOCK(i) \
13406+ LD(i,0) \
13407+ LD(i+1,1) \
13408+ PF1(i) \
13409+ PF1(i+2) \
13410+ LD(i+2,2) \
13411+ LD(i+3,3) \
13412+ PF0(i+4) \
13413+ PF0(i+6) \
13414+ XO1(i,0) \
13415+ XO1(i+1,1) \
13416+ XO1(i+2,2) \
13417+ XO1(i+3,3) \
13418+ ST(i,0) \
13419+ ST(i+1,1) \
13420+ ST(i+2,2) \
13421+ ST(i+3,3) \
13422+
13423+
13424+ PF0(0)
13425+ PF0(2)
13426+
13427+ " .align 32,0x90 ;\n"
13428+ " 1: ;\n"
13429+
13430+ BLOCK(0)
13431+ BLOCK(4)
13432+ BLOCK(8)
13433+ BLOCK(12)
13434+
13435+ " addl $256, %1 ;\n"
13436+ " addl $256, %2 ;\n"
13437+ " decl %0 ;\n"
13438+ " jnz 1b ;\n"
13439+
13440+ :
13441+ : "r" (lines),
13442+ "r" (bh_ptr[0]->b_data),
13443+ "r" (bh_ptr[1]->b_data)
13444+ : "memory" );
13445+ break;
13446+ case 3:
13447+ __asm__ __volatile__ (
13448+#undef BLOCK
13449+#define BLOCK(i) \
13450+ PF1(i) \
13451+ PF1(i+2) \
13452+ LD(i,0) \
13453+ LD(i+1,1) \
13454+ LD(i+2,2) \
13455+ LD(i+3,3) \
13456+ PF2(i) \
13457+ PF2(i+2) \
13458+ PF0(i+4) \
13459+ PF0(i+6) \
13460+ XO1(i,0) \
13461+ XO1(i+1,1) \
13462+ XO1(i+2,2) \
13463+ XO1(i+3,3) \
13464+ XO2(i,0) \
13465+ XO2(i+1,1) \
13466+ XO2(i+2,2) \
13467+ XO2(i+3,3) \
13468+ ST(i,0) \
13469+ ST(i+1,1) \
13470+ ST(i+2,2) \
13471+ ST(i+3,3) \
13472+
13473+
13474+ PF0(0)
13475+ PF0(2)
13476+
13477+ " .align 32,0x90 ;\n"
13478+ " 1: ;\n"
13479+
13480+ BLOCK(0)
13481+ BLOCK(4)
13482+ BLOCK(8)
13483+ BLOCK(12)
13484+
13485+ " addl $256, %1 ;\n"
13486+ " addl $256, %2 ;\n"
13487+ " addl $256, %3 ;\n"
13488+ " decl %0 ;\n"
13489+ " jnz 1b ;\n"
13490+ :
13491+ : "r" (lines),
13492+ "r" (bh_ptr[0]->b_data),
13493+ "r" (bh_ptr[1]->b_data),
13494+ "r" (bh_ptr[2]->b_data)
13495+ : "memory" );
13496+ break;
13497+ case 4:
13498+ __asm__ __volatile__ (
13499+#undef BLOCK
13500+#define BLOCK(i) \
13501+ PF1(i) \
13502+ PF1(i+2) \
13503+ LD(i,0) \
13504+ LD(i+1,1) \
13505+ LD(i+2,2) \
13506+ LD(i+3,3) \
13507+ PF2(i) \
13508+ PF2(i+2) \
13509+ XO1(i,0) \
13510+ XO1(i+1,1) \
13511+ XO1(i+2,2) \
13512+ XO1(i+3,3) \
13513+ PF3(i) \
13514+ PF3(i+2) \
13515+ PF0(i+4) \
13516+ PF0(i+6) \
13517+ XO2(i,0) \
13518+ XO2(i+1,1) \
13519+ XO2(i+2,2) \
13520+ XO2(i+3,3) \
13521+ XO3(i,0) \
13522+ XO3(i+1,1) \
13523+ XO3(i+2,2) \
13524+ XO3(i+3,3) \
13525+ ST(i,0) \
13526+ ST(i+1,1) \
13527+ ST(i+2,2) \
13528+ ST(i+3,3) \
13529+
13530+
13531+ PF0(0)
13532+ PF0(2)
13533+
13534+ " .align 32,0x90 ;\n"
13535+ " 1: ;\n"
13536+
13537+ BLOCK(0)
13538+ BLOCK(4)
13539+ BLOCK(8)
13540+ BLOCK(12)
13541+
13542+ " addl $256, %1 ;\n"
13543+ " addl $256, %2 ;\n"
13544+ " addl $256, %3 ;\n"
13545+ " addl $256, %4 ;\n"
13546+ " decl %0 ;\n"
13547+ " jnz 1b ;\n"
13548+
13549+ :
13550+ : "r" (lines),
13551+ "r" (bh_ptr[0]->b_data),
13552+ "r" (bh_ptr[1]->b_data),
13553+ "r" (bh_ptr[2]->b_data),
13554+ "r" (bh_ptr[3]->b_data)
13555+ : "memory" );
13556+ break;
13557+ case 5:
13558+ __asm__ __volatile__ (
13559+#undef BLOCK
13560+#define BLOCK(i) \
13561+ PF1(i) \
13562+ PF1(i+2) \
13563+ LD(i,0) \
13564+ LD(i+1,1) \
13565+ LD(i+2,2) \
13566+ LD(i+3,3) \
13567+ PF2(i) \
13568+ PF2(i+2) \
13569+ XO1(i,0) \
13570+ XO1(i+1,1) \
13571+ XO1(i+2,2) \
13572+ XO1(i+3,3) \
13573+ PF3(i) \
13574+ PF3(i+2) \
13575+ XO2(i,0) \
13576+ XO2(i+1,1) \
13577+ XO2(i+2,2) \
13578+ XO2(i+3,3) \
13579+ PF4(i) \
13580+ PF4(i+2) \
13581+ PF0(i+4) \
13582+ PF0(i+6) \
13583+ XO3(i,0) \
13584+ XO3(i+1,1) \
13585+ XO3(i+2,2) \
13586+ XO3(i+3,3) \
13587+ XO4(i,0) \
13588+ XO4(i+1,1) \
13589+ XO4(i+2,2) \
13590+ XO4(i+3,3) \
13591+ ST(i,0) \
13592+ ST(i+1,1) \
13593+ ST(i+2,2) \
13594+ ST(i+3,3) \
13595+
13596+
13597+ PF0(0)
13598+ PF0(2)
13599+
13600+ " .align 32,0x90 ;\n"
13601+ " 1: ;\n"
13602+
13603+ BLOCK(0)
13604+ BLOCK(4)
13605+ BLOCK(8)
13606+ BLOCK(12)
13607+
13608+ " addl $256, %1 ;\n"
13609+ " addl $256, %2 ;\n"
13610+ " addl $256, %3 ;\n"
13611+ " addl $256, %4 ;\n"
13612+ " addl $256, %5 ;\n"
13613+ " decl %0 ;\n"
13614+ " jnz 1b ;\n"
13615+
13616+ :
13617+ : "r" (lines),
13618+ "r" (bh_ptr[0]->b_data),
13619+ "r" (bh_ptr[1]->b_data),
13620+ "r" (bh_ptr[2]->b_data),
13621+ "r" (bh_ptr[3]->b_data),
13622+ "r" (bh_ptr[4]->b_data)
13623+ : "memory");
13624+ break;
13625+ }
13626+
13627+ __asm__ __volatile__ (
13628+ "sfence ;\n\t"
13629+ "movups (%1),%%xmm0 ;\n\t"
13630+ "movups 0x10(%1),%%xmm1 ;\n\t"
13631+ "movups 0x20(%1),%%xmm2 ;\n\t"
13632+ "movups 0x30(%1),%%xmm3 ;\n\t"
13633+ "movl %0,%%cr0 ;\n\t"
13634+ :
13635+ : "r" (cr0), "r" (xmm_save)
13636+ : "memory" );
13637+}
13638+
13639+#undef OFFS
13640+#undef LD
13641+#undef ST
13642+#undef PF0
13643+#undef PF1
13644+#undef PF2
13645+#undef PF3
13646+#undef PF4
13647+#undef PF5
13648+#undef XO1
13649+#undef XO2
13650+#undef XO3
13651+#undef XO4
13652+#undef XO5
13653+#undef BLOCK
13654+
13655+#endif /* CONFIG_X86_XMM */
13656+
13657+/*
13658+ * high-speed RAID5 checksumming functions utilizing MMX instructions
13659+ * Copyright (C) 1998 Ingo Molnar
13660+ */
13661+XORBLOCK_TEMPLATE(pII_mmx)
13662+{
13663+ char fpu_save[108];
13664+ int lines = (bh_ptr[0]->b_size>>7);
13665+
13666+ if (!(current->flags & PF_USEDFPU))
13667+ __asm__ __volatile__ ( " clts;\n");
13668+
13669+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13670+
13671+#define LD(x,y) \
13672+ " movq 8*("#x")(%1), %%mm"#y" ;\n"
13673+#define ST(x,y) \
13674+ " movq %%mm"#y", 8*("#x")(%1) ;\n"
13675+#define XO1(x,y) \
13676+ " pxor 8*("#x")(%2), %%mm"#y" ;\n"
13677+#define XO2(x,y) \
13678+ " pxor 8*("#x")(%3), %%mm"#y" ;\n"
13679+#define XO3(x,y) \
13680+ " pxor 8*("#x")(%4), %%mm"#y" ;\n"
13681+#define XO4(x,y) \
13682+ " pxor 8*("#x")(%5), %%mm"#y" ;\n"
13683+
13684+ switch(count) {
13685+ case 2:
13686+ __asm__ __volatile__ (
13687+#undef BLOCK
13688+#define BLOCK(i) \
13689+ LD(i,0) \
13690+ LD(i+1,1) \
13691+ LD(i+2,2) \
13692+ LD(i+3,3) \
13693+ XO1(i,0) \
13694+ ST(i,0) \
13695+ XO1(i+1,1) \
13696+ ST(i+1,1) \
13697+ XO1(i+2,2) \
13698+ ST(i+2,2) \
13699+ XO1(i+3,3) \
13700+ ST(i+3,3)
13701+
13702+ " .align 32,0x90 ;\n"
13703+ " 1: ;\n"
13704+
13705+ BLOCK(0)
13706+ BLOCK(4)
13707+ BLOCK(8)
13708+ BLOCK(12)
13709+
13710+ " addl $128, %1 ;\n"
13711+ " addl $128, %2 ;\n"
13712+ " decl %0 ;\n"
13713+ " jnz 1b ;\n"
13714+ :
13715+ : "r" (lines),
13716+ "r" (bh_ptr[0]->b_data),
13717+ "r" (bh_ptr[1]->b_data)
13718+ : "memory");
13719+ break;
13720+ case 3:
13721+ __asm__ __volatile__ (
13722+#undef BLOCK
13723+#define BLOCK(i) \
13724+ LD(i,0) \
13725+ LD(i+1,1) \
13726+ LD(i+2,2) \
13727+ LD(i+3,3) \
13728+ XO1(i,0) \
13729+ XO1(i+1,1) \
13730+ XO1(i+2,2) \
13731+ XO1(i+3,3) \
13732+ XO2(i,0) \
13733+ ST(i,0) \
13734+ XO2(i+1,1) \
13735+ ST(i+1,1) \
13736+ XO2(i+2,2) \
13737+ ST(i+2,2) \
13738+ XO2(i+3,3) \
13739+ ST(i+3,3)
13740+
13741+ " .align 32,0x90 ;\n"
13742+ " 1: ;\n"
13743+
13744+ BLOCK(0)
13745+ BLOCK(4)
13746+ BLOCK(8)
13747+ BLOCK(12)
13748+
13749+ " addl $128, %1 ;\n"
13750+ " addl $128, %2 ;\n"
13751+ " addl $128, %3 ;\n"
13752+ " decl %0 ;\n"
13753+ " jnz 1b ;\n"
13754+ :
13755+ : "r" (lines),
13756+ "r" (bh_ptr[0]->b_data),
13757+ "r" (bh_ptr[1]->b_data),
13758+ "r" (bh_ptr[2]->b_data)
13759+ : "memory");
13760+ break;
13761+ case 4:
13762+ __asm__ __volatile__ (
13763+#undef BLOCK
13764+#define BLOCK(i) \
13765+ LD(i,0) \
13766+ LD(i+1,1) \
13767+ LD(i+2,2) \
13768+ LD(i+3,3) \
13769+ XO1(i,0) \
13770+ XO1(i+1,1) \
13771+ XO1(i+2,2) \
13772+ XO1(i+3,3) \
13773+ XO2(i,0) \
13774+ XO2(i+1,1) \
13775+ XO2(i+2,2) \
13776+ XO2(i+3,3) \
13777+ XO3(i,0) \
13778+ ST(i,0) \
13779+ XO3(i+1,1) \
13780+ ST(i+1,1) \
13781+ XO3(i+2,2) \
13782+ ST(i+2,2) \
13783+ XO3(i+3,3) \
13784+ ST(i+3,3)
13785+
13786+ " .align 32,0x90 ;\n"
13787+ " 1: ;\n"
13788+
13789+ BLOCK(0)
13790+ BLOCK(4)
13791+ BLOCK(8)
13792+ BLOCK(12)
13793+
13794+ " addl $128, %1 ;\n"
13795+ " addl $128, %2 ;\n"
13796+ " addl $128, %3 ;\n"
13797+ " addl $128, %4 ;\n"
13798+ " decl %0 ;\n"
13799+ " jnz 1b ;\n"
13800+ :
13801+ : "r" (lines),
13802+ "r" (bh_ptr[0]->b_data),
13803+ "r" (bh_ptr[1]->b_data),
13804+ "r" (bh_ptr[2]->b_data),
13805+ "r" (bh_ptr[3]->b_data)
13806+ : "memory");
13807+ break;
13808+ case 5:
13809+ __asm__ __volatile__ (
13810+#undef BLOCK
13811+#define BLOCK(i) \
13812+ LD(i,0) \
13813+ LD(i+1,1) \
13814+ LD(i+2,2) \
13815+ LD(i+3,3) \
13816+ XO1(i,0) \
13817+ XO1(i+1,1) \
13818+ XO1(i+2,2) \
13819+ XO1(i+3,3) \
13820+ XO2(i,0) \
13821+ XO2(i+1,1) \
13822+ XO2(i+2,2) \
13823+ XO2(i+3,3) \
13824+ XO3(i,0) \
13825+ XO3(i+1,1) \
13826+ XO3(i+2,2) \
13827+ XO3(i+3,3) \
13828+ XO4(i,0) \
13829+ ST(i,0) \
13830+ XO4(i+1,1) \
13831+ ST(i+1,1) \
13832+ XO4(i+2,2) \
13833+ ST(i+2,2) \
13834+ XO4(i+3,3) \
13835+ ST(i+3,3)
13836+
13837+ " .align 32,0x90 ;\n"
13838+ " 1: ;\n"
13839+
13840+ BLOCK(0)
13841+ BLOCK(4)
13842+ BLOCK(8)
13843+ BLOCK(12)
13844+
13845+ " addl $128, %1 ;\n"
13846+ " addl $128, %2 ;\n"
13847+ " addl $128, %3 ;\n"
13848+ " addl $128, %4 ;\n"
13849+ " addl $128, %5 ;\n"
13850+ " decl %0 ;\n"
13851+ " jnz 1b ;\n"
13852+ :
13853+ : "r" (lines),
13854+ "r" (bh_ptr[0]->b_data),
13855+ "r" (bh_ptr[1]->b_data),
13856+ "r" (bh_ptr[2]->b_data),
13857+ "r" (bh_ptr[3]->b_data),
13858+ "r" (bh_ptr[4]->b_data)
13859+ : "memory");
13860+ break;
13861+ }
13862+
13863+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
13864+
13865+ if (!(current->flags & PF_USEDFPU))
13866+ stts();
13867+}
13868+
13869+#undef LD
13870+#undef XO1
13871+#undef XO2
13872+#undef XO3
13873+#undef XO4
13874+#undef ST
13875+#undef BLOCK
13876+
13877+XORBLOCK_TEMPLATE(p5_mmx)
13878+{
13879+ char fpu_save[108];
13880+ int lines = (bh_ptr[0]->b_size>>6);
13881+
13882+ if (!(current->flags & PF_USEDFPU))
13883+ __asm__ __volatile__ ( " clts;\n");
13884+
13885+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13886+
13887+ switch(count) {
13888+ case 2:
13889+ __asm__ __volatile__ (
13890+
13891+ " .align 32,0x90 ;\n"
13892+ " 1: ;\n"
13893+ " movq (%1), %%mm0 ;\n"
13894+ " movq 8(%1), %%mm1 ;\n"
13895+ " pxor (%2), %%mm0 ;\n"
13896+ " movq 16(%1), %%mm2 ;\n"
13897+ " movq %%mm0, (%1) ;\n"
13898+ " pxor 8(%2), %%mm1 ;\n"
13899+ " movq 24(%1), %%mm3 ;\n"
13900+ " movq %%mm1, 8(%1) ;\n"
13901+ " pxor 16(%2), %%mm2 ;\n"
13902+ " movq 32(%1), %%mm4 ;\n"
13903+ " movq %%mm2, 16(%1) ;\n"
13904+ " pxor 24(%2), %%mm3 ;\n"
13905+ " movq 40(%1), %%mm5 ;\n"
13906+ " movq %%mm3, 24(%1) ;\n"
13907+ " pxor 32(%2), %%mm4 ;\n"
13908+ " movq 48(%1), %%mm6 ;\n"
13909+ " movq %%mm4, 32(%1) ;\n"
13910+ " pxor 40(%2), %%mm5 ;\n"
13911+ " movq 56(%1), %%mm7 ;\n"
13912+ " movq %%mm5, 40(%1) ;\n"
13913+ " pxor 48(%2), %%mm6 ;\n"
13914+ " pxor 56(%2), %%mm7 ;\n"
13915+ " movq %%mm6, 48(%1) ;\n"
13916+ " movq %%mm7, 56(%1) ;\n"
13917+
13918+ " addl $64, %1 ;\n"
13919+ " addl $64, %2 ;\n"
13920+ " decl %0 ;\n"
13921+ " jnz 1b ;\n"
13922+
13923+ :
13924+ : "r" (lines),
13925+ "r" (bh_ptr[0]->b_data),
13926+ "r" (bh_ptr[1]->b_data)
13927+ : "memory" );
13928+ break;
13929+ case 3:
13930+ __asm__ __volatile__ (
13931+
13932+ " .align 32,0x90 ;\n"
13933+ " 1: ;\n"
13934+ " movq (%1), %%mm0 ;\n"
13935+ " movq 8(%1), %%mm1 ;\n"
13936+ " pxor (%2), %%mm0 ;\n"
13937+ " movq 16(%1), %%mm2 ;\n"
13938+ " pxor 8(%2), %%mm1 ;\n"
13939+ " pxor (%3), %%mm0 ;\n"
13940+ " pxor 16(%2), %%mm2 ;\n"
13941+ " movq %%mm0, (%1) ;\n"
13942+ " pxor 8(%3), %%mm1 ;\n"
13943+ " pxor 16(%3), %%mm2 ;\n"
13944+ " movq 24(%1), %%mm3 ;\n"
13945+ " movq %%mm1, 8(%1) ;\n"
13946+ " movq 32(%1), %%mm4 ;\n"
13947+ " movq 40(%1), %%mm5 ;\n"
13948+ " pxor 24(%2), %%mm3 ;\n"
13949+ " movq %%mm2, 16(%1) ;\n"
13950+ " pxor 32(%2), %%mm4 ;\n"
13951+ " pxor 24(%3), %%mm3 ;\n"
13952+ " pxor 40(%2), %%mm5 ;\n"
13953+ " movq %%mm3, 24(%1) ;\n"
13954+ " pxor 32(%3), %%mm4 ;\n"
13955+ " pxor 40(%3), %%mm5 ;\n"
13956+ " movq 48(%1), %%mm6 ;\n"
13957+ " movq %%mm4, 32(%1) ;\n"
13958+ " movq 56(%1), %%mm7 ;\n"
13959+ " pxor 48(%2), %%mm6 ;\n"
13960+ " movq %%mm5, 40(%1) ;\n"
13961+ " pxor 56(%2), %%mm7 ;\n"
13962+ " pxor 48(%3), %%mm6 ;\n"
13963+ " pxor 56(%3), %%mm7 ;\n"
13964+ " movq %%mm6, 48(%1) ;\n"
13965+ " movq %%mm7, 56(%1) ;\n"
13966+
13967+ " addl $64, %1 ;\n"
13968+ " addl $64, %2 ;\n"
13969+ " addl $64, %3 ;\n"
13970+ " decl %0 ;\n"
13971+ " jnz 1b ;\n"
13972+
13973+ :
13974+ : "r" (lines),
13975+ "r" (bh_ptr[0]->b_data),
13976+ "r" (bh_ptr[1]->b_data),
13977+ "r" (bh_ptr[2]->b_data)
13978+ : "memory" );
13979+ break;
13980+ case 4:
13981+ __asm__ __volatile__ (
13982+
13983+ " .align 32,0x90 ;\n"
13984+ " 1: ;\n"
13985+ " movq (%1), %%mm0 ;\n"
13986+ " movq 8(%1), %%mm1 ;\n"
13987+ " pxor (%2), %%mm0 ;\n"
13988+ " movq 16(%1), %%mm2 ;\n"
13989+ " pxor 8(%2), %%mm1 ;\n"
13990+ " pxor (%3), %%mm0 ;\n"
13991+ " pxor 16(%2), %%mm2 ;\n"
13992+ " pxor 8(%3), %%mm1 ;\n"
13993+ " pxor (%4), %%mm0 ;\n"
13994+ " movq 24(%1), %%mm3 ;\n"
13995+ " pxor 16(%3), %%mm2 ;\n"
13996+ " pxor 8(%4), %%mm1 ;\n"
13997+ " movq %%mm0, (%1) ;\n"
13998+ " movq 32(%1), %%mm4 ;\n"
13999+ " pxor 24(%2), %%mm3 ;\n"
14000+ " pxor 16(%4), %%mm2 ;\n"
14001+ " movq %%mm1, 8(%1) ;\n"
14002+ " movq 40(%1), %%mm5 ;\n"
14003+ " pxor 32(%2), %%mm4 ;\n"
14004+ " pxor 24(%3), %%mm3 ;\n"
14005+ " movq %%mm2, 16(%1) ;\n"
14006+ " pxor 40(%2), %%mm5 ;\n"
14007+ " pxor 32(%3), %%mm4 ;\n"
14008+ " pxor 24(%4), %%mm3 ;\n"
14009+ " movq %%mm3, 24(%1) ;\n"
14010+ " movq 56(%1), %%mm7 ;\n"
14011+ " movq 48(%1), %%mm6 ;\n"
14012+ " pxor 40(%3), %%mm5 ;\n"
14013+ " pxor 32(%4), %%mm4 ;\n"
14014+ " pxor 48(%2), %%mm6 ;\n"
14015+ " movq %%mm4, 32(%1) ;\n"
14016+ " pxor 56(%2), %%mm7 ;\n"
14017+ " pxor 40(%4), %%mm5 ;\n"
14018+ " pxor 48(%3), %%mm6 ;\n"
14019+ " pxor 56(%3), %%mm7 ;\n"
14020+ " movq %%mm5, 40(%1) ;\n"
14021+ " pxor 48(%4), %%mm6 ;\n"
14022+ " pxor 56(%4), %%mm7 ;\n"
14023+ " movq %%mm6, 48(%1) ;\n"
14024+ " movq %%mm7, 56(%1) ;\n"
14025+
14026+ " addl $64, %1 ;\n"
14027+ " addl $64, %2 ;\n"
14028+ " addl $64, %3 ;\n"
14029+ " addl $64, %4 ;\n"
14030+ " decl %0 ;\n"
14031+ " jnz 1b ;\n"
14032+
14033+ :
14034+ : "r" (lines),
14035+ "r" (bh_ptr[0]->b_data),
14036+ "r" (bh_ptr[1]->b_data),
14037+ "r" (bh_ptr[2]->b_data),
14038+ "r" (bh_ptr[3]->b_data)
14039+ : "memory" );
14040+ break;
14041+ case 5:
14042+ __asm__ __volatile__ (
14043+
14044+ " .align 32,0x90 ;\n"
14045+ " 1: ;\n"
14046+ " movq (%1), %%mm0 ;\n"
14047+ " movq 8(%1), %%mm1 ;\n"
14048+ " pxor (%2), %%mm0 ;\n"
14049+ " pxor 8(%2), %%mm1 ;\n"
14050+ " movq 16(%1), %%mm2 ;\n"
14051+ " pxor (%3), %%mm0 ;\n"
14052+ " pxor 8(%3), %%mm1 ;\n"
14053+ " pxor 16(%2), %%mm2 ;\n"
14054+ " pxor (%4), %%mm0 ;\n"
14055+ " pxor 8(%4), %%mm1 ;\n"
14056+ " pxor 16(%3), %%mm2 ;\n"
14057+ " movq 24(%1), %%mm3 ;\n"
14058+ " pxor (%5), %%mm0 ;\n"
14059+ " pxor 8(%5), %%mm1 ;\n"
14060+ " movq %%mm0, (%1) ;\n"
14061+ " pxor 16(%4), %%mm2 ;\n"
14062+ " pxor 24(%2), %%mm3 ;\n"
14063+ " movq %%mm1, 8(%1) ;\n"
14064+ " pxor 16(%5), %%mm2 ;\n"
14065+ " pxor 24(%3), %%mm3 ;\n"
14066+ " movq 32(%1), %%mm4 ;\n"
14067+ " movq %%mm2, 16(%1) ;\n"
14068+ " pxor 24(%4), %%mm3 ;\n"
14069+ " pxor 32(%2), %%mm4 ;\n"
14070+ " movq 40(%1), %%mm5 ;\n"
14071+ " pxor 24(%5), %%mm3 ;\n"
14072+ " pxor 32(%3), %%mm4 ;\n"
14073+ " pxor 40(%2), %%mm5 ;\n"
14074+ " movq %%mm3, 24(%1) ;\n"
14075+ " pxor 32(%4), %%mm4 ;\n"
14076+ " pxor 40(%3), %%mm5 ;\n"
14077+ " movq 48(%1), %%mm6 ;\n"
14078+ " movq 56(%1), %%mm7 ;\n"
14079+ " pxor 32(%5), %%mm4 ;\n"
14080+ " pxor 40(%4), %%mm5 ;\n"
14081+ " pxor 48(%2), %%mm6 ;\n"
14082+ " pxor 56(%2), %%mm7 ;\n"
14083+ " movq %%mm4, 32(%1) ;\n"
14084+ " pxor 48(%3), %%mm6 ;\n"
14085+ " pxor 56(%3), %%mm7 ;\n"
14086+ " pxor 40(%5), %%mm5 ;\n"
14087+ " pxor 48(%4), %%mm6 ;\n"
14088+ " pxor 56(%4), %%mm7 ;\n"
14089+ " movq %%mm5, 40(%1) ;\n"
14090+ " pxor 48(%5), %%mm6 ;\n"
14091+ " pxor 56(%5), %%mm7 ;\n"
14092+ " movq %%mm6, 48(%1) ;\n"
14093+ " movq %%mm7, 56(%1) ;\n"
14094+
14095+ " addl $64, %1 ;\n"
14096+ " addl $64, %2 ;\n"
14097+ " addl $64, %3 ;\n"
14098+ " addl $64, %4 ;\n"
14099+ " addl $64, %5 ;\n"
14100+ " decl %0 ;\n"
14101+ " jnz 1b ;\n"
14102+
14103+ :
14104+ : "r" (lines),
14105+ "r" (bh_ptr[0]->b_data),
14106+ "r" (bh_ptr[1]->b_data),
14107+ "r" (bh_ptr[2]->b_data),
14108+ "r" (bh_ptr[3]->b_data),
14109+ "r" (bh_ptr[4]->b_data)
14110+ : "memory" );
14111+ break;
14112+ }
14113+
14114+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
14115+
14116+ if (!(current->flags & PF_USEDFPU))
14117+ stts();
14118+}
14119+#endif /* __i386__ */
14120+#endif /* !__sparc_v9__ */
14121+
14122+#ifdef __sparc_v9__
14123+/*
14124+ * High speed xor_block operation for RAID4/5 utilizing the
14125+ * UltraSparc Visual Instruction Set.
14126+ *
14127+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
14128+ *
14129+ * Requirements:
14130+ * !(((long)dest | (long)sourceN) & (64 - 1)) &&
14131+ * !(len & 127) && len >= 256
14132+ *
14133+ * It is done in pure assembly, as otherwise gcc makes it
14134+ * a non-leaf function, which is not what we want.
14135+ * Also, we don't measure the speeds as on other architectures,
14136+ * as the measuring routine does not take into account cold caches
14137+ * and the fact that xor_block_VIS bypasses the caches.
14138+ * xor_block_32regs might be 5% faster for count 2 if caches are hot
14139+ * and things just right (for count 3 VIS is about as fast as 32regs for
14140+ * hot caches and for count 4 and 5 VIS is faster by good margin always),
14141+ * but I think it is better not to pollute the caches.
14142+ * Actually, if I'd just fight for speed for hot caches, I could
14143+ * write a hybrid VIS/integer routine, which would do always two
14144+ * 64B blocks in VIS and two in IEUs, but I really care more about
14145+ * caches.
14146+ */
14147+extern void *VISenter(void);
14148+extern void xor_block_VIS XOR_ARGS;
14149+
14150+void __xor_block_VIS(void)
14151+{
14152+__asm__ ("
14153+ .globl xor_block_VIS
14154+xor_block_VIS:
14155+ ldx [%%o1 + 0], %%o4
14156+ ldx [%%o1 + 8], %%o3
14157+ ldx [%%o4 + %1], %%g5
14158+ ldx [%%o4 + %0], %%o4
14159+ ldx [%%o3 + %0], %%o3
14160+ rd %%fprs, %%o5
14161+ andcc %%o5, %2, %%g0
14162+ be,pt %%icc, 297f
14163+ sethi %%hi(%5), %%g1
14164+ jmpl %%g1 + %%lo(%5), %%g7
14165+ add %%g7, 8, %%g7
14166+297: wr %%g0, %4, %%fprs
14167+ membar #LoadStore|#StoreLoad|#StoreStore
14168+ sub %%g5, 64, %%g5
14169+ ldda [%%o4] %3, %%f0
14170+ ldda [%%o3] %3, %%f16
14171+ cmp %%o0, 4
14172+ bgeu,pt %%xcc, 10f
14173+ cmp %%o0, 3
14174+ be,pn %%xcc, 13f
14175+ mov -64, %%g1
14176+ sub %%g5, 64, %%g5
14177+ rd %%asi, %%g1
14178+ wr %%g0, %3, %%asi
14179+
14180+2: ldda [%%o4 + 64] %%asi, %%f32
14181+ fxor %%f0, %%f16, %%f16
14182+ fxor %%f2, %%f18, %%f18
14183+ fxor %%f4, %%f20, %%f20
14184+ fxor %%f6, %%f22, %%f22
14185+ fxor %%f8, %%f24, %%f24
14186+ fxor %%f10, %%f26, %%f26
14187+ fxor %%f12, %%f28, %%f28
14188+ fxor %%f14, %%f30, %%f30
14189+ stda %%f16, [%%o4] %3
14190+ ldda [%%o3 + 64] %%asi, %%f48
14191+ ldda [%%o4 + 128] %%asi, %%f0
14192+ fxor %%f32, %%f48, %%f48
14193+ fxor %%f34, %%f50, %%f50
14194+ add %%o4, 128, %%o4
14195+ fxor %%f36, %%f52, %%f52
14196+ add %%o3, 128, %%o3
14197+ fxor %%f38, %%f54, %%f54
14198+ subcc %%g5, 128, %%g5
14199+ fxor %%f40, %%f56, %%f56
14200+ fxor %%f42, %%f58, %%f58
14201+ fxor %%f44, %%f60, %%f60
14202+ fxor %%f46, %%f62, %%f62
14203+ stda %%f48, [%%o4 - 64] %%asi
14204+ bne,pt %%xcc, 2b
14205+ ldda [%%o3] %3, %%f16
14206+
14207+ ldda [%%o4 + 64] %%asi, %%f32
14208+ fxor %%f0, %%f16, %%f16
14209+ fxor %%f2, %%f18, %%f18
14210+ fxor %%f4, %%f20, %%f20
14211+ fxor %%f6, %%f22, %%f22
14212+ fxor %%f8, %%f24, %%f24
14213+ fxor %%f10, %%f26, %%f26
14214+ fxor %%f12, %%f28, %%f28
14215+ fxor %%f14, %%f30, %%f30
14216+ stda %%f16, [%%o4] %3
14217+ ldda [%%o3 + 64] %%asi, %%f48
14218+ membar #Sync
14219+ fxor %%f32, %%f48, %%f48
14220+ fxor %%f34, %%f50, %%f50
14221+ fxor %%f36, %%f52, %%f52
14222+ fxor %%f38, %%f54, %%f54
14223+ fxor %%f40, %%f56, %%f56
14224+ fxor %%f42, %%f58, %%f58
14225+ fxor %%f44, %%f60, %%f60
14226+ fxor %%f46, %%f62, %%f62
14227+ stda %%f48, [%%o4 + 64] %%asi
14228+ membar #Sync|#StoreStore|#StoreLoad
14229+ wr %%g0, 0, %%fprs
14230+ retl
14231+ wr %%g1, %%g0, %%asi
14232+
14233+13: ldx [%%o1 + 16], %%o2
14234+ ldx [%%o2 + %0], %%o2
14235+
14236+3: ldda [%%o2] %3, %%f32
14237+ fxor %%f0, %%f16, %%f48
14238+ fxor %%f2, %%f18, %%f50
14239+ add %%o4, 64, %%o4
14240+ fxor %%f4, %%f20, %%f52
14241+ fxor %%f6, %%f22, %%f54
14242+ add %%o3, 64, %%o3
14243+ fxor %%f8, %%f24, %%f56
14244+ fxor %%f10, %%f26, %%f58
14245+ fxor %%f12, %%f28, %%f60
14246+ fxor %%f14, %%f30, %%f62
14247+ ldda [%%o4] %3, %%f0
14248+ fxor %%f48, %%f32, %%f48
14249+ fxor %%f50, %%f34, %%f50
14250+ fxor %%f52, %%f36, %%f52
14251+ fxor %%f54, %%f38, %%f54
14252+ add %%o2, 64, %%o2
14253+ fxor %%f56, %%f40, %%f56
14254+ fxor %%f58, %%f42, %%f58
14255+ subcc %%g5, 64, %%g5
14256+ fxor %%f60, %%f44, %%f60
14257+ fxor %%f62, %%f46, %%f62
14258+ stda %%f48, [%%o4 + %%g1] %3
14259+ bne,pt %%xcc, 3b
14260+ ldda [%%o3] %3, %%f16
14261+
14262+ ldda [%%o2] %3, %%f32
14263+ fxor %%f0, %%f16, %%f48
14264+ fxor %%f2, %%f18, %%f50
14265+ fxor %%f4, %%f20, %%f52
14266+ fxor %%f6, %%f22, %%f54
14267+ fxor %%f8, %%f24, %%f56
14268+ fxor %%f10, %%f26, %%f58
14269+ fxor %%f12, %%f28, %%f60
14270+ fxor %%f14, %%f30, %%f62
14271+ membar #Sync
14272+ fxor %%f48, %%f32, %%f48
14273+ fxor %%f50, %%f34, %%f50
14274+ fxor %%f52, %%f36, %%f52
14275+ fxor %%f54, %%f38, %%f54
14276+ fxor %%f56, %%f40, %%f56
14277+ fxor %%f58, %%f42, %%f58
14278+ fxor %%f60, %%f44, %%f60
14279+ fxor %%f62, %%f46, %%f62
14280+ stda %%f48, [%%o4] %3
14281+ membar #Sync|#StoreStore|#StoreLoad
14282+ retl
14283+ wr %%g0, 0, %%fprs
14284+
14285+10: cmp %%o0, 5
14286+ be,pt %%xcc, 15f
14287+ mov -64, %%g1
14288+
14289+14: ldx [%%o1 + 16], %%o2
14290+ ldx [%%o1 + 24], %%o0
14291+ ldx [%%o2 + %0], %%o2
14292+ ldx [%%o0 + %0], %%o0
14293+
14294+4: ldda [%%o2] %3, %%f32
14295+ fxor %%f0, %%f16, %%f16
14296+ fxor %%f2, %%f18, %%f18
14297+ add %%o4, 64, %%o4
14298+ fxor %%f4, %%f20, %%f20
14299+ fxor %%f6, %%f22, %%f22
14300+ add %%o3, 64, %%o3
14301+ fxor %%f8, %%f24, %%f24
14302+ fxor %%f10, %%f26, %%f26
14303+ fxor %%f12, %%f28, %%f28
14304+ fxor %%f14, %%f30, %%f30
14305+ ldda [%%o0] %3, %%f48
14306+ fxor %%f16, %%f32, %%f32
14307+ fxor %%f18, %%f34, %%f34
14308+ fxor %%f20, %%f36, %%f36
14309+ fxor %%f22, %%f38, %%f38
14310+ add %%o2, 64, %%o2
14311+ fxor %%f24, %%f40, %%f40
14312+ fxor %%f26, %%f42, %%f42
14313+ fxor %%f28, %%f44, %%f44
14314+ fxor %%f30, %%f46, %%f46
14315+ ldda [%%o4] %3, %%f0
14316+ fxor %%f32, %%f48, %%f48
14317+ fxor %%f34, %%f50, %%f50
14318+ fxor %%f36, %%f52, %%f52
14319+ add %%o0, 64, %%o0
14320+ fxor %%f38, %%f54, %%f54
14321+ fxor %%f40, %%f56, %%f56
14322+ fxor %%f42, %%f58, %%f58
14323+ subcc %%g5, 64, %%g5
14324+ fxor %%f44, %%f60, %%f60
14325+ fxor %%f46, %%f62, %%f62
14326+ stda %%f48, [%%o4 + %%g1] %3
14327+ bne,pt %%xcc, 4b
14328+ ldda [%%o3] %3, %%f16
14329+
14330+ ldda [%%o2] %3, %%f32
14331+ fxor %%f0, %%f16, %%f16
14332+ fxor %%f2, %%f18, %%f18
14333+ fxor %%f4, %%f20, %%f20
14334+ fxor %%f6, %%f22, %%f22
14335+ fxor %%f8, %%f24, %%f24
14336+ fxor %%f10, %%f26, %%f26
14337+ fxor %%f12, %%f28, %%f28
14338+ fxor %%f14, %%f30, %%f30
14339+ ldda [%%o0] %3, %%f48
14340+ fxor %%f16, %%f32, %%f32
14341+ fxor %%f18, %%f34, %%f34
14342+ fxor %%f20, %%f36, %%f36
14343+ fxor %%f22, %%f38, %%f38
14344+ fxor %%f24, %%f40, %%f40
14345+ fxor %%f26, %%f42, %%f42
14346+ fxor %%f28, %%f44, %%f44
14347+ fxor %%f30, %%f46, %%f46
14348+ membar #Sync
14349+ fxor %%f32, %%f48, %%f48
14350+ fxor %%f34, %%f50, %%f50
14351+ fxor %%f36, %%f52, %%f52
14352+ fxor %%f38, %%f54, %%f54
14353+ fxor %%f40, %%f56, %%f56
14354+ fxor %%f42, %%f58, %%f58
14355+ fxor %%f44, %%f60, %%f60
14356+ fxor %%f46, %%f62, %%f62
14357+ stda %%f48, [%%o4] %3
14358+ membar #Sync|#StoreStore|#StoreLoad
14359+ retl
14360+ wr %%g0, 0, %%fprs
14361+
14362+15: ldx [%%o1 + 16], %%o2
14363+ ldx [%%o1 + 24], %%o0
14364+ ldx [%%o1 + 32], %%o1
14365+ ldx [%%o2 + %0], %%o2
14366+ ldx [%%o0 + %0], %%o0
14367+ ldx [%%o1 + %0], %%o1
14368+
14369+5: ldda [%%o2] %3, %%f32
14370+ fxor %%f0, %%f16, %%f48
14371+ fxor %%f2, %%f18, %%f50
14372+ add %%o4, 64, %%o4
14373+ fxor %%f4, %%f20, %%f52
14374+ fxor %%f6, %%f22, %%f54
14375+ add %%o3, 64, %%o3
14376+ fxor %%f8, %%f24, %%f56
14377+ fxor %%f10, %%f26, %%f58
14378+ fxor %%f12, %%f28, %%f60
14379+ fxor %%f14, %%f30, %%f62
14380+ ldda [%%o0] %3, %%f16
14381+ fxor %%f48, %%f32, %%f48
14382+ fxor %%f50, %%f34, %%f50
14383+ fxor %%f52, %%f36, %%f52
14384+ fxor %%f54, %%f38, %%f54
14385+ add %%o2, 64, %%o2
14386+ fxor %%f56, %%f40, %%f56
14387+ fxor %%f58, %%f42, %%f58
14388+ fxor %%f60, %%f44, %%f60
14389+ fxor %%f62, %%f46, %%f62
14390+ ldda [%%o1] %3, %%f32
14391+ fxor %%f48, %%f16, %%f48
14392+ fxor %%f50, %%f18, %%f50
14393+ add %%o0, 64, %%o0
14394+ fxor %%f52, %%f20, %%f52
14395+ fxor %%f54, %%f22, %%f54
14396+ add %%o1, 64, %%o1
14397+ fxor %%f56, %%f24, %%f56
14398+ fxor %%f58, %%f26, %%f58
14399+ fxor %%f60, %%f28, %%f60
14400+ fxor %%f62, %%f30, %%f62
14401+ ldda [%%o4] %3, %%f0
14402+ fxor %%f48, %%f32, %%f48
14403+ fxor %%f50, %%f34, %%f50
14404+ fxor %%f52, %%f36, %%f52
14405+ fxor %%f54, %%f38, %%f54
14406+ fxor %%f56, %%f40, %%f56
14407+ fxor %%f58, %%f42, %%f58
14408+ subcc %%g5, 64, %%g5
14409+ fxor %%f60, %%f44, %%f60
14410+ fxor %%f62, %%f46, %%f62
14411+ stda %%f48, [%%o4 + %%g1] %3
14412+ bne,pt %%xcc, 5b
14413+ ldda [%%o3] %3, %%f16
14414+
14415+ ldda [%%o2] %3, %%f32
14416+ fxor %%f0, %%f16, %%f48
14417+ fxor %%f2, %%f18, %%f50
14418+ fxor %%f4, %%f20, %%f52
14419+ fxor %%f6, %%f22, %%f54
14420+ fxor %%f8, %%f24, %%f56
14421+ fxor %%f10, %%f26, %%f58
14422+ fxor %%f12, %%f28, %%f60
14423+ fxor %%f14, %%f30, %%f62
14424+ ldda [%%o0] %3, %%f16
14425+ fxor %%f48, %%f32, %%f48
14426+ fxor %%f50, %%f34, %%f50
14427+ fxor %%f52, %%f36, %%f52
14428+ fxor %%f54, %%f38, %%f54
14429+ fxor %%f56, %%f40, %%f56
14430+ fxor %%f58, %%f42, %%f58
14431+ fxor %%f60, %%f44, %%f60
14432+ fxor %%f62, %%f46, %%f62
14433+ ldda [%%o1] %3, %%f32
14434+ fxor %%f48, %%f16, %%f48
14435+ fxor %%f50, %%f18, %%f50
14436+ fxor %%f52, %%f20, %%f52
14437+ fxor %%f54, %%f22, %%f54
14438+ fxor %%f56, %%f24, %%f56
14439+ fxor %%f58, %%f26, %%f58
14440+ fxor %%f60, %%f28, %%f60
14441+ fxor %%f62, %%f30, %%f62
14442+ membar #Sync
14443+ fxor %%f48, %%f32, %%f48
14444+ fxor %%f50, %%f34, %%f50
14445+ fxor %%f52, %%f36, %%f52
14446+ fxor %%f54, %%f38, %%f54
14447+ fxor %%f56, %%f40, %%f56
14448+ fxor %%f58, %%f42, %%f58
14449+ fxor %%f60, %%f44, %%f60
14450+ fxor %%f62, %%f46, %%f62
14451+ stda %%f48, [%%o4] %3
14452+ membar #Sync|#StoreStore|#StoreLoad
14453+ retl
14454+ wr %%g0, 0, %%fprs
14455+ " : :
14456+ "i" (&((struct buffer_head *)0)->b_data),
14457+ "i" (&((struct buffer_head *)0)->b_size),
14458+ "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
14459+ "i" (FPRS_FEF), "i" (VISenter));
14460+}
14461+#endif /* __sparc_v9__ */
14462+
14463+#if defined(__sparc__) && !defined(__sparc_v9__)
14464+/*
14465+ * High speed xor_block operation for RAID4/5 utilizing the
14466+ * ldd/std SPARC instructions.
14467+ *
14468+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
14469+ *
14470+ */
14471+
14472+XORBLOCK_TEMPLATE(SPARC)
14473+{
14474+ int size = bh_ptr[0]->b_size;
14475+ int lines = size / (sizeof (long)) / 8, i;
14476+ long *destp = (long *) bh_ptr[0]->b_data;
14477+ long *source1 = (long *) bh_ptr[1]->b_data;
14478+ long *source2, *source3, *source4;
14479+
14480+ switch (count) {
14481+ case 2:
14482+ for (i = lines; i > 0; i--) {
14483+ __asm__ __volatile__("
14484+ ldd [%0 + 0x00], %%g2
14485+ ldd [%0 + 0x08], %%g4
14486+ ldd [%0 + 0x10], %%o0
14487+ ldd [%0 + 0x18], %%o2
14488+ ldd [%1 + 0x00], %%o4
14489+ ldd [%1 + 0x08], %%l0
14490+ ldd [%1 + 0x10], %%l2
14491+ ldd [%1 + 0x18], %%l4
14492+ xor %%g2, %%o4, %%g2
14493+ xor %%g3, %%o5, %%g3
14494+ xor %%g4, %%l0, %%g4
14495+ xor %%g5, %%l1, %%g5
14496+ xor %%o0, %%l2, %%o0
14497+ xor %%o1, %%l3, %%o1
14498+ xor %%o2, %%l4, %%o2
14499+ xor %%o3, %%l5, %%o3
14500+ std %%g2, [%0 + 0x00]
14501+ std %%g4, [%0 + 0x08]
14502+ std %%o0, [%0 + 0x10]
14503+ std %%o2, [%0 + 0x18]
14504+ " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
14505+ "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
14506+ destp += 8;
14507+ source1 += 8;
14508+ }
14509+ break;
14510+ case 3:
14511+ source2 = (long *) bh_ptr[2]->b_data;
14512+ for (i = lines; i > 0; i--) {
14513+ __asm__ __volatile__("
14514+ ldd [%0 + 0x00], %%g2
14515+ ldd [%0 + 0x08], %%g4
14516+ ldd [%0 + 0x10], %%o0
14517+ ldd [%0 + 0x18], %%o2
14518+ ldd [%1 + 0x00], %%o4
14519+ ldd [%1 + 0x08], %%l0
14520+ ldd [%1 + 0x10], %%l2
14521+ ldd [%1 + 0x18], %%l4
14522+ xor %%g2, %%o4, %%g2
14523+ xor %%g3, %%o5, %%g3
14524+ ldd [%2 + 0x00], %%o4
14525+ xor %%g4, %%l0, %%g4
14526+ xor %%g5, %%l1, %%g5
14527+ ldd [%2 + 0x08], %%l0
14528+ xor %%o0, %%l2, %%o0
14529+ xor %%o1, %%l3, %%o1
14530+ ldd [%2 + 0x10], %%l2
14531+ xor %%o2, %%l4, %%o2
14532+ xor %%o3, %%l5, %%o3
14533+ ldd [%2 + 0x18], %%l4
14534+ xor %%g2, %%o4, %%g2
14535+ xor %%g3, %%o5, %%g3
14536+ xor %%g4, %%l0, %%g4
14537+ xor %%g5, %%l1, %%g5
14538+ xor %%o0, %%l2, %%o0
14539+ xor %%o1, %%l3, %%o1
14540+ xor %%o2, %%l4, %%o2
14541+ xor %%o3, %%l5, %%o3
14542+ std %%g2, [%0 + 0x00]
14543+ std %%g4, [%0 + 0x08]
14544+ std %%o0, [%0 + 0x10]
14545+ std %%o2, [%0 + 0x18]
14546+ " : : "r" (destp), "r" (source1), "r" (source2)
14547+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14548+ "l0", "l1", "l2", "l3", "l4", "l5");
14549+ destp += 8;
14550+ source1 += 8;
14551+ source2 += 8;
14552+ }
14553+ break;
14554+ case 4:
14555+ source2 = (long *) bh_ptr[2]->b_data;
14556+ source3 = (long *) bh_ptr[3]->b_data;
14557+ for (i = lines; i > 0; i--) {
14558+ __asm__ __volatile__("
14559+ ldd [%0 + 0x00], %%g2
14560+ ldd [%0 + 0x08], %%g4
14561+ ldd [%0 + 0x10], %%o0
14562+ ldd [%0 + 0x18], %%o2
14563+ ldd [%1 + 0x00], %%o4
14564+ ldd [%1 + 0x08], %%l0
14565+ ldd [%1 + 0x10], %%l2
14566+ ldd [%1 + 0x18], %%l4
14567+ xor %%g2, %%o4, %%g2
14568+ xor %%g3, %%o5, %%g3
14569+ ldd [%2 + 0x00], %%o4
14570+ xor %%g4, %%l0, %%g4
14571+ xor %%g5, %%l1, %%g5
14572+ ldd [%2 + 0x08], %%l0
14573+ xor %%o0, %%l2, %%o0
14574+ xor %%o1, %%l3, %%o1
14575+ ldd [%2 + 0x10], %%l2
14576+ xor %%o2, %%l4, %%o2
14577+ xor %%o3, %%l5, %%o3
14578+ ldd [%2 + 0x18], %%l4
14579+ xor %%g2, %%o4, %%g2
14580+ xor %%g3, %%o5, %%g3
14581+ ldd [%3 + 0x00], %%o4
14582+ xor %%g4, %%l0, %%g4
14583+ xor %%g5, %%l1, %%g5
14584+ ldd [%3 + 0x08], %%l0
14585+ xor %%o0, %%l2, %%o0
14586+ xor %%o1, %%l3, %%o1
14587+ ldd [%3 + 0x10], %%l2
14588+ xor %%o2, %%l4, %%o2
14589+ xor %%o3, %%l5, %%o3
14590+ ldd [%3 + 0x18], %%l4
14591+ xor %%g2, %%o4, %%g2
14592+ xor %%g3, %%o5, %%g3
14593+ xor %%g4, %%l0, %%g4
14594+ xor %%g5, %%l1, %%g5
14595+ xor %%o0, %%l2, %%o0
14596+ xor %%o1, %%l3, %%o1
14597+ xor %%o2, %%l4, %%o2
14598+ xor %%o3, %%l5, %%o3
14599+ std %%g2, [%0 + 0x00]
14600+ std %%g4, [%0 + 0x08]
14601+ std %%o0, [%0 + 0x10]
14602+ std %%o2, [%0 + 0x18]
14603+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
14604+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14605+ "l0", "l1", "l2", "l3", "l4", "l5");
14606+ destp += 8;
14607+ source1 += 8;
14608+ source2 += 8;
14609+ source3 += 8;
14610+ }
14611+ break;
14612+ case 5:
14613+ source2 = (long *) bh_ptr[2]->b_data;
14614+ source3 = (long *) bh_ptr[3]->b_data;
14615+ source4 = (long *) bh_ptr[4]->b_data;
14616+ for (i = lines; i > 0; i--) {
14617+ __asm__ __volatile__("
14618+ ldd [%0 + 0x00], %%g2
14619+ ldd [%0 + 0x08], %%g4
14620+ ldd [%0 + 0x10], %%o0
14621+ ldd [%0 + 0x18], %%o2
14622+ ldd [%1 + 0x00], %%o4
14623+ ldd [%1 + 0x08], %%l0
14624+ ldd [%1 + 0x10], %%l2
14625+ ldd [%1 + 0x18], %%l4
14626+ xor %%g2, %%o4, %%g2
14627+ xor %%g3, %%o5, %%g3
14628+ ldd [%2 + 0x00], %%o4
14629+ xor %%g4, %%l0, %%g4
14630+ xor %%g5, %%l1, %%g5
14631+ ldd [%2 + 0x08], %%l0
14632+ xor %%o0, %%l2, %%o0
14633+ xor %%o1, %%l3, %%o1
14634+ ldd [%2 + 0x10], %%l2
14635+ xor %%o2, %%l4, %%o2
14636+ xor %%o3, %%l5, %%o3
14637+ ldd [%2 + 0x18], %%l4
14638+ xor %%g2, %%o4, %%g2
14639+ xor %%g3, %%o5, %%g3
14640+ ldd [%3 + 0x00], %%o4
14641+ xor %%g4, %%l0, %%g4
14642+ xor %%g5, %%l1, %%g5
14643+ ldd [%3 + 0x08], %%l0
14644+ xor %%o0, %%l2, %%o0
14645+ xor %%o1, %%l3, %%o1
14646+ ldd [%3 + 0x10], %%l2
14647+ xor %%o2, %%l4, %%o2
14648+ xor %%o3, %%l5, %%o3
14649+ ldd [%3 + 0x18], %%l4
14650+ xor %%g2, %%o4, %%g2
14651+ xor %%g3, %%o5, %%g3
14652+ ldd [%4 + 0x00], %%o4
14653+ xor %%g4, %%l0, %%g4
14654+ xor %%g5, %%l1, %%g5
14655+ ldd [%4 + 0x08], %%l0
14656+ xor %%o0, %%l2, %%o0
14657+ xor %%o1, %%l3, %%o1
14658+ ldd [%4 + 0x10], %%l2
14659+ xor %%o2, %%l4, %%o2
14660+ xor %%o3, %%l5, %%o3
14661+ ldd [%4 + 0x18], %%l4
14662+ xor %%g2, %%o4, %%g2
14663+ xor %%g3, %%o5, %%g3
14664+ xor %%g4, %%l0, %%g4
14665+ xor %%g5, %%l1, %%g5
14666+ xor %%o0, %%l2, %%o0
14667+ xor %%o1, %%l3, %%o1
14668+ xor %%o2, %%l4, %%o2
14669+ xor %%o3, %%l5, %%o3
14670+ std %%g2, [%0 + 0x00]
14671+ std %%g4, [%0 + 0x08]
14672+ std %%o0, [%0 + 0x10]
14673+ std %%o2, [%0 + 0x18]
14674+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
14675+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14676+ "l0", "l1", "l2", "l3", "l4", "l5");
14677+ destp += 8;
14678+ source1 += 8;
14679+ source2 += 8;
14680+ source3 += 8;
14681+ source4 += 8;
14682+ }
14683+ break;
14684+ }
14685+}
14686+#endif /* __sparc_v[78]__ */
14687+
14688+#ifndef __sparc_v9__
14689+
14690+/*
14691+ * this one works reasonably on any x86 CPU
14692+ * (send me an assembly version for inclusion if you can make it faster)
14693+ *
14694+ * this one is just as fast as written in pure assembly on x86.
14695+ * the reason for this separate version is that the
14696+ * fast open-coded xor routine "32reg" produces suboptimal code
14697+ * on x86, due to lack of registers.
14698+ */
14699+XORBLOCK_TEMPLATE(8regs)
14700+{
14701+ int len = bh_ptr[0]->b_size;
14702+ long *destp = (long *) bh_ptr[0]->b_data;
14703+ long *source1, *source2, *source3, *source4;
14704+ long lines = len / (sizeof (long)) / 8, i;
14705+
14706+ switch(count) {
14707+ case 2:
14708+ source1 = (long *) bh_ptr[1]->b_data;
14709+ for (i = lines; i > 0; i--) {
14710+ *(destp + 0) ^= *(source1 + 0);
14711+ *(destp + 1) ^= *(source1 + 1);
14712+ *(destp + 2) ^= *(source1 + 2);
14713+ *(destp + 3) ^= *(source1 + 3);
14714+ *(destp + 4) ^= *(source1 + 4);
14715+ *(destp + 5) ^= *(source1 + 5);
14716+ *(destp + 6) ^= *(source1 + 6);
14717+ *(destp + 7) ^= *(source1 + 7);
14718+ source1 += 8;
14719+ destp += 8;
14720+ }
14721+ break;
14722+ case 3:
14723+ source2 = (long *) bh_ptr[2]->b_data;
14724+ source1 = (long *) bh_ptr[1]->b_data;
14725+ for (i = lines; i > 0; i--) {
14726+ *(destp + 0) ^= *(source1 + 0);
14727+ *(destp + 0) ^= *(source2 + 0);
14728+ *(destp + 1) ^= *(source1 + 1);
14729+ *(destp + 1) ^= *(source2 + 1);
14730+ *(destp + 2) ^= *(source1 + 2);
14731+ *(destp + 2) ^= *(source2 + 2);
14732+ *(destp + 3) ^= *(source1 + 3);
14733+ *(destp + 3) ^= *(source2 + 3);
14734+ *(destp + 4) ^= *(source1 + 4);
14735+ *(destp + 4) ^= *(source2 + 4);
14736+ *(destp + 5) ^= *(source1 + 5);
14737+ *(destp + 5) ^= *(source2 + 5);
14738+ *(destp + 6) ^= *(source1 + 6);
14739+ *(destp + 6) ^= *(source2 + 6);
14740+ *(destp + 7) ^= *(source1 + 7);
14741+ *(destp + 7) ^= *(source2 + 7);
14742+ source1 += 8;
14743+ source2 += 8;
14744+ destp += 8;
14745+ }
14746+ break;
14747+ case 4:
14748+ source3 = (long *) bh_ptr[3]->b_data;
14749+ source2 = (long *) bh_ptr[2]->b_data;
14750+ source1 = (long *) bh_ptr[1]->b_data;
14751+ for (i = lines; i > 0; i--) {
14752+ *(destp + 0) ^= *(source1 + 0);
14753+ *(destp + 0) ^= *(source2 + 0);
14754+ *(destp + 0) ^= *(source3 + 0);
14755+ *(destp + 1) ^= *(source1 + 1);
14756+ *(destp + 1) ^= *(source2 + 1);
14757+ *(destp + 1) ^= *(source3 + 1);
14758+ *(destp + 2) ^= *(source1 + 2);
14759+ *(destp + 2) ^= *(source2 + 2);
14760+ *(destp + 2) ^= *(source3 + 2);
14761+ *(destp + 3) ^= *(source1 + 3);
14762+ *(destp + 3) ^= *(source2 + 3);
14763+ *(destp + 3) ^= *(source3 + 3);
14764+ *(destp + 4) ^= *(source1 + 4);
14765+ *(destp + 4) ^= *(source2 + 4);
14766+ *(destp + 4) ^= *(source3 + 4);
14767+ *(destp + 5) ^= *(source1 + 5);
14768+ *(destp + 5) ^= *(source2 + 5);
14769+ *(destp + 5) ^= *(source3 + 5);
14770+ *(destp + 6) ^= *(source1 + 6);
14771+ *(destp + 6) ^= *(source2 + 6);
14772+ *(destp + 6) ^= *(source3 + 6);
14773+ *(destp + 7) ^= *(source1 + 7);
14774+ *(destp + 7) ^= *(source2 + 7);
14775+ *(destp + 7) ^= *(source3 + 7);
14776+ source1 += 8;
14777+ source2 += 8;
14778+ source3 += 8;
14779+ destp += 8;
14780+ }
14781+ break;
14782+ case 5:
14783+ source4 = (long *) bh_ptr[4]->b_data;
14784+ source3 = (long *) bh_ptr[3]->b_data;
14785+ source2 = (long *) bh_ptr[2]->b_data;
14786+ source1 = (long *) bh_ptr[1]->b_data;
14787+ for (i = lines; i > 0; i--) {
14788+ *(destp + 0) ^= *(source1 + 0);
14789+ *(destp + 0) ^= *(source2 + 0);
14790+ *(destp + 0) ^= *(source3 + 0);
14791+ *(destp + 0) ^= *(source4 + 0);
14792+ *(destp + 1) ^= *(source1 + 1);
14793+ *(destp + 1) ^= *(source2 + 1);
14794+ *(destp + 1) ^= *(source3 + 1);
14795+ *(destp + 1) ^= *(source4 + 1);
14796+ *(destp + 2) ^= *(source1 + 2);
14797+ *(destp + 2) ^= *(source2 + 2);
14798+ *(destp + 2) ^= *(source3 + 2);
14799+ *(destp + 2) ^= *(source4 + 2);
14800+ *(destp + 3) ^= *(source1 + 3);
14801+ *(destp + 3) ^= *(source2 + 3);
14802+ *(destp + 3) ^= *(source3 + 3);
14803+ *(destp + 3) ^= *(source4 + 3);
14804+ *(destp + 4) ^= *(source1 + 4);
14805+ *(destp + 4) ^= *(source2 + 4);
14806+ *(destp + 4) ^= *(source3 + 4);
14807+ *(destp + 4) ^= *(source4 + 4);
14808+ *(destp + 5) ^= *(source1 + 5);
14809+ *(destp + 5) ^= *(source2 + 5);
14810+ *(destp + 5) ^= *(source3 + 5);
14811+ *(destp + 5) ^= *(source4 + 5);
14812+ *(destp + 6) ^= *(source1 + 6);
14813+ *(destp + 6) ^= *(source2 + 6);
14814+ *(destp + 6) ^= *(source3 + 6);
14815+ *(destp + 6) ^= *(source4 + 6);
14816+ *(destp + 7) ^= *(source1 + 7);
14817+ *(destp + 7) ^= *(source2 + 7);
14818+ *(destp + 7) ^= *(source3 + 7);
14819+ *(destp + 7) ^= *(source4 + 7);
14820+ source1 += 8;
14821+ source2 += 8;
14822+ source3 += 8;
14823+ source4 += 8;
14824+ destp += 8;
14825+ }
14826+ break;
14827+ }
14828+}
14829+
14830+/*
14831+ * platform independent RAID5 checksum calculation, this should
14832+ * be very fast on any platform that has a decent amount of
14833+ * registers. (32 or more)
14834+ */
14835+XORBLOCK_TEMPLATE(32regs)
14836+{
14837+ int size = bh_ptr[0]->b_size;
14838+ int lines = size / (sizeof (long)) / 8, i;
14839+ long *destp = (long *) bh_ptr[0]->b_data;
14840+ long *source1, *source2, *source3, *source4;
14841+
14842+ /* LOTS of registers available...
14843+ We do explicite loop-unrolling here for code which
14844+ favours RISC machines. In fact this is almoast direct
14845+ RISC assembly on Alpha and SPARC :-) */
14846+
14847+
14848+ switch(count) {
14849+ case 2:
14850+ source1 = (long *) bh_ptr[1]->b_data;
14851+ for (i = lines; i > 0; i--) {
14852+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14853+ d0 = destp[0]; /* Pull the stuff into registers */
14854+ d1 = destp[1]; /* ... in bursts, if possible. */
14855+ d2 = destp[2];
14856+ d3 = destp[3];
14857+ d4 = destp[4];
14858+ d5 = destp[5];
14859+ d6 = destp[6];
14860+ d7 = destp[7];
14861+ d0 ^= source1[0];
14862+ d1 ^= source1[1];
14863+ d2 ^= source1[2];
14864+ d3 ^= source1[3];
14865+ d4 ^= source1[4];
14866+ d5 ^= source1[5];
14867+ d6 ^= source1[6];
14868+ d7 ^= source1[7];
14869+ destp[0] = d0; /* Store the result (in burts) */
14870+ destp[1] = d1;
14871+ destp[2] = d2;
14872+ destp[3] = d3;
14873+ destp[4] = d4; /* Store the result (in burts) */
14874+ destp[5] = d5;
14875+ destp[6] = d6;
14876+ destp[7] = d7;
14877+ source1 += 8;
14878+ destp += 8;
14879+ }
14880+ break;
14881+ case 3:
14882+ source2 = (long *) bh_ptr[2]->b_data;
14883+ source1 = (long *) bh_ptr[1]->b_data;
14884+ for (i = lines; i > 0; i--) {
14885+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14886+ d0 = destp[0]; /* Pull the stuff into registers */
14887+ d1 = destp[1]; /* ... in bursts, if possible. */
14888+ d2 = destp[2];
14889+ d3 = destp[3];
14890+ d4 = destp[4];
14891+ d5 = destp[5];
14892+ d6 = destp[6];
14893+ d7 = destp[7];
14894+ d0 ^= source1[0];
14895+ d1 ^= source1[1];
14896+ d2 ^= source1[2];
14897+ d3 ^= source1[3];
14898+ d4 ^= source1[4];
14899+ d5 ^= source1[5];
14900+ d6 ^= source1[6];
14901+ d7 ^= source1[7];
14902+ d0 ^= source2[0];
14903+ d1 ^= source2[1];
14904+ d2 ^= source2[2];
14905+ d3 ^= source2[3];
14906+ d4 ^= source2[4];
14907+ d5 ^= source2[5];
14908+ d6 ^= source2[6];
14909+ d7 ^= source2[7];
14910+ destp[0] = d0; /* Store the result (in burts) */
14911+ destp[1] = d1;
14912+ destp[2] = d2;
14913+ destp[3] = d3;
14914+ destp[4] = d4; /* Store the result (in burts) */
14915+ destp[5] = d5;
14916+ destp[6] = d6;
14917+ destp[7] = d7;
14918+ source1 += 8;
14919+ source2 += 8;
14920+ destp += 8;
14921+ }
14922+ break;
14923+ case 4:
14924+ source3 = (long *) bh_ptr[3]->b_data;
14925+ source2 = (long *) bh_ptr[2]->b_data;
14926+ source1 = (long *) bh_ptr[1]->b_data;
14927+ for (i = lines; i > 0; i--) {
14928+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14929+ d0 = destp[0]; /* Pull the stuff into registers */
14930+ d1 = destp[1]; /* ... in bursts, if possible. */
14931+ d2 = destp[2];
14932+ d3 = destp[3];
14933+ d4 = destp[4];
14934+ d5 = destp[5];
14935+ d6 = destp[6];
14936+ d7 = destp[7];
14937+ d0 ^= source1[0];
14938+ d1 ^= source1[1];
14939+ d2 ^= source1[2];
14940+ d3 ^= source1[3];
14941+ d4 ^= source1[4];
14942+ d5 ^= source1[5];
14943+ d6 ^= source1[6];
14944+ d7 ^= source1[7];
14945+ d0 ^= source2[0];
14946+ d1 ^= source2[1];
14947+ d2 ^= source2[2];
14948+ d3 ^= source2[3];
14949+ d4 ^= source2[4];
14950+ d5 ^= source2[5];
14951+ d6 ^= source2[6];
14952+ d7 ^= source2[7];
14953+ d0 ^= source3[0];
14954+ d1 ^= source3[1];
14955+ d2 ^= source3[2];
14956+ d3 ^= source3[3];
14957+ d4 ^= source3[4];
14958+ d5 ^= source3[5];
14959+ d6 ^= source3[6];
14960+ d7 ^= source3[7];
14961+ destp[0] = d0; /* Store the result (in burts) */
14962+ destp[1] = d1;
14963+ destp[2] = d2;
14964+ destp[3] = d3;
14965+ destp[4] = d4; /* Store the result (in burts) */
14966+ destp[5] = d5;
14967+ destp[6] = d6;
14968+ destp[7] = d7;
14969+ source1 += 8;
14970+ source2 += 8;
14971+ source3 += 8;
14972+ destp += 8;
14973+ }
14974+ break;
14975+ case 5:
14976+ source4 = (long *) bh_ptr[4]->b_data;
14977+ source3 = (long *) bh_ptr[3]->b_data;
14978+ source2 = (long *) bh_ptr[2]->b_data;
14979+ source1 = (long *) bh_ptr[1]->b_data;
14980+ for (i = lines; i > 0; i--) {
14981+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14982+ d0 = destp[0]; /* Pull the stuff into registers */
14983+ d1 = destp[1]; /* ... in bursts, if possible. */
14984+ d2 = destp[2];
14985+ d3 = destp[3];
14986+ d4 = destp[4];
14987+ d5 = destp[5];
14988+ d6 = destp[6];
14989+ d7 = destp[7];
14990+ d0 ^= source1[0];
14991+ d1 ^= source1[1];
14992+ d2 ^= source1[2];
14993+ d3 ^= source1[3];
14994+ d4 ^= source1[4];
14995+ d5 ^= source1[5];
14996+ d6 ^= source1[6];
14997+ d7 ^= source1[7];
14998+ d0 ^= source2[0];
14999+ d1 ^= source2[1];
15000+ d2 ^= source2[2];
15001+ d3 ^= source2[3];
15002+ d4 ^= source2[4];
15003+ d5 ^= source2[5];
15004+ d6 ^= source2[6];
15005+ d7 ^= source2[7];
15006+ d0 ^= source3[0];
15007+ d1 ^= source3[1];
15008+ d2 ^= source3[2];
15009+ d3 ^= source3[3];
15010+ d4 ^= source3[4];
15011+ d5 ^= source3[5];
15012+ d6 ^= source3[6];
15013+ d7 ^= source3[7];
15014+ d0 ^= source4[0];
15015+ d1 ^= source4[1];
15016+ d2 ^= source4[2];
15017+ d3 ^= source4[3];
15018+ d4 ^= source4[4];
15019+ d5 ^= source4[5];
15020+ d6 ^= source4[6];
15021+ d7 ^= source4[7];
15022+ destp[0] = d0; /* Store the result (in burts) */
15023+ destp[1] = d1;
15024+ destp[2] = d2;
15025+ destp[3] = d3;
15026+ destp[4] = d4; /* Store the result (in burts) */
15027+ destp[5] = d5;
15028+ destp[6] = d6;
15029+ destp[7] = d7;
15030+ source1 += 8;
15031+ source2 += 8;
15032+ source3 += 8;
15033+ source4 += 8;
15034+ destp += 8;
15035+ }
15036+ break;
15037+ }
15038+}
15039+
15040+/*
15041+ * (the -6*32 shift factor colors the cache)
15042+ */
15043+#define SIZE (PAGE_SIZE-6*32)
15044+
15045+static void xor_speed ( struct xor_block_template * func,
15046+ struct buffer_head *b1, struct buffer_head *b2)
15047+{
15048+ int speed;
15049+ unsigned long now;
15050+ int i, count, max;
15051+ struct buffer_head *bh_ptr[6];
15052+
15053+ func->next = xor_functions;
15054+ xor_functions = func;
15055+ bh_ptr[0] = b1;
15056+ bh_ptr[1] = b2;
15057+
15058+ /*
15059+ * count the number of XORs done during a whole jiffy.
15060+ * calculate the speed of checksumming from this.
15061+ * (we use a 2-page allocation to have guaranteed
15062+ * color L1-cache layout)
15063+ */
15064+ max = 0;
15065+ for (i = 0; i < 5; i++) {
15066+ now = jiffies;
15067+ count = 0;
15068+ while (jiffies == now) {
15069+ mb();
15070+ func->xor_block(2,bh_ptr);
15071+ mb();
15072+ count++;
15073+ mb();
15074+ }
15075+ if (count > max)
15076+ max = count;
15077+ }
15078+
15079+ speed = max * (HZ*SIZE/1024);
15080+ func->speed = speed;
15081+
15082+ printk( " %-10s: %5d.%03d MB/sec\n", func->name,
15083+ speed / 1000, speed % 1000);
15084+}
15085+
15086+static inline void pick_fastest_function(void)
15087+{
15088+ struct xor_block_template *f, *fastest;
15089+
15090+ fastest = xor_functions;
15091+ for (f = fastest; f; f = f->next) {
15092+ if (f->speed > fastest->speed)
15093+ fastest = f;
15094+ }
15095+#ifdef CONFIG_X86_XMM
15096+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15097+ fastest = &t_xor_block_pIII_kni;
15098+ }
15099+#endif
15100+ xor_block = fastest->xor_block;
15101+ printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
15102+ fastest->speed / 1000, fastest->speed % 1000);
15103+}
15104+
15105+
15106+void calibrate_xor_block(void)
15107+{
15108+ struct buffer_head b1, b2;
15109+
15110+ memset(&b1,0,sizeof(b1));
15111+ b2 = b1;
15112+
15113+ b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
15114+ if (!b1.b_data) {
15115+ pick_fastest_function();
15116+ return;
15117+ }
15118+ b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
15119+
15120+ b1.b_size = SIZE;
15121+
15122+ printk(KERN_INFO "raid5: measuring checksumming speed\n");
15123+
15124+ sti(); /* should be safe */
15125+
15126+#if defined(__sparc__) && !defined(__sparc_v9__)
15127+ printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
15128+ xor_speed(&t_xor_block_SPARC,&b1,&b2);
15129+#endif
15130+
15131+#ifdef CONFIG_X86_XMM
15132+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15133+ printk(KERN_INFO
15134+ "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
15135+ /* we force the use of the KNI xor block because it
15136+ can write around l2. we may also be able
15137+ to load into the l1 only depending on how
15138+ the cpu deals with a load to a line that is
15139+ being prefetched.
15140+ */
15141+ xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
15142+ }
15143+#endif /* CONFIG_X86_XMM */
15144+
15145+#ifdef __i386__
15146+
15147+ if (md_cpu_has_mmx()) {
15148+ printk(KERN_INFO
15149+ "raid5: MMX detected, trying high-speed MMX checksum routines\n");
15150+ xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
15151+ xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
15152+ }
15153+
15154+#endif /* __i386__ */
15155+
15156+
15157+ xor_speed(&t_xor_block_8regs,&b1,&b2);
15158+ xor_speed(&t_xor_block_32regs,&b1,&b2);
15159+
15160+ free_pages((unsigned long)b1.b_data,2);
15161+ pick_fastest_function();
15162+}
15163+
15164+#else /* __sparc_v9__ */
15165+
15166+void calibrate_xor_block(void)
15167+{
15168+ printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
15169+ xor_block = xor_block_VIS;
15170+}
15171+
15172+#endif /* __sparc_v9__ */
15173+
15174+MD_EXPORT_SYMBOL(xor_block);
15175+
15176--- linux/arch/i386/defconfig.orig Fri Nov 23 11:17:42 2001
15177+++ linux/arch/i386/defconfig Fri Nov 23 11:18:20 2001
15178@@ -97,7 +97,15 @@
15179 #
15180 # CONFIG_BLK_DEV_LOOP is not set
15181 # CONFIG_BLK_DEV_NBD is not set
15182-# CONFIG_BLK_DEV_MD is not set
15183+CONFIG_BLK_DEV_MD=y
15184+CONFIG_AUTODETECT_RAID=y
15185+CONFIG_MD_TRANSLUCENT=y
15186+CONFIG_MD_LINEAR=y
15187+CONFIG_MD_STRIPED=y
15188+CONFIG_MD_MIRRORING=y
15189+CONFIG_MD_RAID5=y
15190+CONFIG_MD_BOOT=y
15191+CONFIG_BLK_DEV_HSM=y
15192 # CONFIG_BLK_DEV_RAM is not set
15193 # CONFIG_BLK_DEV_XD is not set
15194 # CONFIG_BLK_DEV_DAC960 is not set
15195--- linux/arch/sparc/config.in.orig Fri Nov 23 11:17:42 2001
15196+++ linux/arch/sparc/config.in Fri Nov 23 11:18:20 2001
15197@@ -88,10 +88,16 @@
15198
15199 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15200 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15201+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15202 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15203 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15204 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15205 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15206+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15207+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15208+fi
15209+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15210+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15211 fi
15212
15213 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15214--- linux/arch/sparc/defconfig.orig Fri Nov 23 11:17:42 2001
15215+++ linux/arch/sparc/defconfig Fri Nov 23 11:18:20 2001
15216@@ -89,10 +89,13 @@
15217 #
15218 CONFIG_BLK_DEV_FD=y
15219 CONFIG_BLK_DEV_MD=y
15220+# CONFIG_AUTODETECT_RAID is not set
15221 CONFIG_MD_LINEAR=m
15222 CONFIG_MD_STRIPED=m
15223 CONFIG_MD_MIRRORING=m
15224 CONFIG_MD_RAID5=m
15225+# CONFIG_MD_TRANSLUCENT is not set
15226+# CONFIG_MD_HSM is not set
15227 CONFIG_BLK_DEV_RAM=y
15228 CONFIG_BLK_DEV_RAM_SIZE=4096
15229 CONFIG_BLK_DEV_INITRD=y
15230--- linux/arch/sparc64/config.in.orig Fri Nov 23 11:17:42 2001
15231+++ linux/arch/sparc64/config.in Fri Nov 23 11:18:20 2001
15232@@ -103,10 +103,16 @@
15233
15234 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15235 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15236+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15237 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15238 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15239 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15240 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15241+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15242+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15243+fi
15244+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15245+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15246 fi
15247
15248 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15249--- linux/arch/sparc64/defconfig.orig Fri Nov 23 11:17:42 2001
15250+++ linux/arch/sparc64/defconfig Fri Nov 23 11:18:20 2001
15251@@ -107,10 +107,13 @@
15252 #
15253 CONFIG_BLK_DEV_FD=y
15254 CONFIG_BLK_DEV_MD=y
15255+# CONFIG_AUTODETECT_RAID is not set
15256 CONFIG_MD_LINEAR=m
15257 CONFIG_MD_STRIPED=m
15258 CONFIG_MD_MIRRORING=m
15259 CONFIG_MD_RAID5=m
15260+# CONFIG_MD_TRANSLUCENT is not set
15261+# CONFIG_MD_HSM is not set
15262 CONFIG_BLK_DEV_RAM=y
15263 CONFIG_BLK_DEV_RAM_SIZE=4096
15264 CONFIG_BLK_DEV_INITRD=y
15265--- linux/Documentation/Configure.help.orig Fri Nov 23 11:17:44 2001
15266+++ linux/Documentation/Configure.help Fri Nov 23 11:18:20 2001
15267@@ -1054,6 +1054,13 @@
15268
15269 If unsure, say N.
15270
15271+Autodetect RAID partitions
15272+CONFIG_AUTODETECT_RAID
15273+ This feature lets the kernel detect RAID partitions on bootup.
15274+ An autodetect RAID partition is a normal partition with partition
15275+ type 0xfd. Use this if you want to boot RAID devices, or want to
15276+ run them automatically.
15277+
15278 Linear (append) mode
15279 CONFIG_MD_LINEAR
15280 If you say Y here, then your multiple devices driver will be able to
15281@@ -1132,6 +1139,21 @@
15282 Documentation/modules.txt.
15283
15284 If unsure, say Y.
15285+
15286+Translucent Block Device Support (EXPERIMENTAL)
15287+CONFIG_MD_TRANSLUCENT
15288+ DO NOT USE THIS STUFF YET!
15289+
15290+ currently there is only a placeholder there as the implementation
15291+ is not yet usable.
15292+
15293+Hierarchical Storage Management support (EXPERIMENTAL)
15294+CONFIG_MD_HSM
15295+ DO NOT USE THIS STUFF YET!
15296+
15297+ i have released this so people can comment on the architecture,
15298+ but user-space tools are still unusable so there is nothing much
15299+ you can do with this.
15300
15301 Boot support (linear, striped)
15302 CONFIG_MD_BOOT
This page took 3.957112 seconds and 4 git commands to generate.