]> git.pld-linux.org Git - packages/kernel.git/blame - raid-2.2.19-A1
- [2.4.2x, 2.6.x] don't recursively crash in die() on CHRP/PReP machines
[packages/kernel.git] / raid-2.2.19-A1
CommitLineData
cc1d2ee1 1--- linux/init/main.c.orig Tue Jan 16 13:30:09 2001
2+++ linux/init/main.c Tue Jan 16 13:42:03 2001
3@@ -19,6 +19,7 @@
4 #include <linux/utsname.h>
5 #include <linux/ioport.h>
6 #include <linux/init.h>
7+#include <linux/raid/md.h>
8 #include <linux/smp_lock.h>
9 #include <linux/blk.h>
10 #include <linux/hdreg.h>
11@@ -549,7 +550,7 @@
12 #ifdef CONFIG_BLK_DEV_FD
13 { "fd", 0x0200 },
14 #endif
15-#ifdef CONFIG_MD_BOOT
16+#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
17 { "md", 0x0900 },
18 #endif
19 #ifdef CONFIG_BLK_DEV_XD
20@@ -1060,6 +1061,9 @@
21 #ifdef CONFIG_MD_BOOT
22 { "md=", md_setup},
23 #endif
24+#if CONFIG_BLK_DEV_MD
25+ { "raid=", raid_setup},
26+#endif
27 #ifdef CONFIG_ADBMOUSE
28 { "adb_buttons=", adb_mouse_setup },
29 #endif
30@@ -1637,6 +1641,9 @@
31 while (pid != wait(&i));
32 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
33 || MINOR(real_root_dev) != 0) {
34+#ifdef CONFIG_BLK_DEV_MD
35+ autodetect_raid();
36+#endif
37 error = change_root(real_root_dev,"/initrd");
38 if (error)
39 printk(KERN_ERR "Change root to /initrd: "
40--- linux/include/linux/raid/hsm.h.orig Tue Jan 16 13:42:03 2001
41+++ linux/include/linux/raid/hsm.h Tue Jan 16 13:42:03 2001
42@@ -0,0 +1,65 @@
43+#ifndef _HSM_H
44+#define _HSM_H
45+
46+#include <linux/raid/md.h>
47+
48+#if __alpha__
49+#error fix cpu_addr on Alpha first
50+#endif
51+
52+#include <linux/raid/hsm_p.h>
53+
54+#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
55+#define index_dev(lv,index) index_pv((lv),(index))->dev
56+#define index_block(lv,index) (index)->data.phys_block
57+#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
58+
59+#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
60+
61+
62+typedef struct pv_bg_desc_s {
63+ unsigned int free_blocks;
64+ pv_block_group_t *bg;
65+} pv_bg_desc_t;
66+
67+typedef struct pv_s pv_t;
68+typedef struct vg_s vg_t;
69+typedef struct lv_s lv_t;
70+
71+struct pv_s
72+{
73+ int phys_nr;
74+ kdev_t dev;
75+ pv_sb_t *pv_sb;
76+ pv_bg_desc_t *bg_array;
77+};
78+
79+struct lv_s
80+{
81+ int log_id;
82+ vg_t *vg;
83+
84+ unsigned int max_indices;
85+ unsigned int free_indices;
86+ lv_lptr_t root_index;
87+
88+ kdev_t dev;
89+};
90+
91+struct vg_s
92+{
93+ int nr_pv;
94+ pv_t pv_array [MD_SB_DISKS];
95+
96+ int nr_lv;
97+ lv_t lv_array [HSM_MAX_LVS_PER_VG];
98+
99+ vg_sb_t *vg_sb;
100+ mddev_t *mddev;
101+};
102+
103+#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
104+#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
105+
106+#endif
107+
108--- linux/include/linux/raid/hsm_p.h.orig Tue Jan 16 13:42:03 2001
109+++ linux/include/linux/raid/hsm_p.h Tue Jan 16 13:42:03 2001
110@@ -0,0 +1,237 @@
111+#ifndef _HSM_P_H
112+#define _HSM_P_H
113+
114+#define HSM_BLOCKSIZE 4096
115+#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
116+#define PACKED __attribute__ ((packed))
117+
118+/*
119+ * Identifies a block in physical space
120+ */
121+typedef struct phys_idx_s {
122+ __u16 phys_nr;
123+ __u32 phys_block;
124+
125+} PACKED phys_idx_t;
126+
127+/*
128+ * Identifies a block in logical space
129+ */
130+typedef struct log_idx_s {
131+ __u16 log_id;
132+ __u32 log_index;
133+
134+} PACKED log_idx_t;
135+
136+/*
137+ * Describes one PV
138+ */
139+#define HSM_PV_SB_MAGIC 0xf091ae9fU
140+
141+#define HSM_PV_SB_GENERIC_WORDS 32
142+#define HSM_PV_SB_RESERVED_WORDS \
143+ (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
144+
145+/*
146+ * On-disk PV identification data, on block 0 in any PV.
147+ */
148+typedef struct pv_sb_s
149+{
150+ __u32 pv_magic; /* 0 */
151+
152+ __u32 pv_uuid0; /* 1 */
153+ __u32 pv_uuid1; /* 2 */
154+ __u32 pv_uuid2; /* 3 */
155+ __u32 pv_uuid3; /* 4 */
156+
157+ __u32 pv_major; /* 5 */
158+ __u32 pv_minor; /* 6 */
159+ __u32 pv_patch; /* 7 */
160+
161+ __u32 pv_ctime; /* 8 Creation time */
162+
163+ __u32 pv_total_size; /* 9 size of this PV, in blocks */
164+ __u32 pv_first_free; /* 10 first free block */
165+ __u32 pv_first_used; /* 11 first used block */
166+ __u32 pv_blocks_left; /* 12 unallocated blocks */
167+ __u32 pv_bg_size; /* 13 size of a block group, in blocks */
168+ __u32 pv_block_size; /* 14 size of blocks, in bytes */
169+ __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
170+ __u32 pv_block_groups; /* 16 number of block groups */
171+
172+ __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
173+
174+ /*
175+ * Reserved
176+ */
177+ __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
178+
179+} PACKED pv_sb_t;
180+
181+/*
182+ * this is pretty much arbitrary, but has to be less than ~64
183+ */
184+#define HSM_MAX_LVS_PER_VG 32
185+
186+#define HSM_VG_SB_GENERIC_WORDS 32
187+
188+#define LV_DESCRIPTOR_WORDS 8
189+#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
190+ LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
191+
192+#if (HSM_PV_SB_RESERVED_WORDS < 0)
193+#error you messed this one up dude ...
194+#endif
195+
196+typedef struct lv_descriptor_s
197+{
198+ __u32 lv_id; /* 0 */
199+ phys_idx_t lv_root_idx; /* 1 */
200+ __u16 __reserved; /* 2 */
201+ __u32 lv_max_indices; /* 3 */
202+ __u32 lv_free_indices; /* 4 */
203+ __u32 md_id; /* 5 */
204+
205+ __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
206+
207+} PACKED lv_descriptor_t;
208+
209+#define HSM_VG_SB_MAGIC 0x98320d7aU
210+/*
211+ * On-disk VG identification data, in block 1 on all PVs
212+ */
213+typedef struct vg_sb_s
214+{
215+ __u32 vg_magic; /* 0 */
216+ __u32 nr_lvs; /* 1 */
217+
218+ __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
219+
220+ lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
221+ /*
222+ * Reserved
223+ */
224+ __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
225+
226+} PACKED vg_sb_t;
227+
228+/*
229+ * Describes one LV
230+ */
231+
232+#define HSM_LV_SB_MAGIC 0xe182bd8aU
233+
234+/* do we need lv_sb_t? */
235+
236+typedef struct lv_sb_s
237+{
238+ /*
239+ * On-disk LV identifier
240+ */
241+ __u32 lv_magic; /* 0 LV identifier */
242+ __u32 lv_uuid0; /* 1 */
243+ __u32 lv_uuid1; /* 2 */
244+ __u32 lv_uuid2; /* 3 */
245+ __u32 lv_uuid3; /* 4 */
246+
247+ __u32 lv_major; /* 5 PV identifier */
248+ __u32 lv_minor; /* 6 PV identifier */
249+ __u32 lv_patch; /* 7 PV identifier */
250+
251+ __u32 ctime; /* 8 Creation time */
252+ __u32 size; /* 9 size of this LV, in blocks */
253+ phys_idx_t start; /* 10 position of root index block */
254+ log_idx_t first_free; /* 11-12 first free index */
255+
256+ /*
257+ * Reserved
258+ */
259+ __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
260+
261+} PACKED lv_sb_t;
262+
263+/*
264+ * Pointer pointing from the physical space, points to
265+ * the LV owning this block. It also contains various
266+ * statistics about the physical block.
267+ */
268+typedef struct pv_pptr_s
269+{
270+ union {
271+ /* case 1 */
272+ struct {
273+ log_idx_t owner;
274+ log_idx_t predicted;
275+ __u32 last_referenced;
276+ } used;
277+ /* case 2 */
278+ struct {
279+ __u16 log_id;
280+ __u16 __unused1;
281+ __u32 next_free;
282+ __u32 __unused2;
283+ __u32 __unused3;
284+ } free;
285+ } u;
286+} PACKED pv_pptr_t;
287+
288+static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
289+{
290+ return !pptr->u.free.log_id;
291+}
292+
293+
294+#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
295+
296+#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
297+/*
298+ * A table of pointers filling up a single block, managing
299+ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
300+ * groups form the physical space of blocks.
301+ */
302+typedef struct pv_block_group_s
303+{
304+ __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
305+
306+ pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
307+
308+} PACKED pv_block_group_t;
309+
310+/*
311+ * Pointer from the logical space, points to
312+ * the (PV,block) containing this logical block
313+ */
314+typedef struct lv_lptr_s
315+{
316+ phys_idx_t data;
317+ __u16 __reserved;
318+ __u32 cpu_addr;
319+ __u32 __reserved2;
320+
321+} PACKED lv_lptr_t;
322+
323+static __inline__ int index_free (const lv_lptr_t * index)
324+{
325+ return !index->data.phys_block;
326+}
327+
328+static __inline__ int index_present (const lv_lptr_t * index)
329+{
330+ return index->cpu_addr;
331+}
332+
333+
334+#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
335+/*
336+ * A table of pointers filling up a single block, managing
337+ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
338+ * the logical space of blocks.
339+ */
340+typedef struct lv_index_block_s
341+{
342+ lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
343+
344+} PACKED lv_index_block_t;
345+
346+#endif
347+
348--- linux/include/linux/raid/linear.h.orig Tue Jan 16 13:42:03 2001
349+++ linux/include/linux/raid/linear.h Tue Jan 16 13:44:37 2001
350@@ -0,0 +1,32 @@
351+#ifndef _LINEAR_H
352+#define _LINEAR_H
353+
354+#include <linux/raid/md.h>
355+
356+struct dev_info {
357+ kdev_t dev;
358+ int size;
359+ unsigned int offset;
360+};
361+
362+typedef struct dev_info dev_info_t;
363+
364+struct linear_hash
365+{
366+ dev_info_t *dev0, *dev1;
367+};
368+
369+struct linear_private_data
370+{
371+ struct linear_hash *hash_table;
372+ dev_info_t disks[MD_SB_DISKS];
373+ dev_info_t *smallest;
374+ int nr_zones;
375+};
376+
377+
378+typedef struct linear_private_data linear_conf_t;
379+
380+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
381+
382+#endif
383--- linux/include/linux/raid/md.h.orig Tue Jan 16 13:42:03 2001
384+++ linux/include/linux/raid/md.h Tue Jan 16 13:47:20 2001
385@@ -0,0 +1,96 @@
386+/*
387+ md.h : Multiple Devices driver for Linux
388+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
389+ Copyright (C) 1994-96 Marc ZYNGIER
390+ <zyngier@ufr-info-p7.ibp.fr> or
391+ <maz@gloups.fdn.fr>
392+
393+ This program is free software; you can redistribute it and/or modify
394+ it under the terms of the GNU General Public License as published by
395+ the Free Software Foundation; either version 2, or (at your option)
396+ any later version.
397+
398+ You should have received a copy of the GNU General Public License
399+ (for example /usr/src/linux/COPYING); if not, write to the Free
400+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
401+*/
402+
403+#ifndef _MD_H
404+#define _MD_H
405+
406+#include <linux/mm.h>
407+#include <linux/fs.h>
408+#include <linux/blkdev.h>
409+#include <asm/semaphore.h>
410+#include <linux/major.h>
411+#include <linux/ioctl.h>
412+#include <linux/types.h>
413+#include <asm/bitops.h>
414+#include <linux/module.h>
415+#include <linux/mm.h>
416+#include <linux/hdreg.h>
417+#include <linux/sysctl.h>
418+#include <linux/fs.h>
419+#include <linux/proc_fs.h>
420+#include <linux/smp_lock.h>
421+#include <linux/delay.h>
422+#include <net/checksum.h>
423+#include <linux/random.h>
424+#include <linux/locks.h>
425+#include <asm/io.h>
426+
427+#include <linux/raid/md_compatible.h>
428+/*
429+ * 'md_p.h' holds the 'physical' layout of RAID devices
430+ * 'md_u.h' holds the user <=> kernel API
431+ *
432+ * 'md_k.h' holds kernel internal definitions
433+ */
434+
435+#include <linux/raid/md_p.h>
436+#include <linux/raid/md_u.h>
437+#include <linux/raid/md_k.h>
438+
439+/*
440+ * Different major versions are not compatible.
441+ * Different minor versions are only downward compatible.
442+ * Different patchlevel versions are downward and upward compatible.
443+ */
444+#define MD_MAJOR_VERSION 0
445+#define MD_MINOR_VERSION 90
446+#define MD_PATCHLEVEL_VERSION 0
447+
448+extern int md_size[MAX_MD_DEVS];
449+extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
450+
451+extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
452+extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
453+extern char * partition_name (kdev_t dev);
454+extern int register_md_personality (int p_num, mdk_personality_t *p);
455+extern int unregister_md_personality (int p_num);
456+extern mdk_thread_t * md_register_thread (void (*run) (void *data),
457+ void *data, const char *name);
458+extern void md_unregister_thread (mdk_thread_t *thread);
459+extern void md_wakeup_thread(mdk_thread_t *thread);
460+extern void md_interrupt_thread (mdk_thread_t *thread);
461+extern int md_update_sb (mddev_t *mddev);
462+extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
463+extern void md_recover_arrays (void);
464+extern int md_check_ordering (mddev_t *mddev);
465+extern void autodetect_raid(void);
466+extern struct gendisk * find_gendisk (kdev_t dev);
467+extern int md_notify_reboot(struct notifier_block *this,
468+ unsigned long code, void *x);
469+#if CONFIG_BLK_DEV_MD
470+extern void raid_setup(char *str,int *ints) md__init;
471+#endif
472+#ifdef CONFIG_MD_BOOT
473+extern void md_setup(char *str,int *ints) md__init;
474+#endif
475+
476+extern void md_print_devices (void);
477+
478+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
479+
480+#endif _MD_H
481+
482--- linux/include/linux/raid/md_compatible.h.orig Tue Jan 16 13:42:03 2001
483+++ linux/include/linux/raid/md_compatible.h Tue Jan 16 13:47:19 2001
484@@ -0,0 +1,387 @@
485+
486+/*
487+ md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
488+ Copyright (C) 1998 Ingo Molnar
489+
490+ This program is free software; you can redistribute it and/or modify
491+ it under the terms of the GNU General Public License as published by
492+ the Free Software Foundation; either version 2, or (at your option)
493+ any later version.
494+
495+ You should have received a copy of the GNU General Public License
496+ (for example /usr/src/linux/COPYING); if not, write to the Free
497+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
498+*/
499+
500+#include <linux/version.h>
501+
502+#ifndef _MD_COMPATIBLE_H
503+#define _MD_COMPATIBLE_H
504+
505+#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
506+
507+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
508+
509+/* 000 */
510+#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
511+
512+#ifdef __i386__
513+/* 001 */
514+extern __inline__ int md_cpu_has_mmx(void)
515+{
516+ return x86_capability & 0x00800000;
517+}
518+#endif
519+
520+/* 002 */
521+#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
522+
523+/* 003 */
524+/*
525+ * someone please suggest a sane compatibility layer for modules
526+ */
527+#define MD_EXPORT_SYMBOL(x)
528+
529+/* 004 */
530+static inline unsigned long
531+md_copy_from_user(void *to, const void *from, unsigned long n)
532+{
533+ int err;
534+
535+ err = verify_area(VERIFY_READ,from,n);
536+ if (!err)
537+ memcpy_fromfs(to, from, n);
538+ return err;
539+}
540+
541+/* 005 */
542+extern inline unsigned long
543+md_copy_to_user(void *to, const void *from, unsigned long n)
544+{
545+ int err;
546+
547+ err = verify_area(VERIFY_WRITE,to,n);
548+ if (!err)
549+ memcpy_tofs(to, from, n);
550+ return err;
551+}
552+
553+/* 006 */
554+#define md_put_user(x,ptr) \
555+({ \
556+ int __err; \
557+ \
558+ __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
559+ if (!__err) \
560+ put_user(x,ptr); \
561+ __err; \
562+})
563+
564+/* 007 */
565+extern inline int md_capable_admin(void)
566+{
567+ return suser();
568+}
569+
570+/* 008 */
571+#define MD_FILE_TO_INODE(file) ((file)->f_inode)
572+
573+/* 009 */
574+extern inline void md_flush_signals (void)
575+{
576+ current->signal = 0;
577+}
578+
579+/* 010 */
580+#define __S(nr) (1<<((nr)-1))
581+extern inline void md_init_signals (void)
582+{
583+ current->exit_signal = SIGCHLD;
584+ current->blocked = ~(__S(SIGKILL));
585+}
586+#undef __S
587+
588+/* 011 */
589+extern inline unsigned long md_signal_pending (struct task_struct * tsk)
590+{
591+ return (tsk->signal & ~tsk->blocked);
592+}
593+
594+/* 012 */
595+#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
596+
597+/* 013 */
598+#define md_mdelay(n) (\
599+ {unsigned long msec=(n); while (msec--) udelay(1000);})
600+
601+/* 014 */
602+#define MD_SYS_DOWN 0
603+#define MD_SYS_HALT 0
604+#define MD_SYS_POWER_OFF 0
605+
606+/* 015 */
607+#define md_register_reboot_notifier(x)
608+
609+/* 016 */
610+extern __inline__ unsigned long
611+md_test_and_set_bit(int nr, void * addr)
612+{
613+ unsigned long flags;
614+ unsigned long oldbit;
615+
616+ save_flags(flags);
617+ cli();
618+ oldbit = test_bit(nr,addr);
619+ set_bit(nr,addr);
620+ restore_flags(flags);
621+ return oldbit;
622+}
623+
624+/* 017 */
625+extern __inline__ unsigned long
626+md_test_and_clear_bit(int nr, void * addr)
627+{
628+ unsigned long flags;
629+ unsigned long oldbit;
630+
631+ save_flags(flags);
632+ cli();
633+ oldbit = test_bit(nr,addr);
634+ clear_bit(nr,addr);
635+ restore_flags(flags);
636+ return oldbit;
637+}
638+
639+/* 018 */
640+#define md_atomic_read(x) (*(volatile int *)(x))
641+#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
642+
643+/* 019 */
644+extern __inline__ void md_lock_kernel (void)
645+{
646+#if __SMP__
647+ lock_kernel();
648+ syscall_count++;
649+#endif
650+}
651+
652+extern __inline__ void md_unlock_kernel (void)
653+{
654+#if __SMP__
655+ syscall_count--;
656+ unlock_kernel();
657+#endif
658+}
659+/* 020 */
660+
661+#define md__init
662+#define md__initdata
663+#define md__initfunc(__arginit) __arginit
664+
665+/* 021 */
666+
667+/* 022 */
668+
669+struct md_list_head {
670+ struct md_list_head *next, *prev;
671+};
672+
673+#define MD_LIST_HEAD(name) \
674+ struct md_list_head name = { &name, &name }
675+
676+#define MD_INIT_LIST_HEAD(ptr) do { \
677+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
678+} while (0)
679+
680+static __inline__ void md__list_add(struct md_list_head * new,
681+ struct md_list_head * prev,
682+ struct md_list_head * next)
683+{
684+ next->prev = new;
685+ new->next = next;
686+ new->prev = prev;
687+ prev->next = new;
688+}
689+
690+static __inline__ void md_list_add(struct md_list_head *new,
691+ struct md_list_head *head)
692+{
693+ md__list_add(new, head, head->next);
694+}
695+
696+static __inline__ void md__list_del(struct md_list_head * prev,
697+ struct md_list_head * next)
698+{
699+ next->prev = prev;
700+ prev->next = next;
701+}
702+
703+static __inline__ void md_list_del(struct md_list_head *entry)
704+{
705+ md__list_del(entry->prev, entry->next);
706+}
707+
708+static __inline__ int md_list_empty(struct md_list_head *head)
709+{
710+ return head->next == head;
711+}
712+
713+#define md_list_entry(ptr, type, member) \
714+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
715+
716+/* 023 */
717+
718+static __inline__ signed long md_schedule_timeout(signed long timeout)
719+{
720+ current->timeout = jiffies + timeout;
721+ schedule();
722+ return 0;
723+}
724+
725+/* 024 */
726+#define md_need_resched(tsk) (need_resched)
727+
728+/* 025 */
729+typedef struct { int gcc_is_buggy; } md_spinlock_t;
730+#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
731+
732+#define md_spin_lock_irq cli
733+#define md_spin_unlock_irq sti
734+#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
735+#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
736+
737+/* END */
738+
739+#else
740+
741+#include <linux/reboot.h>
742+#include <linux/vmalloc.h>
743+
744+/* 000 */
745+#define md__get_free_pages(x,y) __get_free_pages(x,y)
746+
747+#ifdef __i386__
748+/* 001 */
749+extern __inline__ int md_cpu_has_mmx(void)
750+{
751+ return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
752+}
753+#endif
754+
755+/* 002 */
756+#define md_clear_page(page) clear_page(page)
757+
758+/* 003 */
759+#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
760+
761+/* 004 */
762+#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
763+
764+/* 005 */
765+#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
766+
767+/* 006 */
768+#define md_put_user put_user
769+
770+/* 007 */
771+extern inline int md_capable_admin(void)
772+{
773+ return capable(CAP_SYS_ADMIN);
774+}
775+
776+/* 008 */
777+#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
778+
779+/* 009 */
780+extern inline void md_flush_signals (void)
781+{
782+ spin_lock(&current->sigmask_lock);
783+ flush_signals(current);
784+ spin_unlock(&current->sigmask_lock);
785+}
786+
787+/* 010 */
788+extern inline void md_init_signals (void)
789+{
790+ current->exit_signal = SIGCHLD;
791+ siginitsetinv(&current->blocked, sigmask(SIGKILL));
792+}
793+
794+/* 011 */
795+#define md_signal_pending signal_pending
796+
797+/* 012 */
798+extern inline void md_set_global_readahead(int * table)
799+{
800+ max_readahead[MD_MAJOR] = table;
801+}
802+
803+/* 013 */
804+#define md_mdelay(x) mdelay(x)
805+
806+/* 014 */
807+#define MD_SYS_DOWN SYS_DOWN
808+#define MD_SYS_HALT SYS_HALT
809+#define MD_SYS_POWER_OFF SYS_POWER_OFF
810+
811+/* 015 */
812+#define md_register_reboot_notifier register_reboot_notifier
813+
814+/* 016 */
815+#define md_test_and_set_bit test_and_set_bit
816+
817+/* 017 */
818+#define md_test_and_clear_bit test_and_clear_bit
819+
820+/* 018 */
821+#define md_atomic_read atomic_read
822+#define md_atomic_set atomic_set
823+
824+/* 019 */
825+#define md_lock_kernel lock_kernel
826+#define md_unlock_kernel unlock_kernel
827+
828+/* 020 */
829+
830+#include <linux/init.h>
831+
832+#define md__init __init
833+#define md__initdata __initdata
834+#define md__initfunc(__arginit) __initfunc(__arginit)
835+
836+/* 021 */
837+
838+
839+/* 022 */
840+
841+#define md_list_head list_head
842+#define MD_LIST_HEAD(name) LIST_HEAD(name)
843+#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
844+#define md_list_add list_add
845+#define md_list_del list_del
846+#define md_list_empty list_empty
847+
848+#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
849+
850+/* 023 */
851+
852+#define md_schedule_timeout schedule_timeout
853+
854+/* 024 */
855+#define md_need_resched(tsk) ((tsk)->need_resched)
856+
857+/* 025 */
858+#define md_spinlock_t spinlock_t
859+#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
860+
861+#define md_spin_lock_irq spin_lock_irq
862+#define md_spin_unlock_irq spin_unlock_irq
863+#define md_spin_unlock_irqrestore spin_unlock_irqrestore
864+#define md_spin_lock_irqsave spin_lock_irqsave
865+
866+/* END */
867+
868+#endif
869+
870+#endif _MD_COMPATIBLE_H
871+
872--- linux/include/linux/raid/md_k.h.orig Tue Jan 16 13:42:03 2001
873+++ linux/include/linux/raid/md_k.h Tue Jan 16 13:42:03 2001
874@@ -0,0 +1,338 @@
875+/*
876+ md_k.h : kernel internal structure of the Linux MD driver
877+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
878+
879+ This program is free software; you can redistribute it and/or modify
880+ it under the terms of the GNU General Public License as published by
881+ the Free Software Foundation; either version 2, or (at your option)
882+ any later version.
883+
884+ You should have received a copy of the GNU General Public License
885+ (for example /usr/src/linux/COPYING); if not, write to the Free
886+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
887+*/
888+
889+#ifndef _MD_K_H
890+#define _MD_K_H
891+
892+#define MD_RESERVED 0UL
893+#define LINEAR 1UL
894+#define STRIPED 2UL
895+#define RAID0 STRIPED
896+#define RAID1 3UL
897+#define RAID5 4UL
898+#define TRANSLUCENT 5UL
899+#define HSM 6UL
900+#define MAX_PERSONALITY 7UL
901+
902+extern inline int pers_to_level (int pers)
903+{
904+ switch (pers) {
905+ case HSM: return -3;
906+ case TRANSLUCENT: return -2;
907+ case LINEAR: return -1;
908+ case RAID0: return 0;
909+ case RAID1: return 1;
910+ case RAID5: return 5;
911+ }
912+ panic("pers_to_level()");
913+}
914+
915+extern inline int level_to_pers (int level)
916+{
917+ switch (level) {
918+ case -3: return HSM;
919+ case -2: return TRANSLUCENT;
920+ case -1: return LINEAR;
921+ case 0: return RAID0;
922+ case 1: return RAID1;
923+ case 4:
924+ case 5: return RAID5;
925+ }
926+ return MD_RESERVED;
927+}
928+
929+typedef struct mddev_s mddev_t;
930+typedef struct mdk_rdev_s mdk_rdev_t;
931+
932+#if (MINORBITS != 8)
933+#error MD doesnt handle bigger kdev yet
934+#endif
935+
936+#define MAX_REAL 12 /* Max number of disks per md dev */
937+#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
938+
939+/*
940+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
941+ * the personality. (eg. HSM uses this to identify individual LVs)
942+ */
943+typedef struct dev_mapping_s {
944+ mddev_t *mddev;
945+ void *data;
946+} dev_mapping_t;
947+
948+extern dev_mapping_t mddev_map [MAX_MD_DEVS];
949+
950+extern inline mddev_t * kdev_to_mddev (kdev_t dev)
951+{
952+ return mddev_map[MINOR(dev)].mddev;
953+}
954+
955+/*
956+ * options passed in raidrun:
957+ */
958+
959+#define MAX_CHUNK_SIZE (4096*1024)
960+
961+/*
962+ * default readahead
963+ */
964+#define MD_READAHEAD (256 * 512)
965+
966+extern inline int disk_faulty(mdp_disk_t * d)
967+{
968+ return d->state & (1 << MD_DISK_FAULTY);
969+}
970+
971+extern inline int disk_active(mdp_disk_t * d)
972+{
973+ return d->state & (1 << MD_DISK_ACTIVE);
974+}
975+
976+extern inline int disk_sync(mdp_disk_t * d)
977+{
978+ return d->state & (1 << MD_DISK_SYNC);
979+}
980+
981+extern inline int disk_spare(mdp_disk_t * d)
982+{
983+ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
984+}
985+
986+extern inline int disk_removed(mdp_disk_t * d)
987+{
988+ return d->state & (1 << MD_DISK_REMOVED);
989+}
990+
991+extern inline void mark_disk_faulty(mdp_disk_t * d)
992+{
993+ d->state |= (1 << MD_DISK_FAULTY);
994+}
995+
996+extern inline void mark_disk_active(mdp_disk_t * d)
997+{
998+ d->state |= (1 << MD_DISK_ACTIVE);
999+}
1000+
1001+extern inline void mark_disk_sync(mdp_disk_t * d)
1002+{
1003+ d->state |= (1 << MD_DISK_SYNC);
1004+}
1005+
1006+extern inline void mark_disk_spare(mdp_disk_t * d)
1007+{
1008+ d->state = 0;
1009+}
1010+
1011+extern inline void mark_disk_removed(mdp_disk_t * d)
1012+{
1013+ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
1014+}
1015+
1016+extern inline void mark_disk_inactive(mdp_disk_t * d)
1017+{
1018+ d->state &= ~(1 << MD_DISK_ACTIVE);
1019+}
1020+
1021+extern inline void mark_disk_nonsync(mdp_disk_t * d)
1022+{
1023+ d->state &= ~(1 << MD_DISK_SYNC);
1024+}
1025+
1026+/*
1027+ * MD's 'extended' device
1028+ */
1029+struct mdk_rdev_s
1030+{
1031+ struct md_list_head same_set; /* RAID devices within the same set */
1032+ struct md_list_head all; /* all RAID devices */
1033+ struct md_list_head pending; /* undetected RAID devices */
1034+
1035+ kdev_t dev; /* Device number */
1036+ kdev_t old_dev; /* "" when it was last imported */
1037+ int size; /* Device size (in blocks) */
1038+ mddev_t *mddev; /* RAID array if running */
1039+ unsigned long last_events; /* IO event timestamp */
1040+
1041+ struct inode *inode; /* Lock inode */
1042+ struct file filp; /* Lock file */
1043+
1044+ mdp_super_t *sb;
1045+ int sb_offset;
1046+
1047+ int faulty; /* if faulty do not issue IO requests */
1048+ int desc_nr; /* descriptor index in the superblock */
1049+};
1050+
1051+
1052+/*
1053+ * disk operations in a working array:
1054+ */
1055+#define DISKOP_SPARE_INACTIVE 0
1056+#define DISKOP_SPARE_WRITE 1
1057+#define DISKOP_SPARE_ACTIVE 2
1058+#define DISKOP_HOT_REMOVE_DISK 3
1059+#define DISKOP_HOT_ADD_DISK 4
1060+
1061+typedef struct mdk_personality_s mdk_personality_t;
1062+
1063+struct mddev_s
1064+{
1065+ void *private;
1066+ mdk_personality_t *pers;
1067+ int __minor;
1068+ mdp_super_t *sb;
1069+ int nb_dev;
1070+ struct md_list_head disks;
1071+ int sb_dirty;
1072+ mdu_param_t param;
1073+ int ro;
1074+ unsigned int curr_resync;
1075+ unsigned long resync_start;
1076+ char *name;
1077+ int recovery_running;
1078+ struct semaphore reconfig_sem;
1079+ struct semaphore recovery_sem;
1080+ struct semaphore resync_sem;
1081+ struct md_list_head all_mddevs;
1082+};
1083+
1084+struct mdk_personality_s
1085+{
1086+ char *name;
1087+ int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1088+ unsigned long *rsector, unsigned long size);
1089+ int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
1090+ void (*end_request)(struct buffer_head * bh, int uptodate);
1091+ int (*run)(mddev_t *mddev);
1092+ int (*stop)(mddev_t *mddev);
1093+ int (*status)(char *page, mddev_t *mddev);
1094+ int (*ioctl)(struct inode *inode, struct file *file,
1095+ unsigned int cmd, unsigned long arg);
1096+ int max_invalid_dev;
1097+ int (*error_handler)(mddev_t *mddev, kdev_t dev);
1098+
1099+/*
1100+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
1101+ * hot-removed. Hot removal is different from failure. (failure marks
1102+ * a disk inactive, but the disk is still part of the array) The interface
1103+ * to such operations is the 'pers->diskop()' function, can be NULL.
1104+ *
1105+ * the diskop function can change the pointer pointing to the incoming
1106+ * descriptor, but must do so very carefully. (currently only
1107+ * SPARE_ACTIVE expects such a change)
1108+ */
1109+ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
1110+
1111+ int (*stop_resync)(mddev_t *mddev);
1112+ int (*restart_resync)(mddev_t *mddev);
1113+};
1114+
1115+
1116+/*
1117+ * Currently we index md_array directly, based on the minor
1118+ * number. This will have to change to dynamic allocation
1119+ * once we start supporting partitioning of md devices.
1120+ */
1121+extern inline int mdidx (mddev_t * mddev)
1122+{
1123+ return mddev->__minor;
1124+}
1125+
1126+extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
1127+{
1128+ return MKDEV(MD_MAJOR, mdidx(mddev));
1129+}
1130+
1131+extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
1132+extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
1133+
1134+/*
1135+ * iterates through some rdev ringlist. It's safe to remove the
1136+ * current 'rdev'. Dont touch 'tmp' though.
1137+ */
1138+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
1139+ \
1140+ for (tmp = head.next; \
1141+ rdev = md_list_entry(tmp, mdk_rdev_t, field), \
1142+ tmp = tmp->next, tmp->prev != &head \
1143+ ; )
1144+/*
1145+ * iterates through the 'same array disks' ringlist
1146+ */
1147+#define ITERATE_RDEV(mddev,rdev,tmp) \
1148+ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
1149+
1150+/*
1151+ * Same as above, but assumes that the device has rdev->desc_nr numbered
1152+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
1153+ */
1154+#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
1155+ for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
1156+
1157+
1158+/*
1159+ * Iterates through all 'RAID managed disks'
1160+ */
1161+#define ITERATE_RDEV_ALL(rdev,tmp) \
1162+ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
1163+
1164+/*
1165+ * Iterates through 'pending RAID disks'
1166+ */
1167+#define ITERATE_RDEV_PENDING(rdev,tmp) \
1168+ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
1169+
1170+/*
1171+ * iterates through all used mddevs in the system.
1172+ */
1173+#define ITERATE_MDDEV(mddev,tmp) \
1174+ \
1175+ for (tmp = all_mddevs.next; \
1176+ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
1177+ tmp = tmp->next, tmp->prev != &all_mddevs \
1178+ ; )
1179+
1180+extern inline int lock_mddev (mddev_t * mddev)
1181+{
1182+ return down_interruptible(&mddev->reconfig_sem);
1183+}
1184+
1185+extern inline void unlock_mddev (mddev_t * mddev)
1186+{
1187+ up(&mddev->reconfig_sem);
1188+}
1189+
1190+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
1191+ x = y; y = __tmp; } while (0)
1192+
1193+typedef struct mdk_thread_s {
1194+ void (*run) (void *data);
1195+ void *data;
1196+ struct wait_queue *wqueue;
1197+ unsigned long flags;
1198+ struct semaphore *sem;
1199+ struct task_struct *tsk;
1200+ const char *name;
1201+} mdk_thread_t;
1202+
1203+#define THREAD_WAKEUP 0
1204+
1205+typedef struct dev_name_s {
1206+ struct md_list_head list;
1207+ kdev_t dev;
1208+ char name [MAX_DISKNAME_LEN];
1209+} dev_name_t;
1210+
1211+#endif _MD_K_H
1212+
1213--- linux/include/linux/raid/md_p.h.orig Tue Jan 16 13:42:03 2001
1214+++ linux/include/linux/raid/md_p.h Tue Jan 16 13:42:03 2001
1215@@ -0,0 +1,161 @@
1216+/*
1217+ md_p.h : physical layout of Linux RAID devices
1218+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
1219+
1220+ This program is free software; you can redistribute it and/or modify
1221+ it under the terms of the GNU General Public License as published by
1222+ the Free Software Foundation; either version 2, or (at your option)
1223+ any later version.
1224+
1225+ You should have received a copy of the GNU General Public License
1226+ (for example /usr/src/linux/COPYING); if not, write to the Free
1227+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1228+*/
1229+
1230+#ifndef _MD_P_H
1231+#define _MD_P_H
1232+
1233+/*
1234+ * RAID superblock.
1235+ *
1236+ * The RAID superblock maintains some statistics on each RAID configuration.
1237+ * Each real device in the RAID set contains it near the end of the device.
1238+ * Some of the ideas are copied from the ext2fs implementation.
1239+ *
1240+ * We currently use 4096 bytes as follows:
1241+ *
1242+ * word offset function
1243+ *
1244+ * 0 - 31 Constant generic RAID device information.
1245+ * 32 - 63 Generic state information.
1246+ * 64 - 127 Personality specific information.
1247+ * 128 - 511 12 32-words descriptors of the disks in the raid set.
1248+ * 512 - 911 Reserved.
1249+ * 912 - 1023 Disk specific descriptor.
1250+ */
1251+
1252+/*
1253+ * If x is the real device size in bytes, we return an apparent size of:
1254+ *
1255+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1256+ *
1257+ * and place the 4kB superblock at offset y.
1258+ */
1259+#define MD_RESERVED_BYTES (64 * 1024)
1260+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1261+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1262+
1263+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1264+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1265+
1266+#define MD_SB_BYTES 4096
1267+#define MD_SB_WORDS (MD_SB_BYTES / 4)
1268+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1269+#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1270+
1271+/*
1272+ * The following are counted in 32-bit words
1273+ */
1274+#define MD_SB_GENERIC_OFFSET 0
1275+#define MD_SB_PERSONALITY_OFFSET 64
1276+#define MD_SB_DISKS_OFFSET 128
1277+#define MD_SB_DESCRIPTOR_OFFSET 992
1278+
1279+#define MD_SB_GENERIC_CONSTANT_WORDS 32
1280+#define MD_SB_GENERIC_STATE_WORDS 32
1281+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1282+#define MD_SB_PERSONALITY_WORDS 64
1283+#define MD_SB_DISKS_WORDS 384
1284+#define MD_SB_DESCRIPTOR_WORDS 32
1285+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1286+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1287+#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1288+
1289+/*
1290+ * Device "operational" state bits
1291+ */
1292+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
1293+#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
1294+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
1295+#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
1296+
1297+typedef struct mdp_device_descriptor_s {
1298+ __u32 number; /* 0 Device number in the entire set */
1299+ __u32 major; /* 1 Device major number */
1300+ __u32 minor; /* 2 Device minor number */
1301+ __u32 raid_disk; /* 3 The role of the device in the raid set */
1302+ __u32 state; /* 4 Operational state */
1303+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1304+} mdp_disk_t;
1305+
1306+#define MD_SB_MAGIC 0xa92b4efc
1307+
1308+/*
1309+ * Superblock state bits
1310+ */
1311+#define MD_SB_CLEAN 0
1312+#define MD_SB_ERRORS 1
1313+
1314+typedef struct mdp_superblock_s {
1315+ /*
1316+ * Constant generic information
1317+ */
1318+ __u32 md_magic; /* 0 MD identifier */
1319+ __u32 major_version; /* 1 major version to which the set conforms */
1320+ __u32 minor_version; /* 2 minor version ... */
1321+ __u32 patch_version; /* 3 patchlevel version ... */
1322+ __u32 gvalid_words; /* 4 Number of used words in this section */
1323+ __u32 set_uuid0; /* 5 Raid set identifier */
1324+ __u32 ctime; /* 6 Creation time */
1325+ __u32 level; /* 7 Raid personality */
1326+ __u32 size; /* 8 Apparent size of each individual disk */
1327+ __u32 nr_disks; /* 9 total disks in the raid set */
1328+ __u32 raid_disks; /* 10 disks in a fully functional raid set */
1329+ __u32 md_minor; /* 11 preferred MD minor device number */
1330+ __u32 not_persistent; /* 12 does it have a persistent superblock */
1331+ __u32 set_uuid1; /* 13 Raid set identifier #2 */
1332+ __u32 set_uuid2; /* 14 Raid set identifier #3 */
1333+ __u32 set_uuid3; /* 14 Raid set identifier #4 */
1334+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
1335+
1336+ /*
1337+ * Generic state information
1338+ */
1339+ __u32 utime; /* 0 Superblock update time */
1340+ __u32 state; /* 1 State bits (clean, ...) */
1341+ __u32 active_disks; /* 2 Number of currently active disks */
1342+ __u32 working_disks; /* 3 Number of working disks */
1343+ __u32 failed_disks; /* 4 Number of failed disks */
1344+ __u32 spare_disks; /* 5 Number of spare disks */
1345+ __u32 sb_csum; /* 6 checksum of the whole superblock */
1346+ __u64 events; /* 7 number of superblock updates (64-bit!) */
1347+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
1348+
1349+ /*
1350+ * Personality information
1351+ */
1352+ __u32 layout; /* 0 the array's physical layout */
1353+ __u32 chunk_size; /* 1 chunk size in bytes */
1354+ __u32 root_pv; /* 2 LV root PV */
1355+ __u32 root_block; /* 3 LV root block */
1356+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
1357+
1358+ /*
1359+ * Disks information
1360+ */
1361+ mdp_disk_t disks[MD_SB_DISKS];
1362+
1363+ /*
1364+ * Reserved
1365+ */
1366+ __u32 reserved[MD_SB_RESERVED_WORDS];
1367+
1368+ /*
1369+ * Active descriptor
1370+ */
1371+ mdp_disk_t this_disk;
1372+
1373+} mdp_super_t;
1374+
1375+#endif _MD_P_H
1376+
1377--- linux/include/linux/raid/md_u.h.orig Tue Jan 16 13:42:03 2001
1378+++ linux/include/linux/raid/md_u.h Tue Jan 16 13:42:03 2001
1379@@ -0,0 +1,115 @@
1380+/*
1381+ md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
1382+ Copyright (C) 1998 Ingo Molnar
1383+
1384+ This program is free software; you can redistribute it and/or modify
1385+ it under the terms of the GNU General Public License as published by
1386+ the Free Software Foundation; either version 2, or (at your option)
1387+ any later version.
1388+
1389+ You should have received a copy of the GNU General Public License
1390+ (for example /usr/src/linux/COPYING); if not, write to the Free
1391+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1392+*/
1393+
1394+#ifndef _MD_U_H
1395+#define _MD_U_H
1396+
1397+/* ioctls */
1398+
1399+/* status */
1400+#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
1401+#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
1402+#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
1403+#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
1404+
1405+/* configuration */
1406+#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
1407+#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
1408+#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
1409+#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
1410+#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
1411+#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
1412+#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
1413+#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
1414+#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
1415+#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
1416+
1417+/* usage */
1418+#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
1419+#define START_ARRAY _IO (MD_MAJOR, 0x31)
1420+#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
1421+#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
1422+#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
1423+
1424+typedef struct mdu_version_s {
1425+ int major;
1426+ int minor;
1427+ int patchlevel;
1428+} mdu_version_t;
1429+
1430+typedef struct mdu_array_info_s {
1431+ /*
1432+ * Generic constant information
1433+ */
1434+ int major_version;
1435+ int minor_version;
1436+ int patch_version;
1437+ int ctime;
1438+ int level;
1439+ int size;
1440+ int nr_disks;
1441+ int raid_disks;
1442+ int md_minor;
1443+ int not_persistent;
1444+
1445+ /*
1446+ * Generic state information
1447+ */
1448+ int utime; /* 0 Superblock update time */
1449+ int state; /* 1 State bits (clean, ...) */
1450+ int active_disks; /* 2 Number of currently active disks */
1451+ int working_disks; /* 3 Number of working disks */
1452+ int failed_disks; /* 4 Number of failed disks */
1453+ int spare_disks; /* 5 Number of spare disks */
1454+
1455+ /*
1456+ * Personality information
1457+ */
1458+ int layout; /* 0 the array's physical layout */
1459+ int chunk_size; /* 1 chunk size in bytes */
1460+
1461+} mdu_array_info_t;
1462+
1463+typedef struct mdu_disk_info_s {
1464+ /*
1465+ * configuration/status of one particular disk
1466+ */
1467+ int number;
1468+ int major;
1469+ int minor;
1470+ int raid_disk;
1471+ int state;
1472+
1473+} mdu_disk_info_t;
1474+
1475+typedef struct mdu_start_info_s {
1476+ /*
1477+ * configuration/status of one particular disk
1478+ */
1479+ int major;
1480+ int minor;
1481+ int raid_disk;
1482+ int state;
1483+
1484+} mdu_start_info_t;
1485+
1486+typedef struct mdu_param_s
1487+{
1488+ int personality; /* 1,2,3,4 */
1489+ int chunk_size; /* in bytes */
1490+ int max_fault; /* unused for now */
1491+} mdu_param_t;
1492+
1493+#endif _MD_U_H
1494+
1495--- linux/include/linux/raid/raid0.h.orig Tue Jan 16 13:42:03 2001
1496+++ linux/include/linux/raid/raid0.h Tue Jan 16 13:44:37 2001
1497@@ -0,0 +1,33 @@
1498+#ifndef _RAID0_H
1499+#define _RAID0_H
1500+
1501+#include <linux/raid/md.h>
1502+
1503+struct strip_zone
1504+{
1505+ int zone_offset; /* Zone offset in md_dev */
1506+ int dev_offset; /* Zone offset in real dev */
1507+ int size; /* Zone size */
1508+ int nb_dev; /* # of devices attached to the zone */
1509+ mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
1510+};
1511+
1512+struct raid0_hash
1513+{
1514+ struct strip_zone *zone0, *zone1;
1515+};
1516+
1517+struct raid0_private_data
1518+{
1519+ struct raid0_hash *hash_table; /* Dynamically allocated */
1520+ struct strip_zone *strip_zone; /* This one too */
1521+ int nr_strip_zones;
1522+ struct strip_zone *smallest;
1523+ int nr_zones;
1524+};
1525+
1526+typedef struct raid0_private_data raid0_conf_t;
1527+
1528+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
1529+
1530+#endif
1531--- linux/include/linux/raid/raid1.h.orig Tue Jan 16 13:42:03 2001
1532+++ linux/include/linux/raid/raid1.h Tue Jan 16 13:44:37 2001
1533@@ -0,0 +1,64 @@
1534+#ifndef _RAID1_H
1535+#define _RAID1_H
1536+
1537+#include <linux/raid/md.h>
1538+
1539+struct mirror_info {
1540+ int number;
1541+ int raid_disk;
1542+ kdev_t dev;
1543+ int next;
1544+ int sect_limit;
1545+
1546+ /*
1547+ * State bits:
1548+ */
1549+ int operational;
1550+ int write_only;
1551+ int spare;
1552+
1553+ int used_slot;
1554+};
1555+
1556+struct raid1_private_data {
1557+ mddev_t *mddev;
1558+ struct mirror_info mirrors[MD_SB_DISKS];
1559+ int nr_disks;
1560+ int raid_disks;
1561+ int working_disks;
1562+ int last_used;
1563+ unsigned long next_sect;
1564+ int sect_count;
1565+ mdk_thread_t *thread, *resync_thread;
1566+ int resync_mirrors;
1567+ struct mirror_info *spare;
1568+};
1569+
1570+typedef struct raid1_private_data raid1_conf_t;
1571+
1572+/*
1573+ * this is the only point in the RAID code where we violate
1574+ * C type safety. mddev->private is an 'opaque' pointer.
1575+ */
1576+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
1577+
1578+/*
1579+ * this is our 'private' 'collective' RAID1 buffer head.
1580+ * it contains information about what kind of IO operations were started
1581+ * for this RAID1 operation, and about their status:
1582+ */
1583+
1584+struct raid1_bh {
1585+ atomic_t remaining; /* 'have we finished' count,
1586+ * used from IRQ handlers
1587+ */
1588+ int cmd;
1589+ unsigned long state;
1590+ mddev_t *mddev;
1591+ struct buffer_head *master_bh;
1592+ struct buffer_head *mirror_bh [MD_SB_DISKS];
1593+ struct buffer_head bh_req;
1594+ struct buffer_head *next_retry;
1595+};
1596+
1597+#endif
1598--- linux/include/linux/raid/raid5.h.orig Tue Jan 16 13:42:03 2001
1599+++ linux/include/linux/raid/raid5.h Tue Jan 16 13:44:37 2001
1600@@ -0,0 +1,113 @@
1601+#ifndef _RAID5_H
1602+#define _RAID5_H
1603+
1604+#include <linux/raid/md.h>
1605+#include <linux/raid/xor.h>
1606+
1607+struct disk_info {
1608+ kdev_t dev;
1609+ int operational;
1610+ int number;
1611+ int raid_disk;
1612+ int write_only;
1613+ int spare;
1614+ int used_slot;
1615+};
1616+
1617+struct stripe_head {
1618+ md_spinlock_t stripe_lock;
1619+ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
1620+ struct stripe_head *free_next; /* pool of free sh's */
1621+ struct buffer_head *buffer_pool; /* pool of free buffers */
1622+ struct buffer_head *bh_pool; /* pool of free bh's */
1623+ struct raid5_private_data *raid_conf;
1624+ struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
1625+ struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
1626+ struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
1627+ struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
1628+ int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
1629+ int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
1630+ unsigned long sector; /* sector of this row */
1631+ int size; /* buffers size */
1632+ int pd_idx; /* parity disk index */
1633+ atomic_t nr_pending; /* nr of pending cmds */
1634+ unsigned long state; /* state flags */
1635+ int cmd; /* stripe cmd */
1636+ int count; /* nr of waiters */
1637+ int write_method; /* reconstruct-write / read-modify-write */
1638+ int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
1639+ struct wait_queue *wait; /* processes waiting for this stripe */
1640+};
1641+
1642+/*
1643+ * Phase
1644+ */
1645+#define PHASE_BEGIN 0
1646+#define PHASE_READ_OLD 1
1647+#define PHASE_WRITE 2
1648+#define PHASE_READ 3
1649+#define PHASE_COMPLETE 4
1650+
1651+/*
1652+ * Write method
1653+ */
1654+#define METHOD_NONE 0
1655+#define RECONSTRUCT_WRITE 1
1656+#define READ_MODIFY_WRITE 2
1657+
1658+/*
1659+ * Stripe state
1660+ */
1661+#define STRIPE_LOCKED 0
1662+#define STRIPE_ERROR 1
1663+
1664+/*
1665+ * Stripe commands
1666+ */
1667+#define STRIPE_NONE 0
1668+#define STRIPE_WRITE 1
1669+#define STRIPE_READ 2
1670+
1671+struct raid5_private_data {
1672+ struct stripe_head **stripe_hashtbl;
1673+ mddev_t *mddev;
1674+ mdk_thread_t *thread, *resync_thread;
1675+ struct disk_info disks[MD_SB_DISKS];
1676+ struct disk_info *spare;
1677+ int buffer_size;
1678+ int chunk_size, level, algorithm;
1679+ int raid_disks, working_disks, failed_disks;
1680+ int sector_count;
1681+ unsigned long next_sector;
1682+ atomic_t nr_handle;
1683+ struct stripe_head *next_free_stripe;
1684+ int nr_stripes;
1685+ int resync_parity;
1686+ int max_nr_stripes;
1687+ int clock;
1688+ int nr_hashed_stripes;
1689+ int nr_locked_stripes;
1690+ int nr_pending_stripes;
1691+ int nr_cached_stripes;
1692+
1693+ /*
1694+ * Free stripes pool
1695+ */
1696+ int nr_free_sh;
1697+ struct stripe_head *free_sh_list;
1698+ struct wait_queue *wait_for_stripe;
1699+};
1700+
1701+typedef struct raid5_private_data raid5_conf_t;
1702+
1703+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
1704+
1705+/*
1706+ * Our supported algorithms
1707+ */
1708+#define ALGORITHM_LEFT_ASYMMETRIC 0
1709+#define ALGORITHM_RIGHT_ASYMMETRIC 1
1710+#define ALGORITHM_LEFT_SYMMETRIC 2
1711+#define ALGORITHM_RIGHT_SYMMETRIC 3
1712+
1713+#endif
1714--- linux/include/linux/raid/translucent.h.orig Tue Jan 16 13:42:03 2001
1715+++ linux/include/linux/raid/translucent.h Tue Jan 16 13:42:03 2001
1716@@ -0,0 +1,23 @@
1717+#ifndef _TRANSLUCENT_H
1718+#define _TRANSLUCENT_H
1719+
1720+#include <linux/raid/md.h>
1721+
1722+typedef struct dev_info dev_info_t;
1723+
1724+struct dev_info {
1725+ kdev_t dev;
1726+ int size;
1727+};
1728+
1729+struct translucent_private_data
1730+{
1731+ dev_info_t disks[MD_SB_DISKS];
1732+};
1733+
1734+
1735+typedef struct translucent_private_data translucent_conf_t;
1736+
1737+#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
1738+
1739+#endif
1740--- linux/include/linux/raid/xor.h.orig Tue Jan 16 13:42:03 2001
1741+++ linux/include/linux/raid/xor.h Tue Jan 16 13:44:35 2001
1742@@ -0,0 +1,12 @@
1743+#ifndef _XOR_H
1744+#define _XOR_H
1745+
1746+#include <linux/raid/md.h>
1747+
1748+#define MAX_XOR_BLOCKS 5
1749+
1750+extern void calibrate_xor_block(void);
1751+extern void (*xor_block)(unsigned int count,
1752+ struct buffer_head **bh_ptr);
1753+
1754+#endif
1755--- linux/include/linux/blkdev.h.orig Mon Dec 11 01:49:44 2000
1756+++ linux/include/linux/blkdev.h Tue Jan 16 13:47:19 2001
1757@@ -91,8 +91,9 @@
1758 extern void make_request(int major,int rw, struct buffer_head * bh);
1759
1760 /* md needs this function to remap requests */
1761-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
1762-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
1763+extern int md_map (kdev_t dev, kdev_t *rdev,
1764+ unsigned long *rsector, unsigned long size);
1765+extern int md_make_request (struct buffer_head * bh, int rw);
1766 extern int md_error (kdev_t mddev, kdev_t rdev);
1767
1768 extern int * blk_size[MAX_BLKDEV];
1769--- linux/include/linux/fs.h.orig Tue Jan 16 13:30:09 2001
1770+++ linux/include/linux/fs.h Tue Jan 16 13:47:18 2001
1771@@ -191,6 +191,7 @@
1772 #define BH_Req 3 /* 0 if the buffer has been invalidated */
1773 #define BH_Protected 6 /* 1 if the buffer is protected */
1774 #define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */
1775+#define BH_LowPrio 8 /* 1 if the buffer is lowprio */
1776
1777 /*
1778 * Try to keep the most commonly used fields in single cache lines (16
1779@@ -784,6 +785,7 @@
1780 extern void refile_buffer(struct buffer_head * buf);
1781 extern void set_writetime(struct buffer_head * buf, int flag);
1782 extern int try_to_free_buffers(struct page *, int);
1783+extern void cache_drop_behind(struct buffer_head *bh);
1784
1785 extern int nr_buffers;
1786 extern long buffermem;
1787@@ -814,6 +816,25 @@
1788 }
1789 }
1790
1791+extern inline void mark_buffer_highprio(struct buffer_head * bh)
1792+{
1793+ clear_bit(BH_LowPrio, &bh->b_state);
1794+}
1795+
1796+extern inline void mark_buffer_lowprio(struct buffer_head * bh)
1797+{
1798+ /*
1799+ * dirty buffers cannot be marked lowprio.
1800+ */
1801+ if (!buffer_dirty(bh))
1802+ set_bit(BH_LowPrio, &bh->b_state);
1803+}
1804+
1805+static inline int buffer_lowprio(struct buffer_head * bh)
1806+{
1807+ return test_bit(BH_LowPrio, &bh->b_state);
1808+}
1809+
1810 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
1811 {
1812 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1813@@ -821,6 +842,23 @@
1814 if (bh->b_list != BUF_DIRTY)
1815 refile_buffer(bh);
1816 }
1817+ /*
1818+ * if a buffer gets marked dirty then it has to lose
1819+ * it's lowprio state.
1820+ */
1821+ mark_buffer_highprio(bh);
1822+}
1823+
1824+extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
1825+{
1826+ if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1827+ if (bh->b_list != BUF_DIRTY)
1828+ refile_buffer(bh);
1829+ /*
1830+ * Mark it lowprio only if it was not dirty before!
1831+ */
1832+ set_bit(BH_LowPrio, &bh->b_state);
1833+ }
1834 }
1835
1836 extern int check_disk_change(kdev_t dev);
1837@@ -898,6 +936,7 @@
1838 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
1839 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
1840 extern int is_read_only(kdev_t);
1841+extern int is_device_idle(kdev_t);
1842 extern void __brelse(struct buffer_head *);
1843 extern inline void brelse(struct buffer_head *buf)
1844 {
1845@@ -913,8 +952,12 @@
1846 extern void set_blocksize(kdev_t dev, int size);
1847 extern unsigned int get_hardblocksize(kdev_t dev);
1848 extern struct buffer_head * bread(kdev_t dev, int block, int size);
1849+extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
1850+extern void bread_ahead (kdev_t dev, int block, int size);
1851 extern struct buffer_head * breada(kdev_t dev,int block, int size,
1852 unsigned int pos, unsigned int filesize);
1853+extern struct buffer_head * breada_blocks(kdev_t dev,int block,
1854+ int size, int blocks);
1855
1856 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
1857
1858--- linux/include/linux/md.h.orig Fri May 8 09:17:13 1998
1859+++ linux/include/linux/md.h Tue Jan 16 13:42:03 2001
1860@@ -1,300 +0,0 @@
1861-/*
1862- md.h : Multiple Devices driver for Linux
1863- Copyright (C) 1994-96 Marc ZYNGIER
1864- <zyngier@ufr-info-p7.ibp.fr> or
1865- <maz@gloups.fdn.fr>
1866-
1867- This program is free software; you can redistribute it and/or modify
1868- it under the terms of the GNU General Public License as published by
1869- the Free Software Foundation; either version 2, or (at your option)
1870- any later version.
1871-
1872- You should have received a copy of the GNU General Public License
1873- (for example /usr/src/linux/COPYING); if not, write to the Free
1874- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1875-*/
1876-
1877-#ifndef _MD_H
1878-#define _MD_H
1879-
1880-#include <linux/major.h>
1881-#include <linux/ioctl.h>
1882-#include <linux/types.h>
1883-
1884-/*
1885- * Different major versions are not compatible.
1886- * Different minor versions are only downward compatible.
1887- * Different patchlevel versions are downward and upward compatible.
1888- */
1889-#define MD_MAJOR_VERSION 0
1890-#define MD_MINOR_VERSION 36
1891-#define MD_PATCHLEVEL_VERSION 6
1892-
1893-#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
1894-
1895-/* ioctls */
1896-#define REGISTER_DEV _IO (MD_MAJOR, 1)
1897-#define START_MD _IO (MD_MAJOR, 2)
1898-#define STOP_MD _IO (MD_MAJOR, 3)
1899-#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
1900-
1901-/*
1902- personalities :
1903- Byte 0 : Chunk size factor
1904- Byte 1 : Fault tolerance count for each physical device
1905- ( 0 means no fault tolerance,
1906- 0xFF means always tolerate faults), not used by now.
1907- Byte 2 : Personality
1908- Byte 3 : Reserved.
1909- */
1910-
1911-#define FAULT_SHIFT 8
1912-#define PERSONALITY_SHIFT 16
1913-
1914-#define FACTOR_MASK 0x000000FFUL
1915-#define FAULT_MASK 0x0000FF00UL
1916-#define PERSONALITY_MASK 0x00FF0000UL
1917-
1918-#define MD_RESERVED 0 /* Not used by now */
1919-#define LINEAR (1UL << PERSONALITY_SHIFT)
1920-#define STRIPED (2UL << PERSONALITY_SHIFT)
1921-#define RAID0 STRIPED
1922-#define RAID1 (3UL << PERSONALITY_SHIFT)
1923-#define RAID5 (4UL << PERSONALITY_SHIFT)
1924-#define MAX_PERSONALITY 5
1925-
1926-/*
1927- * MD superblock.
1928- *
1929- * The MD superblock maintains some statistics on each MD configuration.
1930- * Each real device in the MD set contains it near the end of the device.
1931- * Some of the ideas are copied from the ext2fs implementation.
1932- *
1933- * We currently use 4096 bytes as follows:
1934- *
1935- * word offset function
1936- *
1937- * 0 - 31 Constant generic MD device information.
1938- * 32 - 63 Generic state information.
1939- * 64 - 127 Personality specific information.
1940- * 128 - 511 12 32-words descriptors of the disks in the raid set.
1941- * 512 - 911 Reserved.
1942- * 912 - 1023 Disk specific descriptor.
1943- */
1944-
1945-/*
1946- * If x is the real device size in bytes, we return an apparent size of:
1947- *
1948- * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1949- *
1950- * and place the 4kB superblock at offset y.
1951- */
1952-#define MD_RESERVED_BYTES (64 * 1024)
1953-#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1954-#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1955-
1956-#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1957-#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1958-
1959-#define MD_SB_BYTES 4096
1960-#define MD_SB_WORDS (MD_SB_BYTES / 4)
1961-#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1962-#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1963-
1964-/*
1965- * The following are counted in 32-bit words
1966- */
1967-#define MD_SB_GENERIC_OFFSET 0
1968-#define MD_SB_PERSONALITY_OFFSET 64
1969-#define MD_SB_DISKS_OFFSET 128
1970-#define MD_SB_DESCRIPTOR_OFFSET 992
1971-
1972-#define MD_SB_GENERIC_CONSTANT_WORDS 32
1973-#define MD_SB_GENERIC_STATE_WORDS 32
1974-#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1975-#define MD_SB_PERSONALITY_WORDS 64
1976-#define MD_SB_DISKS_WORDS 384
1977-#define MD_SB_DESCRIPTOR_WORDS 32
1978-#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1979-#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1980-#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1981-
1982-/*
1983- * Device "operational" state bits
1984- */
1985-#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
1986-#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
1987-#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
1988-
1989-typedef struct md_device_descriptor_s {
1990- __u32 number; /* 0 Device number in the entire set */
1991- __u32 major; /* 1 Device major number */
1992- __u32 minor; /* 2 Device minor number */
1993- __u32 raid_disk; /* 3 The role of the device in the raid set */
1994- __u32 state; /* 4 Operational state */
1995- __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1996-} md_descriptor_t;
1997-
1998-#define MD_SB_MAGIC 0xa92b4efc
1999-
2000-/*
2001- * Superblock state bits
2002- */
2003-#define MD_SB_CLEAN 0
2004-#define MD_SB_ERRORS 1
2005-
2006-typedef struct md_superblock_s {
2007-
2008- /*
2009- * Constant generic information
2010- */
2011- __u32 md_magic; /* 0 MD identifier */
2012- __u32 major_version; /* 1 major version to which the set conforms */
2013- __u32 minor_version; /* 2 minor version to which the set conforms */
2014- __u32 patch_version; /* 3 patchlevel version to which the set conforms */
2015- __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
2016- __u32 set_magic; /* 5 Raid set identifier */
2017- __u32 ctime; /* 6 Creation time */
2018- __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
2019- __u32 size; /* 8 Apparent size of each individual disk, in kB */
2020- __u32 nr_disks; /* 9 Number of total disks in the raid set */
2021- __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
2022- __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
2023-
2024- /*
2025- * Generic state information
2026- */
2027- __u32 utime; /* 0 Superblock update time */
2028- __u32 state; /* 1 State bits (clean, ...) */
2029- __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
2030- __u32 working_disks; /* 3 Number of working disks */
2031- __u32 failed_disks; /* 4 Number of failed disks */
2032- __u32 spare_disks; /* 5 Number of spare disks */
2033- __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
2034-
2035- /*
2036- * Personality information
2037- */
2038- __u32 parity_algorithm;
2039- __u32 chunk_size;
2040- __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
2041-
2042- /*
2043- * Disks information
2044- */
2045- md_descriptor_t disks[MD_SB_DISKS];
2046-
2047- /*
2048- * Reserved
2049- */
2050- __u32 reserved[MD_SB_RESERVED_WORDS];
2051-
2052- /*
2053- * Active descriptor
2054- */
2055- md_descriptor_t descriptor;
2056-} md_superblock_t;
2057-
2058-#ifdef __KERNEL__
2059-
2060-#include <linux/mm.h>
2061-#include <linux/fs.h>
2062-#include <linux/blkdev.h>
2063-#include <asm/semaphore.h>
2064-
2065-/*
2066- * Kernel-based reconstruction is mostly working, but still requires
2067- * some additional work.
2068- */
2069-#define SUPPORT_RECONSTRUCTION 0
2070-
2071-#define MAX_REAL 8 /* Max number of physical dev per md dev */
2072-#define MAX_MD_DEV 4 /* Max number of md dev */
2073-
2074-#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
2075-#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
2076-#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
2077-
2078-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
2079-
2080-struct real_dev
2081-{
2082- kdev_t dev; /* Device number */
2083- int size; /* Device size (in blocks) */
2084- int offset; /* Real device offset (in blocks) in md dev
2085- (only used in linear mode) */
2086- struct inode *inode; /* Lock inode */
2087- md_superblock_t *sb;
2088- u32 sb_offset;
2089-};
2090-
2091-struct md_dev;
2092-
2093-#define SPARE_INACTIVE 0
2094-#define SPARE_WRITE 1
2095-#define SPARE_ACTIVE 2
2096-
2097-struct md_personality
2098-{
2099- char *name;
2100- int (*map)(struct md_dev *mddev, kdev_t *rdev,
2101- unsigned long *rsector, unsigned long size);
2102- int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
2103- void (*end_request)(struct buffer_head * bh, int uptodate);
2104- int (*run)(int minor, struct md_dev *mddev);
2105- int (*stop)(int minor, struct md_dev *mddev);
2106- int (*status)(char *page, int minor, struct md_dev *mddev);
2107- int (*ioctl)(struct inode *inode, struct file *file,
2108- unsigned int cmd, unsigned long arg);
2109- int max_invalid_dev;
2110- int (*error_handler)(struct md_dev *mddev, kdev_t dev);
2111-
2112-/*
2113- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
2114- * hot-removed. Hot removal is different from failure. (failure marks
2115- * a disk inactive, but the disk is still part of the array)
2116- */
2117- int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
2118- int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
2119- int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
2120-};
2121-
2122-struct md_dev
2123-{
2124- struct real_dev devices[MAX_REAL];
2125- struct md_personality *pers;
2126- md_superblock_t *sb;
2127- int sb_dirty;
2128- int repartition;
2129- int busy;
2130- int nb_dev;
2131- void *private;
2132-};
2133-
2134-struct md_thread {
2135- void (*run) (void *data);
2136- void *data;
2137- struct wait_queue *wqueue;
2138- unsigned long flags;
2139- struct semaphore *sem;
2140- struct task_struct *tsk;
2141-};
2142-
2143-#define THREAD_WAKEUP 0
2144-
2145-extern struct md_dev md_dev[MAX_MD_DEV];
2146-extern int md_size[MAX_MD_DEV];
2147-extern int md_maxreadahead[MAX_MD_DEV];
2148-
2149-extern char *partition_name (kdev_t dev);
2150-
2151-extern int register_md_personality (int p_num, struct md_personality *p);
2152-extern int unregister_md_personality (int p_num);
2153-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
2154-extern void md_unregister_thread (struct md_thread *thread);
2155-extern void md_wakeup_thread(struct md_thread *thread);
2156-extern int md_update_sb (int minor);
2157-extern int md_do_sync(struct md_dev *mddev);
2158-
2159-#endif __KERNEL__
2160-#endif _MD_H
2161--- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996
2162+++ linux/include/linux/raid0.h Tue Jan 16 13:42:03 2001
2163@@ -1,27 +0,0 @@
2164-#ifndef _RAID0_H
2165-#define _RAID0_H
2166-
2167-struct strip_zone
2168-{
2169- int zone_offset; /* Zone offset in md_dev */
2170- int dev_offset; /* Zone offset in real dev */
2171- int size; /* Zone size */
2172- int nb_dev; /* Number of devices attached to the zone */
2173- struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
2174-};
2175-
2176-struct raid0_hash
2177-{
2178- struct strip_zone *zone0, *zone1;
2179-};
2180-
2181-struct raid0_data
2182-{
2183- struct raid0_hash *hash_table; /* Dynamically allocated */
2184- struct strip_zone *strip_zone; /* This one too */
2185- int nr_strip_zones;
2186- struct strip_zone *smallest;
2187- int nr_zones;
2188-};
2189-
2190-#endif
2191--- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998
2192+++ linux/include/linux/raid1.h Tue Jan 16 13:42:03 2001
2193@@ -1,49 +0,0 @@
2194-#ifndef _RAID1_H
2195-#define _RAID1_H
2196-
2197-#include <linux/md.h>
2198-
2199-struct mirror_info {
2200- int number;
2201- int raid_disk;
2202- kdev_t dev;
2203- int next;
2204- int sect_limit;
2205-
2206- /*
2207- * State bits:
2208- */
2209- int operational;
2210- int write_only;
2211- int spare;
2212-};
2213-
2214-struct raid1_data {
2215- struct md_dev *mddev;
2216- struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
2217- int raid_disks;
2218- int working_disks; /* Number of working disks */
2219- int last_used;
2220- unsigned long next_sect;
2221- int sect_count;
2222- int resync_running;
2223-};
2224-
2225-/*
2226- * this is our 'private' 'collective' RAID1 buffer head.
2227- * it contains information about what kind of IO operations were started
2228- * for this RAID5 operation, and about their status:
2229- */
2230-
2231-struct raid1_bh {
2232- unsigned int remaining;
2233- int cmd;
2234- unsigned long state;
2235- struct md_dev *mddev;
2236- struct buffer_head *master_bh;
2237- struct buffer_head *mirror_bh [MD_SB_DISKS];
2238- struct buffer_head bh_req;
2239- struct buffer_head *next_retry;
2240-};
2241-
2242-#endif
2243--- linux/include/linux/raid5.h.orig Fri May 8 09:17:13 1998
2244+++ linux/include/linux/raid5.h Tue Jan 16 13:42:03 2001
2245@@ -1,110 +0,0 @@
2246-#ifndef _RAID5_H
2247-#define _RAID5_H
2248-
2249-#ifdef __KERNEL__
2250-#include <linux/md.h>
2251-#include <asm/atomic.h>
2252-
2253-struct disk_info {
2254- kdev_t dev;
2255- int operational;
2256- int number;
2257- int raid_disk;
2258- int write_only;
2259- int spare;
2260-};
2261-
2262-struct stripe_head {
2263- struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
2264- struct stripe_head *free_next; /* pool of free sh's */
2265- struct buffer_head *buffer_pool; /* pool of free buffers */
2266- struct buffer_head *bh_pool; /* pool of free bh's */
2267- struct raid5_data *raid_conf;
2268- struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
2269- struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
2270- struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
2271- struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
2272- int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
2273- int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
2274- unsigned long sector; /* sector of this row */
2275- int size; /* buffers size */
2276- int pd_idx; /* parity disk index */
2277- int nr_pending; /* nr of pending cmds */
2278- unsigned long state; /* state flags */
2279- int cmd; /* stripe cmd */
2280- int count; /* nr of waiters */
2281- int write_method; /* reconstruct-write / read-modify-write */
2282- int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
2283- struct wait_queue *wait; /* processes waiting for this stripe */
2284-};
2285-
2286-/*
2287- * Phase
2288- */
2289-#define PHASE_BEGIN 0
2290-#define PHASE_READ_OLD 1
2291-#define PHASE_WRITE 2
2292-#define PHASE_READ 3
2293-#define PHASE_COMPLETE 4
2294-
2295-/*
2296- * Write method
2297- */
2298-#define METHOD_NONE 0
2299-#define RECONSTRUCT_WRITE 1
2300-#define READ_MODIFY_WRITE 2
2301-
2302-/*
2303- * Stripe state
2304- */
2305-#define STRIPE_LOCKED 0
2306-#define STRIPE_ERROR 1
2307-
2308-/*
2309- * Stripe commands
2310- */
2311-#define STRIPE_NONE 0
2312-#define STRIPE_WRITE 1
2313-#define STRIPE_READ 2
2314-
2315-struct raid5_data {
2316- struct stripe_head **stripe_hashtbl;
2317- struct md_dev *mddev;
2318- struct md_thread *thread, *resync_thread;
2319- struct disk_info disks[MD_SB_DISKS];
2320- struct disk_info *spare;
2321- int buffer_size;
2322- int chunk_size, level, algorithm;
2323- int raid_disks, working_disks, failed_disks;
2324- int sector_count;
2325- unsigned long next_sector;
2326- atomic_t nr_handle;
2327- struct stripe_head *next_free_stripe;
2328- int nr_stripes;
2329- int resync_parity;
2330- int max_nr_stripes;
2331- int clock;
2332- int nr_hashed_stripes;
2333- int nr_locked_stripes;
2334- int nr_pending_stripes;
2335- int nr_cached_stripes;
2336-
2337- /*
2338- * Free stripes pool
2339- */
2340- int nr_free_sh;
2341- struct stripe_head *free_sh_list;
2342- struct wait_queue *wait_for_stripe;
2343-};
2344-
2345-#endif
2346-
2347-/*
2348- * Our supported algorithms
2349- */
2350-#define ALGORITHM_LEFT_ASYMMETRIC 0
2351-#define ALGORITHM_RIGHT_ASYMMETRIC 1
2352-#define ALGORITHM_LEFT_SYMMETRIC 2
2353-#define ALGORITHM_RIGHT_SYMMETRIC 3
2354-
2355-#endif
2356--- linux/include/linux/sysctl.h.orig Mon Dec 11 01:49:44 2000
2357+++ linux/include/linux/sysctl.h Tue Jan 16 13:42:03 2001
2358@@ -435,6 +435,7 @@
2359 enum {
2360 DEV_CDROM=1,
2361 DEV_HWMON=2,
2362+ DEV_MD=3,
2363 DEV_MAC_HID=5
2364 };
2365
2366@@ -446,6 +447,11 @@
2367 DEV_CDROM_DEBUG=4,
2368 DEV_CDROM_LOCK=5,
2369 DEV_CDROM_CHECK_MEDIA=6
2370+};
2371+
2372+/* /proc/sys/dev/md */
2373+enum {
2374+ DEV_MD_SPEED_LIMIT=1
2375 };
2376
2377 /* /proc/sys/dev/mac_hid */
2378--- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998
2379+++ linux/include/asm-i386/md.h Tue Jan 16 13:42:03 2001
2380@@ -1,13 +0,0 @@
2381-/* $Id$
2382- * md.h: High speed xor_block operation for RAID4/5
2383- *
2384- */
2385-
2386-#ifndef __ASM_MD_H
2387-#define __ASM_MD_H
2388-
2389-/* #define HAVE_ARCH_XORBLOCK */
2390-
2391-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2392-
2393-#endif /* __ASM_MD_H */
2394--- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998
2395+++ linux/include/asm-alpha/md.h Tue Jan 16 13:42:03 2001
2396@@ -1,13 +0,0 @@
2397-/* $Id$
2398- * md.h: High speed xor_block operation for RAID4/5
2399- *
2400- */
2401-
2402-#ifndef __ASM_MD_H
2403-#define __ASM_MD_H
2404-
2405-/* #define HAVE_ARCH_XORBLOCK */
2406-
2407-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2408-
2409-#endif /* __ASM_MD_H */
2410--- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998
2411+++ linux/include/asm-m68k/md.h Tue Jan 16 13:42:04 2001
2412@@ -1,13 +0,0 @@
2413-/* $Id$
2414- * md.h: High speed xor_block operation for RAID4/5
2415- *
2416- */
2417-
2418-#ifndef __ASM_MD_H
2419-#define __ASM_MD_H
2420-
2421-/* #define HAVE_ARCH_XORBLOCK */
2422-
2423-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2424-
2425-#endif /* __ASM_MD_H */
2426--- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998
2427+++ linux/include/asm-sparc/md.h Tue Jan 16 13:42:04 2001
2428@@ -1,13 +0,0 @@
2429-/* $Id$
2430- * md.h: High speed xor_block operation for RAID4/5
2431- *
2432- */
2433-
2434-#ifndef __ASM_MD_H
2435-#define __ASM_MD_H
2436-
2437-/* #define HAVE_ARCH_XORBLOCK */
2438-
2439-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2440-
2441-#endif /* __ASM_MD_H */
2442--- linux/include/asm-ppc/md.h.orig Wed Oct 27 02:53:42 1999
2443+++ linux/include/asm-ppc/md.h Tue Jan 16 13:42:04 2001
2444@@ -1,13 +0,0 @@
2445-/* $Id$
2446- * md.h: High speed xor_block operation for RAID4/5
2447- *
2448- */
2449-
2450-#ifndef __ASM_MD_H
2451-#define __ASM_MD_H
2452-
2453-/* #define HAVE_ARCH_XORBLOCK */
2454-
2455-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2456-
2457-#endif /* __ASM_MD_H */
2458--- linux/include/asm-sparc64/md.h.orig Tue Jan 13 00:15:58 1998
2459+++ linux/include/asm-sparc64/md.h Tue Jan 16 13:42:04 2001
2460@@ -1,91 +0,0 @@
2461-/* $Id$
2462- * md.h: High speed xor_block operation for RAID4/5
2463- * utilizing the UltraSparc Visual Instruction Set.
2464- *
2465- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
2466- */
2467-
2468-#ifndef __ASM_MD_H
2469-#define __ASM_MD_H
2470-
2471-#include <asm/head.h>
2472-#include <asm/asi.h>
2473-
2474-#define HAVE_ARCH_XORBLOCK
2475-
2476-#define MD_XORBLOCK_ALIGNMENT 64
2477-
2478-/* void __xor_block (char *dest, char *src, long len)
2479- * {
2480- * while (len--) *dest++ ^= *src++;
2481- * }
2482- *
2483- * Requirements:
2484- * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
2485- * !(len & 127) && len >= 256
2486- */
2487-
2488-static inline void __xor_block (char *dest, char *src, long len)
2489-{
2490- __asm__ __volatile__ ("
2491- wr %%g0, %3, %%fprs
2492- wr %%g0, %4, %%asi
2493- membar #LoadStore|#StoreLoad|#StoreStore
2494- sub %2, 128, %2
2495- ldda [%0] %4, %%f0
2496- ldda [%1] %4, %%f16
2497-1: ldda [%0 + 64] %%asi, %%f32
2498- fxor %%f0, %%f16, %%f16
2499- fxor %%f2, %%f18, %%f18
2500- fxor %%f4, %%f20, %%f20
2501- fxor %%f6, %%f22, %%f22
2502- fxor %%f8, %%f24, %%f24
2503- fxor %%f10, %%f26, %%f26
2504- fxor %%f12, %%f28, %%f28
2505- fxor %%f14, %%f30, %%f30
2506- stda %%f16, [%0] %4
2507- ldda [%1 + 64] %%asi, %%f48
2508- ldda [%0 + 128] %%asi, %%f0
2509- fxor %%f32, %%f48, %%f48
2510- fxor %%f34, %%f50, %%f50
2511- add %0, 128, %0
2512- fxor %%f36, %%f52, %%f52
2513- add %1, 128, %1
2514- fxor %%f38, %%f54, %%f54
2515- subcc %2, 128, %2
2516- fxor %%f40, %%f56, %%f56
2517- fxor %%f42, %%f58, %%f58
2518- fxor %%f44, %%f60, %%f60
2519- fxor %%f46, %%f62, %%f62
2520- stda %%f48, [%0 - 64] %%asi
2521- bne,pt %%xcc, 1b
2522- ldda [%1] %4, %%f16
2523- ldda [%0 + 64] %%asi, %%f32
2524- fxor %%f0, %%f16, %%f16
2525- fxor %%f2, %%f18, %%f18
2526- fxor %%f4, %%f20, %%f20
2527- fxor %%f6, %%f22, %%f22
2528- fxor %%f8, %%f24, %%f24
2529- fxor %%f10, %%f26, %%f26
2530- fxor %%f12, %%f28, %%f28
2531- fxor %%f14, %%f30, %%f30
2532- stda %%f16, [%0] %4
2533- ldda [%1 + 64] %%asi, %%f48
2534- membar #Sync
2535- fxor %%f32, %%f48, %%f48
2536- fxor %%f34, %%f50, %%f50
2537- fxor %%f36, %%f52, %%f52
2538- fxor %%f38, %%f54, %%f54
2539- fxor %%f40, %%f56, %%f56
2540- fxor %%f42, %%f58, %%f58
2541- fxor %%f44, %%f60, %%f60
2542- fxor %%f46, %%f62, %%f62
2543- stda %%f48, [%0 + 64] %%asi
2544- membar #Sync|#StoreStore|#StoreLoad
2545- wr %%g0, 0, %%fprs
2546- " : :
2547- "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
2548- "cc", "memory");
2549-}
2550-
2551-#endif /* __ASM_MD_H */
2552--- linux/drivers/block/Config.in.orig Mon Dec 11 01:49:41 2000
2553+++ linux/drivers/block/Config.in Tue Jan 16 13:42:04 2001
2554@@ -103,10 +103,13 @@
2555 fi
2556 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
2557 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
2558+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
2559 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
2560 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
2561 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
2562 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
2563+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
2564+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
2565 fi
2566 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
2567 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
2568--- linux/drivers/block/Makefile.orig Mon Dec 11 01:49:41 2000
2569+++ linux/drivers/block/Makefile Tue Jan 16 13:42:04 2001
2570@@ -294,10 +294,28 @@
2571 endif
2572
2573 ifeq ($(CONFIG_MD_RAID5),y)
2574+LX_OBJS += xor.o
2575 L_OBJS += raid5.o
2576 else
2577 ifeq ($(CONFIG_MD_RAID5),m)
2578+ LX_OBJS += xor.o
2579 M_OBJS += raid5.o
2580+ endif
2581+endif
2582+
2583+ifeq ($(CONFIG_MD_TRANSLUCENT),y)
2584+L_OBJS += translucent.o
2585+else
2586+ ifeq ($(CONFIG_MD_TRANSLUCENT),m)
2587+ M_OBJS += translucent.o
2588+ endif
2589+endif
2590+
2591+ifeq ($(CONFIG_MD_HSM),y)
2592+L_OBJS += hsm.o
2593+else
2594+ ifeq ($(CONFIG_MD_HSM),m)
2595+ M_OBJS += hsm.o
2596 endif
2597 endif
2598
2599--- linux/drivers/block/genhd.c.orig Tue Jan 16 13:30:06 2001
2600+++ linux/drivers/block/genhd.c Tue Jan 16 13:42:04 2001
2601@@ -28,6 +28,7 @@
2602 #include <linux/string.h>
2603 #include <linux/blk.h>
2604 #include <linux/init.h>
2605+#include <linux/raid/md.h>
2606
2607 #ifdef CONFIG_ARCH_S390
2608 #include <asm/dasd.h>
2609@@ -1785,6 +1786,9 @@
2610 else
2611 #endif
2612 rd_load();
2613+#endif
2614+#ifdef CONFIG_BLK_DEV_MD
2615+ autodetect_raid();
2616 #endif
2617 #ifdef CONFIG_MD_BOOT
2618 md_setup_drive();
2619--- linux/drivers/block/hsm.c.orig Tue Jan 16 13:42:04 2001
2620+++ linux/drivers/block/hsm.c Tue Jan 16 13:42:04 2001
2621@@ -0,0 +1,840 @@
2622+/*
2623+ hsm.c : HSM RAID driver for Linux
2624+ Copyright (C) 1998 Ingo Molnar
2625+
2626+ HSM mode management functions.
2627+
2628+ This program is free software; you can redistribute it and/or modify
2629+ it under the terms of the GNU General Public License as published by
2630+ the Free Software Foundation; either version 2, or (at your option)
2631+ any later version.
2632+
2633+ You should have received a copy of the GNU General Public License
2634+ (for example /usr/src/linux/COPYING); if not, write to the Free
2635+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2636+*/
2637+
2638+#include <linux/module.h>
2639+
2640+#include <linux/raid/md.h>
2641+#include <linux/malloc.h>
2642+
2643+#include <linux/raid/hsm.h>
2644+#include <linux/blk.h>
2645+
2646+#define MAJOR_NR MD_MAJOR
2647+#define MD_DRIVER
2648+#define MD_PERSONALITY
2649+
2650+
2651+#define DEBUG_HSM 1
2652+
2653+#if DEBUG_HSM
2654+#define dprintk(x,y...) printk(x,##y)
2655+#else
2656+#define dprintk(x,y...) do { } while (0)
2657+#endif
2658+
2659+void print_bh(struct buffer_head *bh)
2660+{
2661+ dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
2662+ bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
2663+ bh->b_rsector, bh->b_this_page, bh->b_state,
2664+ bh->b_next_free, bh->b_count, bh->b_data,
2665+ bh->b_list, bh->b_flushtime
2666+ );
2667+}
2668+
2669+static int check_bg (pv_t *pv, pv_block_group_t * bg)
2670+{
2671+ int i, free = 0;
2672+
2673+ dprintk("checking bg ...\n");
2674+
2675+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2676+ if (pv_pptr_free(bg->blocks + i)) {
2677+ free++;
2678+ if (test_bit(i, bg->used_bitmap)) {
2679+ printk("hm, bit %d set?\n", i);
2680+ }
2681+ } else {
2682+ if (!test_bit(i, bg->used_bitmap)) {
2683+ printk("hm, bit %d not set?\n", i);
2684+ }
2685+ }
2686+ }
2687+ dprintk("%d free blocks in bg ...\n", free);
2688+ return free;
2689+}
2690+
2691+static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
2692+{
2693+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2694+ struct buffer_head *bh;
2695+
2696+ dprintk("... getting BG at %u ...\n", bg_pos);
2697+
2698+ bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
2699+ if (!bh) {
2700+ MD_BUG();
2701+ return;
2702+ }
2703+ desc->bg = (pv_block_group_t *) bh->b_data;
2704+ desc->free_blocks = check_bg(pv, desc->bg);
2705+}
2706+
2707+static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
2708+ unsigned int lblock, lv_lptr_t * index)
2709+{
2710+ int i;
2711+
2712+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2713+ pv_pptr_t * bptr = desc->bg->blocks + i;
2714+ if (pv_pptr_free(bptr)) {
2715+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2716+
2717+ if (test_bit(i, desc->bg->used_bitmap)) {
2718+ MD_BUG();
2719+ continue;
2720+ }
2721+ bptr->u.used.owner.log_id = lv->log_id;
2722+ bptr->u.used.owner.log_index = lblock;
2723+ index->data.phys_nr = pv->phys_nr;
2724+ index->data.phys_block = bg_pos + i + 1;
2725+ set_bit(i, desc->bg->used_bitmap);
2726+ desc->free_blocks--;
2727+ dprintk(".....free blocks left in bg %p: %d\n",
2728+ desc->bg, desc->free_blocks);
2729+ return 0;
2730+ }
2731+ }
2732+ return -ENOSPC;
2733+}
2734+
2735+static int __get_free_block (lv_t *lv, pv_t *pv,
2736+ unsigned int lblock, lv_lptr_t * index)
2737+{
2738+ int i;
2739+
2740+ dprintk("trying to get free block for lblock %d ...\n", lblock);
2741+
2742+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2743+ pv_bg_desc_t *desc = pv->bg_array + i;
2744+
2745+ dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
2746+ if (!desc->bg)
2747+ get_bg(pv, desc, i);
2748+
2749+ if (desc->bg && desc->free_blocks)
2750+ return find_free_block(lv, pv, desc, i,
2751+ lblock, index);
2752+ }
2753+ dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
2754+ return -ENOSPC;
2755+}
2756+
2757+static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2758+{
2759+ int err;
2760+
2761+ if (!lv->free_indices)
2762+ return -ENOSPC;
2763+
2764+ /* fix me */
2765+ err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
2766+
2767+ if (err || !index->data.phys_block) {
2768+ MD_BUG();
2769+ return -ENOSPC;
2770+ }
2771+
2772+ lv->free_indices--;
2773+
2774+ return 0;
2775+}
2776+
2777+/*
2778+ * fix me: wordsize assumptions ...
2779+ */
2780+#define INDEX_BITS 8
2781+#define INDEX_DEPTH (32/INDEX_BITS)
2782+#define INDEX_MASK ((1<<INDEX_BITS) - 1)
2783+
2784+static void print_index_list (lv_t *lv, lv_lptr_t *index)
2785+{
2786+ lv_lptr_t *tmp;
2787+ int i;
2788+
2789+ dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
2790+ index->data.phys_block, index->cpu_addr);
2791+
2792+ tmp = index_child(index);
2793+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2794+ if (index_block(lv, tmp))
2795+ dprintk("(%d->%d)", i, index_block(lv, tmp));
2796+ tmp++;
2797+ }
2798+ dprintk(".]\n");
2799+}
2800+
2801+static int read_index_group (lv_t *lv, lv_lptr_t *index)
2802+{
2803+ lv_lptr_t *index_group, *tmp;
2804+ struct buffer_head *bh;
2805+ int i;
2806+
2807+ dprintk("reading index group <%s:%d>\n",
2808+ partition_name(index_dev(lv, index)), index_block(lv, index));
2809+
2810+ bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
2811+ if (!bh) {
2812+ MD_BUG();
2813+ return -EIO;
2814+ }
2815+ if (!buffer_uptodate(bh))
2816+ MD_BUG();
2817+
2818+ index_group = (lv_lptr_t *) bh->b_data;
2819+ tmp = index_group;
2820+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2821+ if (index_block(lv, tmp)) {
2822+ dprintk("index group has BLOCK %d, non-present.\n", i);
2823+ tmp->cpu_addr = 0;
2824+ }
2825+ tmp++;
2826+ }
2827+ index->cpu_addr = ptr_to_cpuaddr(index_group);
2828+
2829+ dprintk("have read index group %p at block %d.\n",
2830+ index_group, index_block(lv, index));
2831+ print_index_list(lv, index);
2832+
2833+ return 0;
2834+}
2835+
2836+static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2837+{
2838+ struct buffer_head *bh;
2839+ lv_lptr_t * index_group;
2840+
2841+ if (get_free_block(lv, lblock, index))
2842+ return -ENOSPC;
2843+
2844+ dprintk("creating block for index group <%s:%d>\n",
2845+ partition_name(index_dev(lv, index)), index_block(lv, index));
2846+
2847+ bh = getblk(index_dev(lv, index),
2848+ index_block(lv, index), HSM_BLOCKSIZE);
2849+
2850+ index_group = (lv_lptr_t *) bh->b_data;
2851+ md_clear_page(index_group);
2852+ mark_buffer_uptodate(bh, 1);
2853+
2854+ index->cpu_addr = ptr_to_cpuaddr(index_group);
2855+
2856+ dprintk("allocated index group %p at block %d.\n",
2857+ index_group, index_block(lv, index));
2858+ return 0;
2859+}
2860+
2861+static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
2862+{
2863+ lv_lptr_t * index = index_child(&lv->root_index);
2864+ int idx, l;
2865+
2866+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
2867+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2868+ index += idx;
2869+ if (!l)
2870+ break;
2871+ if (!index_present(index)) {
2872+ dprintk("no group, level %u, pos %u\n", l, idx);
2873+ if (alloc_index_group(lv, lblock, index))
2874+ return NULL;
2875+ }
2876+ index = index_child(index);
2877+ }
2878+ if (!index_block(lv,index)) {
2879+ dprintk("no data, pos %u\n", idx);
2880+ if (get_free_block(lv, lblock, index))
2881+ return NULL;
2882+ return index;
2883+ }
2884+ MD_BUG();
2885+ return index;
2886+}
2887+
2888+static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
2889+{
2890+ lv_lptr_t * index = index_child(&lv->root_index);
2891+ int idx, l;
2892+
2893+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
2894+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2895+ index += idx;
2896+ if (!l)
2897+ break;
2898+ if (index_free(index))
2899+ return NULL;
2900+ if (!index_present(index))
2901+ read_index_group(lv, index);
2902+ if (!index_present(index)) {
2903+ MD_BUG();
2904+ return NULL;
2905+ }
2906+ index = index_child(index);
2907+ }
2908+ if (!index_block(lv,index))
2909+ return NULL;
2910+ return index;
2911+}
2912+
2913+static int read_root_index(lv_t *lv)
2914+{
2915+ int err;
2916+ lv_lptr_t *index = &lv->root_index;
2917+
2918+ if (!index_block(lv, index)) {
2919+ printk("LV has no root index yet, creating.\n");
2920+
2921+ err = alloc_index_group (lv, 0, index);
2922+ if (err) {
2923+ printk("could not create index group, err:%d\n", err);
2924+ return err;
2925+ }
2926+ lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
2927+ lv->root_index.data;
2928+ } else {
2929+ printk("LV already has a root index.\n");
2930+ printk("... at <%s:%d>.\n",
2931+ partition_name(index_dev(lv, index)),
2932+ index_block(lv, index));
2933+
2934+ read_index_group(lv, index);
2935+ }
2936+ return 0;
2937+}
2938+
2939+static int init_pv(pv_t *pv)
2940+{
2941+ struct buffer_head *bh;
2942+ pv_sb_t *pv_sb;
2943+
2944+ bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
2945+ if (!bh) {
2946+ MD_BUG();
2947+ return -1;
2948+ }
2949+
2950+ pv_sb = (pv_sb_t *) bh->b_data;
2951+ pv->pv_sb = pv_sb;
2952+
2953+ if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
2954+ printk("%s is not a PV, has magic %x instead of %x!\n",
2955+ partition_name(pv->dev), pv_sb->pv_magic,
2956+ HSM_PV_SB_MAGIC);
2957+ return -1;
2958+ }
2959+ printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
2960+ pv->phys_nr);
2961+ printk("... created under HSM version %d.%d.%d, at %x.\n",
2962+ pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
2963+ printk("... total # of blocks: %d (%d left unallocated).\n",
2964+ pv_sb->pv_total_size, pv_sb->pv_blocks_left);
2965+
2966+ printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
2967+ printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
2968+ printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
2969+ printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
2970+
2971+ if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
2972+ MD_BUG();
2973+ return 1;
2974+ }
2975+ pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
2976+ if (!pv->bg_array) {
2977+ MD_BUG();
2978+ return 1;
2979+ }
2980+ memset(pv->bg_array, 0, PAGE_SIZE);
2981+
2982+ return 0;
2983+}
2984+
2985+static int free_pv(pv_t *pv)
2986+{
2987+ struct buffer_head *bh;
2988+
2989+ dprintk("freeing PV %d ...\n", pv->phys_nr);
2990+
2991+ if (pv->bg_array) {
2992+ int i;
2993+
2994+ dprintk(".... freeing BGs ...\n");
2995+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2996+ unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
2997+ pv_bg_desc_t *desc = pv->bg_array + i;
2998+
2999+ if (desc->bg) {
3000+ dprintk(".... freeing BG %d ...\n", i);
3001+ bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
3002+ mark_buffer_dirty(bh, 1);
3003+ brelse(bh);
3004+ brelse(bh);
3005+ }
3006+ }
3007+ free_page((unsigned long)pv->bg_array);
3008+ } else
3009+ MD_BUG();
3010+
3011+ bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
3012+ if (!bh) {
3013+ MD_BUG();
3014+ return -1;
3015+ }
3016+ mark_buffer_dirty(bh, 1);
3017+ brelse(bh);
3018+ brelse(bh);
3019+
3020+ return 0;
3021+}
3022+
3023+struct semaphore hsm_sem = MUTEX;
3024+
3025+#define HSM_SECTORS (HSM_BLOCKSIZE/512)
3026+
3027+static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3028+ unsigned long *rsector, unsigned long bsectors)
3029+{
3030+ lv_t *lv = kdev_to_lv(dev);
3031+ lv_lptr_t *index;
3032+ unsigned int lblock = *rsector / HSM_SECTORS;
3033+ unsigned int offset = *rsector % HSM_SECTORS;
3034+ int err = -EIO;
3035+
3036+ if (!lv) {
3037+ printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
3038+ goto out;
3039+ }
3040+ if (offset + bsectors > HSM_SECTORS) {
3041+ MD_BUG();
3042+ goto out;
3043+ }
3044+ down(&hsm_sem);
3045+ index = find_index(lv, lblock);
3046+ if (!index) {
3047+ printk("no block %u yet ... allocating\n", lblock);
3048+ index = alloc_fixed_index(lv, lblock);
3049+ }
3050+
3051+ err = 0;
3052+
3053+ printk(" %u <%s : %ld(%ld)> -> ", lblock,
3054+ partition_name(*rdev), *rsector, bsectors);
3055+
3056+ *rdev = index_dev(lv, index);
3057+ *rsector = index_block(lv, index) * HSM_SECTORS + offset;
3058+
3059+ printk(" <%s : %ld> %u\n",
3060+ partition_name(*rdev), *rsector, index_block(lv, index));
3061+
3062+ up(&hsm_sem);
3063+out:
3064+ return err;
3065+}
3066+
3067+static void free_index (lv_t *lv, lv_lptr_t * index)
3068+{
3069+ struct buffer_head *bh;
3070+
3071+ printk("tryin to get cached block for index group <%s:%d>\n",
3072+ partition_name(index_dev(lv, index)), index_block(lv, index));
3073+
3074+ bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
3075+
3076+ printk("....FREEING ");
3077+ print_index_list(lv, index);
3078+
3079+ if (bh) {
3080+ if (!buffer_uptodate(bh))
3081+ MD_BUG();
3082+ if ((lv_lptr_t *)bh->b_data != index_child(index)) {
3083+ printk("huh? b_data is %p, index content is %p.\n",
3084+ bh->b_data, index_child(index));
3085+ } else
3086+ printk("good, b_data == index content == %p.\n",
3087+ index_child(index));
3088+ printk("b_count == %d, writing.\n", bh->b_count);
3089+ mark_buffer_dirty(bh, 1);
3090+ brelse(bh);
3091+ brelse(bh);
3092+ printk("done.\n");
3093+ } else {
3094+ printk("FAILED!\n");
3095+ }
3096+ print_index_list(lv, index);
3097+ index_child(index) = NULL;
3098+}
3099+
3100+static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3101+{
3102+ char dots [3*8];
3103+ lv_lptr_t * index;
3104+ int i, nr_dots;
3105+
3106+ nr_dots = (INDEX_DEPTH-level)*3;
3107+ memcpy(dots,"...............",nr_dots);
3108+ dots[nr_dots] = 0;
3109+
3110+ dprintk("%s level %d index group block:\n", dots, level);
3111+
3112+
3113+ index = index_0;
3114+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3115+ if (index->data.phys_block) {
3116+ dprintk("%s block <%u,%u,%x>\n", dots,
3117+ index->data.phys_nr,
3118+ index->data.phys_block,
3119+ index->cpu_addr);
3120+ if (level && index_present(index)) {
3121+ dprintk("%s==> deeper one level\n", dots);
3122+ free_index_group(lv, level-1,
3123+ index_child(index));
3124+ dprintk("%s freeing index group block %p ...",
3125+ dots, index_child(index));
3126+ free_index(lv, index);
3127+ }
3128+ }
3129+ index++;
3130+ }
3131+ dprintk("%s DONE: level %d index group block.\n", dots, level);
3132+}
3133+
3134+static void free_lv_indextree (lv_t *lv)
3135+{
3136+ dprintk("freeing LV %d ...\n", lv->log_id);
3137+ dprintk("..root index: %p\n", index_child(&lv->root_index));
3138+ dprintk("..INDEX TREE:\n");
3139+ free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3140+ dprintk("..freeing root index %p ...", index_child(&lv->root_index));
3141+ dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
3142+ lv->root_index.data.phys_block, lv->root_index.cpu_addr);
3143+ free_index(lv, &lv->root_index);
3144+ dprintk("..INDEX TREE done.\n");
3145+ fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
3146+ lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
3147+}
3148+
3149+static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3150+{
3151+ char dots [3*5];
3152+ lv_lptr_t * index;
3153+ int i, nr_dots;
3154+
3155+ nr_dots = (INDEX_DEPTH-level)*3;
3156+ memcpy(dots,"...............",nr_dots);
3157+ dots[nr_dots] = 0;
3158+
3159+ dprintk("%s level %d index group block:\n", dots, level);
3160+
3161+
3162+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3163+ index = index_0 + i;
3164+ if (index->data.phys_block) {
3165+ dprintk("%s block <%u,%u,%x>\n", dots,
3166+ index->data.phys_nr,
3167+ index->data.phys_block,
3168+ index->cpu_addr);
3169+ if (level && index_present(index)) {
3170+ dprintk("%s==> deeper one level\n", dots);
3171+ print_index_group(lv, level-1,
3172+ index_child(index));
3173+ }
3174+ }
3175+ }
3176+ dprintk("%s DONE: level %d index group block.\n", dots, level);
3177+}
3178+
3179+static void print_lv (lv_t *lv)
3180+{
3181+ dprintk("printing LV %d ...\n", lv->log_id);
3182+ dprintk("..root index: %p\n", index_child(&lv->root_index));
3183+ dprintk("..INDEX TREE:\n");
3184+ print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3185+ dprintk("..INDEX TREE done.\n");
3186+}
3187+
3188+static int map_lv (lv_t *lv)
3189+{
3190+ kdev_t dev = lv->dev;
3191+ unsigned int nr = MINOR(dev);
3192+ mddev_t *mddev = lv->vg->mddev;
3193+
3194+ if (MAJOR(dev) != MD_MAJOR) {
3195+ MD_BUG();
3196+ return -1;
3197+ }
3198+ if (kdev_to_mddev(dev)) {
3199+ MD_BUG();
3200+ return -1;
3201+ }
3202+ md_hd_struct[nr].start_sect = 0;
3203+ md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
3204+ md_size[nr] = md_size[mdidx(mddev)];
3205+ add_mddev_mapping(mddev, dev, lv);
3206+
3207+ return 0;
3208+}
3209+
3210+static int unmap_lv (lv_t *lv)
3211+{
3212+ kdev_t dev = lv->dev;
3213+ unsigned int nr = MINOR(dev);
3214+
3215+ if (MAJOR(dev) != MD_MAJOR) {
3216+ MD_BUG();
3217+ return -1;
3218+ }
3219+ md_hd_struct[nr].start_sect = 0;
3220+ md_hd_struct[nr].nr_sects = 0;
3221+ md_size[nr] = 0;
3222+ del_mddev_mapping(lv->vg->mddev, dev);
3223+
3224+ return 0;
3225+}
3226+
3227+static int init_vg (vg_t *vg)
3228+{
3229+ int i;
3230+ lv_t *lv;
3231+ kdev_t dev;
3232+ vg_sb_t *vg_sb;
3233+ struct buffer_head *bh;
3234+ lv_descriptor_t *lv_desc;
3235+
3236+ /*
3237+ * fix me: read all PVs and compare the SB
3238+ */
3239+ dev = vg->pv_array[0].dev;
3240+ bh = bread (dev, 1, HSM_BLOCKSIZE);
3241+ if (!bh) {
3242+ MD_BUG();
3243+ return -1;
3244+ }
3245+
3246+ vg_sb = (vg_sb_t *) bh->b_data;
3247+ vg->vg_sb = vg_sb;
3248+
3249+ if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
3250+ printk("%s is not a valid VG, has magic %x instead of %x!\n",
3251+ partition_name(dev), vg_sb->vg_magic,
3252+ HSM_VG_SB_MAGIC);
3253+ return -1;
3254+ }
3255+
3256+ vg->nr_lv = 0;
3257+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3258+ unsigned int id;
3259+ lv_desc = vg->vg_sb->lv_array + i;
3260+
3261+ id = lv_desc->lv_id;
3262+ if (!id) {
3263+ printk("... LV desc %d empty\n", i);
3264+ continue;
3265+ }
3266+ if (id >= HSM_MAX_LVS_PER_VG) {
3267+ MD_BUG();
3268+ continue;
3269+ }
3270+
3271+ lv = vg->lv_array + id;
3272+ if (lv->vg) {
3273+ MD_BUG();
3274+ continue;
3275+ }
3276+ lv->log_id = id;
3277+ lv->vg = vg;
3278+ lv->max_indices = lv_desc->lv_max_indices;
3279+ lv->free_indices = lv_desc->lv_free_indices;
3280+ lv->root_index.data = lv_desc->lv_root_idx;
3281+ lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
3282+
3283+ vg->nr_lv++;
3284+
3285+ map_lv(lv);
3286+ if (read_root_index(lv)) {
3287+ vg->nr_lv--;
3288+ unmap_lv(lv);
3289+ memset(lv, 0, sizeof(*lv));
3290+ }
3291+ }
3292+ if (vg->nr_lv != vg_sb->nr_lvs)
3293+ MD_BUG();
3294+
3295+ return 0;
3296+}
3297+
3298+static int hsm_run (mddev_t *mddev)
3299+{
3300+ int i;
3301+ vg_t *vg;
3302+ mdk_rdev_t *rdev;
3303+
3304+ MOD_INC_USE_COUNT;
3305+
3306+ vg = kmalloc (sizeof (*vg), GFP_KERNEL);
3307+ if (!vg)
3308+ goto out;
3309+ memset(vg, 0, sizeof(*vg));
3310+ mddev->private = vg;
3311+ vg->mddev = mddev;
3312+
3313+ if (md_check_ordering(mddev)) {
3314+ printk("hsm: disks are not ordered, aborting!\n");
3315+ goto out;
3316+ }
3317+
3318+ set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
3319+
3320+ vg->nr_pv = mddev->nb_dev;
3321+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
3322+ pv_t *pv = vg->pv_array + i;
3323+
3324+ pv->dev = rdev->dev;
3325+ fsync_dev (pv->dev);
3326+ set_blocksize (pv->dev, HSM_BLOCKSIZE);
3327+ pv->phys_nr = i;
3328+ if (init_pv(pv))
3329+ goto out;
3330+ }
3331+
3332+ init_vg(vg);
3333+
3334+ return 0;
3335+
3336+out:
3337+ if (vg) {
3338+ kfree(vg);
3339+ mddev->private = NULL;
3340+ }
3341+ MOD_DEC_USE_COUNT;
3342+
3343+ return 1;
3344+}
3345+
3346+static int hsm_stop (mddev_t *mddev)
3347+{
3348+ lv_t *lv;
3349+ vg_t *vg;
3350+ int i;
3351+
3352+ vg = mddev_to_vg(mddev);
3353+
3354+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3355+ lv = vg->lv_array + i;
3356+ if (!lv->log_id)
3357+ continue;
3358+ print_lv(lv);
3359+ free_lv_indextree(lv);
3360+ unmap_lv(lv);
3361+ }
3362+ for (i = 0; i < vg->nr_pv; i++)
3363+ free_pv(vg->pv_array + i);
3364+
3365+ kfree(vg);
3366+
3367+ MOD_DEC_USE_COUNT;
3368+
3369+ return 0;
3370+}
3371+
3372+
3373+static int hsm_status (char *page, mddev_t *mddev)
3374+{
3375+ int sz = 0, i;
3376+ lv_t *lv;
3377+ vg_t *vg;
3378+
3379+ vg = mddev_to_vg(mddev);
3380+
3381+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3382+ lv = vg->lv_array + i;
3383+ if (!lv->log_id)
3384+ continue;
3385+ sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
3386+ lv->max_indices - lv->free_indices, lv->max_indices);
3387+ }
3388+ return sz;
3389+}
3390+
3391+
3392+static mdk_personality_t hsm_personality=
3393+{
3394+ "hsm",
3395+ hsm_map,
3396+ NULL,
3397+ NULL,
3398+ hsm_run,
3399+ hsm_stop,
3400+ hsm_status,
3401+ NULL,
3402+ 0,
3403+ NULL,
3404+ NULL,
3405+ NULL,
3406+ NULL
3407+};
3408+
3409+#ifndef MODULE
3410+
3411+md__initfunc(void hsm_init (void))
3412+{
3413+ register_md_personality (HSM, &hsm_personality);
3414+}
3415+
3416+#else
3417+
3418+int init_module (void)
3419+{
3420+ return (register_md_personality (HSM, &hsm_personality));
3421+}
3422+
3423+void cleanup_module (void)
3424+{
3425+ unregister_md_personality (HSM);
3426+}
3427+
3428+#endif
3429+
3430+/*
3431+ * This Linus-trick catches bugs via the linker.
3432+ */
3433+
3434+extern void __BUG__in__hsm_dot_c_1(void);
3435+extern void __BUG__in__hsm_dot_c_2(void);
3436+extern void __BUG__in__hsm_dot_c_3(void);
3437+extern void __BUG__in__hsm_dot_c_4(void);
3438+extern void __BUG__in__hsm_dot_c_5(void);
3439+extern void __BUG__in__hsm_dot_c_6(void);
3440+extern void __BUG__in__hsm_dot_c_7(void);
3441+
3442+void bugcatcher (void)
3443+{
3444+ if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
3445+ __BUG__in__hsm_dot_c_1();
3446+ if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
3447+ __BUG__in__hsm_dot_c_2();
3448+
3449+ if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
3450+ __BUG__in__hsm_dot_c_4();
3451+ if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
3452+ __BUG__in__hsm_dot_c_3();
3453+ if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
3454+ __BUG__in__hsm_dot_c_6();
3455+
3456+ if (sizeof(lv_lptr_t) != 16)
3457+ __BUG__in__hsm_dot_c_5();
3458+ if (sizeof(pv_pptr_t) != 16)
3459+ __BUG__in__hsm_dot_c_6();
3460+}
3461+
3462--- linux/drivers/block/linear.c.orig Sat Nov 8 20:39:12 1997
3463+++ linux/drivers/block/linear.c Tue Jan 16 13:42:04 2001
3464@@ -1,4 +1,3 @@
3465-
3466 /*
3467 linear.c : Multiple Devices driver for Linux
3468 Copyright (C) 1994-96 Marc ZYNGIER
3469@@ -19,186 +18,207 @@
3470
3471 #include <linux/module.h>
3472
3473-#include <linux/md.h>
3474+#include <linux/raid/md.h>
3475 #include <linux/malloc.h>
3476-#include <linux/init.h>
3477
3478-#include "linear.h"
3479+#include <linux/raid/linear.h>
3480
3481 #define MAJOR_NR MD_MAJOR
3482 #define MD_DRIVER
3483 #define MD_PERSONALITY
3484
3485-static int linear_run (int minor, struct md_dev *mddev)
3486+static int linear_run (mddev_t *mddev)
3487 {
3488- int cur=0, i, size, dev0_size, nb_zone;
3489- struct linear_data *data;
3490-
3491- MOD_INC_USE_COUNT;
3492-
3493- mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
3494- data=(struct linear_data *) mddev->private;
3495-
3496- /*
3497- Find out the smallest device. This was previously done
3498- at registry time, but since it violates modularity,
3499- I moved it here... Any comment ? ;-)
3500- */
3501-
3502- data->smallest=mddev->devices;
3503- for (i=1; i<mddev->nb_dev; i++)
3504- if (data->smallest->size > mddev->devices[i].size)
3505- data->smallest=mddev->devices+i;
3506-
3507- nb_zone=data->nr_zones=
3508- md_size[minor]/data->smallest->size +
3509- (md_size[minor]%data->smallest->size ? 1 : 0);
3510-
3511- data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
3512-
3513- size=mddev->devices[cur].size;
3514+ linear_conf_t *conf;
3515+ struct linear_hash *table;
3516+ mdk_rdev_t *rdev;
3517+ int size, i, j, nb_zone;
3518+ unsigned int curr_offset;
3519+
3520+ MOD_INC_USE_COUNT;
3521+
3522+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
3523+ if (!conf)
3524+ goto out;
3525+ mddev->private = conf;
3526+
3527+ if (md_check_ordering(mddev)) {
3528+ printk("linear: disks are not ordered, aborting!\n");
3529+ goto out;
3530+ }
3531+ /*
3532+ * Find the smallest device.
3533+ */
3534+
3535+ conf->smallest = NULL;
3536+ curr_offset = 0;
3537+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
3538+ dev_info_t *disk = conf->disks + j;
3539+
3540+ disk->dev = rdev->dev;
3541+ disk->size = rdev->size;
3542+ disk->offset = curr_offset;
3543+
3544+ curr_offset += disk->size;
3545+
3546+ if (!conf->smallest || (disk->size < conf->smallest->size))
3547+ conf->smallest = disk;
3548+ }
3549+
3550+ nb_zone = conf->nr_zones =
3551+ md_size[mdidx(mddev)] / conf->smallest->size +
3552+ ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
3553+
3554+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
3555+ GFP_KERNEL);
3556+ if (!conf->hash_table)
3557+ goto out;
3558+
3559+ /*
3560+ * Here we generate the linear hash table
3561+ */
3562+ table = conf->hash_table;
3563+ i = 0;
3564+ size = 0;
3565+ for (j = 0; j < mddev->nb_dev; j++) {
3566+ dev_info_t *disk = conf->disks + j;
3567+
3568+ if (size < 0) {
3569+ table->dev1 = disk;
3570+ table++;
3571+ }
3572+ size += disk->size;
3573+
3574+ while (size) {
3575+ table->dev0 = disk;
3576+ size -= conf->smallest->size;
3577+ if (size < 0)
3578+ break;
3579+ table->dev1 = NULL;
3580+ table++;
3581+ }
3582+ }
3583+ table->dev1 = NULL;
3584+
3585+ return 0;
3586+
3587+out:
3588+ if (conf)
3589+ kfree(conf);
3590+ MOD_DEC_USE_COUNT;
3591+ return 1;
3592+}
3593+
3594+static int linear_stop (mddev_t *mddev)
3595+{
3596+ linear_conf_t *conf = mddev_to_conf(mddev);
3597+
3598+ kfree(conf->hash_table);
3599+ kfree(conf);
3600
3601- i=0;
3602- while (cur<mddev->nb_dev)
3603- {
3604- data->hash_table[i].dev0=mddev->devices+cur;
3605+ MOD_DEC_USE_COUNT;
3606
3607- if (size>=data->smallest->size) /* If we completely fill the slot */
3608- {
3609- data->hash_table[i++].dev1=NULL;
3610- size-=data->smallest->size;
3611-
3612- if (!size)
3613- {
3614- if (++cur==mddev->nb_dev) continue;
3615- size=mddev->devices[cur].size;
3616- }
3617-
3618- continue;
3619- }
3620-
3621- if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
3622- {
3623- data->hash_table[i].dev1=NULL;
3624- continue;
3625- }
3626-
3627- dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
3628- size=mddev->devices[cur].size;
3629- data->hash_table[i++].dev1=mddev->devices+cur;
3630- size-=(data->smallest->size - dev0_size);
3631- }
3632-
3633- return 0;
3634-}
3635-
3636-static int linear_stop (int minor, struct md_dev *mddev)
3637-{
3638- struct linear_data *data=(struct linear_data *) mddev->private;
3639-
3640- kfree (data->hash_table);
3641- kfree (data);
3642-
3643- MOD_DEC_USE_COUNT;
3644-
3645- return 0;
3646+ return 0;
3647 }
3648
3649
3650-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
3651+static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3652 unsigned long *rsector, unsigned long size)
3653 {
3654- struct linear_data *data=(struct linear_data *) mddev->private;
3655- struct linear_hash *hash;
3656- struct real_dev *tmp_dev;
3657- long block;
3658-
3659- block=*rsector >> 1;
3660- hash=data->hash_table+(block/data->smallest->size);
3661-
3662- if (block >= (hash->dev0->size + hash->dev0->offset))
3663- {
3664- if (!hash->dev1)
3665- {
3666- printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
3667- return (-1);
3668- }
3669-
3670- tmp_dev=hash->dev1;
3671- }
3672- else
3673- tmp_dev=hash->dev0;
3674+ linear_conf_t *conf = mddev_to_conf(mddev);
3675+ struct linear_hash *hash;
3676+ dev_info_t *tmp_dev;
3677+ long block;
3678+
3679+ block = *rsector >> 1;
3680+ hash = conf->hash_table + (block / conf->smallest->size);
3681+
3682+ if (block >= (hash->dev0->size + hash->dev0->offset))
3683+ {
3684+ if (!hash->dev1)
3685+ {
3686+ printk ("linear_map : hash->dev1==NULL for block %ld\n",
3687+ block);
3688+ return -1;
3689+ }
3690+ tmp_dev = hash->dev1;
3691+ } else
3692+ tmp_dev = hash->dev0;
3693
3694- if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
3695- printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3696- block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3697+ if (block >= (tmp_dev->size + tmp_dev->offset)
3698+ || block < tmp_dev->offset)
3699+ printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3700+ block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3701
3702- *rdev=tmp_dev->dev;
3703- *rsector=(block-(tmp_dev->offset)) << 1;
3704+ *rdev = tmp_dev->dev;
3705+ *rsector = (block - tmp_dev->offset) << 1;
3706
3707- return (0);
3708+ return 0;
3709 }
3710
3711-static int linear_status (char *page, int minor, struct md_dev *mddev)
3712+static int linear_status (char *page, mddev_t *mddev)
3713 {
3714- int sz=0;
3715+ int sz=0;
3716
3717 #undef MD_DEBUG
3718 #ifdef MD_DEBUG
3719- int j;
3720- struct linear_data *data=(struct linear_data *) mddev->private;
3721+ int j;
3722+ linear_conf_t *conf = mddev_to_conf(mddev);
3723
3724- sz+=sprintf (page+sz, " ");
3725- for (j=0; j<data->nr_zones; j++)
3726- {
3727- sz+=sprintf (page+sz, "[%s",
3728- partition_name (data->hash_table[j].dev0->dev));
3729-
3730- if (data->hash_table[j].dev1)
3731- sz+=sprintf (page+sz, "/%s] ",
3732- partition_name(data->hash_table[j].dev1->dev));
3733- else
3734- sz+=sprintf (page+sz, "] ");
3735- }
3736-
3737- sz+=sprintf (page+sz, "\n");
3738+ sz += sprintf(page+sz, " ");
3739+ for (j = 0; j < conf->nr_zones; j++)
3740+ {
3741+ sz += sprintf(page+sz, "[%s",
3742+ partition_name(conf->hash_table[j].dev0->dev));
3743+
3744+ if (conf->hash_table[j].dev1)
3745+ sz += sprintf(page+sz, "/%s] ",
3746+ partition_name(conf->hash_table[j].dev1->dev));
3747+ else
3748+ sz += sprintf(page+sz, "] ");
3749+ }
3750+ sz += sprintf(page+sz, "\n");
3751 #endif
3752- sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
3753- return sz;
3754+ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
3755+ return sz;
3756 }
3757
3758
3759-static struct md_personality linear_personality=
3760+static mdk_personality_t linear_personality=
3761 {
3762- "linear",
3763- linear_map,
3764- NULL,
3765- NULL,
3766- linear_run,
3767- linear_stop,
3768- linear_status,
3769- NULL, /* no ioctls */
3770- 0
3771+ "linear",
3772+ linear_map,
3773+ NULL,
3774+ NULL,
3775+ linear_run,
3776+ linear_stop,
3777+ linear_status,
3778+ NULL,
3779+ 0,
3780+ NULL,
3781+ NULL,
3782+ NULL,
3783+ NULL
3784 };
3785
3786-
3787 #ifndef MODULE
3788
3789-__initfunc(void linear_init (void))
3790+md__initfunc(void linear_init (void))
3791 {
3792- register_md_personality (LINEAR, &linear_personality);
3793+ register_md_personality (LINEAR, &linear_personality);
3794 }
3795
3796 #else
3797
3798 int init_module (void)
3799 {
3800- return (register_md_personality (LINEAR, &linear_personality));
3801+ return (register_md_personality (LINEAR, &linear_personality));
3802 }
3803
3804 void cleanup_module (void)
3805 {
3806- unregister_md_personality (LINEAR);
3807+ unregister_md_personality (LINEAR);
3808 }
3809
3810 #endif
3811+
3812--- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996
3813+++ linux/drivers/block/linear.h Tue Jan 16 13:42:04 2001
3814@@ -1,16 +0,0 @@
3815-#ifndef _LINEAR_H
3816-#define _LINEAR_H
3817-
3818-struct linear_hash
3819-{
3820- struct real_dev *dev0, *dev1;
3821-};
3822-
3823-struct linear_data
3824-{
3825- struct linear_hash *hash_table; /* Dynamically allocated */
3826- struct real_dev *smallest;
3827- int nr_zones;
3828-};
3829-
3830-#endif
3831--- linux/drivers/block/ll_rw_blk.c.orig Mon Dec 11 01:49:41 2000
3832+++ linux/drivers/block/ll_rw_blk.c Tue Jan 16 13:42:04 2001
3833@@ -23,6 +23,7 @@
3834 #include <asm/io.h>
3835 #include <asm/uaccess.h>
3836 #include <linux/blk.h>
3837+#include <linux/raid/md.h>
3838
3839 #include <linux/module.h>
3840
3841@@ -53,6 +54,11 @@
3842 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
3843
3844 /*
3845+ * per-major idle-IO detection
3846+ */
3847+unsigned long io_events[MAX_BLKDEV] = {0, };
3848+
3849+/*
3850 * used to wait on when there are no free requests
3851 */
3852 struct wait_queue * wait_for_request;
3853@@ -583,6 +589,8 @@
3854 return;
3855 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
3856 lock_buffer(bh);
3857+ if (!buffer_lowprio(bh))
3858+ io_events[major]++;
3859
3860 if (blk_size[major]) {
3861 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
3862@@ -832,7 +840,7 @@
3863 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
3864 #ifdef CONFIG_BLK_DEV_MD
3865 if (major==MD_MAJOR &&
3866- md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
3867+ md_map (bh[i]->b_dev, &bh[i]->b_rdev,
3868 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
3869 printk (KERN_ERR
3870 "Bad md_map in ll_rw_block\n");
3871@@ -852,7 +860,7 @@
3872 set_bit(BH_Req, &bh[i]->b_state);
3873 #ifdef CONFIG_BLK_DEV_MD
3874 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
3875- md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
3876+ md_make_request(bh[i], rw);
3877 continue;
3878 }
3879 #endif
3880--- linux/drivers/block/md.c.orig Mon Sep 4 19:39:16 2000
3881+++ linux/drivers/block/md.c Tue Jan 16 13:42:04 2001
3882@@ -1,21 +1,17 @@
3883-
3884 /*
3885 md.c : Multiple Devices driver for Linux
3886- Copyright (C) 1994-96 Marc ZYNGIER
3887- <zyngier@ufr-info-p7.ibp.fr> or
3888- <maz@gloups.fdn.fr>
3889+ Copyright (C) 1998, 1999 Ingo Molnar
3890
3891- A lot of inspiration came from hd.c ...
3892+ completely rewritten, based on the MD driver code from Marc Zyngier
3893
3894- kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3895- boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3896+ Changes:
3897
3898- RAID-1/RAID-5 extensions by:
3899- Ingo Molnar, Miguel de Icaza, Gadi Oxman
3900+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
3901+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3902+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3903+ - kmod support by: Cyrus Durgin
3904+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
3905
3906- Changes for kmod by:
3907- Cyrus Durgin
3908-
3909 This program is free software; you can redistribute it and/or modify
3910 it under the terms of the GNU General Public License as published by
3911 the Free Software Foundation; either version 2, or (at your option)
3912@@ -26,807 +22,3007 @@
3913 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3914 */
3915
3916-/*
3917- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
3918- * the extra system load does not show up that much. Increase it if your
3919- * system can take more.
3920- */
3921-#define SPEED_LIMIT 1024
3922+#include <linux/raid/md.h>
3923+#include <linux/raid/xor.h>
3924
3925-#include <linux/config.h>
3926-#include <linux/module.h>
3927-#include <linux/version.h>
3928-#include <linux/malloc.h>
3929-#include <linux/mm.h>
3930-#include <linux/md.h>
3931-#include <linux/hdreg.h>
3932-#include <linux/stat.h>
3933-#include <linux/fs.h>
3934-#include <linux/proc_fs.h>
3935-#include <linux/blkdev.h>
3936-#include <linux/genhd.h>
3937-#include <linux/smp_lock.h>
3938 #ifdef CONFIG_KMOD
3939 #include <linux/kmod.h>
3940 #endif
3941-#include <linux/errno.h>
3942-#include <linux/init.h>
3943
3944 #define __KERNEL_SYSCALLS__
3945 #include <linux/unistd.h>
3946
3947+#include <asm/unaligned.h>
3948+
3949+extern asmlinkage int sys_sched_yield(void);
3950+extern asmlinkage int sys_setsid(void);
3951+
3952+extern unsigned long io_events[MAX_BLKDEV];
3953+
3954 #define MAJOR_NR MD_MAJOR
3955 #define MD_DRIVER
3956
3957 #include <linux/blk.h>
3958-#include <asm/uaccess.h>
3959-#include <asm/bitops.h>
3960-#include <asm/atomic.h>
3961
3962 #ifdef CONFIG_MD_BOOT
3963-extern kdev_t name_to_kdev_t(char *line) __init;
3964+extern kdev_t name_to_kdev_t(char *line) md__init;
3965 #endif
3966
3967-static struct hd_struct md_hd_struct[MAX_MD_DEV];
3968-static int md_blocksizes[MAX_MD_DEV];
3969-int md_maxreadahead[MAX_MD_DEV];
3970-#if SUPPORT_RECONSTRUCTION
3971-static struct md_thread *md_sync_thread = NULL;
3972-#endif /* SUPPORT_RECONSTRUCTION */
3973+static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
3974+
3975+/*
3976+ * these have to be allocated separately because external
3977+ * subsystems want to have a pre-defined structure
3978+ */
3979+struct hd_struct md_hd_struct[MAX_MD_DEVS];
3980+static int md_blocksizes[MAX_MD_DEVS];
3981+static int md_maxreadahead[MAX_MD_DEVS];
3982+static mdk_thread_t *md_recovery_thread = NULL;
3983
3984-int md_size[MAX_MD_DEV]={0, };
3985+int md_size[MAX_MD_DEVS] = {0, };
3986
3987 static void md_geninit (struct gendisk *);
3988
3989 static struct gendisk md_gendisk=
3990 {
3991- MD_MAJOR,
3992- "md",
3993- 0,
3994- 1,
3995- MAX_MD_DEV,
3996- md_geninit,
3997- md_hd_struct,
3998- md_size,
3999- MAX_MD_DEV,
4000- NULL,
4001- NULL
4002+ MD_MAJOR,
4003+ "md",
4004+ 0,
4005+ 1,
4006+ MAX_MD_DEVS,
4007+ md_geninit,
4008+ md_hd_struct,
4009+ md_size,
4010+ MAX_MD_DEVS,
4011+ NULL,
4012+ NULL
4013 };
4014
4015-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
4016-struct md_dev md_dev[MAX_MD_DEV];
4017-
4018-int md_thread(void * arg);
4019+/*
4020+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
4021+ * is 100 KB/sec, so the extra system load does not show up that much.
4022+ * Increase it if you want to have more _guaranteed_ speed. Note that
4023+ * the RAID driver will use the maximum available bandwith if the IO
4024+ * subsystem is idle.
4025+ *
4026+ * you can change it via /proc/sys/dev/speed-limit
4027+ */
4028
4029-static struct gendisk *find_gendisk (kdev_t dev)
4030-{
4031- struct gendisk *tmp=gendisk_head;
4032+static int sysctl_speed_limit = 100;
4033
4034- while (tmp != NULL)
4035- {
4036- if (tmp->major==MAJOR(dev))
4037- return (tmp);
4038-
4039- tmp=tmp->next;
4040- }
4041+static struct ctl_table_header *md_table_header;
4042
4043- return (NULL);
4044-}
4045+static ctl_table md_table[] = {
4046+ {DEV_MD_SPEED_LIMIT, "speed-limit",
4047+ &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
4048+ {0}
4049+};
4050
4051-char *partition_name (kdev_t dev)
4052-{
4053- static char name[40]; /* This should be long
4054- enough for a device name ! */
4055- struct gendisk *hd = find_gendisk (dev);
4056+static ctl_table md_dir_table[] = {
4057+ {DEV_MD, "md", NULL, 0, 0555, md_table},
4058+ {0}
4059+};
4060
4061- if (!hd)
4062- {
4063- sprintf (name, "[dev %s]", kdevname(dev));
4064- return (name);
4065- }
4066+static ctl_table md_root_table[] = {
4067+ {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
4068+ {0}
4069+};
4070
4071- return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
4072+static void md_register_sysctl(void)
4073+{
4074+ md_table_header = register_sysctl_table(md_root_table, 1);
4075 }
4076
4077-static int legacy_raid_sb (int minor, int pnum)
4078+void md_unregister_sysctl(void)
4079 {
4080- int i, factor;
4081+ unregister_sysctl_table(md_table_header);
4082+}
4083+
4084+/*
4085+ * The mapping between kdev and mddev is not necessary a simple
4086+ * one! Eg. HSM uses several sub-devices to implement Logical
4087+ * Volumes. All these sub-devices map to the same mddev.
4088+ */
4089+dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
4090
4091- factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4092+void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
4093+{
4094+ unsigned int minor = MINOR(dev);
4095
4096- /*****
4097- * do size and offset calculations.
4098- */
4099- for (i=0; i<md_dev[minor].nb_dev; i++) {
4100- md_dev[minor].devices[i].size &= ~(factor - 1);
4101- md_size[minor] += md_dev[minor].devices[i].size;
4102- md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
4103- md_dev[minor].devices[i-1].size) : 0;
4104+ if (MAJOR(dev) != MD_MAJOR) {
4105+ MD_BUG();
4106+ return;
4107 }
4108- if (pnum == RAID0 >> PERSONALITY_SHIFT)
4109- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
4110- return 0;
4111+ if (mddev_map[minor].mddev != NULL) {
4112+ MD_BUG();
4113+ return;
4114+ }
4115+ mddev_map[minor].mddev = mddev;
4116+ mddev_map[minor].data = data;
4117 }
4118
4119-static void free_sb (struct md_dev *mddev)
4120+void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
4121 {
4122- int i;
4123- struct real_dev *realdev;
4124+ unsigned int minor = MINOR(dev);
4125
4126- if (mddev->sb) {
4127- free_page((unsigned long) mddev->sb);
4128- mddev->sb = NULL;
4129+ if (MAJOR(dev) != MD_MAJOR) {
4130+ MD_BUG();
4131+ return;
4132 }
4133- for (i = 0; i <mddev->nb_dev; i++) {
4134- realdev = mddev->devices + i;
4135- if (realdev->sb) {
4136- free_page((unsigned long) realdev->sb);
4137- realdev->sb = NULL;
4138- }
4139+ if (mddev_map[minor].mddev != mddev) {
4140+ MD_BUG();
4141+ return;
4142 }
4143+ mddev_map[minor].mddev = NULL;
4144+ mddev_map[minor].data = NULL;
4145 }
4146
4147 /*
4148- * Check one RAID superblock for generic plausibility
4149+ * Enables to iterate over all existing md arrays
4150 */
4151+static MD_LIST_HEAD(all_mddevs);
4152
4153-#define BAD_MAGIC KERN_ERR \
4154-"md: %s: invalid raid superblock magic (%x) on block %u\n"
4155+static mddev_t * alloc_mddev (kdev_t dev)
4156+{
4157+ mddev_t * mddev;
4158
4159-#define OUT_OF_MEM KERN_ALERT \
4160-"md: out of memory.\n"
4161+ if (MAJOR(dev) != MD_MAJOR) {
4162+ MD_BUG();
4163+ return 0;
4164+ }
4165+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
4166+ if (!mddev)
4167+ return NULL;
4168+
4169+ memset(mddev, 0, sizeof(*mddev));
4170
4171-#define NO_DEVICE KERN_ERR \
4172-"md: disabled device %s\n"
4173+ mddev->__minor = MINOR(dev);
4174+ mddev->reconfig_sem = MUTEX;
4175+ mddev->recovery_sem = MUTEX;
4176+ mddev->resync_sem = MUTEX;
4177+ MD_INIT_LIST_HEAD(&mddev->disks);
4178+ /*
4179+ * The 'base' mddev is the one with data NULL.
4180+ * personalities can create additional mddevs
4181+ * if necessary.
4182+ */
4183+ add_mddev_mapping(mddev, dev, 0);
4184+ md_list_add(&mddev->all_mddevs, &all_mddevs);
4185
4186-#define SUCCESS 0
4187-#define FAILURE -1
4188+ return mddev;
4189+}
4190
4191-static int analyze_one_sb (struct real_dev * rdev)
4192+static void free_mddev (mddev_t *mddev)
4193 {
4194- int ret = FAILURE;
4195- struct buffer_head *bh;
4196- kdev_t dev = rdev->dev;
4197- md_superblock_t *sb;
4198+ if (!mddev) {
4199+ MD_BUG();
4200+ return;
4201+ }
4202
4203 /*
4204- * Read the superblock, it's at the end of the disk
4205+ * Make sure nobody else is using this mddev
4206+ * (careful, we rely on the global kernel lock here)
4207 */
4208- rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
4209- set_blocksize (dev, MD_SB_BYTES);
4210- bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4211-
4212- if (bh) {
4213- sb = (md_superblock_t *) bh->b_data;
4214- if (sb->md_magic != MD_SB_MAGIC) {
4215- printk (BAD_MAGIC, kdevname(dev),
4216- sb->md_magic, rdev->sb_offset);
4217- goto abort;
4218- }
4219- rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
4220- if (!rdev->sb) {
4221- printk (OUT_OF_MEM);
4222- goto abort;
4223- }
4224- memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
4225+ while (md_atomic_read(&mddev->resync_sem.count) != 1)
4226+ schedule();
4227+ while (md_atomic_read(&mddev->recovery_sem.count) != 1)
4228+ schedule();
4229
4230- rdev->size = sb->size;
4231- } else
4232- printk (NO_DEVICE,kdevname(rdev->dev));
4233- ret = SUCCESS;
4234-abort:
4235- if (bh)
4236- brelse (bh);
4237- return ret;
4238+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
4239+ md_list_del(&mddev->all_mddevs);
4240+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
4241+ kfree(mddev);
4242 }
4243
4244-#undef SUCCESS
4245-#undef FAILURE
4246-
4247-#undef BAD_MAGIC
4248-#undef OUT_OF_MEM
4249-#undef NO_DEVICE
4250
4251-/*
4252- * Check a full RAID array for plausibility
4253- */
4254+struct gendisk * find_gendisk (kdev_t dev)
4255+{
4256+ struct gendisk *tmp = gendisk_head;
4257
4258-#define INCONSISTENT KERN_ERR \
4259-"md: superblock inconsistency -- run ckraid\n"
4260+ while (tmp != NULL) {
4261+ if (tmp->major == MAJOR(dev))
4262+ return (tmp);
4263+ tmp = tmp->next;
4264+ }
4265+ return (NULL);
4266+}
4267
4268-#define OUT_OF_DATE KERN_ERR \
4269-"md: superblock update time inconsistenty -- using the most recent one\n"
4270+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
4271+{
4272+ mdk_rdev_t * rdev;
4273+ struct md_list_head *tmp;
4274
4275-#define OLD_VERSION KERN_ALERT \
4276-"md: %s: unsupported raid array version %d.%d.%d\n"
4277+ ITERATE_RDEV(mddev,rdev,tmp) {
4278+ if (rdev->desc_nr == nr)
4279+ return rdev;
4280+ }
4281+ return NULL;
4282+}
4283
4284-#define NOT_CLEAN KERN_ERR \
4285-"md: %s: raid array is not clean -- run ckraid\n"
4286+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
4287+{
4288+ struct md_list_head *tmp;
4289+ mdk_rdev_t *rdev;
4290
4291-#define NOT_CLEAN_IGNORE KERN_ERR \
4292-"md: %s: raid array is not clean -- reconstructing parity\n"
4293+ ITERATE_RDEV(mddev,rdev,tmp) {
4294+ if (rdev->dev == dev)
4295+ return rdev;
4296+ }
4297+ return NULL;
4298+}
4299
4300-#define UNKNOWN_LEVEL KERN_ERR \
4301-"md: %s: unsupported raid level %d\n"
4302+static MD_LIST_HEAD(device_names);
4303
4304-static int analyze_sbs (int minor, int pnum)
4305+char * partition_name (kdev_t dev)
4306 {
4307- struct md_dev *mddev = md_dev + minor;
4308- int i, N = mddev->nb_dev, out_of_date = 0;
4309- struct real_dev * disks = mddev->devices;
4310- md_superblock_t *sb, *freshest = NULL;
4311+ struct gendisk *hd;
4312+ static char nomem [] = "<nomem>";
4313+ dev_name_t *dname;
4314+ struct md_list_head *tmp = device_names.next;
4315
4316- /*
4317- * RAID-0 and linear don't use a RAID superblock
4318- */
4319- if (pnum == RAID0 >> PERSONALITY_SHIFT ||
4320- pnum == LINEAR >> PERSONALITY_SHIFT)
4321- return legacy_raid_sb (minor, pnum);
4322+ while (tmp != &device_names) {
4323+ dname = md_list_entry(tmp, dev_name_t, list);
4324+ if (dname->dev == dev)
4325+ return dname->name;
4326+ tmp = tmp->next;
4327+ }
4328+
4329+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
4330
4331+ if (!dname)
4332+ return nomem;
4333 /*
4334- * Verify the RAID superblock on each real device
4335+ * ok, add this new device name to the list
4336 */
4337- for (i = 0; i < N; i++)
4338- if (analyze_one_sb(disks+i))
4339- goto abort;
4340+ hd = find_gendisk (dev);
4341+
4342+ if (!hd)
4343+ sprintf (dname->name, "[dev %s]", kdevname(dev));
4344+ else
4345+ disk_name (hd, MINOR(dev), dname->name);
4346+
4347+ dname->dev = dev;
4348+ md_list_add(&dname->list, &device_names);
4349+
4350+ return dname->name;
4351+}
4352+
4353+static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
4354+ int persistent)
4355+{
4356+ unsigned int size = 0;
4357+
4358+ if (blk_size[MAJOR(dev)])
4359+ size = blk_size[MAJOR(dev)][MINOR(dev)];
4360+ if (persistent)
4361+ size = MD_NEW_SIZE_BLOCKS(size);
4362+ return size;
4363+}
4364+
4365+static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
4366+{
4367+ unsigned int size;
4368+
4369+ size = calc_dev_sboffset(dev, mddev, persistent);
4370+ if (!mddev->sb) {
4371+ MD_BUG();
4372+ return size;
4373+ }
4374+ if (mddev->sb->chunk_size)
4375+ size &= ~(mddev->sb->chunk_size/1024 - 1);
4376+ return size;
4377+}
4378+
4379+/*
4380+ * We check wether all devices are numbered from 0 to nb_dev-1. The
4381+ * order is guaranteed even after device name changes.
4382+ *
4383+ * Some personalities (raid0, linear) use this. Personalities that
4384+ * provide data have to be able to deal with loss of individual
4385+ * disks, so they do their checking themselves.
4386+ */
4387+int md_check_ordering (mddev_t *mddev)
4388+{
4389+ int i, c;
4390+ mdk_rdev_t *rdev;
4391+ struct md_list_head *tmp;
4392
4393 /*
4394- * The superblock constant part has to be the same
4395- * for all disks in the array.
4396+ * First, all devices must be fully functional
4397 */
4398- sb = NULL;
4399- for (i = 0; i < N; i++) {
4400- if (!disks[i].sb)
4401- continue;
4402- if (!sb) {
4403- sb = disks[i].sb;
4404- continue;
4405- }
4406- if (memcmp(sb,
4407- disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
4408- printk (INCONSISTENT);
4409+ ITERATE_RDEV(mddev,rdev,tmp) {
4410+ if (rdev->faulty) {
4411+ printk("md: md%d's device %s faulty, aborting.\n",
4412+ mdidx(mddev), partition_name(rdev->dev));
4413 goto abort;
4414 }
4415 }
4416
4417- /*
4418- * OK, we have all disks and the array is ready to run. Let's
4419- * find the freshest superblock, that one will be the superblock
4420- * that represents the whole array.
4421- */
4422- if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
4423+ c = 0;
4424+ ITERATE_RDEV(mddev,rdev,tmp) {
4425+ c++;
4426+ }
4427+ if (c != mddev->nb_dev) {
4428+ MD_BUG();
4429 goto abort;
4430- freshest = NULL;
4431- for (i = 0; i < N; i++) {
4432- if (!disks[i].sb)
4433- continue;
4434- if (!freshest) {
4435- freshest = disks[i].sb;
4436- continue;
4437- }
4438- /*
4439- * Find the newest superblock version
4440- */
4441- if (disks[i].sb->utime != freshest->utime) {
4442- out_of_date = 1;
4443- if (disks[i].sb->utime > freshest->utime)
4444- freshest = disks[i].sb;
4445- }
4446 }
4447- if (out_of_date)
4448- printk(OUT_OF_DATE);
4449- memcpy (sb, freshest, sizeof(*freshest));
4450-
4451- /*
4452- * Check if we can support this RAID array
4453- */
4454- if (sb->major_version != MD_MAJOR_VERSION ||
4455- sb->minor_version > MD_MINOR_VERSION) {
4456-
4457- printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
4458- sb->major_version, sb->minor_version,
4459- sb->patch_version);
4460+ if (mddev->nb_dev != mddev->sb->raid_disks) {
4461+ printk("md: md%d, array needs %d disks, has %d, aborting.\n",
4462+ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
4463 goto abort;
4464 }
4465-
4466 /*
4467- * We need to add this as a superblock option.
4468+ * Now the numbering check
4469 */
4470-#if SUPPORT_RECONSTRUCTION
4471- if (sb->state != (1 << MD_SB_CLEAN)) {
4472- if (sb->level == 1) {
4473- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4474+ for (i = 0; i < mddev->nb_dev; i++) {
4475+ c = 0;
4476+ ITERATE_RDEV(mddev,rdev,tmp) {
4477+ if (rdev->desc_nr == i)
4478+ c++;
4479+ }
4480+ if (c == 0) {
4481+ printk("md: md%d, missing disk #%d, aborting.\n",
4482+ mdidx(mddev), i);
4483 goto abort;
4484- } else
4485- printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
4486- }
4487-#else
4488- if (sb->state != (1 << MD_SB_CLEAN)) {
4489- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4490- goto abort;
4491- }
4492-#endif /* SUPPORT_RECONSTRUCTION */
4493-
4494- switch (sb->level) {
4495- case 1:
4496- md_size[minor] = sb->size;
4497- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
4498- break;
4499- case 4:
4500- case 5:
4501- md_size[minor] = sb->size * (sb->raid_disks - 1);
4502- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
4503- break;
4504- default:
4505- printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
4506- sb->level);
4507+ }
4508+ if (c > 1) {
4509+ printk("md: md%d, too many disks #%d, aborting.\n",
4510+ mdidx(mddev), i);
4511 goto abort;
4512+ }
4513 }
4514 return 0;
4515 abort:
4516- free_sb(mddev);
4517 return 1;
4518 }
4519
4520-#undef INCONSISTENT
4521-#undef OUT_OF_DATE
4522-#undef OLD_VERSION
4523-#undef NOT_CLEAN
4524-#undef OLD_LEVEL
4525-
4526-int md_update_sb(int minor)
4527+static unsigned int zoned_raid_size (mddev_t *mddev)
4528 {
4529- struct md_dev *mddev = md_dev + minor;
4530- struct buffer_head *bh;
4531- md_superblock_t *sb = mddev->sb;
4532- struct real_dev *realdev;
4533- kdev_t dev;
4534- int i;
4535- u32 sb_offset;
4536+ unsigned int mask;
4537+ mdk_rdev_t * rdev;
4538+ struct md_list_head *tmp;
4539
4540- sb->utime = CURRENT_TIME;
4541- for (i = 0; i < mddev->nb_dev; i++) {
4542- realdev = mddev->devices + i;
4543- if (!realdev->sb)
4544- continue;
4545- dev = realdev->dev;
4546- sb_offset = realdev->sb_offset;
4547- set_blocksize(dev, MD_SB_BYTES);
4548- printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
4549- bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4550- if (bh) {
4551- sb = (md_superblock_t *) bh->b_data;
4552- memcpy(sb, mddev->sb, MD_SB_BYTES);
4553- memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
4554- mark_buffer_uptodate(bh, 1);
4555- mark_buffer_dirty(bh, 1);
4556- ll_rw_block(WRITE, 1, &bh);
4557- wait_on_buffer(bh);
4558- bforget(bh);
4559- fsync_dev(dev);
4560- invalidate_buffers(dev);
4561- } else
4562- printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
4563+ if (!mddev->sb) {
4564+ MD_BUG();
4565+ return -EINVAL;
4566+ }
4567+ /*
4568+ * do size and offset calculations.
4569+ */
4570+ mask = ~(mddev->sb->chunk_size/1024 - 1);
4571+printk("mask %08x\n", mask);
4572+
4573+ ITERATE_RDEV(mddev,rdev,tmp) {
4574+printk(" rdev->size: %d\n", rdev->size);
4575+ rdev->size &= mask;
4576+printk(" masked rdev->size: %d\n", rdev->size);
4577+ md_size[mdidx(mddev)] += rdev->size;
4578+printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
4579 }
4580 return 0;
4581 }
4582
4583-static int do_md_run (int minor, int repart)
4584+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
4585 {
4586- int pnum, i, min, factor, err;
4587+ if (disk_active(disk)) {
4588+ sb->working_disks--;
4589+ } else {
4590+ if (disk_spare(disk)) {
4591+ sb->spare_disks--;
4592+ sb->working_disks--;
4593+ } else {
4594+ sb->failed_disks--;
4595+ }
4596+ }
4597+ sb->nr_disks--;
4598+ disk->major = 0;
4599+ disk->minor = 0;
4600+ mark_disk_removed(disk);
4601+}
4602
4603- if (!md_dev[minor].nb_dev)
4604- return -EINVAL;
4605-
4606- if (md_dev[minor].pers)
4607- return -EBUSY;
4608+#define BAD_MAGIC KERN_ERR \
4609+"md: invalid raid superblock magic on %s\n"
4610
4611- md_dev[minor].repartition=repart;
4612-
4613- if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
4614- >= MAX_PERSONALITY)
4615- return -EINVAL;
4616-
4617- /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
4618- if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
4619- for (i = 0; i < md_dev [minor].nb_dev; i++)
4620- if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
4621- return -EINVAL;
4622- }
4623- if (!pers[pnum])
4624- {
4625-#ifdef CONFIG_KMOD
4626- char module_name[80];
4627- sprintf (module_name, "md-personality-%d", pnum);
4628- request_module (module_name);
4629- if (!pers[pnum])
4630-#endif
4631- return -EINVAL;
4632- }
4633-
4634- factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4635-
4636- for (i=0; i<md_dev[minor].nb_dev; i++)
4637- if (md_dev[minor].devices[i].size<min)
4638- {
4639- printk ("Dev %s smaller than %dk, cannot shrink\n",
4640- partition_name (md_dev[minor].devices[i].dev), min);
4641- return -EINVAL;
4642- }
4643-
4644- for (i=0; i<md_dev[minor].nb_dev; i++) {
4645- fsync_dev(md_dev[minor].devices[i].dev);
4646- invalidate_buffers(md_dev[minor].devices[i].dev);
4647- }
4648-
4649- /* Resize devices according to the factor. It is used to align
4650- partitions size on a given chunk size. */
4651- md_size[minor]=0;
4652-
4653- /*
4654- * Analyze the raid superblock
4655- */
4656- if (analyze_sbs(minor, pnum))
4657- return -EINVAL;
4658+#define BAD_MINOR KERN_ERR \
4659+"md: %s: invalid raid minor (%x)\n"
4660
4661- md_dev[minor].pers=pers[pnum];
4662-
4663- if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
4664- {
4665- md_dev[minor].pers=NULL;
4666- free_sb(md_dev + minor);
4667- return (err);
4668- }
4669-
4670- if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
4671- {
4672- md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
4673- md_update_sb(minor);
4674- }
4675-
4676- /* FIXME : We assume here we have blocks
4677- that are twice as large as sectors.
4678- THIS MAY NOT BE TRUE !!! */
4679- md_hd_struct[minor].start_sect=0;
4680- md_hd_struct[minor].nr_sects=md_size[minor]<<1;
4681-
4682- read_ahead[MD_MAJOR] = 128;
4683- return (0);
4684-}
4685+#define OUT_OF_MEM KERN_ALERT \
4686+"md: out of memory.\n"
4687+
4688+#define NO_SB KERN_ERR \
4689+"md: disabled device %s, could not read superblock.\n"
4690
4691-static int do_md_stop (int minor, struct inode *inode)
4692+#define BAD_CSUM KERN_WARNING \
4693+"md: invalid superblock checksum on %s\n"
4694+
4695+static int alloc_array_sb (mddev_t * mddev)
4696 {
4697- int i;
4698-
4699- if (inode->i_count>1 || md_dev[minor].busy>1) {
4700- /*
4701- * ioctl : one open channel
4702- */
4703- printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
4704- minor, inode->i_count, md_dev[minor].busy);
4705- return -EBUSY;
4706- }
4707-
4708- if (md_dev[minor].pers) {
4709- /*
4710- * It is safe to call stop here, it only frees private
4711- * data. Also, it tells us if a device is unstoppable
4712- * (eg. resyncing is in progress)
4713- */
4714- if (md_dev[minor].pers->stop (minor, md_dev+minor))
4715- return -EBUSY;
4716- /*
4717- * The device won't exist anymore -> flush it now
4718- */
4719- fsync_dev (inode->i_rdev);
4720- invalidate_buffers (inode->i_rdev);
4721- if (md_dev[minor].sb) {
4722- md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
4723- md_update_sb(minor);
4724- }
4725+ if (mddev->sb) {
4726+ MD_BUG();
4727+ return 0;
4728 }
4729-
4730- /* Remove locks. */
4731- if (md_dev[minor].sb)
4732- free_sb(md_dev + minor);
4733- for (i=0; i<md_dev[minor].nb_dev; i++)
4734- clear_inode (md_dev[minor].devices[i].inode);
4735-
4736- md_dev[minor].nb_dev=md_size[minor]=0;
4737- md_hd_struct[minor].nr_sects=0;
4738- md_dev[minor].pers=NULL;
4739-
4740- read_ahead[MD_MAJOR] = 128;
4741-
4742- return (0);
4743+
4744+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
4745+ if (!mddev->sb)
4746+ return -ENOMEM;
4747+ md_clear_page((unsigned long)mddev->sb);
4748+ return 0;
4749 }
4750
4751-static int do_md_add (int minor, kdev_t dev)
4752+static int alloc_disk_sb (mdk_rdev_t * rdev)
4753 {
4754- int i;
4755- int hot_add=0;
4756- struct real_dev *realdev;
4757+ if (rdev->sb)
4758+ MD_BUG();
4759
4760- if (md_dev[minor].nb_dev==MAX_REAL)
4761+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
4762+ if (!rdev->sb) {
4763+ printk (OUT_OF_MEM);
4764 return -EINVAL;
4765+ }
4766+ md_clear_page((unsigned long)rdev->sb);
4767
4768- if (!fs_may_mount (dev))
4769- return -EBUSY;
4770+ return 0;
4771+}
4772
4773- if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
4774- printk("md_add(): zero device size, huh, bailing out.\n");
4775- return -EINVAL;
4776+static void free_disk_sb (mdk_rdev_t * rdev)
4777+{
4778+ if (rdev->sb) {
4779+ free_page((unsigned long) rdev->sb);
4780+ rdev->sb = NULL;
4781+ rdev->sb_offset = 0;
4782+ rdev->size = 0;
4783+ } else {
4784+ if (!rdev->faulty)
4785+ MD_BUG();
4786 }
4787+}
4788
4789- if (md_dev[minor].pers) {
4790- /*
4791- * The array is already running, hot-add the drive, or
4792- * bail out:
4793- */
4794- if (!md_dev[minor].pers->hot_add_disk)
4795- return -EBUSY;
4796- else
4797- hot_add=1;
4798+static void mark_rdev_faulty (mdk_rdev_t * rdev)
4799+{
4800+ unsigned long flags;
4801+
4802+ if (!rdev) {
4803+ MD_BUG();
4804+ return;
4805 }
4806+ save_flags(flags);
4807+ cli();
4808+ free_disk_sb(rdev);
4809+ rdev->faulty = 1;
4810+ restore_flags(flags);
4811+}
4812+
4813+static int read_disk_sb (mdk_rdev_t * rdev)
4814+{
4815+ int ret = -EINVAL;
4816+ struct buffer_head *bh = NULL;
4817+ kdev_t dev = rdev->dev;
4818+ mdp_super_t *sb;
4819+ u32 sb_offset;
4820
4821+ if (!rdev->sb) {
4822+ MD_BUG();
4823+ goto abort;
4824+ }
4825+
4826 /*
4827- * Careful. We cannot increase nb_dev for a running array.
4828+ * Calculate the position of the superblock,
4829+ * it's at the end of the disk
4830 */
4831- i=md_dev[minor].nb_dev;
4832- realdev = &md_dev[minor].devices[i];
4833- realdev->dev=dev;
4834-
4835- /* Lock the device by inserting a dummy inode. This doesn't
4836- smell very good, but I need to be consistent with the
4837- mount stuff, specially with fs_may_mount. If someone have
4838- a better idea, please help ! */
4839-
4840- realdev->inode=get_empty_inode ();
4841- realdev->inode->i_dev=dev; /* don't care about other fields */
4842- insert_inode_hash (realdev->inode);
4843-
4844- /* Sizes are now rounded at run time */
4845-
4846-/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
4847-
4848- realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
4849+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
4850+ rdev->sb_offset = sb_offset;
4851+ printk("(read) %s's sb offset: %d", partition_name(dev),
4852+ sb_offset);
4853+ fsync_dev(dev);
4854+ set_blocksize (dev, MD_SB_BYTES);
4855+ bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4856
4857- if (hot_add) {
4858+ if (bh) {
4859+ sb = (mdp_super_t *) bh->b_data;
4860+ memcpy (rdev->sb, sb, MD_SB_BYTES);
4861+ } else {
4862+ printk (NO_SB,partition_name(rdev->dev));
4863+ goto abort;
4864+ }
4865+ printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
4866+ ret = 0;
4867+abort:
4868+ if (bh)
4869+ brelse (bh);
4870+ return ret;
4871+}
4872+
4873+static unsigned int calc_sb_csum (mdp_super_t * sb)
4874+{
4875+ unsigned int disk_csum, csum;
4876+
4877+ disk_csum = sb->sb_csum;
4878+ sb->sb_csum = 0;
4879+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
4880+ sb->sb_csum = disk_csum;
4881+ return csum;
4882+}
4883+
4884+/*
4885+ * Check one RAID superblock for generic plausibility
4886+ */
4887+
4888+static int check_disk_sb (mdk_rdev_t * rdev)
4889+{
4890+ mdp_super_t *sb;
4891+ int ret = -EINVAL;
4892+
4893+ sb = rdev->sb;
4894+ if (!sb) {
4895+ MD_BUG();
4896+ goto abort;
4897+ }
4898+
4899+ if (sb->md_magic != MD_SB_MAGIC) {
4900+ printk (BAD_MAGIC, partition_name(rdev->dev));
4901+ goto abort;
4902+ }
4903+
4904+ if (sb->md_minor >= MAX_MD_DEVS) {
4905+ printk (BAD_MINOR, partition_name(rdev->dev),
4906+ sb->md_minor);
4907+ goto abort;
4908+ }
4909+
4910+ if (calc_sb_csum(sb) != sb->sb_csum)
4911+ printk(BAD_CSUM, partition_name(rdev->dev));
4912+ ret = 0;
4913+abort:
4914+ return ret;
4915+}
4916+
4917+static kdev_t dev_unit(kdev_t dev)
4918+{
4919+ unsigned int mask;
4920+ struct gendisk *hd = find_gendisk(dev);
4921+
4922+ if (!hd)
4923+ return 0;
4924+ mask = ~((1 << hd->minor_shift) - 1);
4925+
4926+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
4927+}
4928+
4929+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
4930+{
4931+ struct md_list_head *tmp;
4932+ mdk_rdev_t *rdev;
4933+
4934+ ITERATE_RDEV(mddev,rdev,tmp)
4935+ if (dev_unit(rdev->dev) == dev_unit(dev))
4936+ return rdev;
4937+
4938+ return NULL;
4939+}
4940+
4941+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
4942+{
4943+ struct md_list_head *tmp;
4944+ mdk_rdev_t *rdev;
4945+
4946+ ITERATE_RDEV(mddev1,rdev,tmp)
4947+ if (match_dev_unit(mddev2, rdev->dev))
4948+ return 1;
4949+
4950+ return 0;
4951+}
4952+
4953+static MD_LIST_HEAD(all_raid_disks);
4954+static MD_LIST_HEAD(pending_raid_disks);
4955+
4956+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
4957+{
4958+ mdk_rdev_t *same_pdev;
4959+
4960+ if (rdev->mddev) {
4961+ MD_BUG();
4962+ return;
4963+ }
4964+ same_pdev = match_dev_unit(mddev, rdev->dev);
4965+ if (same_pdev)
4966+ printk( KERN_WARNING
4967+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
4968+" protection against single-disk failure might be compromised.\n",
4969+ mdidx(mddev), partition_name(rdev->dev),
4970+ partition_name(same_pdev->dev));
4971+
4972+ md_list_add(&rdev->same_set, &mddev->disks);
4973+ rdev->mddev = mddev;
4974+ mddev->nb_dev++;
4975+ printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
4976+}
4977+
4978+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
4979+{
4980+ if (!rdev->mddev) {
4981+ MD_BUG();
4982+ return;
4983+ }
4984+ md_list_del(&rdev->same_set);
4985+ MD_INIT_LIST_HEAD(&rdev->same_set);
4986+ rdev->mddev->nb_dev--;
4987+ printk("unbind<%s,%d>\n", partition_name(rdev->dev),
4988+ rdev->mddev->nb_dev);
4989+ rdev->mddev = NULL;
4990+}
4991+
4992+/*
4993+ * prevent the device from being mounted, repartitioned or
4994+ * otherwise reused by a RAID array (or any other kernel
4995+ * subsystem), by opening the device. [simply getting an
4996+ * inode is not enough, the SCSI module usage code needs
4997+ * an explicit open() on the device]
4998+ */
4999+static int lock_rdev (mdk_rdev_t *rdev)
5000+{
5001+ int err = 0;
5002+
5003+ /*
5004+ * First insert a dummy inode.
5005+ */
5006+ if (rdev->inode)
5007+ MD_BUG();
5008+ rdev->inode = get_empty_inode();
5009+ /*
5010+ * we dont care about any other fields
5011+ */
5012+ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
5013+ insert_inode_hash(rdev->inode);
5014+
5015+ memset(&rdev->filp, 0, sizeof(rdev->filp));
5016+ rdev->filp.f_mode = 3; /* read write */
5017+ err = blkdev_open(rdev->inode, &rdev->filp);
5018+ if (err) {
5019+ printk("blkdev_open() failed: %d\n", err);
5020+ clear_inode(rdev->inode);
5021+ rdev->inode = NULL;
5022+ }
5023+ return err;
5024+}
5025+
5026+static void unlock_rdev (mdk_rdev_t *rdev)
5027+{
5028+ blkdev_release(rdev->inode);
5029+ if (!rdev->inode)
5030+ MD_BUG();
5031+ clear_inode(rdev->inode);
5032+ rdev->inode = NULL;
5033+}
5034+
5035+static void export_rdev (mdk_rdev_t * rdev)
5036+{
5037+ printk("export_rdev(%s)\n",partition_name(rdev->dev));
5038+ if (rdev->mddev)
5039+ MD_BUG();
5040+ unlock_rdev(rdev);
5041+ free_disk_sb(rdev);
5042+ md_list_del(&rdev->all);
5043+ MD_INIT_LIST_HEAD(&rdev->all);
5044+ if (rdev->pending.next != &rdev->pending) {
5045+ printk("(%s was pending)\n",partition_name(rdev->dev));
5046+ md_list_del(&rdev->pending);
5047+ MD_INIT_LIST_HEAD(&rdev->pending);
5048+ }
5049+ rdev->dev = 0;
5050+ rdev->faulty = 0;
5051+ kfree(rdev);
5052+}
5053+
5054+static void kick_rdev_from_array (mdk_rdev_t * rdev)
5055+{
5056+ unbind_rdev_from_array(rdev);
5057+ export_rdev(rdev);
5058+}
5059+
5060+static void export_array (mddev_t *mddev)
5061+{
5062+ struct md_list_head *tmp;
5063+ mdk_rdev_t *rdev;
5064+ mdp_super_t *sb = mddev->sb;
5065+
5066+ if (mddev->sb) {
5067+ mddev->sb = NULL;
5068+ free_page((unsigned long) sb);
5069+ }
5070+
5071+ ITERATE_RDEV(mddev,rdev,tmp) {
5072+ if (!rdev->mddev) {
5073+ MD_BUG();
5074+ continue;
5075+ }
5076+ kick_rdev_from_array(rdev);
5077+ }
5078+ if (mddev->nb_dev)
5079+ MD_BUG();
5080+}
5081+
5082+#undef BAD_CSUM
5083+#undef BAD_MAGIC
5084+#undef OUT_OF_MEM
5085+#undef NO_SB
5086+
5087+static void print_desc(mdp_disk_t *desc)
5088+{
5089+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
5090+ partition_name(MKDEV(desc->major,desc->minor)),
5091+ desc->major,desc->minor,desc->raid_disk,desc->state);
5092+}
5093+
5094+static void print_sb(mdp_super_t *sb)
5095+{
5096+ int i;
5097+
5098+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
5099+ sb->major_version, sb->minor_version, sb->patch_version,
5100+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
5101+ sb->ctime);
5102+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
5103+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
5104+ sb->layout, sb->chunk_size);
5105+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
5106+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
5107+ sb->failed_disks, sb->spare_disks,
5108+ sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
5109+
5110+ for (i = 0; i < MD_SB_DISKS; i++) {
5111+ mdp_disk_t *desc;
5112+
5113+ desc = sb->disks + i;
5114+ printk(" D %2d: ", i);
5115+ print_desc(desc);
5116+ }
5117+ printk(" THIS: ");
5118+ print_desc(&sb->this_disk);
5119+
5120+}
5121+
5122+static void print_rdev(mdk_rdev_t *rdev)
5123+{
5124+ printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
5125+ partition_name(rdev->dev), partition_name(rdev->old_dev),
5126+ rdev->size, rdev->faulty, rdev->desc_nr);
5127+ if (rdev->sb) {
5128+ printk("rdev superblock:\n");
5129+ print_sb(rdev->sb);
5130+ } else
5131+ printk("no rdev superblock!\n");
5132+}
5133+
5134+void md_print_devices (void)
5135+{
5136+ struct md_list_head *tmp, *tmp2;
5137+ mdk_rdev_t *rdev;
5138+ mddev_t *mddev;
5139+
5140+ printk("\n");
5141+ printk(" **********************************\n");
5142+ printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
5143+ printk(" **********************************\n");
5144+ ITERATE_MDDEV(mddev,tmp) {
5145+ printk("md%d: ", mdidx(mddev));
5146+
5147+ ITERATE_RDEV(mddev,rdev,tmp2)
5148+ printk("<%s>", partition_name(rdev->dev));
5149+
5150+ if (mddev->sb) {
5151+ printk(" array superblock:\n");
5152+ print_sb(mddev->sb);
5153+ } else
5154+ printk(" no array superblock.\n");
5155+
5156+ ITERATE_RDEV(mddev,rdev,tmp2)
5157+ print_rdev(rdev);
5158+ }
5159+ printk(" **********************************\n");
5160+ printk("\n");
5161+}
5162+
5163+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
5164+{
5165+ int ret;
5166+ mdp_super_t *tmp1, *tmp2;
5167+
5168+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
5169+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
5170+
5171+ if (!tmp1 || !tmp2) {
5172+ ret = 0;
5173+ goto abort;
5174+ }
5175+
5176+ *tmp1 = *sb1;
5177+ *tmp2 = *sb2;
5178+
5179+ /*
5180+ * nr_disks is not constant
5181+ */
5182+ tmp1->nr_disks = 0;
5183+ tmp2->nr_disks = 0;
5184+
5185+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
5186+ ret = 0;
5187+ else
5188+ ret = 1;
5189+
5190+abort:
5191+ if (tmp1)
5192+ kfree(tmp1);
5193+ if (tmp2)
5194+ kfree(tmp2);
5195+
5196+ return ret;
5197+}
5198+
5199+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
5200+{
5201+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
5202+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
5203+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
5204+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
5205+
5206+ return 1;
5207+
5208+ return 0;
5209+}
5210+
5211+static mdk_rdev_t * find_rdev_all (kdev_t dev)
5212+{
5213+ struct md_list_head *tmp;
5214+ mdk_rdev_t *rdev;
5215+
5216+ tmp = all_raid_disks.next;
5217+ while (tmp != &all_raid_disks) {
5218+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
5219+ if (rdev->dev == dev)
5220+ return rdev;
5221+ tmp = tmp->next;
5222+ }
5223+ return NULL;
5224+}
5225+
5226+#define GETBLK_FAILED KERN_ERR \
5227+"md: getblk failed for device %s\n"
5228+
5229+static int write_disk_sb(mdk_rdev_t * rdev)
5230+{
5231+ struct buffer_head *bh;
5232+ kdev_t dev;
5233+ u32 sb_offset, size;
5234+ mdp_super_t *sb;
5235+
5236+ if (!rdev->sb) {
5237+ MD_BUG();
5238+ return -1;
5239+ }
5240+ if (rdev->faulty) {
5241+ MD_BUG();
5242+ return -1;
5243+ }
5244+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
5245+ MD_BUG();
5246+ return -1;
5247+ }
5248+
5249+ dev = rdev->dev;
5250+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
5251+ if (rdev->sb_offset != sb_offset) {
5252+ printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
5253+ goto skip;
5254+ }
5255+ /*
5256+ * If the disk went offline meanwhile and it's just a spare, then
5257+ * it's size has changed to zero silently, and the MD code does
5258+ * not yet know that it's faulty.
5259+ */
5260+ size = calc_dev_size(dev, rdev->mddev, 1);
5261+ if (size != rdev->size) {
5262+ printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
5263+ goto skip;
5264+ }
5265+
5266+ printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
5267+ fsync_dev(dev);
5268+ set_blocksize(dev, MD_SB_BYTES);
5269+ bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
5270+ if (!bh) {
5271+ printk(GETBLK_FAILED, partition_name(dev));
5272+ return 1;
5273+ }
5274+ memset(bh->b_data,0,bh->b_size);
5275+ sb = (mdp_super_t *) bh->b_data;
5276+ memcpy(sb, rdev->sb, MD_SB_BYTES);
5277+
5278+ mark_buffer_uptodate(bh, 1);
5279+ mark_buffer_dirty(bh, 1);
5280+ ll_rw_block(WRITE, 1, &bh);
5281+ wait_on_buffer(bh);
5282+ brelse(bh);
5283+ fsync_dev(dev);
5284+skip:
5285+ return 0;
5286+}
5287+#undef GETBLK_FAILED KERN_ERR
5288+
5289+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5290+{
5291+ int i, ok = 0;
5292+ mdp_disk_t *desc;
5293+
5294+ for (i = 0; i < MD_SB_DISKS; i++) {
5295+ desc = mddev->sb->disks + i;
5296+#if 0
5297+ if (disk_faulty(desc)) {
5298+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
5299+ ok = 1;
5300+ continue;
5301+ }
5302+#endif
5303+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
5304+ rdev->sb->this_disk = *desc;
5305+ rdev->desc_nr = desc->number;
5306+ ok = 1;
5307+ break;
5308+ }
5309+ }
5310+
5311+ if (!ok) {
5312+ MD_BUG();
5313+ }
5314+}
5315+
5316+static int sync_sbs(mddev_t * mddev)
5317+{
5318+ mdk_rdev_t *rdev;
5319+ mdp_super_t *sb;
5320+ struct md_list_head *tmp;
5321+
5322+ ITERATE_RDEV(mddev,rdev,tmp) {
5323+ if (rdev->faulty)
5324+ continue;
5325+ sb = rdev->sb;
5326+ *sb = *mddev->sb;
5327+ set_this_disk(mddev, rdev);
5328+ sb->sb_csum = calc_sb_csum(sb);
5329+ }
5330+ return 0;
5331+}
5332+
5333+int md_update_sb(mddev_t * mddev)
5334+{
5335+ int first, err, count = 100;
5336+ struct md_list_head *tmp;
5337+ mdk_rdev_t *rdev;
5338+ __u64 ev;
5339+
5340+repeat:
5341+ mddev->sb->utime = CURRENT_TIME;
5342+ ev = get_unaligned(&mddev->sb->events);
5343+ ++ev;
5344+ put_unaligned(ev,&mddev->sb->events);
5345+ if (ev == (__u64)0) {
5346+ /*
5347+ * oops, this 64-bit counter should never wrap.
5348+ * Either we are in around ~1 trillion A.C., assuming
5349+ * 1 reboot per second, or we have a bug:
5350+ */
5351+ MD_BUG();
5352+ --ev;
5353+ put_unaligned(ev,&mddev->sb->events);
5354+ }
5355+ sync_sbs(mddev);
5356+
5357+ /*
5358+ * do not write anything to disk if using
5359+ * nonpersistent superblocks
5360+ */
5361+ if (mddev->sb->not_persistent)
5362+ return 0;
5363+
5364+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
5365+ mdidx(mddev));
5366+
5367+ first = 1;
5368+ err = 0;
5369+ ITERATE_RDEV(mddev,rdev,tmp) {
5370+ if (!first) {
5371+ first = 0;
5372+ printk(", ");
5373+ }
5374+ if (rdev->faulty)
5375+ printk("(skipping faulty ");
5376+ printk("%s ", partition_name(rdev->dev));
5377+ if (!rdev->faulty) {
5378+ printk("[events: %08lx]",
5379+ (unsigned long)get_unaligned(&rdev->sb->events));
5380+ err += write_disk_sb(rdev);
5381+ } else
5382+ printk(")\n");
5383+ }
5384+ printk(".\n");
5385+ if (err) {
5386+ printk("errors occured during superblock update, repeating\n");
5387+ if (--count)
5388+ goto repeat;
5389+ printk("excessive errors occured during superblock update, exiting\n");
5390+ }
5391+ return 0;
5392+}
5393+
5394+/*
5395+ * Import a device. If 'on_disk', then sanity check the superblock
5396+ *
5397+ * mark the device faulty if:
5398+ *
5399+ * - the device is nonexistent (zero size)
5400+ * - the device has no valid superblock
5401+ *
5402+ * a faulty rdev _never_ has rdev->sb set.
5403+ */
5404+static int md_import_device (kdev_t newdev, int on_disk)
5405+{
5406+ int err;
5407+ mdk_rdev_t *rdev;
5408+ unsigned int size;
5409+
5410+ if (find_rdev_all(newdev))
5411+ return -EEXIST;
5412+
5413+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
5414+ if (!rdev) {
5415+ printk("could not alloc mem for %s!\n", partition_name(newdev));
5416+ return -ENOMEM;
5417+ }
5418+ memset(rdev, 0, sizeof(*rdev));
5419+
5420+ if (!fs_may_mount(newdev)) {
5421+ printk("md: can not import %s, has active inodes!\n",
5422+ partition_name(newdev));
5423+ err = -EBUSY;
5424+ goto abort_free;
5425+ }
5426+
5427+ if ((err = alloc_disk_sb(rdev)))
5428+ goto abort_free;
5429+
5430+ rdev->dev = newdev;
5431+ if (lock_rdev(rdev)) {
5432+ printk("md: could not lock %s, zero-size? Marking faulty.\n",
5433+ partition_name(newdev));
5434+ err = -EINVAL;
5435+ goto abort_free;
5436+ }
5437+ rdev->desc_nr = -1;
5438+ rdev->faulty = 0;
5439+
5440+ size = 0;
5441+ if (blk_size[MAJOR(newdev)])
5442+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
5443+ if (!size) {
5444+ printk("md: %s has zero size, marking faulty!\n",
5445+ partition_name(newdev));
5446+ err = -EINVAL;
5447+ goto abort_free;
5448+ }
5449+
5450+ if (on_disk) {
5451+ if ((err = read_disk_sb(rdev))) {
5452+ printk("md: could not read %s's sb, not importing!\n",
5453+ partition_name(newdev));
5454+ goto abort_free;
5455+ }
5456+ if ((err = check_disk_sb(rdev))) {
5457+ printk("md: %s has invalid sb, not importing!\n",
5458+ partition_name(newdev));
5459+ goto abort_free;
5460+ }
5461+
5462+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
5463+ rdev->sb->this_disk.minor);
5464+ rdev->desc_nr = rdev->sb->this_disk.number;
5465+ }
5466+ md_list_add(&rdev->all, &all_raid_disks);
5467+ MD_INIT_LIST_HEAD(&rdev->pending);
5468+
5469+ if (rdev->faulty && rdev->sb)
5470+ free_disk_sb(rdev);
5471+ return 0;
5472+
5473+abort_free:
5474+ if (rdev->sb) {
5475+ if (rdev->inode)
5476+ unlock_rdev(rdev);
5477+ free_disk_sb(rdev);
5478+ }
5479+ kfree(rdev);
5480+ return err;
5481+}
5482+
5483+/*
5484+ * Check a full RAID array for plausibility
5485+ */
5486+
5487+#define INCONSISTENT KERN_ERR \
5488+"md: fatal superblock inconsistency in %s -- removing from array\n"
5489+
5490+#define OUT_OF_DATE KERN_ERR \
5491+"md: superblock update time inconsistency -- using the most recent one\n"
5492+
5493+#define OLD_VERSION KERN_ALERT \
5494+"md: md%d: unsupported raid array version %d.%d.%d\n"
5495+
5496+#define NOT_CLEAN_IGNORE KERN_ERR \
5497+"md: md%d: raid array is not clean -- starting background reconstruction\n"
5498+
5499+#define UNKNOWN_LEVEL KERN_ERR \
5500+"md: md%d: unsupported raid level %d\n"
5501+
5502+static int analyze_sbs (mddev_t * mddev)
5503+{
5504+ int out_of_date = 0, i;
5505+ struct md_list_head *tmp, *tmp2;
5506+ mdk_rdev_t *rdev, *rdev2, *freshest;
5507+ mdp_super_t *sb;
5508+
5509+ /*
5510+ * Verify the RAID superblock on each real device
5511+ */
5512+ ITERATE_RDEV(mddev,rdev,tmp) {
5513+ if (rdev->faulty) {
5514+ MD_BUG();
5515+ goto abort;
5516+ }
5517+ if (!rdev->sb) {
5518+ MD_BUG();
5519+ goto abort;
5520+ }
5521+ if (check_disk_sb(rdev))
5522+ goto abort;
5523+ }
5524+
5525+ /*
5526+ * The superblock constant part has to be the same
5527+ * for all disks in the array.
5528+ */
5529+ sb = NULL;
5530+
5531+ ITERATE_RDEV(mddev,rdev,tmp) {
5532+ if (!sb) {
5533+ sb = rdev->sb;
5534+ continue;
5535+ }
5536+ if (!sb_equal(sb, rdev->sb)) {
5537+ printk (INCONSISTENT, partition_name(rdev->dev));
5538+ kick_rdev_from_array(rdev);
5539+ continue;
5540+ }
5541+ }
5542+
5543+ /*
5544+ * OK, we have all disks and the array is ready to run. Let's
5545+ * find the freshest superblock, that one will be the superblock
5546+ * that represents the whole array.
5547+ */
5548+ if (!mddev->sb)
5549+ if (alloc_array_sb(mddev))
5550+ goto abort;
5551+ sb = mddev->sb;
5552+ freshest = NULL;
5553+
5554+ ITERATE_RDEV(mddev,rdev,tmp) {
5555+ __u64 ev1, ev2;
5556+ /*
5557+ * if the checksum is invalid, use the superblock
5558+ * only as a last resort. (decrease it's age by
5559+ * one event)
5560+ */
5561+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
5562+ __u64 ev = get_unaligned(&rdev->sb->events);
5563+ if (ev != (__u64)0) {
5564+ --ev;
5565+ put_unaligned(ev,&rdev->sb->events);
5566+ }
5567+ }
5568+
5569+ printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
5570+ (unsigned long)get_unaligned(&rdev->sb->events));
5571+ if (!freshest) {
5572+ freshest = rdev;
5573+ continue;
5574+ }
5575+ /*
5576+ * Find the newest superblock version
5577+ */
5578+ ev1 = get_unaligned(&rdev->sb->events);
5579+ ev2 = get_unaligned(&freshest->sb->events);
5580+ if (ev1 != ev2) {
5581+ out_of_date = 1;
5582+ if (ev1 > ev2)
5583+ freshest = rdev;
5584+ }
5585+ }
5586+ if (out_of_date) {
5587+ printk(OUT_OF_DATE);
5588+ printk("freshest: %s\n", partition_name(freshest->dev));
5589+ }
5590+ memcpy (sb, freshest->sb, sizeof(*sb));
5591+
5592+ /*
5593+ * at this point we have picked the 'best' superblock
5594+ * from all available superblocks.
5595+ * now we validate this superblock and kick out possibly
5596+ * failed disks.
5597+ */
5598+ ITERATE_RDEV(mddev,rdev,tmp) {
5599+ /*
5600+ * Kick all non-fresh devices faulty
5601+ */
5602+ __u64 ev1, ev2;
5603+ ev1 = get_unaligned(&rdev->sb->events);
5604+ ev2 = get_unaligned(&sb->events);
5605+ ++ev1;
5606+ if (ev1 < ev2) {
5607+ printk("md: kicking non-fresh %s from array!\n",
5608+ partition_name(rdev->dev));
5609+ kick_rdev_from_array(rdev);
5610+ continue;
5611+ }
5612+ }
5613+
5614+ /*
5615+ * Fix up changed device names ... but only if this disk has a
5616+ * recent update time. Use faulty checksum ones too.
5617+ */
5618+ ITERATE_RDEV(mddev,rdev,tmp) {
5619+ __u64 ev1, ev2, ev3;
5620+ if (rdev->faulty) { /* REMOVEME */
5621+ MD_BUG();
5622+ goto abort;
5623+ }
5624+ ev1 = get_unaligned(&rdev->sb->events);
5625+ ev2 = get_unaligned(&sb->events);
5626+ ev3 = ev2;
5627+ --ev3;
5628+ if ((rdev->dev != rdev->old_dev) &&
5629+ ((ev1 == ev2) || (ev1 == ev3))) {
5630+ mdp_disk_t *desc;
5631+
5632+ printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
5633+ if (rdev->desc_nr == -1) {
5634+ MD_BUG();
5635+ goto abort;
5636+ }
5637+ desc = &sb->disks[rdev->desc_nr];
5638+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
5639+ MD_BUG();
5640+ goto abort;
5641+ }
5642+ desc->major = MAJOR(rdev->dev);
5643+ desc->minor = MINOR(rdev->dev);
5644+ desc = &rdev->sb->this_disk;
5645+ desc->major = MAJOR(rdev->dev);
5646+ desc->minor = MINOR(rdev->dev);
5647+ }
5648+ }
5649+
5650+ /*
5651+ * Remove unavailable and faulty devices ...
5652+ *
5653+ * note that if an array becomes completely unrunnable due to
5654+ * missing devices, we do not write the superblock back, so the
5655+ * administrator has a chance to fix things up. The removal thus
5656+ * only happens if it's nonfatal to the contents of the array.
5657+ */
5658+ for (i = 0; i < MD_SB_DISKS; i++) {
5659+ int found;
5660+ mdp_disk_t *desc;
5661+ kdev_t dev;
5662+
5663+ desc = sb->disks + i;
5664+ dev = MKDEV(desc->major, desc->minor);
5665+
5666+ /*
5667+ * We kick faulty devices/descriptors immediately.
5668+ */
5669+ if (disk_faulty(desc)) {
5670+ found = 0;
5671+ ITERATE_RDEV(mddev,rdev,tmp) {
5672+ if (rdev->desc_nr != desc->number)
5673+ continue;
5674+ printk("md%d: kicking faulty %s!\n",
5675+ mdidx(mddev),partition_name(rdev->dev));
5676+ kick_rdev_from_array(rdev);
5677+ found = 1;
5678+ break;
5679+ }
5680+ if (!found) {
5681+ if (dev == MKDEV(0,0))
5682+ continue;
5683+ printk("md%d: removing former faulty %s!\n",
5684+ mdidx(mddev), partition_name(dev));
5685+ }
5686+ remove_descriptor(desc, sb);
5687+ continue;
5688+ }
5689+
5690+ if (dev == MKDEV(0,0))
5691+ continue;
5692+ /*
5693+ * Is this device present in the rdev ring?
5694+ */
5695+ found = 0;
5696+ ITERATE_RDEV(mddev,rdev,tmp) {
5697+ if (rdev->desc_nr == desc->number) {
5698+ found = 1;
5699+ break;
5700+ }
5701+ }
5702+ if (found)
5703+ continue;
5704+
5705+ printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
5706+ remove_descriptor(desc, sb);
5707+ }
5708+
5709+ /*
5710+ * Double check wether all devices mentioned in the
5711+ * superblock are in the rdev ring.
5712+ */
5713+ for (i = 0; i < MD_SB_DISKS; i++) {
5714+ mdp_disk_t *desc;
5715+ kdev_t dev;
5716+
5717+ desc = sb->disks + i;
5718+ dev = MKDEV(desc->major, desc->minor);
5719+
5720+ if (dev == MKDEV(0,0))
5721+ continue;
5722+
5723+ if (disk_faulty(desc)) {
5724+ MD_BUG();
5725+ goto abort;
5726+ }
5727+
5728+ rdev = find_rdev(mddev, dev);
5729+ if (!rdev) {
5730+ MD_BUG();
5731+ goto abort;
5732+ }
5733+ }
5734+
5735+ /*
5736+ * Do a final reality check.
5737+ */
5738+ ITERATE_RDEV(mddev,rdev,tmp) {
5739+ if (rdev->desc_nr == -1) {
5740+ MD_BUG();
5741+ goto abort;
5742+ }
5743+ /*
5744+ * is the desc_nr unique?
5745+ */
5746+ ITERATE_RDEV(mddev,rdev2,tmp2) {
5747+ if ((rdev2 != rdev) &&
5748+ (rdev2->desc_nr == rdev->desc_nr)) {
5749+ MD_BUG();
5750+ goto abort;
5751+ }
5752+ }
5753+ /*
5754+ * is the device unique?
5755+ */
5756+ ITERATE_RDEV(mddev,rdev2,tmp2) {
5757+ if ((rdev2 != rdev) &&
5758+ (rdev2->dev == rdev->dev)) {
5759+ MD_BUG();
5760+ goto abort;
5761+ }
5762+ }
5763+ }
5764+
5765+ /*
5766+ * Check if we can support this RAID array
5767+ */
5768+ if (sb->major_version != MD_MAJOR_VERSION ||
5769+ sb->minor_version > MD_MINOR_VERSION) {
5770+
5771+ printk (OLD_VERSION, mdidx(mddev), sb->major_version,
5772+ sb->minor_version, sb->patch_version);
5773+ goto abort;
5774+ }
5775+
5776+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
5777+ (sb->level == 4) || (sb->level == 5)))
5778+ printk (NOT_CLEAN_IGNORE, mdidx(mddev));
5779+
5780+ return 0;
5781+abort:
5782+ return 1;
5783+}
5784+
5785+#undef INCONSISTENT
5786+#undef OUT_OF_DATE
5787+#undef OLD_VERSION
5788+#undef OLD_LEVEL
5789+
5790+static int device_size_calculation (mddev_t * mddev)
5791+{
5792+ int data_disks = 0, persistent;
5793+ unsigned int readahead;
5794+ mdp_super_t *sb = mddev->sb;
5795+ struct md_list_head *tmp;
5796+ mdk_rdev_t *rdev;
5797+
5798+ /*
5799+ * Do device size calculation. Bail out if too small.
5800+ * (we have to do this after having validated chunk_size,
5801+ * because device size has to be modulo chunk_size)
5802+ */
5803+ persistent = !mddev->sb->not_persistent;
5804+ ITERATE_RDEV(mddev,rdev,tmp) {
5805+ if (rdev->faulty)
5806+ continue;
5807+ if (rdev->size) {
5808+ MD_BUG();
5809+ continue;
5810+ }
5811+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
5812+ if (rdev->size < sb->chunk_size / 1024) {
5813+ printk (KERN_WARNING
5814+ "Dev %s smaller than chunk_size: %dk < %dk\n",
5815+ partition_name(rdev->dev),
5816+ rdev->size, sb->chunk_size / 1024);
5817+ return -EINVAL;
5818+ }
5819+ }
5820+
5821+ switch (sb->level) {
5822+ case -3:
5823+ data_disks = 1;
5824+ break;
5825+ case -2:
5826+ data_disks = 1;
5827+ break;
5828+ case -1:
5829+ zoned_raid_size(mddev);
5830+ data_disks = 1;
5831+ break;
5832+ case 0:
5833+ zoned_raid_size(mddev);
5834+ data_disks = sb->raid_disks;
5835+ break;
5836+ case 1:
5837+ data_disks = 1;
5838+ break;
5839+ case 4:
5840+ case 5:
5841+ data_disks = sb->raid_disks-1;
5842+ break;
5843+ default:
5844+ printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
5845+ goto abort;
5846+ }
5847+ if (!md_size[mdidx(mddev)])
5848+ md_size[mdidx(mddev)] = sb->size * data_disks;
5849+
5850+ readahead = MD_READAHEAD;
5851+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
5852+ readahead = mddev->sb->chunk_size * 4 * data_disks;
5853+ if (readahead < data_disks * MAX_SECTORS*512*2)
5854+ readahead = data_disks * MAX_SECTORS*512*2;
5855+ else {
5856+ if (sb->level == -3)
5857+ readahead = 0;
5858+ }
5859+ md_maxreadahead[mdidx(mddev)] = readahead;
5860+
5861+ printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
5862+ mdidx(mddev), readahead/1024);
5863+
5864+ printk(KERN_INFO
5865+ "md%d: %d data-disks, max readahead per data-disk: %dk\n",
5866+ mdidx(mddev), data_disks, readahead/data_disks/1024);
5867+ return 0;
5868+abort:
5869+ return 1;
5870+}
5871+
5872+
5873+#define TOO_BIG_CHUNKSIZE KERN_ERR \
5874+"too big chunk_size: %d > %d\n"
5875+
5876+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
5877+"too small chunk_size: %d < %ld\n"
5878+
5879+#define BAD_CHUNKSIZE KERN_ERR \
5880+"no chunksize specified, see 'man raidtab'\n"
5881+
5882+static int do_md_run (mddev_t * mddev)
5883+{
5884+ int pnum, err;
5885+ int chunk_size;
5886+ struct md_list_head *tmp;
5887+ mdk_rdev_t *rdev;
5888+
5889+
5890+ if (!mddev->nb_dev) {
5891+ MD_BUG();
5892+ return -EINVAL;
5893+ }
5894+
5895+ if (mddev->pers)
5896+ return -EBUSY;
5897+
5898+ /*
5899+ * Resize disks to align partitions size on a given
5900+ * chunk size.
5901+ */
5902+ md_size[mdidx(mddev)] = 0;
5903+
5904+ /*
5905+ * Analyze all RAID superblock(s)
5906+ */
5907+ if (analyze_sbs(mddev)) {
5908+ MD_BUG();
5909+ return -EINVAL;
5910+ }
5911+
5912+ chunk_size = mddev->sb->chunk_size;
5913+ pnum = level_to_pers(mddev->sb->level);
5914+
5915+ mddev->param.chunk_size = chunk_size;
5916+ mddev->param.personality = pnum;
5917+
5918+ if (chunk_size > MAX_CHUNK_SIZE) {
5919+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
5920+ return -EINVAL;
5921+ }
5922+ /*
5923+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
5924+ */
5925+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
5926+ MD_BUG();
5927+ return -EINVAL;
5928+ }
5929+ if (chunk_size < PAGE_SIZE) {
5930+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
5931+ return -EINVAL;
5932+ }
5933+
5934+ if (pnum >= MAX_PERSONALITY) {
5935+ MD_BUG();
5936+ return -EINVAL;
5937+ }
5938+
5939+ if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
5940+ /*
5941+ * 'default chunksize' in the old md code used to
5942+ * be PAGE_SIZE, baaad.
5943+ * we abort here to be on the safe side. We dont
5944+ * want to continue the bad practice.
5945+ */
5946+ printk(BAD_CHUNKSIZE);
5947+ return -EINVAL;
5948+ }
5949+
5950+ if (!pers[pnum])
5951+ {
5952+#ifdef CONFIG_KMOD
5953+ char module_name[80];
5954+ sprintf (module_name, "md-personality-%d", pnum);
5955+ request_module (module_name);
5956+ if (!pers[pnum])
5957+#endif
5958+ return -EINVAL;
5959+ }
5960+
5961+ if (device_size_calculation(mddev))
5962+ return -EINVAL;
5963+
5964+ /*
5965+ * Drop all container device buffers, from now on
5966+ * the only valid external interface is through the md
5967+ * device.
5968+ */
5969+ ITERATE_RDEV(mddev,rdev,tmp) {
5970+ if (rdev->faulty)
5971+ continue;
5972+ fsync_dev(rdev->dev);
5973+ invalidate_buffers(rdev->dev);
5974+ }
5975+
5976+ mddev->pers = pers[pnum];
5977+
5978+ err = mddev->pers->run(mddev);
5979+ if (err) {
5980+ printk("pers->run() failed ...\n");
5981+ mddev->pers = NULL;
5982+ return -EINVAL;
5983+ }
5984+
5985+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
5986+ md_update_sb(mddev);
5987+
5988+ /*
5989+ * md_size has units of 1K blocks, which are
5990+ * twice as large as sectors.
5991+ */
5992+ md_hd_struct[mdidx(mddev)].start_sect = 0;
5993+ md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
5994+
5995+ read_ahead[MD_MAJOR] = 1024;
5996+ return (0);
5997+}
5998+
5999+#undef TOO_BIG_CHUNKSIZE
6000+#undef BAD_CHUNKSIZE
6001+
6002+#define OUT(x) do { err = (x); goto out; } while (0)
6003+
6004+static int restart_array (mddev_t *mddev)
6005+{
6006+ int err = 0;
6007+
6008+ /*
6009+ * Complain if it has no devices
6010+ */
6011+ if (!mddev->nb_dev)
6012+ OUT(-ENXIO);
6013+
6014+ if (mddev->pers) {
6015+ if (!mddev->ro)
6016+ OUT(-EBUSY);
6017+
6018+ mddev->ro = 0;
6019+ set_device_ro(mddev_to_kdev(mddev), 0);
6020+
6021+ printk (KERN_INFO
6022+ "md%d switched to read-write mode.\n", mdidx(mddev));
6023+ /*
6024+ * Kick recovery or resync if necessary
6025+ */
6026+ md_recover_arrays();
6027+ if (mddev->pers->restart_resync)
6028+ mddev->pers->restart_resync(mddev);
6029+ } else
6030+ err = -EINVAL;
6031+
6032+out:
6033+ return err;
6034+}
6035+
6036+#define STILL_MOUNTED KERN_WARNING \
6037+"md: md%d still mounted.\n"
6038+
6039+static int do_md_stop (mddev_t * mddev, int ro)
6040+{
6041+ int err = 0, resync_interrupted = 0;
6042+ kdev_t dev = mddev_to_kdev(mddev);
6043+
6044+ if (!ro && !fs_may_mount (dev)) {
6045+ printk (STILL_MOUNTED, mdidx(mddev));
6046+ OUT(-EBUSY);
6047+ }
6048+
6049+ /*
6050+ * complain if it's already stopped
6051+ */
6052+ if (!mddev->nb_dev)
6053+ OUT(-ENXIO);
6054+
6055+ if (mddev->pers) {
6056+ /*
6057+ * It is safe to call stop here, it only frees private
6058+ * data. Also, it tells us if a device is unstoppable
6059+ * (eg. resyncing is in progress)
6060+ */
6061+ if (mddev->pers->stop_resync)
6062+ if (mddev->pers->stop_resync(mddev))
6063+ resync_interrupted = 1;
6064+
6065+ if (mddev->recovery_running)
6066+ md_interrupt_thread(md_recovery_thread);
6067+
6068+ /*
6069+ * This synchronizes with signal delivery to the
6070+ * resync or reconstruction thread. It also nicely
6071+ * hangs the process if some reconstruction has not
6072+ * finished.
6073+ */
6074+ down(&mddev->recovery_sem);
6075+ up(&mddev->recovery_sem);
6076+
6077+ /*
6078+ * sync and invalidate buffers because we cannot kill the
6079+ * main thread with valid IO transfers still around.
6080+ * the kernel lock protects us from new requests being
6081+ * added after invalidate_buffers().
6082+ */
6083+ fsync_dev (mddev_to_kdev(mddev));
6084+ fsync_dev (dev);
6085+ invalidate_buffers (dev);
6086+
6087+ if (ro) {
6088+ if (mddev->ro)
6089+ OUT(-ENXIO);
6090+ mddev->ro = 1;
6091+ } else {
6092+ if (mddev->ro)
6093+ set_device_ro(dev, 0);
6094+ if (mddev->pers->stop(mddev)) {
6095+ if (mddev->ro)
6096+ set_device_ro(dev, 1);
6097+ OUT(-EBUSY);
6098+ }
6099+ if (mddev->ro)
6100+ mddev->ro = 0;
6101+ }
6102+ if (mddev->sb) {
6103+ /*
6104+ * mark it clean only if there was no resync
6105+ * interrupted.
6106+ */
6107+ if (!mddev->recovery_running && !resync_interrupted) {
6108+ printk("marking sb clean...\n");
6109+ mddev->sb->state |= 1 << MD_SB_CLEAN;
6110+ }
6111+ md_update_sb(mddev);
6112+ }
6113+ if (ro)
6114+ set_device_ro(dev, 1);
6115+ }
6116+
6117+ /*
6118+ * Free resources if final stop
6119+ */
6120+ if (!ro) {
6121+ export_array(mddev);
6122+ md_size[mdidx(mddev)] = 0;
6123+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
6124+ free_mddev(mddev);
6125+
6126+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
6127+ } else
6128+ printk (KERN_INFO
6129+ "md%d switched to read-only mode.\n", mdidx(mddev));
6130+out:
6131+ return err;
6132+}
6133+
6134+#undef OUT
6135+
6136+/*
6137+ * We have to safely support old arrays too.
6138+ */
6139+int detect_old_array (mdp_super_t *sb)
6140+{
6141+ if (sb->major_version > 0)
6142+ return 0;
6143+ if (sb->minor_version >= 90)
6144+ return 0;
6145+
6146+ return -EINVAL;
6147+}
6148+
6149+
6150+static void autorun_array (mddev_t *mddev)
6151+{
6152+ mdk_rdev_t *rdev;
6153+ struct md_list_head *tmp;
6154+ int err;
6155+
6156+ if (mddev->disks.prev == &mddev->disks) {
6157+ MD_BUG();
6158+ return;
6159+ }
6160+
6161+ printk("running: ");
6162+
6163+ ITERATE_RDEV(mddev,rdev,tmp) {
6164+ printk("<%s>", partition_name(rdev->dev));
6165+ }
6166+ printk("\nnow!\n");
6167+
6168+ err = do_md_run (mddev);
6169+ if (err) {
6170+ printk("do_md_run() returned %d\n", err);
6171+ /*
6172+ * prevent the writeback of an unrunnable array
6173+ */
6174+ mddev->sb_dirty = 0;
6175+ do_md_stop (mddev, 0);
6176+ }
6177+}
6178+
6179+/*
6180+ * lets try to run arrays based on all disks that have arrived
6181+ * until now. (those are in the ->pending list)
6182+ *
6183+ * the method: pick the first pending disk, collect all disks with
6184+ * the same UUID, remove all from the pending list and put them into
6185+ * the 'same_array' list. Then order this list based on superblock
6186+ * update time (freshest comes first), kick out 'old' disks and
6187+ * compare superblocks. If everything's fine then run it.
6188+ */
6189+static void autorun_devices (void)
6190+{
6191+ struct md_list_head candidates;
6192+ struct md_list_head *tmp;
6193+ mdk_rdev_t *rdev0, *rdev;
6194+ mddev_t *mddev;
6195+ kdev_t md_kdev;
6196+
6197+
6198+ printk("autorun ...\n");
6199+ while (pending_raid_disks.next != &pending_raid_disks) {
6200+ rdev0 = md_list_entry(pending_raid_disks.next,
6201+ mdk_rdev_t, pending);
6202+
6203+ printk("considering %s ...\n", partition_name(rdev0->dev));
6204+ MD_INIT_LIST_HEAD(&candidates);
6205+ ITERATE_RDEV_PENDING(rdev,tmp) {
6206+ if (uuid_equal(rdev0, rdev)) {
6207+ if (!sb_equal(rdev0->sb, rdev->sb)) {
6208+ printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
6209+ continue;
6210+ }
6211+ printk(" adding %s ...\n", partition_name(rdev->dev));
6212+ md_list_del(&rdev->pending);
6213+ md_list_add(&rdev->pending, &candidates);
6214+ }
6215+ }
6216 /*
6217- * Check the superblock for consistency.
6218- * The personality itself has to check whether it's getting
6219- * added with the proper flags. The personality has to be
6220- * checked too. ;)
6221+ * now we have a set of devices, with all of them having
6222+ * mostly sane superblocks. It's time to allocate the
6223+ * mddev.
6224 */
6225- if (analyze_one_sb (realdev))
6226+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
6227+ mddev = kdev_to_mddev(md_kdev);
6228+ if (mddev) {
6229+ printk("md%d already running, cannot run %s\n",
6230+ mdidx(mddev), partition_name(rdev0->dev));
6231+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
6232+ export_rdev(rdev);
6233+ continue;
6234+ }
6235+ mddev = alloc_mddev(md_kdev);
6236+ printk("created md%d\n", mdidx(mddev));
6237+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
6238+ bind_rdev_to_array(rdev, mddev);
6239+ md_list_del(&rdev->pending);
6240+ MD_INIT_LIST_HEAD(&rdev->pending);
6241+ }
6242+ autorun_array(mddev);
6243+ }
6244+ printk("... autorun DONE.\n");
6245+}
6246+
6247+/*
6248+ * import RAID devices based on one partition
6249+ * if possible, the array gets run as well.
6250+ */
6251+
6252+#define BAD_VERSION KERN_ERR \
6253+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
6254+
6255+#define OUT_OF_MEM KERN_ALERT \
6256+"md: out of memory.\n"
6257+
6258+#define NO_DEVICE KERN_ERR \
6259+"md: disabled device %s\n"
6260+
6261+#define AUTOADD_FAILED KERN_ERR \
6262+"md: auto-adding devices to md%d FAILED (error %d).\n"
6263+
6264+#define AUTOADD_FAILED_USED KERN_ERR \
6265+"md: cannot auto-add device %s to md%d, already used.\n"
6266+
6267+#define AUTORUN_FAILED KERN_ERR \
6268+"md: auto-running md%d FAILED (error %d).\n"
6269+
6270+#define MDDEV_BUSY KERN_ERR \
6271+"md: cannot auto-add to md%d, already running.\n"
6272+
6273+#define AUTOADDING KERN_INFO \
6274+"md: auto-adding devices to md%d, based on %s's superblock.\n"
6275+
6276+#define AUTORUNNING KERN_INFO \
6277+"md: auto-running md%d.\n"
6278+
6279+static int autostart_array (kdev_t startdev)
6280+{
6281+ int err = -EINVAL, i;
6282+ mdp_super_t *sb = NULL;
6283+ mdk_rdev_t *start_rdev = NULL, *rdev;
6284+
6285+ if (md_import_device(startdev, 1)) {
6286+ printk("could not import %s!\n", partition_name(startdev));
6287+ goto abort;
6288+ }
6289+
6290+ start_rdev = find_rdev_all(startdev);
6291+ if (!start_rdev) {
6292+ MD_BUG();
6293+ goto abort;
6294+ }
6295+ if (start_rdev->faulty) {
6296+ printk("can not autostart based on faulty %s!\n",
6297+ partition_name(startdev));
6298+ goto abort;
6299+ }
6300+ md_list_add(&start_rdev->pending, &pending_raid_disks);
6301+
6302+ sb = start_rdev->sb;
6303+
6304+ err = detect_old_array(sb);
6305+ if (err) {
6306+ printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
6307+ goto abort;
6308+ }
6309+
6310+ for (i = 0; i < MD_SB_DISKS; i++) {
6311+ mdp_disk_t *desc;
6312+ kdev_t dev;
6313+
6314+ desc = sb->disks + i;
6315+ dev = MKDEV(desc->major, desc->minor);
6316+
6317+ if (dev == MKDEV(0,0))
6318+ continue;
6319+ if (dev == startdev)
6320+ continue;
6321+ if (md_import_device(dev, 1)) {
6322+ printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
6323+ continue;
6324+ }
6325+ rdev = find_rdev_all(dev);
6326+ if (!rdev) {
6327+ MD_BUG();
6328+ goto abort;
6329+ }
6330+ md_list_add(&rdev->pending, &pending_raid_disks);
6331+ }
6332+
6333+ /*
6334+ * possibly return codes
6335+ */
6336+ autorun_devices();
6337+ return 0;
6338+
6339+abort:
6340+ if (start_rdev)
6341+ export_rdev(start_rdev);
6342+ return err;
6343+}
6344+
6345+#undef BAD_VERSION
6346+#undef OUT_OF_MEM
6347+#undef NO_DEVICE
6348+#undef AUTOADD_FAILED_USED
6349+#undef AUTOADD_FAILED
6350+#undef AUTORUN_FAILED
6351+#undef AUTOADDING
6352+#undef AUTORUNNING
6353+
6354+struct {
6355+ int set;
6356+ int noautodetect;
6357+
6358+} raid_setup_args md__initdata = { 0, 0 };
6359+
6360+/*
6361+ * Searches all registered partitions for autorun RAID arrays
6362+ * at boot time.
6363+ */
6364+md__initfunc(void autodetect_raid(void))
6365+{
6366+#ifdef CONFIG_AUTODETECT_RAID
6367+ struct gendisk *disk;
6368+ mdk_rdev_t *rdev;
6369+ int i;
6370+
6371+ if (raid_setup_args.noautodetect) {
6372+ printk(KERN_INFO "skipping autodetection of RAID arrays\n");
6373+ return;
6374+ }
6375+ printk(KERN_INFO "autodetecting RAID arrays\n");
6376+
6377+ for (disk = gendisk_head ; disk ; disk = disk->next) {
6378+ for (i = 0; i < disk->max_p*disk->max_nr; i++) {
6379+ kdev_t dev = MKDEV(disk->major,i);
6380+
6381+ if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
6382+ printk(KERN_ALERT
6383+"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
6384+" to maintain interoperability with other OSs! Autodetection support for\n"
6385+" type 0x86 will be deleted after some migration timeout. Sorry.\n",
6386+ partition_name(dev));
6387+ disk->part[i].type = LINUX_RAID_PARTITION;
6388+ }
6389+ if (disk->part[i].type != LINUX_RAID_PARTITION)
6390+ continue;
6391+
6392+ if (md_import_device(dev,1)) {
6393+ printk(KERN_ALERT "could not import %s!\n",
6394+ partition_name(dev));
6395+ continue;
6396+ }
6397+ /*
6398+ * Sanity checks:
6399+ */
6400+ rdev = find_rdev_all(dev);
6401+ if (!rdev) {
6402+ MD_BUG();
6403+ continue;
6404+ }
6405+ if (rdev->faulty) {
6406+ MD_BUG();
6407+ continue;
6408+ }
6409+ md_list_add(&rdev->pending, &pending_raid_disks);
6410+ }
6411+ }
6412+
6413+ autorun_devices();
6414+#endif
6415+}
6416+
6417+static int get_version (void * arg)
6418+{
6419+ mdu_version_t ver;
6420+
6421+ ver.major = MD_MAJOR_VERSION;
6422+ ver.minor = MD_MINOR_VERSION;
6423+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
6424+
6425+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
6426+ return -EFAULT;
6427+
6428+ return 0;
6429+}
6430+
6431+#define SET_FROM_SB(x) info.x = mddev->sb->x
6432+static int get_array_info (mddev_t * mddev, void * arg)
6433+{
6434+ mdu_array_info_t info;
6435+
6436+ if (!mddev->sb)
6437+ return -EINVAL;
6438+
6439+ SET_FROM_SB(major_version);
6440+ SET_FROM_SB(minor_version);
6441+ SET_FROM_SB(patch_version);
6442+ SET_FROM_SB(ctime);
6443+ SET_FROM_SB(level);
6444+ SET_FROM_SB(size);
6445+ SET_FROM_SB(nr_disks);
6446+ SET_FROM_SB(raid_disks);
6447+ SET_FROM_SB(md_minor);
6448+ SET_FROM_SB(not_persistent);
6449+
6450+ SET_FROM_SB(utime);
6451+ SET_FROM_SB(state);
6452+ SET_FROM_SB(active_disks);
6453+ SET_FROM_SB(working_disks);
6454+ SET_FROM_SB(failed_disks);
6455+ SET_FROM_SB(spare_disks);
6456+
6457+ SET_FROM_SB(layout);
6458+ SET_FROM_SB(chunk_size);
6459+
6460+ if (md_copy_to_user(arg, &info, sizeof(info)))
6461+ return -EFAULT;
6462+
6463+ return 0;
6464+}
6465+#undef SET_FROM_SB
6466+
6467+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
6468+static int get_disk_info (mddev_t * mddev, void * arg)
6469+{
6470+ mdu_disk_info_t info;
6471+ unsigned int nr;
6472+
6473+ if (!mddev->sb)
6474+ return -EINVAL;
6475+
6476+ if (md_copy_from_user(&info, arg, sizeof(info)))
6477+ return -EFAULT;
6478+
6479+ nr = info.number;
6480+ if (nr >= mddev->sb->nr_disks)
6481+ return -EINVAL;
6482+
6483+ SET_FROM_SB(major);
6484+ SET_FROM_SB(minor);
6485+ SET_FROM_SB(raid_disk);
6486+ SET_FROM_SB(state);
6487+
6488+ if (md_copy_to_user(arg, &info, sizeof(info)))
6489+ return -EFAULT;
6490+
6491+ return 0;
6492+}
6493+#undef SET_FROM_SB
6494+
6495+#define SET_SB(x) mddev->sb->disks[nr].x = info.x
6496+
6497+static int add_new_disk (mddev_t * mddev, void * arg)
6498+{
6499+ int err, size, persistent;
6500+ mdu_disk_info_t info;
6501+ mdk_rdev_t *rdev;
6502+ unsigned int nr;
6503+ kdev_t dev;
6504+
6505+ if (!mddev->sb)
6506+ return -EINVAL;
6507+
6508+ if (md_copy_from_user(&info, arg, sizeof(info)))
6509+ return -EFAULT;
6510+
6511+ nr = info.number;
6512+ if (nr >= mddev->sb->nr_disks)
6513+ return -EINVAL;
6514+
6515+ dev = MKDEV(info.major,info.minor);
6516+
6517+ if (find_rdev_all(dev)) {
6518+ printk("device %s already used in a RAID array!\n",
6519+ partition_name(dev));
6520+ return -EBUSY;
6521+ }
6522+
6523+ SET_SB(number);
6524+ SET_SB(major);
6525+ SET_SB(minor);
6526+ SET_SB(raid_disk);
6527+ SET_SB(state);
6528+
6529+ if ((info.state & (1<<MD_DISK_FAULTY))==0) {
6530+ err = md_import_device (dev, 0);
6531+ if (err) {
6532+ printk("md: error, md_import_device() returned %d\n", err);
6533+ return -EINVAL;
6534+ }
6535+ rdev = find_rdev_all(dev);
6536+ if (!rdev) {
6537+ MD_BUG();
6538 return -EINVAL;
6539+ }
6540+
6541+ rdev->old_dev = dev;
6542+ rdev->desc_nr = info.number;
6543+
6544+ bind_rdev_to_array(rdev, mddev);
6545+
6546+ persistent = !mddev->sb->not_persistent;
6547+ if (!persistent)
6548+ printk("nonpersistent superblock ...\n");
6549+ if (!mddev->sb->chunk_size)
6550+ printk("no chunksize?\n");
6551+
6552+ size = calc_dev_size(dev, mddev, persistent);
6553+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6554+
6555+ if (!mddev->sb->size || (mddev->sb->size > size))
6556+ mddev->sb->size = size;
6557+ }
6558+
6559+ /*
6560+ * sync all other superblocks with the main superblock
6561+ */
6562+ sync_sbs(mddev);
6563+
6564+ return 0;
6565+}
6566+#undef SET_SB
6567+
6568+static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
6569+{
6570+ int err;
6571+ mdk_rdev_t *rdev;
6572+ mdp_disk_t *disk;
6573+
6574+ if (!mddev->pers)
6575+ return -ENODEV;
6576+
6577+ printk("trying to remove %s from md%d ... \n",
6578+ partition_name(dev), mdidx(mddev));
6579+
6580+ if (!mddev->pers->diskop) {
6581+ printk("md%d: personality does not support diskops!\n",
6582+ mdidx(mddev));
6583+ return -EINVAL;
6584+ }
6585+
6586+ rdev = find_rdev(mddev, dev);
6587+ if (!rdev)
6588+ return -ENXIO;
6589+
6590+ if (rdev->desc_nr == -1) {
6591+ MD_BUG();
6592+ return -EINVAL;
6593+ }
6594+ disk = &mddev->sb->disks[rdev->desc_nr];
6595+ if (disk_active(disk))
6596+ goto busy;
6597+ if (disk_removed(disk)) {
6598+ MD_BUG();
6599+ return -EINVAL;
6600+ }
6601+
6602+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
6603+ if (err == -EBUSY)
6604+ goto busy;
6605+ if (err) {
6606+ MD_BUG();
6607+ return -EINVAL;
6608+ }
6609+
6610+ remove_descriptor(disk, mddev->sb);
6611+ kick_rdev_from_array(rdev);
6612+ mddev->sb_dirty = 1;
6613+ md_update_sb(mddev);
6614+
6615+ return 0;
6616+busy:
6617+ printk("cannot remove active disk %s from md%d ... \n",
6618+ partition_name(dev), mdidx(mddev));
6619+ return -EBUSY;
6620+}
6621+
6622+static int hot_add_disk (mddev_t * mddev, kdev_t dev)
6623+{
6624+ int i, err, persistent;
6625+ unsigned int size;
6626+ mdk_rdev_t *rdev;
6627+ mdp_disk_t *disk;
6628+
6629+ if (!mddev->pers)
6630+ return -ENODEV;
6631+
6632+ printk("trying to hot-add %s to md%d ... \n",
6633+ partition_name(dev), mdidx(mddev));
6634+
6635+ if (!mddev->pers->diskop) {
6636+ printk("md%d: personality does not support diskops!\n",
6637+ mdidx(mddev));
6638+ return -EINVAL;
6639+ }
6640+
6641+ persistent = !mddev->sb->not_persistent;
6642+ size = calc_dev_size(dev, mddev, persistent);
6643+
6644+ if (size < mddev->sb->size) {
6645+ printk("md%d: disk size %d blocks < array size %d\n",
6646+ mdidx(mddev), size, mddev->sb->size);
6647+ return -ENOSPC;
6648+ }
6649+
6650+ rdev = find_rdev(mddev, dev);
6651+ if (rdev)
6652+ return -EBUSY;
6653+
6654+ err = md_import_device (dev, 0);
6655+ if (err) {
6656+ printk("md: error, md_import_device() returned %d\n", err);
6657+ return -EINVAL;
6658+ }
6659+ rdev = find_rdev_all(dev);
6660+ if (!rdev) {
6661+ MD_BUG();
6662+ return -EINVAL;
6663+ }
6664+ if (rdev->faulty) {
6665+ printk("md: can not hot-add faulty %s disk to md%d!\n",
6666+ partition_name(dev), mdidx(mddev));
6667+ err = -EINVAL;
6668+ goto abort_export;
6669+ }
6670+ bind_rdev_to_array(rdev, mddev);
6671+
6672+ /*
6673+ * The rest should better be atomic, we can have disk failures
6674+ * noticed in interrupt contexts ...
6675+ */
6676+ cli();
6677+ rdev->old_dev = dev;
6678+ rdev->size = size;
6679+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6680+
6681+ disk = mddev->sb->disks + mddev->sb->raid_disks;
6682+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
6683+ disk = mddev->sb->disks + i;
6684+
6685+ if (!disk->major && !disk->minor)
6686+ break;
6687+ if (disk_removed(disk))
6688+ break;
6689+ }
6690+ if (i == MD_SB_DISKS) {
6691+ sti();
6692+ printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
6693+ err = -EBUSY;
6694+ goto abort_unbind_export;
6695+ }
6696+
6697+ if (disk_removed(disk)) {
6698 /*
6699- * hot_add has to bump up nb_dev itself
6700+ * reuse slot
6701 */
6702- if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
6703- /*
6704- * FIXME: here we should free up the inode and stuff
6705- */
6706- printk ("FIXME\n");
6707- return -EINVAL;
6708+ if (disk->number != i) {
6709+ sti();
6710+ MD_BUG();
6711+ err = -EINVAL;
6712+ goto abort_unbind_export;
6713 }
6714- } else
6715- md_dev[minor].nb_dev++;
6716+ } else {
6717+ disk->number = i;
6718+ }
6719
6720- printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
6721- return (0);
6722+ disk->raid_disk = disk->number;
6723+ disk->major = MAJOR(dev);
6724+ disk->minor = MINOR(dev);
6725+
6726+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
6727+ sti();
6728+ MD_BUG();
6729+ err = -EINVAL;
6730+ goto abort_unbind_export;
6731+ }
6732+
6733+ mark_disk_spare(disk);
6734+ mddev->sb->nr_disks++;
6735+ mddev->sb->spare_disks++;
6736+ mddev->sb->working_disks++;
6737+
6738+ mddev->sb_dirty = 1;
6739+
6740+ sti();
6741+ md_update_sb(mddev);
6742+
6743+ /*
6744+ * Kick recovery, maybe this spare has to be added to the
6745+ * array immediately.
6746+ */
6747+ md_recover_arrays();
6748+
6749+ return 0;
6750+
6751+abort_unbind_export:
6752+ unbind_rdev_from_array(rdev);
6753+
6754+abort_export:
6755+ export_rdev(rdev);
6756+ return err;
6757+}
6758+
6759+#define SET_SB(x) mddev->sb->x = info.x
6760+static int set_array_info (mddev_t * mddev, void * arg)
6761+{
6762+ mdu_array_info_t info;
6763+
6764+ if (mddev->sb) {
6765+ printk("array md%d already has a superblock!\n",
6766+ mdidx(mddev));
6767+ return -EBUSY;
6768+ }
6769+
6770+ if (md_copy_from_user(&info, arg, sizeof(info)))
6771+ return -EFAULT;
6772+
6773+ if (alloc_array_sb(mddev))
6774+ return -ENOMEM;
6775+
6776+ mddev->sb->major_version = MD_MAJOR_VERSION;
6777+ mddev->sb->minor_version = MD_MINOR_VERSION;
6778+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
6779+ mddev->sb->ctime = CURRENT_TIME;
6780+
6781+ SET_SB(level);
6782+ SET_SB(size);
6783+ SET_SB(nr_disks);
6784+ SET_SB(raid_disks);
6785+ SET_SB(md_minor);
6786+ SET_SB(not_persistent);
6787+
6788+ SET_SB(state);
6789+ SET_SB(active_disks);
6790+ SET_SB(working_disks);
6791+ SET_SB(failed_disks);
6792+ SET_SB(spare_disks);
6793+
6794+ SET_SB(layout);
6795+ SET_SB(chunk_size);
6796+
6797+ mddev->sb->md_magic = MD_SB_MAGIC;
6798+
6799+ /*
6800+ * Generate a 128 bit UUID
6801+ */
6802+ get_random_bytes(&mddev->sb->set_uuid0, 4);
6803+ get_random_bytes(&mddev->sb->set_uuid1, 4);
6804+ get_random_bytes(&mddev->sb->set_uuid2, 4);
6805+ get_random_bytes(&mddev->sb->set_uuid3, 4);
6806+
6807+ return 0;
6808+}
6809+#undef SET_SB
6810+
6811+static int set_disk_info (mddev_t * mddev, void * arg)
6812+{
6813+ printk("not yet");
6814+ return -EINVAL;
6815+}
6816+
6817+static int clear_array (mddev_t * mddev)
6818+{
6819+ printk("not yet");
6820+ return -EINVAL;
6821+}
6822+
6823+static int write_raid_info (mddev_t * mddev)
6824+{
6825+ printk("not yet");
6826+ return -EINVAL;
6827+}
6828+
6829+static int protect_array (mddev_t * mddev)
6830+{
6831+ printk("not yet");
6832+ return -EINVAL;
6833+}
6834+
6835+static int unprotect_array (mddev_t * mddev)
6836+{
6837+ printk("not yet");
6838+ return -EINVAL;
6839+}
6840+
6841+static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
6842+{
6843+ int ret;
6844+
6845+ fsync_dev(mddev_to_kdev(mddev));
6846+ ret = md_error(mddev_to_kdev(mddev), dev);
6847+ return ret;
6848 }
6849
6850 static int md_ioctl (struct inode *inode, struct file *file,
6851 unsigned int cmd, unsigned long arg)
6852 {
6853- int minor, err;
6854- struct hd_geometry *loc = (struct hd_geometry *) arg;
6855+ unsigned int minor;
6856+ int err = 0;
6857+ struct hd_geometry *loc = (struct hd_geometry *) arg;
6858+ mddev_t *mddev = NULL;
6859+ kdev_t dev;
6860
6861- if (!capable(CAP_SYS_ADMIN))
6862- return -EACCES;
6863+ if (!md_capable_admin())
6864+ return -EACCES;
6865
6866- if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
6867- (minor & 0x7f) < MAX_PERSONALITY &&
6868- pers[minor & 0x7f] &&
6869- pers[minor & 0x7f]->ioctl)
6870- return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
6871-
6872- if (minor >= MAX_MD_DEV)
6873- return -EINVAL;
6874+ dev = inode->i_rdev;
6875+ minor = MINOR(dev);
6876+ if (minor >= MAX_MD_DEVS)
6877+ return -EINVAL;
6878
6879- switch (cmd)
6880- {
6881- case REGISTER_DEV:
6882- return do_md_add (minor, to_kdev_t ((dev_t) arg));
6883+ /*
6884+ * Commands dealing with the RAID driver but not any
6885+ * particular array:
6886+ */
6887+ switch (cmd)
6888+ {
6889+ case RAID_VERSION:
6890+ err = get_version((void *)arg);
6891+ goto done;
6892+
6893+ case PRINT_RAID_DEBUG:
6894+ err = 0;
6895+ md_print_devices();
6896+ goto done_unlock;
6897+
6898+ case BLKGETSIZE: /* Return device size */
6899+ if (!arg) {
6900+ err = -EINVAL;
6901+ goto abort;
6902+ }
6903+ err = md_put_user(md_hd_struct[minor].nr_sects,
6904+ (long *) arg);
6905+ goto done;
6906
6907- case START_MD:
6908- return do_md_run (minor, (int) arg);
6909+ case BLKFLSBUF:
6910+ fsync_dev(dev);
6911+ invalidate_buffers(dev);
6912+ goto done;
6913
6914- case STOP_MD:
6915- return do_md_stop (minor, inode);
6916-
6917- case BLKGETSIZE: /* Return device size */
6918- if (!arg) return -EINVAL;
6919- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
6920- if (err)
6921- return err;
6922- break;
6923-
6924- case BLKFLSBUF:
6925- fsync_dev (inode->i_rdev);
6926- invalidate_buffers (inode->i_rdev);
6927- break;
6928-
6929- case BLKRASET:
6930- if (arg > 0xff)
6931- return -EINVAL;
6932- read_ahead[MAJOR(inode->i_rdev)] = arg;
6933- return 0;
6934-
6935- case BLKRAGET:
6936- if (!arg) return -EINVAL;
6937- err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
6938- if (err)
6939- return err;
6940- break;
6941-
6942- /* We have a problem here : there is no easy way to give a CHS
6943- virtual geometry. We currently pretend that we have a 2 heads
6944- 4 sectors (with a BIG number of cylinders...). This drives dosfs
6945- just mad... ;-) */
6946-
6947- case HDIO_GETGEO:
6948- if (!loc) return -EINVAL;
6949- err = put_user (2, (char *) &loc->heads);
6950- if (err)
6951- return err;
6952- err = put_user (4, (char *) &loc->sectors);
6953- if (err)
6954- return err;
6955- err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
6956- if (err)
6957- return err;
6958- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
6959- (long *) &loc->start);
6960- if (err)
6961- return err;
6962- break;
6963-
6964- RO_IOCTLS(inode->i_rdev,arg);
6965+ case BLKRASET:
6966+ if (arg > 0xff) {
6967+ err = -EINVAL;
6968+ goto abort;
6969+ }
6970+ read_ahead[MAJOR(dev)] = arg;
6971+ goto done;
6972
6973- default:
6974- return -EINVAL;
6975- }
6976+ case BLKRAGET:
6977+ if (!arg) {
6978+ err = -EINVAL;
6979+ goto abort;
6980+ }
6981+ err = md_put_user (read_ahead[
6982+ MAJOR(dev)], (long *) arg);
6983+ goto done;
6984+ default:
6985+ }
6986+
6987+ /*
6988+ * Commands creating/starting a new array:
6989+ */
6990+
6991+ mddev = kdev_to_mddev(dev);
6992+
6993+ switch (cmd)
6994+ {
6995+ case SET_ARRAY_INFO:
6996+ case START_ARRAY:
6997+ if (mddev) {
6998+ printk("array md%d already exists!\n",
6999+ mdidx(mddev));
7000+ err = -EEXIST;
7001+ goto abort;
7002+ }
7003+ default:
7004+ }
7005+
7006+ switch (cmd)
7007+ {
7008+ case SET_ARRAY_INFO:
7009+ mddev = alloc_mddev(dev);
7010+ if (!mddev) {
7011+ err = -ENOMEM;
7012+ goto abort;
7013+ }
7014+ /*
7015+ * alloc_mddev() should possibly self-lock.
7016+ */
7017+ err = lock_mddev(mddev);
7018+ if (err) {
7019+ printk("ioctl, reason %d, cmd %d\n", err, cmd);
7020+ goto abort;
7021+ }
7022+ err = set_array_info(mddev, (void *)arg);
7023+ if (err) {
7024+ printk("couldnt set array info. %d\n", err);
7025+ goto abort;
7026+ }
7027+ goto done_unlock;
7028+
7029+ case START_ARRAY:
7030+ /*
7031+ * possibly make it lock the array ...
7032+ */
7033+ err = autostart_array((kdev_t)arg);
7034+ if (err) {
7035+ printk("autostart %s failed!\n",
7036+ partition_name((kdev_t)arg));
7037+ goto abort;
7038+ }
7039+ goto done;
7040+
7041+ default:
7042+ }
7043+
7044+ /*
7045+ * Commands querying/configuring an existing array:
7046+ */
7047+
7048+ if (!mddev) {
7049+ err = -ENODEV;
7050+ goto abort;
7051+ }
7052+ err = lock_mddev(mddev);
7053+ if (err) {
7054+ printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
7055+ goto abort;
7056+ }
7057+
7058+ /*
7059+ * Commands even a read-only array can execute:
7060+ */
7061+ switch (cmd)
7062+ {
7063+ case GET_ARRAY_INFO:
7064+ err = get_array_info(mddev, (void *)arg);
7065+ goto done_unlock;
7066+
7067+ case GET_DISK_INFO:
7068+ err = get_disk_info(mddev, (void *)arg);
7069+ goto done_unlock;
7070+
7071+ case RESTART_ARRAY_RW:
7072+ err = restart_array(mddev);
7073+ goto done_unlock;
7074+
7075+ case STOP_ARRAY:
7076+ err = do_md_stop (mddev, 0);
7077+ goto done_unlock;
7078+
7079+ case STOP_ARRAY_RO:
7080+ err = do_md_stop (mddev, 1);
7081+ goto done_unlock;
7082+
7083+ /*
7084+ * We have a problem here : there is no easy way to give a CHS
7085+ * virtual geometry. We currently pretend that we have a 2 heads
7086+ * 4 sectors (with a BIG number of cylinders...). This drives
7087+ * dosfs just mad... ;-)
7088+ */
7089+ case HDIO_GETGEO:
7090+ if (!loc) {
7091+ err = -EINVAL;
7092+ goto abort_unlock;
7093+ }
7094+ err = md_put_user (2, (char *) &loc->heads);
7095+ if (err)
7096+ goto abort_unlock;
7097+ err = md_put_user (4, (char *) &loc->sectors);
7098+ if (err)
7099+ goto abort_unlock;
7100+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
7101+ (short *) &loc->cylinders);
7102+ if (err)
7103+ goto abort_unlock;
7104+ err = md_put_user (md_hd_struct[minor].start_sect,
7105+ (long *) &loc->start);
7106+ goto done_unlock;
7107+ }
7108
7109- return (0);
7110+ /*
7111+ * The remaining ioctls are changing the state of the
7112+ * superblock, so we do not allow read-only arrays
7113+ * here:
7114+ */
7115+ if (mddev->ro) {
7116+ err = -EROFS;
7117+ goto abort_unlock;
7118+ }
7119+
7120+ switch (cmd)
7121+ {
7122+ case CLEAR_ARRAY:
7123+ err = clear_array(mddev);
7124+ goto done_unlock;
7125+
7126+ case ADD_NEW_DISK:
7127+ err = add_new_disk(mddev, (void *)arg);
7128+ goto done_unlock;
7129+
7130+ case HOT_REMOVE_DISK:
7131+ err = hot_remove_disk(mddev, (kdev_t)arg);
7132+ goto done_unlock;
7133+
7134+ case HOT_ADD_DISK:
7135+ err = hot_add_disk(mddev, (kdev_t)arg);
7136+ goto done_unlock;
7137+
7138+ case SET_DISK_INFO:
7139+ err = set_disk_info(mddev, (void *)arg);
7140+ goto done_unlock;
7141+
7142+ case WRITE_RAID_INFO:
7143+ err = write_raid_info(mddev);
7144+ goto done_unlock;
7145+
7146+ case UNPROTECT_ARRAY:
7147+ err = unprotect_array(mddev);
7148+ goto done_unlock;
7149+
7150+ case PROTECT_ARRAY:
7151+ err = protect_array(mddev);
7152+ goto done_unlock;
7153+
7154+ case SET_DISK_FAULTY:
7155+ err = set_disk_faulty(mddev, (kdev_t)arg);
7156+ goto done_unlock;
7157+
7158+ case RUN_ARRAY:
7159+ {
7160+ mdu_param_t param;
7161+
7162+ err = md_copy_from_user(&param, (mdu_param_t *)arg,
7163+ sizeof(param));
7164+ if (err)
7165+ goto abort_unlock;
7166+
7167+ err = do_md_run (mddev);
7168+ /*
7169+ * we have to clean up the mess if
7170+ * the array cannot be run for some
7171+ * reason ...
7172+ */
7173+ if (err) {
7174+ mddev->sb_dirty = 0;
7175+ do_md_stop (mddev, 0);
7176+ }
7177+ goto done_unlock;
7178+ }
7179+
7180+ default:
7181+ printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
7182+ err = -EINVAL;
7183+ goto abort_unlock;
7184+ }
7185+
7186+done_unlock:
7187+abort_unlock:
7188+ if (mddev)
7189+ unlock_mddev(mddev);
7190+ else
7191+ printk("huh11?\n");
7192+
7193+ return err;
7194+done:
7195+ if (err)
7196+ printk("huh12?\n");
7197+abort:
7198+ return err;
7199 }
7200
7201+
7202+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
7203+
7204 static int md_open (struct inode *inode, struct file *file)
7205 {
7206- int minor=MINOR(inode->i_rdev);
7207+ /*
7208+ * Always succeed
7209+ */
7210+ return (0);
7211+}
7212+
7213+static void md_release (struct inode *inode, struct file *file)
7214+{
7215+ sync_dev(inode->i_rdev);
7216+}
7217+
7218+
7219+static int md_read (struct inode *inode, struct file *file,
7220+ char *buf, int count)
7221+{
7222+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7223
7224- md_dev[minor].busy++;
7225- return (0); /* Always succeed */
7226+ if (!mddev || !mddev->pers)
7227+ return -ENXIO;
7228+
7229+ return block_read (inode, file, buf, count);
7230 }
7231
7232+static int md_write (struct inode *inode, struct file *file,
7233+ const char *buf, int count)
7234+{
7235+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7236+
7237+ if (!mddev || !mddev->pers)
7238+ return -ENXIO;
7239
7240-static int md_release (struct inode *inode, struct file *file)
7241+ return block_write (inode, file, buf, count);
7242+}
7243+
7244+static struct file_operations md_fops=
7245 {
7246- int minor=MINOR(inode->i_rdev);
7247+ NULL,
7248+ md_read,
7249+ md_write,
7250+ NULL,
7251+ NULL,
7252+ md_ioctl,
7253+ NULL,
7254+ md_open,
7255+ md_release,
7256+ block_fsync
7257+};
7258+
7259+#else
7260
7261- sync_dev (inode->i_rdev);
7262- md_dev[minor].busy--;
7263- return 0;
7264+static int md_open (struct inode *inode, struct file *file)
7265+{
7266+ /*
7267+ * Always succeed
7268+ */
7269+ return (0);
7270 }
7271
7272+static int md_release (struct inode *inode, struct file *file)
7273+{
7274+ sync_dev(inode->i_rdev);
7275+ return 0;
7276+}
7277
7278 static ssize_t md_read (struct file *file, char *buf, size_t count,
7279 loff_t *ppos)
7280 {
7281- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7282+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7283
7284- if (!md_dev[minor].pers) /* Check if device is being run */
7285- return -ENXIO;
7286+ if (!mddev || !mddev->pers)
7287+ return -ENXIO;
7288
7289- return block_read(file, buf, count, ppos);
7290+ return block_read(file, buf, count, ppos);
7291 }
7292
7293 static ssize_t md_write (struct file *file, const char *buf,
7294 size_t count, loff_t *ppos)
7295 {
7296- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7297+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7298
7299- if (!md_dev[minor].pers) /* Check if device is being run */
7300- return -ENXIO;
7301+ if (!mddev || !mddev->pers)
7302+ return -ENXIO;
7303
7304- return block_write(file, buf, count, ppos);
7305+ return block_write(file, buf, count, ppos);
7306 }
7307
7308 static struct file_operations md_fops=
7309 {
7310- NULL,
7311- md_read,
7312- md_write,
7313- NULL,
7314- NULL,
7315- md_ioctl,
7316- NULL,
7317- md_open,
7318- NULL,
7319- md_release,
7320- block_fsync
7321+ NULL,
7322+ md_read,
7323+ md_write,
7324+ NULL,
7325+ NULL,
7326+ md_ioctl,
7327+ NULL,
7328+ md_open,
7329+ NULL,
7330+ md_release,
7331+ block_fsync
7332 };
7333
7334-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
7335+#endif
7336+
7337+int md_map (kdev_t dev, kdev_t *rdev,
7338+ unsigned long *rsector, unsigned long size)
7339 {
7340- if ((unsigned int) minor >= MAX_MD_DEV)
7341- {
7342- printk ("Bad md device %d\n", minor);
7343- return (-1);
7344- }
7345-
7346- if (!md_dev[minor].pers)
7347- {
7348- printk ("Oops ! md%d not running, giving up !\n", minor);
7349- return (-1);
7350- }
7351+ int err;
7352+ mddev_t *mddev = kdev_to_mddev(dev);
7353
7354- return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
7355+ if (!mddev || !mddev->pers) {
7356+ err = -ENXIO;
7357+ goto out;
7358+ }
7359+
7360+ err = mddev->pers->map(mddev, dev, rdev, rsector, size);
7361+out:
7362+ return err;
7363 }
7364
7365-int md_make_request (int minor, int rw, struct buffer_head * bh)
7366+int md_make_request (struct buffer_head * bh, int rw)
7367 {
7368- if (md_dev [minor].pers->make_request) {
7369- if (buffer_locked(bh))
7370- return 0;
7371+ int err;
7372+ mddev_t *mddev = kdev_to_mddev(bh->b_dev);
7373+
7374+ if (!mddev || !mddev->pers) {
7375+ err = -ENXIO;
7376+ goto out;
7377+ }
7378+
7379+ if (mddev->pers->make_request) {
7380+ if (buffer_locked(bh)) {
7381+ err = 0;
7382+ goto out;
7383+ }
7384 set_bit(BH_Lock, &bh->b_state);
7385 if (rw == WRITE || rw == WRITEA) {
7386 if (!buffer_dirty(bh)) {
7387- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7388- return 0;
7389+ bh->b_end_io(bh, buffer_uptodate(bh));
7390+ err = 0;
7391+ goto out;
7392 }
7393 }
7394 if (rw == READ || rw == READA) {
7395 if (buffer_uptodate(bh)) {
7396- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7397- return 0;
7398+ bh->b_end_io(bh, buffer_uptodate(bh));
7399+ err = 0;
7400+ goto out;
7401 }
7402 }
7403- return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
7404+ err = mddev->pers->make_request(mddev, rw, bh);
7405 } else {
7406 make_request (MAJOR(bh->b_rdev), rw, bh);
7407- return 0;
7408+ err = 0;
7409 }
7410+out:
7411+ return err;
7412 }
7413
7414 static void do_md_request (void)
7415 {
7416- printk ("Got md request, not good...");
7417- return;
7418+ printk(KERN_ALERT "Got md request, not good...");
7419+ return;
7420+}
7421+
7422+int md_thread(void * arg)
7423+{
7424+ mdk_thread_t *thread = arg;
7425+
7426+ md_lock_kernel();
7427+ exit_mm(current);
7428+ exit_files(current);
7429+ exit_fs(current);
7430+
7431+ /*
7432+ * Detach thread
7433+ */
7434+ sys_setsid();
7435+ sprintf(current->comm, thread->name);
7436+ md_init_signals();
7437+ md_flush_signals();
7438+ thread->tsk = current;
7439+
7440+ /*
7441+ * md_thread is a 'system-thread', it's priority should be very
7442+ * high. We avoid resource deadlocks individually in each
7443+ * raid personality. (RAID5 does preallocation) We also use RR and
7444+ * the very same RT priority as kswapd, thus we will never get
7445+ * into a priority inversion deadlock.
7446+ *
7447+ * we definitely have to have equal or higher priority than
7448+ * bdflush, otherwise bdflush will deadlock if there are too
7449+ * many dirty RAID5 blocks.
7450+ */
7451+ current->policy = SCHED_OTHER;
7452+ current->priority = 40;
7453+
7454+ up(thread->sem);
7455+
7456+ for (;;) {
7457+ cli();
7458+ if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7459+ if (!thread->run)
7460+ break;
7461+ interruptible_sleep_on(&thread->wqueue);
7462+ }
7463+ sti();
7464+ clear_bit(THREAD_WAKEUP, &thread->flags);
7465+ if (thread->run) {
7466+ thread->run(thread->data);
7467+ run_task_queue(&tq_disk);
7468+ }
7469+ if (md_signal_pending(current)) {
7470+ printk("%8s(%d) flushing signals.\n", current->comm,
7471+ current->pid);
7472+ md_flush_signals();
7473+ }
7474+ }
7475+ sti();
7476+ up(thread->sem);
7477+ return 0;
7478 }
7479
7480-void md_wakeup_thread(struct md_thread *thread)
7481+void md_wakeup_thread(mdk_thread_t *thread)
7482 {
7483 set_bit(THREAD_WAKEUP, &thread->flags);
7484 wake_up(&thread->wqueue);
7485 }
7486
7487-struct md_thread *md_register_thread (void (*run) (void *), void *data)
7488+mdk_thread_t *md_register_thread (void (*run) (void *),
7489+ void *data, const char *name)
7490 {
7491- struct md_thread *thread = (struct md_thread *)
7492- kmalloc(sizeof(struct md_thread), GFP_KERNEL);
7493+ mdk_thread_t *thread;
7494 int ret;
7495 struct semaphore sem = MUTEX_LOCKED;
7496
7497- if (!thread) return NULL;
7498+ thread = (mdk_thread_t *) kmalloc
7499+ (sizeof(mdk_thread_t), GFP_KERNEL);
7500+ if (!thread)
7501+ return NULL;
7502
7503- memset(thread, 0, sizeof(struct md_thread));
7504+ memset(thread, 0, sizeof(mdk_thread_t));
7505 init_waitqueue(&thread->wqueue);
7506
7507 thread->sem = &sem;
7508 thread->run = run;
7509 thread->data = data;
7510+ thread->name = name;
7511 ret = kernel_thread(md_thread, thread, 0);
7512 if (ret < 0) {
7513 kfree(thread);
7514@@ -836,270 +3032,407 @@
7515 return thread;
7516 }
7517
7518-void md_unregister_thread (struct md_thread *thread)
7519+void md_interrupt_thread (mdk_thread_t *thread)
7520+{
7521+ if (!thread->tsk) {
7522+ MD_BUG();
7523+ return;
7524+ }
7525+ printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
7526+ send_sig(SIGKILL, thread->tsk, 1);
7527+}
7528+
7529+void md_unregister_thread (mdk_thread_t *thread)
7530 {
7531 struct semaphore sem = MUTEX_LOCKED;
7532
7533 thread->sem = &sem;
7534 thread->run = NULL;
7535- if (thread->tsk)
7536- printk("Killing md_thread %d %p %s\n",
7537- thread->tsk->pid, thread->tsk, thread->tsk->comm);
7538- else
7539- printk("Aiee. md_thread has 0 tsk\n");
7540- send_sig(SIGKILL, thread->tsk, 1);
7541- printk("downing on %p\n", &sem);
7542+ thread->name = NULL;
7543+ if (!thread->tsk) {
7544+ MD_BUG();
7545+ return;
7546+ }
7547+ md_interrupt_thread(thread);
7548 down(&sem);
7549 }
7550
7551-#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
7552-
7553-int md_thread(void * arg)
7554+void md_recover_arrays (void)
7555 {
7556- struct md_thread *thread = arg;
7557-
7558- lock_kernel();
7559- exit_mm(current);
7560- exit_files(current);
7561- exit_fs(current);
7562-
7563- current->session = 1;
7564- current->pgrp = 1;
7565- sprintf(current->comm, "md_thread");
7566- siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
7567- thread->tsk = current;
7568- up(thread->sem);
7569-
7570- for (;;) {
7571- cli();
7572- if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7573- do {
7574- spin_lock(&current->sigmask_lock);
7575- flush_signals(current);
7576- spin_unlock(&current->sigmask_lock);
7577- interruptible_sleep_on(&thread->wqueue);
7578- cli();
7579- if (test_bit(THREAD_WAKEUP, &thread->flags))
7580- break;
7581- if (!thread->run) {
7582- sti();
7583- up(thread->sem);
7584- return 0;
7585- }
7586- } while (signal_pending(current));
7587- }
7588- sti();
7589- clear_bit(THREAD_WAKEUP, &thread->flags);
7590- if (thread->run) {
7591- thread->run(thread->data);
7592- run_task_queue(&tq_disk);
7593- }
7594+ if (!md_recovery_thread) {
7595+ MD_BUG();
7596+ return;
7597 }
7598+ md_wakeup_thread(md_recovery_thread);
7599 }
7600
7601-EXPORT_SYMBOL(md_size);
7602-EXPORT_SYMBOL(md_maxreadahead);
7603-EXPORT_SYMBOL(register_md_personality);
7604-EXPORT_SYMBOL(unregister_md_personality);
7605-EXPORT_SYMBOL(partition_name);
7606-EXPORT_SYMBOL(md_dev);
7607-EXPORT_SYMBOL(md_error);
7608-EXPORT_SYMBOL(md_register_thread);
7609-EXPORT_SYMBOL(md_unregister_thread);
7610-EXPORT_SYMBOL(md_update_sb);
7611-EXPORT_SYMBOL(md_map);
7612-EXPORT_SYMBOL(md_wakeup_thread);
7613-EXPORT_SYMBOL(md_do_sync);
7614
7615-#ifdef CONFIG_PROC_FS
7616-static struct proc_dir_entry proc_md = {
7617- PROC_MD, 6, "mdstat",
7618- S_IFREG | S_IRUGO, 1, 0, 0,
7619- 0, &proc_array_inode_operations,
7620-};
7621+int md_error (kdev_t dev, kdev_t rdev)
7622+{
7623+ mddev_t *mddev = kdev_to_mddev(dev);
7624+ mdk_rdev_t * rrdev;
7625+ int rc;
7626+
7627+ if (!mddev) {
7628+ MD_BUG();
7629+ return 0;
7630+ }
7631+ rrdev = find_rdev(mddev, rdev);
7632+ mark_rdev_faulty(rrdev);
7633+ /*
7634+ * if recovery was running, stop it now.
7635+ */
7636+ if (mddev->pers->stop_resync)
7637+ mddev->pers->stop_resync(mddev);
7638+ if (mddev->recovery_running)
7639+ md_interrupt_thread(md_recovery_thread);
7640+ if (mddev->pers->error_handler) {
7641+ rc = mddev->pers->error_handler(mddev, rdev);
7642+ md_recover_arrays();
7643+ return rc;
7644+ }
7645+#if 0
7646+ /*
7647+ * Drop all buffers in the failed array.
7648+ * _not_. This is called from IRQ handlers ...
7649+ */
7650+ invalidate_buffers(rdev);
7651 #endif
7652+ return 0;
7653+}
7654
7655-static void md_geninit (struct gendisk *gdisk)
7656+static int status_unused (char * page)
7657 {
7658- int i;
7659-
7660- for(i=0;i<MAX_MD_DEV;i++)
7661- {
7662- md_blocksizes[i] = 1024;
7663- md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
7664- md_gendisk.part[i].start_sect=-1; /* avoid partition check */
7665- md_gendisk.part[i].nr_sects=0;
7666- md_dev[i].pers=NULL;
7667- }
7668+ int sz = 0, i = 0;
7669+ mdk_rdev_t *rdev;
7670+ struct md_list_head *tmp;
7671
7672- blksize_size[MD_MAJOR] = md_blocksizes;
7673- max_readahead[MD_MAJOR] = md_maxreadahead;
7674+ sz += sprintf(page + sz, "unused devices: ");
7675
7676-#ifdef CONFIG_PROC_FS
7677- proc_register(&proc_root, &proc_md);
7678-#endif
7679+ ITERATE_RDEV_ALL(rdev,tmp) {
7680+ if (!rdev->same_set.next && !rdev->same_set.prev) {
7681+ /*
7682+ * The device is not yet used by any array.
7683+ */
7684+ i++;
7685+ sz += sprintf(page + sz, "%s ",
7686+ partition_name(rdev->dev));
7687+ }
7688+ }
7689+ if (!i)
7690+ sz += sprintf(page + sz, "<none>");
7691+
7692+ sz += sprintf(page + sz, "\n");
7693+ return sz;
7694 }
7695
7696-int md_error (kdev_t mddev, kdev_t rdev)
7697+
7698+static int status_resync (char * page, mddev_t * mddev)
7699 {
7700- unsigned int minor = MINOR (mddev);
7701- int rc;
7702+ int sz = 0;
7703+ unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
7704
7705- if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
7706- panic ("md_error gets unknown device\n");
7707- if (!md_dev [minor].pers)
7708- panic ("md_error gets an error for an unknown device\n");
7709- if (md_dev [minor].pers->error_handler) {
7710- rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
7711-#if SUPPORT_RECONSTRUCTION
7712- md_wakeup_thread(md_sync_thread);
7713-#endif /* SUPPORT_RECONSTRUCTION */
7714- return rc;
7715- }
7716- return 0;
7717+ resync = mddev->curr_resync;
7718+ blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
7719+ max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
7720+
7721+ /*
7722+ * Should not happen.
7723+ */
7724+ if (!max_blocks) {
7725+ MD_BUG();
7726+ return 0;
7727+ }
7728+ res = resync*100/max_blocks;
7729+ if (!mddev->recovery_running)
7730+ /*
7731+ * true resync
7732+ */
7733+ sz += sprintf(page + sz, " resync=%u%%", res);
7734+ else
7735+ /*
7736+ * recovery ...
7737+ */
7738+ sz += sprintf(page + sz, " recovery=%u%%", res);
7739+
7740+ /*
7741+ * We do not want to overflow, so the order of operands and
7742+ * the * 100 / 100 trick are important. We do a +1 to be
7743+ * safe against division by zero. We only estimate anyway.
7744+ *
7745+ * dt: time until now
7746+ * tt: total time
7747+ * et: estimated finish time
7748+ */
7749+ dt = ((jiffies - mddev->resync_start) / HZ);
7750+ tt = (dt * (max_blocks / (resync/100+1)))/100;
7751+ if (tt > dt)
7752+ et = tt - dt;
7753+ else
7754+ /*
7755+ * ignore rounding effects near finish time
7756+ */
7757+ et = 0;
7758+
7759+ sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
7760+
7761+ return sz;
7762 }
7763
7764 int get_md_status (char *page)
7765 {
7766- int sz=0, i, j, size;
7767-
7768- sz+=sprintf( page+sz, "Personalities : ");
7769- for (i=0; i<MAX_PERSONALITY; i++)
7770- if (pers[i])
7771- sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
7772-
7773- page[sz-1]='\n';
7774-
7775- sz+=sprintf (page+sz, "read_ahead ");
7776- if (read_ahead[MD_MAJOR]==INT_MAX)
7777- sz+=sprintf (page+sz, "not set\n");
7778- else
7779- sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7780+ int sz = 0, j, size;
7781+ struct md_list_head *tmp, *tmp2;
7782+ mdk_rdev_t *rdev;
7783+ mddev_t *mddev;
7784+
7785+ sz += sprintf(page + sz, "Personalities : ");
7786+ for (j = 0; j < MAX_PERSONALITY; j++)
7787+ if (pers[j])
7788+ sz += sprintf(page+sz, "[%s] ", pers[j]->name);
7789+
7790+ sz += sprintf(page+sz, "\n");
7791+
7792+
7793+ sz += sprintf(page+sz, "read_ahead ");
7794+ if (read_ahead[MD_MAJOR] == INT_MAX)
7795+ sz += sprintf(page+sz, "not set\n");
7796+ else
7797+ sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7798
7799- for (i=0; i<MAX_MD_DEV; i++)
7800- {
7801- sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
7802-
7803- if (md_dev[i].pers)
7804- sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
7805+ ITERATE_MDDEV(mddev,tmp) {
7806+ sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
7807+ mddev->pers ? "" : "in");
7808+ if (mddev->pers) {
7809+ if (mddev->ro)
7810+ sz += sprintf(page + sz, " (read-only)");
7811+ sz += sprintf(page + sz, " %s", mddev->pers->name);
7812+ }
7813
7814- size=0;
7815- for (j=0; j<md_dev[i].nb_dev; j++)
7816- {
7817- sz+=sprintf (page+sz, " %s",
7818- partition_name(md_dev[i].devices[j].dev));
7819- size+=md_dev[i].devices[j].size;
7820- }
7821+ size = 0;
7822+ ITERATE_RDEV(mddev,rdev,tmp2) {
7823+ sz += sprintf(page + sz, " %s[%d]",
7824+ partition_name(rdev->dev), rdev->desc_nr);
7825+ if (rdev->faulty) {
7826+ sz += sprintf(page + sz, "(F)");
7827+ continue;
7828+ }
7829+ size += rdev->size;
7830+ }
7831
7832- if (md_dev[i].nb_dev) {
7833- if (md_dev[i].pers)
7834- sz+=sprintf (page+sz, " %d blocks", md_size[i]);
7835- else
7836- sz+=sprintf (page+sz, " %d blocks", size);
7837- }
7838+ if (mddev->nb_dev) {
7839+ if (mddev->pers)
7840+ sz += sprintf(page + sz, " %d blocks",
7841+ md_size[mdidx(mddev)]);
7842+ else
7843+ sz += sprintf(page + sz, " %d blocks", size);
7844+ }
7845
7846- if (!md_dev[i].pers)
7847- {
7848- sz+=sprintf (page+sz, "\n");
7849- continue;
7850- }
7851+ if (!mddev->pers) {
7852+ sz += sprintf(page+sz, "\n");
7853+ continue;
7854+ }
7855
7856- if (md_dev[i].pers->max_invalid_dev)
7857- sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
7858+ sz += mddev->pers->status (page+sz, mddev);
7859
7860- sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
7861- sz+=sprintf (page+sz, "\n");
7862- }
7863+ if (mddev->curr_resync)
7864+ sz += status_resync (page+sz, mddev);
7865+ else {
7866+ if (md_atomic_read(&mddev->resync_sem.count) != 1)
7867+ sz += sprintf(page + sz, " resync=DELAYED");
7868+ }
7869+ sz += sprintf(page + sz, "\n");
7870+ }
7871+ sz += status_unused (page + sz);
7872
7873- return (sz);
7874+ return (sz);
7875 }
7876
7877-int register_md_personality (int p_num, struct md_personality *p)
7878+int register_md_personality (int pnum, mdk_personality_t *p)
7879 {
7880- int i=(p_num >> PERSONALITY_SHIFT);
7881-
7882- if (i >= MAX_PERSONALITY)
7883- return -EINVAL;
7884+ if (pnum >= MAX_PERSONALITY)
7885+ return -EINVAL;
7886
7887- if (pers[i])
7888- return -EBUSY;
7889+ if (pers[pnum])
7890+ return -EBUSY;
7891
7892- pers[i]=p;
7893- printk ("%s personality registered\n", p->name);
7894- return 0;
7895+ pers[pnum] = p;
7896+ printk(KERN_INFO "%s personality registered\n", p->name);
7897+ return 0;
7898 }
7899
7900-int unregister_md_personality (int p_num)
7901+int unregister_md_personality (int pnum)
7902 {
7903- int i=(p_num >> PERSONALITY_SHIFT);
7904-
7905- if (i >= MAX_PERSONALITY)
7906- return -EINVAL;
7907+ if (pnum >= MAX_PERSONALITY)
7908+ return -EINVAL;
7909
7910- printk ("%s personality unregistered\n", pers[i]->name);
7911- pers[i]=NULL;
7912- return 0;
7913+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
7914+ pers[pnum] = NULL;
7915+ return 0;
7916 }
7917
7918-static md_descriptor_t *get_spare(struct md_dev *mddev)
7919+static mdp_disk_t *get_spare(mddev_t *mddev)
7920 {
7921- int i;
7922- md_superblock_t *sb = mddev->sb;
7923- md_descriptor_t *descriptor;
7924- struct real_dev *realdev;
7925-
7926- for (i = 0; i < mddev->nb_dev; i++) {
7927- realdev = &mddev->devices[i];
7928- if (!realdev->sb)
7929+ mdp_super_t *sb = mddev->sb;
7930+ mdp_disk_t *disk;
7931+ mdk_rdev_t *rdev;
7932+ struct md_list_head *tmp;
7933+
7934+ ITERATE_RDEV(mddev,rdev,tmp) {
7935+ if (rdev->faulty)
7936+ continue;
7937+ if (!rdev->sb) {
7938+ MD_BUG();
7939 continue;
7940- descriptor = &sb->disks[realdev->sb->descriptor.number];
7941- if (descriptor->state & (1 << MD_FAULTY_DEVICE))
7942+ }
7943+ disk = &sb->disks[rdev->desc_nr];
7944+ if (disk_faulty(disk)) {
7945+ MD_BUG();
7946 continue;
7947- if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
7948+ }
7949+ if (disk_active(disk))
7950 continue;
7951- return descriptor;
7952+ return disk;
7953 }
7954 return NULL;
7955 }
7956
7957+static int is_mddev_idle (mddev_t *mddev)
7958+{
7959+ mdk_rdev_t * rdev;
7960+ struct md_list_head *tmp;
7961+ int idle;
7962+ unsigned long curr_events;
7963+
7964+ idle = 1;
7965+ ITERATE_RDEV(mddev,rdev,tmp) {
7966+ curr_events = io_events[MAJOR(rdev->dev)];
7967+
7968+ if (curr_events != rdev->last_events) {
7969+// printk("!I(%d)", curr_events-rdev->last_events);
7970+ rdev->last_events = curr_events;
7971+ idle = 0;
7972+ }
7973+ }
7974+ return idle;
7975+}
7976+
7977 /*
7978 * parallel resyncing thread.
7979- *
7980- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
7981- * - fix read error handing
7982 */
7983
7984-int md_do_sync(struct md_dev *mddev)
7985+/*
7986+ * Determine correct block size for this device.
7987+ */
7988+unsigned int device_bsize (kdev_t dev)
7989+{
7990+ unsigned int i, correct_size;
7991+
7992+ correct_size = BLOCK_SIZE;
7993+ if (blksize_size[MAJOR(dev)]) {
7994+ i = blksize_size[MAJOR(dev)][MINOR(dev)];
7995+ if (i)
7996+ correct_size = i;
7997+ }
7998+
7999+ return correct_size;
8000+}
8001+
8002+static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
8003+
8004+#define RA_ORDER (1)
8005+#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
8006+#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
8007+
8008+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
8009 {
8010- struct buffer_head *bh;
8011- int max_blocks, blocksize, curr_bsize, percent=1, j;
8012- kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
8013+ mddev_t *mddev2;
8014+ struct buffer_head **bh;
8015+ unsigned int max_blocks, blocksize, curr_bsize,
8016+ i, ii, j, k, chunk, window, nr_blocks, err, serialize;
8017+ kdev_t read_disk = mddev_to_kdev(mddev);
8018 int major = MAJOR(read_disk), minor = MINOR(read_disk);
8019 unsigned long starttime;
8020+ int max_read_errors = 2*MAX_NR_BLOCKS,
8021+ max_write_errors = 2*MAX_NR_BLOCKS;
8022+ struct md_list_head *tmp;
8023+
8024+retry_alloc:
8025+ bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
8026+ if (!bh) {
8027+ printk(KERN_ERR
8028+ "could not alloc bh array for reconstruction ... retrying!\n");
8029+ goto retry_alloc;
8030+ }
8031+
8032+ err = down_interruptible(&mddev->resync_sem);
8033+ if (err)
8034+ goto out_nolock;
8035+
8036+recheck:
8037+ serialize = 0;
8038+ ITERATE_MDDEV(mddev2,tmp) {
8039+ if (mddev2 == mddev)
8040+ continue;
8041+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
8042+ printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
8043+ serialize = 1;
8044+ break;
8045+ }
8046+ }
8047+ if (serialize) {
8048+ interruptible_sleep_on(&resync_wait);
8049+ if (md_signal_pending(current)) {
8050+ md_flush_signals();
8051+ err = -EINTR;
8052+ goto out;
8053+ }
8054+ goto recheck;
8055+ }
8056+
8057+ mddev->curr_resync = 1;
8058
8059- blocksize = blksize_size[major][minor];
8060+ blocksize = device_bsize(read_disk);
8061 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8062
8063- printk("... resync log\n");
8064- printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
8065- printk(" .... raid array: %s\n", kdevname(read_disk));
8066- printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
8067- printk("md: syncing RAID array %s\n", kdevname(read_disk));
8068+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
8069+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
8070+ sysctl_speed_limit);
8071+ printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
8072+
8073+ /*
8074+ * Resync has low priority.
8075+ */
8076+ current->priority = 1;
8077+
8078+ is_mddev_idle(mddev); /* this also initializes IO event counters */
8079+ starttime = jiffies;
8080+ mddev->resync_start = starttime;
8081
8082- mddev->busy++;
8083+ /*
8084+ * Tune reconstruction:
8085+ */
8086+ window = md_maxreadahead[mdidx(mddev)]/1024;
8087+ nr_blocks = window / (blocksize >> 10);
8088+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8089+ nr_blocks = MAX_NR_BLOCKS;
8090+ printk(KERN_INFO "md: using %dk window.\n",window);
8091
8092- starttime=jiffies;
8093- for (j = 0; j < max_blocks; j++) {
8094+ for (j = 0; j < max_blocks; j += nr_blocks) {
8095
8096+ if (j)
8097+ mddev->curr_resync = j;
8098 /*
8099 * B careful. When some1 mounts a non-'blocksize' filesystem
8100 * then we get the blocksize changed right under us. Go deal
8101 * with it transparently, recalculate 'blocksize', 'j' and
8102 * 'max_blocks':
8103 */
8104- curr_bsize = blksize_size[major][minor];
8105+ curr_bsize = device_bsize(read_disk);
8106 if (curr_bsize != blocksize) {
8107- diff_blocksize:
8108+ printk(KERN_INFO "md%d: blocksize changed\n",
8109+ mdidx(mddev));
8110+retry_read:
8111 if (curr_bsize > blocksize)
8112 /*
8113 * this is safe, rounds downwards.
8114@@ -1109,114 +3442,384 @@
8115 j *= blocksize/curr_bsize;
8116
8117 blocksize = curr_bsize;
8118+ nr_blocks = window / (blocksize >> 10);
8119+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8120+ nr_blocks = MAX_NR_BLOCKS;
8121 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8122- }
8123- if ((bh = breada (read_disk, j, blocksize, j * blocksize,
8124- max_blocks * blocksize)) != NULL) {
8125- mark_buffer_dirty(bh, 1);
8126- brelse(bh);
8127- } else {
8128+ printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
8129+ nr_blocks, blocksize, j, max_blocks);
8130 /*
8131- * FIXME: Ugly, but set_blocksize() isnt safe ...
8132+ * We will retry the current block-group
8133 */
8134- curr_bsize = blksize_size[major][minor];
8135- if (curr_bsize != blocksize)
8136- goto diff_blocksize;
8137+ }
8138
8139- /*
8140- * It's a real read problem. FIXME, handle this
8141- * a better way.
8142- */
8143- printk ( KERN_ALERT
8144- "read error, stopping reconstruction.\n");
8145- mddev->busy--;
8146- return 1;
8147+ /*
8148+ * Cleanup routines expect this
8149+ */
8150+ for (k = 0; k < nr_blocks; k++)
8151+ bh[k] = NULL;
8152+
8153+ chunk = nr_blocks;
8154+ if (chunk > max_blocks-j)
8155+ chunk = max_blocks-j;
8156+
8157+ /*
8158+ * request buffer heads ...
8159+ */
8160+ for (i = 0; i < chunk; i++) {
8161+ bh[i] = getblk (read_disk, j+i, blocksize);
8162+ if (!bh[i])
8163+ goto read_error;
8164+ if (!buffer_dirty(bh[i]))
8165+ mark_buffer_lowprio(bh[i]);
8166 }
8167
8168 /*
8169- * Let's sleep some if we are faster than our speed limit:
8170+ * read buffer heads ...
8171 */
8172- while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
8173- {
8174- current->state = TASK_INTERRUPTIBLE;
8175- schedule_timeout(1);
8176+ ll_rw_block (READ, chunk, bh);
8177+ run_task_queue(&tq_disk);
8178+
8179+ /*
8180+ * verify that all of them are OK ...
8181+ */
8182+ for (i = 0; i < chunk; i++) {
8183+ ii = chunk-i-1;
8184+ wait_on_buffer(bh[ii]);
8185+ if (!buffer_uptodate(bh[ii]))
8186+ goto read_error;
8187+ }
8188+
8189+retry_write:
8190+ for (i = 0; i < chunk; i++)
8191+ mark_buffer_dirty_lowprio(bh[i]);
8192+
8193+ ll_rw_block(WRITE, chunk, bh);
8194+ run_task_queue(&tq_disk);
8195+
8196+ for (i = 0; i < chunk; i++) {
8197+ ii = chunk-i-1;
8198+ wait_on_buffer(bh[ii]);
8199+
8200+ if (spare && disk_faulty(spare)) {
8201+ for (k = 0; k < chunk; k++)
8202+ brelse(bh[k]);
8203+ printk(" <SPARE FAILED!>\n ");
8204+ err = -EIO;
8205+ goto out;
8206+ }
8207+
8208+ if (!buffer_uptodate(bh[ii])) {
8209+ curr_bsize = device_bsize(read_disk);
8210+ if (curr_bsize != blocksize) {
8211+ printk(KERN_INFO
8212+ "md%d: blocksize changed during write\n",
8213+ mdidx(mddev));
8214+ for (k = 0; k < chunk; k++)
8215+ if (bh[k]) {
8216+ if (buffer_lowprio(bh[k]))
8217+ mark_buffer_clean(bh[k]);
8218+ brelse(bh[k]);
8219+ }
8220+ goto retry_read;
8221+ }
8222+ printk(" BAD WRITE %8d>\n", j);
8223+ /*
8224+ * Ouch, write error, retry or bail out.
8225+ */
8226+ if (max_write_errors) {
8227+ max_write_errors--;
8228+ printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8229+ goto retry_write;
8230+ }
8231+ printk ( KERN_ALERT
8232+ "too many write errors, stopping reconstruction.\n");
8233+ for (k = 0; k < chunk; k++)
8234+ if (bh[k]) {
8235+ if (buffer_lowprio(bh[k]))
8236+ mark_buffer_clean(bh[k]);
8237+ brelse(bh[k]);
8238+ }
8239+ err = -EIO;
8240+ goto out;
8241+ }
8242 }
8243
8244 /*
8245- * FIXME: put this status bar thing into /proc
8246+ * This is the normal 'everything went OK' case
8247+ * do a 'free-behind' logic, we sure dont need
8248+ * this buffer if it was the only user.
8249 */
8250- if (!(j%(max_blocks/100))) {
8251- if (!(percent%10))
8252- printk (" %03d%% done.\n",percent);
8253+ for (i = 0; i < chunk; i++)
8254+ if (buffer_dirty(bh[i]))
8255+ brelse(bh[i]);
8256 else
8257- printk (".");
8258- percent++;
8259+ bforget(bh[i]);
8260+
8261+
8262+ if (md_signal_pending(current)) {
8263+ /*
8264+ * got a signal, exit.
8265+ */
8266+ mddev->curr_resync = 0;
8267+ printk("md_do_sync() got signal ... exiting\n");
8268+ md_flush_signals();
8269+ err = -EINTR;
8270+ goto out;
8271 }
8272+
8273+ /*
8274+ * this loop exits only if either when we are slower than
8275+ * the 'hard' speed limit, or the system was IO-idle for
8276+ * a jiffy.
8277+ * the system might be non-idle CPU-wise, but we only care
8278+ * about not overloading the IO subsystem. (things like an
8279+ * e2fsck being done on the RAID array should execute fast)
8280+ */
8281+repeat:
8282+ if (md_need_resched(current))
8283+ schedule();
8284+
8285+ if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
8286+ > sysctl_speed_limit) {
8287+ current->priority = 1;
8288+
8289+ if (!is_mddev_idle(mddev)) {
8290+ current->state = TASK_INTERRUPTIBLE;
8291+ md_schedule_timeout(HZ/2);
8292+ if (!md_signal_pending(current))
8293+ goto repeat;
8294+ }
8295+ } else
8296+ current->priority = 40;
8297 }
8298 fsync_dev(read_disk);
8299- printk("md: %s: sync done.\n", kdevname(read_disk));
8300- mddev->busy--;
8301- return 0;
8302+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
8303+ err = 0;
8304+ /*
8305+ * this also signals 'finished resyncing' to md_stop
8306+ */
8307+out:
8308+ up(&mddev->resync_sem);
8309+out_nolock:
8310+ free_pages((unsigned long)bh, RA_ORDER);
8311+ mddev->curr_resync = 0;
8312+ wake_up(&resync_wait);
8313+ return err;
8314+
8315+read_error:
8316+ /*
8317+ * set_blocksize() might change the blocksize. This
8318+ * should not happen often, but it happens when eg.
8319+ * someone mounts a filesystem that has non-1k
8320+ * blocksize. set_blocksize() doesnt touch our
8321+ * buffer, but to avoid aliasing problems we change
8322+ * our internal blocksize too and retry the read.
8323+ */
8324+ curr_bsize = device_bsize(read_disk);
8325+ if (curr_bsize != blocksize) {
8326+ printk(KERN_INFO "md%d: blocksize changed during read\n",
8327+ mdidx(mddev));
8328+ for (k = 0; k < chunk; k++)
8329+ if (bh[k]) {
8330+ if (buffer_lowprio(bh[k]))
8331+ mark_buffer_clean(bh[k]);
8332+ brelse(bh[k]);
8333+ }
8334+ goto retry_read;
8335+ }
8336+
8337+ /*
8338+ * It's a real read problem. We retry and bail out
8339+ * only if it's excessive.
8340+ */
8341+ if (max_read_errors) {
8342+ max_read_errors--;
8343+ printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8344+ for (k = 0; k < chunk; k++)
8345+ if (bh[k]) {
8346+ if (buffer_lowprio(bh[k]))
8347+ mark_buffer_clean(bh[k]);
8348+ brelse(bh[k]);
8349+ }
8350+ goto retry_read;
8351+ }
8352+ printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
8353+ for (k = 0; k < chunk; k++)
8354+ if (bh[k]) {
8355+ if (buffer_lowprio(bh[k]))
8356+ mark_buffer_clean(bh[k]);
8357+ brelse(bh[k]);
8358+ }
8359+ err = -EIO;
8360+ goto out;
8361 }
8362
8363+#undef MAX_NR_BLOCKS
8364+
8365 /*
8366- * This is a kernel thread which: syncs a spare disk with the active array
8367+ * This is a kernel thread which syncs a spare disk with the active array
8368 *
8369 * the amount of foolproofing might seem to be a tad excessive, but an
8370 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
8371 * of my root partition with the first 0.5 gigs of my /home partition ... so
8372 * i'm a bit nervous ;)
8373 */
8374-void mdsyncd (void *data)
8375+void md_do_recovery (void *data)
8376 {
8377- int i;
8378- struct md_dev *mddev;
8379- md_superblock_t *sb;
8380- md_descriptor_t *spare;
8381+ int err;
8382+ mddev_t *mddev;
8383+ mdp_super_t *sb;
8384+ mdp_disk_t *spare;
8385 unsigned long flags;
8386+ struct md_list_head *tmp;
8387
8388- for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
8389- if ((sb = mddev->sb) == NULL)
8390+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
8391+restart:
8392+ ITERATE_MDDEV(mddev,tmp) {
8393+ sb = mddev->sb;
8394+ if (!sb)
8395+ continue;
8396+ if (mddev->recovery_running)
8397 continue;
8398 if (sb->active_disks == sb->raid_disks)
8399 continue;
8400- if (!sb->spare_disks)
8401+ if (!sb->spare_disks) {
8402+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
8403 continue;
8404+ }
8405+ /*
8406+ * now here we get the spare and resync it.
8407+ */
8408 if ((spare = get_spare(mddev)) == NULL)
8409 continue;
8410- if (!mddev->pers->mark_spare)
8411+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8412+ if (!mddev->pers->diskop)
8413 continue;
8414- if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
8415+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
8416 continue;
8417- if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
8418- mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
8419+ down(&mddev->recovery_sem);
8420+ mddev->recovery_running = 1;
8421+ err = md_do_sync(mddev, spare);
8422+ if (err == -EIO) {
8423+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8424+ if (!disk_faulty(spare)) {
8425+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
8426+ mark_disk_faulty(spare);
8427+ mark_disk_nonsync(spare);
8428+ mark_disk_inactive(spare);
8429+ sb->spare_disks--;
8430+ sb->working_disks--;
8431+ sb->failed_disks++;
8432+ }
8433+ } else
8434+ if (disk_faulty(spare))
8435+ mddev->pers->diskop(mddev, &spare,
8436+ DISKOP_SPARE_INACTIVE);
8437+ if (err == -EINTR) {
8438+ /*
8439+ * Recovery got interrupted ...
8440+ * signal back that we have finished using the array.
8441+ */
8442+ mddev->pers->diskop(mddev, &spare,
8443+ DISKOP_SPARE_INACTIVE);
8444+ up(&mddev->recovery_sem);
8445+ mddev->recovery_running = 0;
8446 continue;
8447+ } else {
8448+ mddev->recovery_running = 0;
8449+ up(&mddev->recovery_sem);
8450 }
8451 save_flags(flags);
8452 cli();
8453- mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
8454- spare->state |= (1 << MD_SYNC_DEVICE);
8455- spare->state |= (1 << MD_ACTIVE_DEVICE);
8456- sb->spare_disks--;
8457- sb->active_disks++;
8458- mddev->sb_dirty = 1;
8459- md_update_sb(mddev - md_dev);
8460+ if (!disk_faulty(spare)) {
8461+ /*
8462+ * the SPARE_ACTIVE diskop possibly changes the
8463+ * pointer too
8464+ */
8465+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
8466+ mark_disk_sync(spare);
8467+ mark_disk_active(spare);
8468+ sb->active_disks++;
8469+ sb->spare_disks--;
8470+ }
8471 restore_flags(flags);
8472+ mddev->sb_dirty = 1;
8473+ md_update_sb(mddev);
8474+ goto restart;
8475 }
8476+ printk(KERN_INFO "md: recovery thread finished ...\n");
8477
8478 }
8479
8480+int md_notify_reboot(struct notifier_block *this,
8481+ unsigned long code, void *x)
8482+{
8483+ struct md_list_head *tmp;
8484+ mddev_t *mddev;
8485+
8486+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
8487+ || (code == MD_SYS_POWER_OFF)) {
8488+
8489+ printk(KERN_INFO "stopping all md devices.\n");
8490+
8491+ ITERATE_MDDEV(mddev,tmp)
8492+ do_md_stop (mddev, 1);
8493+ /*
8494+ * certain more exotic SCSI devices are known to be
8495+ * volatile wrt too early system reboots. While the
8496+ * right place to handle this issue is the given
8497+ * driver, we do want to have a safe RAID driver ...
8498+ */
8499+ md_mdelay(1000*1);
8500+ }
8501+ return NOTIFY_DONE;
8502+}
8503+
8504+struct notifier_block md_notifier = {
8505+ md_notify_reboot,
8506+ NULL,
8507+ 0
8508+};
8509+
8510+md__initfunc(void raid_setup(char *str, int *ints))
8511+{
8512+ char tmpline[100];
8513+ int len, pos, nr, i;
8514+
8515+ len = strlen(str) + 1;
8516+ nr = 0;
8517+ pos = 0;
8518+
8519+ for (i = 0; i < len; i++) {
8520+ char c = str[i];
8521+
8522+ if (c == ',' || !c) {
8523+ tmpline[pos] = 0;
8524+ if (!strcmp(tmpline,"noautodetect"))
8525+ raid_setup_args.noautodetect = 1;
8526+ nr++;
8527+ pos = 0;
8528+ continue;
8529+ }
8530+ tmpline[pos] = c;
8531+ pos++;
8532+ }
8533+ raid_setup_args.set = 1;
8534+ return;
8535+}
8536+
8537 #ifdef CONFIG_MD_BOOT
8538 struct {
8539 int set;
8540 int ints[100];
8541 char str[100];
8542-} md_setup_args __initdata = {
8543+} md_setup_args md__initdata = {
8544 0,{0},{0}
8545 };
8546
8547 /* called from init/main.c */
8548-__initfunc(void md_setup(char *str,int *ints))
8549+md__initfunc(void md_setup(char *str,int *ints))
8550 {
8551 int i;
8552 for(i=0;i<=ints[0];i++) {
8553@@ -1228,21 +3831,24 @@
8554 return;
8555 }
8556
8557-__initfunc(void do_md_setup(char *str,int *ints))
8558+md__initfunc(void do_md_setup(char *str,int *ints))
8559 {
8560- int minor, pers, factor, fault;
8561+#if 0
8562+ int minor, pers, chunk_size, fault;
8563 kdev_t dev;
8564 int i=1;
8565
8566+ printk("i plan to phase this out --mingo\n");
8567+
8568 if(ints[0] < 4) {
8569- printk ("md: Too few Arguments (%d).\n", ints[0]);
8570+ printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
8571 return;
8572 }
8573
8574 minor=ints[i++];
8575
8576- if (minor >= MAX_MD_DEV) {
8577- printk ("md: Minor device number too high.\n");
8578+ if ((unsigned int)minor >= MAX_MD_DEVS) {
8579+ printk (KERN_WARNING "md: Minor device number too high.\n");
8580 return;
8581 }
8582
8583@@ -1252,18 +3858,20 @@
8584 case -1:
8585 #ifdef CONFIG_MD_LINEAR
8586 pers = LINEAR;
8587- printk ("md: Setting up md%d as linear device.\n",minor);
8588+ printk (KERN_INFO "md: Setting up md%d as linear device.\n",
8589+ minor);
8590 #else
8591- printk ("md: Linear mode not configured."
8592+ printk (KERN_WARNING "md: Linear mode not configured."
8593 "Recompile the kernel with linear mode enabled!\n");
8594 #endif
8595 break;
8596 case 0:
8597 pers = STRIPED;
8598 #ifdef CONFIG_MD_STRIPED
8599- printk ("md: Setting up md%d as a striped device.\n",minor);
8600+ printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
8601+ minor);
8602 #else
8603- printk ("md: Striped mode not configured."
8604+ printk (KERN_WARNING "md: Striped mode not configured."
8605 "Recompile the kernel with striped mode enabled!\n");
8606 #endif
8607 break;
8608@@ -1278,79 +3886,145 @@
8609 break;
8610 */
8611 default:
8612- printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
8613+ printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
8614 return;
8615 }
8616
8617- if(pers) {
8618+ if (pers) {
8619
8620- factor=ints[i++]; /* Chunksize */
8621- fault =ints[i++]; /* Faultlevel */
8622+ chunk_size = ints[i++]; /* Chunksize */
8623+ fault = ints[i++]; /* Faultlevel */
8624
8625- pers=pers | factor | (fault << FAULT_SHIFT);
8626+ pers = pers | chunk_size | (fault << FAULT_SHIFT);
8627
8628- while( str && (dev = name_to_kdev_t(str))) {
8629- do_md_add (minor, dev);
8630- if((str = strchr (str, ',')) != NULL)
8631- str++;
8632- }
8633+ while( str && (dev = name_to_kdev_t(str))) {
8634+ do_md_add (minor, dev);
8635+ if((str = strchr (str, ',')) != NULL)
8636+ str++;
8637+ }
8638
8639- do_md_run (minor, pers);
8640- printk ("md: Loading md%d.\n",minor);
8641+ do_md_run (minor, pers);
8642+ printk (KERN_INFO "md: Loading md%d.\n",minor);
8643 }
8644-
8645+#endif
8646 }
8647 #endif
8648
8649+void hsm_init (void);
8650+void translucent_init (void);
8651 void linear_init (void);
8652 void raid0_init (void);
8653 void raid1_init (void);
8654 void raid5_init (void);
8655
8656-__initfunc(int md_init (void))
8657+md__initfunc(int md_init (void))
8658 {
8659- printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
8660- MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
8661- MAX_MD_DEV, MAX_REAL);
8662-
8663- if (register_blkdev (MD_MAJOR, "md", &md_fops))
8664- {
8665- printk ("Unable to get major %d for md\n", MD_MAJOR);
8666- return (-1);
8667- }
8668-
8669- blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
8670- blk_dev[MD_MAJOR].current_request=NULL;
8671- read_ahead[MD_MAJOR]=INT_MAX;
8672- memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
8673- md_gendisk.next=gendisk_head;
8674-
8675- gendisk_head=&md_gendisk;
8676-
8677-#if SUPPORT_RECONSTRUCTION
8678- if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
8679- printk("md: bug: md_sync_thread == NULL\n");
8680-#endif /* SUPPORT_RECONSTRUCTION */
8681+ static char * name = "mdrecoveryd";
8682+
8683+ printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
8684+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
8685+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
8686+
8687+ if (register_blkdev (MD_MAJOR, "md", &md_fops))
8688+ {
8689+ printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
8690+ return (-1);
8691+ }
8692+
8693+ blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
8694+ blk_dev[MD_MAJOR].current_request = NULL;
8695+ read_ahead[MD_MAJOR] = INT_MAX;
8696+ md_gendisk.next = gendisk_head;
8697+
8698+ gendisk_head = &md_gendisk;
8699+
8700+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
8701+ if (!md_recovery_thread)
8702+ printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
8703
8704+ md_register_reboot_notifier(&md_notifier);
8705+ md_register_sysctl();
8706+
8707+#ifdef CONFIG_MD_HSM
8708+ hsm_init ();
8709+#endif
8710+#ifdef CONFIG_MD_TRANSLUCENT
8711+ translucent_init ();
8712+#endif
8713 #ifdef CONFIG_MD_LINEAR
8714- linear_init ();
8715+ linear_init ();
8716 #endif
8717 #ifdef CONFIG_MD_STRIPED
8718- raid0_init ();
8719+ raid0_init ();
8720 #endif
8721 #ifdef CONFIG_MD_MIRRORING
8722- raid1_init ();
8723+ raid1_init ();
8724 #endif
8725 #ifdef CONFIG_MD_RAID5
8726- raid5_init ();
8727+ raid5_init ();
8728+#endif
8729+#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
8730+ /*
8731+ * pick a XOR routine, runtime.
8732+ */
8733+ calibrate_xor_block();
8734 #endif
8735- return (0);
8736+
8737+ return (0);
8738 }
8739
8740 #ifdef CONFIG_MD_BOOT
8741-__initfunc(void md_setup_drive(void))
8742+md__initfunc(void md_setup_drive(void))
8743 {
8744 if(md_setup_args.set)
8745 do_md_setup(md_setup_args.str, md_setup_args.ints);
8746 }
8747 #endif
8748+
8749+MD_EXPORT_SYMBOL(md_size);
8750+MD_EXPORT_SYMBOL(register_md_personality);
8751+MD_EXPORT_SYMBOL(unregister_md_personality);
8752+MD_EXPORT_SYMBOL(partition_name);
8753+MD_EXPORT_SYMBOL(md_error);
8754+MD_EXPORT_SYMBOL(md_recover_arrays);
8755+MD_EXPORT_SYMBOL(md_register_thread);
8756+MD_EXPORT_SYMBOL(md_unregister_thread);
8757+MD_EXPORT_SYMBOL(md_update_sb);
8758+MD_EXPORT_SYMBOL(md_map);
8759+MD_EXPORT_SYMBOL(md_wakeup_thread);
8760+MD_EXPORT_SYMBOL(md_do_sync);
8761+MD_EXPORT_SYMBOL(md_print_devices);
8762+MD_EXPORT_SYMBOL(find_rdev_nr);
8763+MD_EXPORT_SYMBOL(md_check_ordering);
8764+MD_EXPORT_SYMBOL(md_interrupt_thread);
8765+MD_EXPORT_SYMBOL(mddev_map);
8766+
8767+#ifdef CONFIG_PROC_FS
8768+static struct proc_dir_entry proc_md = {
8769+ PROC_MD, 6, "mdstat",
8770+ S_IFREG | S_IRUGO, 1, 0, 0,
8771+ 0, &proc_array_inode_operations,
8772+};
8773+#endif
8774+
8775+static void md_geninit (struct gendisk *gdisk)
8776+{
8777+ int i;
8778+
8779+ for(i = 0; i < MAX_MD_DEVS; i++) {
8780+ md_blocksizes[i] = 1024;
8781+ md_maxreadahead[i] = MD_READAHEAD;
8782+ md_gendisk.part[i].start_sect = -1; /* avoid partition check */
8783+ md_gendisk.part[i].nr_sects = 0;
8784+ }
8785+
8786+ printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8787+
8788+ blksize_size[MD_MAJOR] = md_blocksizes;
8789+ md_set_global_readahead(md_maxreadahead);
8790+
8791+#ifdef CONFIG_PROC_FS
8792+ proc_register(&proc_root, &proc_md);
8793+#endif
8794+}
8795+
8796--- linux/drivers/block/raid0.c.orig Mon Sep 4 19:39:16 2000
8797+++ linux/drivers/block/raid0.c Tue Jan 16 13:42:04 2001
8798@@ -1,4 +1,3 @@
8799-
8800 /*
8801 raid0.c : Multiple Devices driver for Linux
8802 Copyright (C) 1994-96 Marc ZYNGIER
8803@@ -18,146 +17,201 @@
8804 */
8805
8806 #include <linux/module.h>
8807-#include <linux/md.h>
8808-#include <linux/raid0.h>
8809-#include <linux/vmalloc.h>
8810+#include <linux/raid/raid0.h>
8811
8812 #define MAJOR_NR MD_MAJOR
8813 #define MD_DRIVER
8814 #define MD_PERSONALITY
8815
8816-static int create_strip_zones (int minor, struct md_dev *mddev)
8817+static int create_strip_zones (mddev_t *mddev)
8818 {
8819- int i, j, c=0;
8820- int current_offset=0;
8821- struct real_dev *smallest_by_zone;
8822- struct raid0_data *data=(struct raid0_data *) mddev->private;
8823-
8824- data->nr_strip_zones=1;
8825-
8826- for (i=1; i<mddev->nb_dev; i++)
8827- {
8828- for (j=0; j<i; j++)
8829- if (mddev->devices[i].size==mddev->devices[j].size)
8830- {
8831- c=1;
8832- break;
8833- }
8834-
8835- if (!c)
8836- data->nr_strip_zones++;
8837-
8838- c=0;
8839- }
8840-
8841- if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
8842- return 1;
8843-
8844- data->smallest=NULL;
8845-
8846- for (i=0; i<data->nr_strip_zones; i++)
8847- {
8848- data->strip_zone[i].dev_offset=current_offset;
8849- smallest_by_zone=NULL;
8850- c=0;
8851-
8852- for (j=0; j<mddev->nb_dev; j++)
8853- if (mddev->devices[j].size>current_offset)
8854- {
8855- data->strip_zone[i].dev[c++]=mddev->devices+j;
8856- if (!smallest_by_zone ||
8857- smallest_by_zone->size > mddev->devices[j].size)
8858- smallest_by_zone=mddev->devices+j;
8859- }
8860-
8861- data->strip_zone[i].nb_dev=c;
8862- data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
8863-
8864- if (!data->smallest ||
8865- data->smallest->size > data->strip_zone[i].size)
8866- data->smallest=data->strip_zone+i;
8867-
8868- data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
8869- data->strip_zone[i-1].size) : 0;
8870- current_offset=smallest_by_zone->size;
8871- }
8872- return 0;
8873+ int i, c, j, j1, j2;
8874+ int current_offset, curr_zone_offset;
8875+ raid0_conf_t *conf = mddev_to_conf(mddev);
8876+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
8877+
8878+ /*
8879+ * The number of 'same size groups'
8880+ */
8881+ conf->nr_strip_zones = 0;
8882+
8883+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
8884+ printk("raid0: looking at %s\n", partition_name(rdev1->dev));
8885+ c = 0;
8886+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
8887+ printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
8888+ if (rdev2 == rdev1) {
8889+ printk("raid0: END\n");
8890+ break;
8891+ }
8892+ if (rdev2->size == rdev1->size)
8893+ {
8894+ /*
8895+ * Not unique, dont count it as a new
8896+ * group
8897+ */
8898+ printk("raid0: EQUAL\n");
8899+ c = 1;
8900+ break;
8901+ }
8902+ printk("raid0: NOT EQUAL\n");
8903+ }
8904+ if (!c) {
8905+ printk("raid0: ==> UNIQUE\n");
8906+ conf->nr_strip_zones++;
8907+ printk("raid0: %d zones\n", conf->nr_strip_zones);
8908+ }
8909+ }
8910+ printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
8911+
8912+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
8913+ conf->nr_strip_zones);
8914+ if (!conf->strip_zone)
8915+ return 1;
8916+
8917+
8918+ conf->smallest = NULL;
8919+ current_offset = 0;
8920+ curr_zone_offset = 0;
8921+
8922+ for (i = 0; i < conf->nr_strip_zones; i++)
8923+ {
8924+ struct strip_zone *zone = conf->strip_zone + i;
8925+
8926+ printk("zone %d\n", i);
8927+ zone->dev_offset = current_offset;
8928+ smallest = NULL;
8929+ c = 0;
8930+
8931+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
8932+
8933+ printk(" checking %s ...", partition_name(rdev->dev));
8934+ if (rdev->size > current_offset)
8935+ {
8936+ printk(" contained as device %d\n", c);
8937+ zone->dev[c] = rdev;
8938+ c++;
8939+ if (!smallest || (rdev->size <smallest->size)) {
8940+ smallest = rdev;
8941+ printk(" (%d) is smallest!.\n", rdev->size);
8942+ }
8943+ } else
8944+ printk(" nope.\n");
8945+ }
8946+
8947+ zone->nb_dev = c;
8948+ zone->size = (smallest->size - current_offset) * c;
8949+ printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
8950+
8951+ if (!conf->smallest || (zone->size < conf->smallest->size))
8952+ conf->smallest = zone;
8953+
8954+ zone->zone_offset = curr_zone_offset;
8955+ curr_zone_offset += zone->size;
8956+
8957+ current_offset = smallest->size;
8958+ printk("current zone offset: %d\n", current_offset);
8959+ }
8960+ printk("done.\n");
8961+ return 0;
8962 }
8963
8964-static int raid0_run (int minor, struct md_dev *mddev)
8965+static int raid0_run (mddev_t *mddev)
8966 {
8967- int cur=0, i=0, size, zone0_size, nb_zone;
8968- struct raid0_data *data;
8969-
8970- MOD_INC_USE_COUNT;
8971+ int cur=0, i=0, size, zone0_size, nb_zone;
8972+ raid0_conf_t *conf;
8973
8974- if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
8975- data=(struct raid0_data *) mddev->private;
8976-
8977- if (create_strip_zones (minor, mddev))
8978- {
8979- vfree(data);
8980- return 1;
8981- }
8982-
8983- nb_zone=data->nr_zones=
8984- md_size[minor]/data->smallest->size +
8985- (md_size[minor]%data->smallest->size ? 1 : 0);
8986-
8987- printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
8988- if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
8989- {
8990- vfree(data->strip_zone);
8991- vfree(data);
8992- return 1;
8993- }
8994- size=data->strip_zone[cur].size;
8995-
8996- i=0;
8997- while (cur<data->nr_strip_zones)
8998- {
8999- data->hash_table[i].zone0=data->strip_zone+cur;
9000-
9001- if (size>=data->smallest->size)/* If we completely fill the slot */
9002- {
9003- data->hash_table[i++].zone1=NULL;
9004- size-=data->smallest->size;
9005-
9006- if (!size)
9007- {
9008- if (++cur==data->nr_strip_zones) continue;
9009- size=data->strip_zone[cur].size;
9010- }
9011-
9012- continue;
9013- }
9014-
9015- if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
9016- {
9017- data->hash_table[i].zone1=NULL;
9018- continue;
9019- }
9020-
9021- zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
9022- size=data->strip_zone[cur].size;
9023- data->hash_table[i++].zone1=data->strip_zone+cur;
9024- size-=(data->smallest->size - zone0_size);
9025- }
9026+ MOD_INC_USE_COUNT;
9027
9028- return (0);
9029+ conf = vmalloc(sizeof (raid0_conf_t));
9030+ if (!conf)
9031+ goto out;
9032+ mddev->private = (void *)conf;
9033+
9034+ if (md_check_ordering(mddev)) {
9035+ printk("raid0: disks are not ordered, aborting!\n");
9036+ goto out_free_conf;
9037+ }
9038+
9039+ if (create_strip_zones (mddev))
9040+ goto out_free_conf;
9041+
9042+ printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
9043+ printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
9044+ nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
9045+ (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
9046+ printk("raid0 : nb_zone is %d.\n", nb_zone);
9047+ conf->nr_zones = nb_zone;
9048+
9049+ printk("raid0 : Allocating %d bytes for hash.\n",
9050+ sizeof(struct raid0_hash)*nb_zone);
9051+
9052+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
9053+ if (!conf->hash_table)
9054+ goto out_free_zone_conf;
9055+ size = conf->strip_zone[cur].size;
9056+
9057+ i = 0;
9058+ while (cur < conf->nr_strip_zones) {
9059+ conf->hash_table[i].zone0 = conf->strip_zone + cur;
9060+
9061+ /*
9062+ * If we completely fill the slot
9063+ */
9064+ if (size >= conf->smallest->size) {
9065+ conf->hash_table[i++].zone1 = NULL;
9066+ size -= conf->smallest->size;
9067+
9068+ if (!size) {
9069+ if (++cur == conf->nr_strip_zones)
9070+ continue;
9071+ size = conf->strip_zone[cur].size;
9072+ }
9073+ continue;
9074+ }
9075+ if (++cur == conf->nr_strip_zones) {
9076+ /*
9077+ * Last dev, set unit1 as NULL
9078+ */
9079+ conf->hash_table[i].zone1=NULL;
9080+ continue;
9081+ }
9082+
9083+ /*
9084+ * Here we use a 2nd dev to fill the slot
9085+ */
9086+ zone0_size = size;
9087+ size = conf->strip_zone[cur].size;
9088+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
9089+ size -= (conf->smallest->size - zone0_size);
9090+ }
9091+ return 0;
9092+
9093+out_free_zone_conf:
9094+ vfree(conf->strip_zone);
9095+ conf->strip_zone = NULL;
9096+
9097+out_free_conf:
9098+ vfree(conf);
9099+ mddev->private = NULL;
9100+out:
9101+ MOD_DEC_USE_COUNT;
9102+ return 1;
9103 }
9104
9105-
9106-static int raid0_stop (int minor, struct md_dev *mddev)
9107+static int raid0_stop (mddev_t *mddev)
9108 {
9109- struct raid0_data *data=(struct raid0_data *) mddev->private;
9110+ raid0_conf_t *conf = mddev_to_conf(mddev);
9111
9112- vfree (data->hash_table);
9113- vfree (data->strip_zone);
9114- vfree (data);
9115+ vfree (conf->hash_table);
9116+ conf->hash_table = NULL;
9117+ vfree (conf->strip_zone);
9118+ conf->strip_zone = NULL;
9119+ vfree (conf);
9120+ mddev->private = NULL;
9121
9122- MOD_DEC_USE_COUNT;
9123- return 0;
9124+ MOD_DEC_USE_COUNT;
9125+ return 0;
9126 }
9127
9128 /*
9129@@ -167,135 +221,140 @@
9130 * Of course, those facts may not be valid anymore (and surely won't...)
9131 * Hey guys, there's some work out there ;-)
9132 */
9133-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
9134+static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9135 unsigned long *rsector, unsigned long size)
9136 {
9137- struct raid0_data *data=(struct raid0_data *) mddev->private;
9138- static struct raid0_hash *hash;
9139- struct strip_zone *zone;
9140- struct real_dev *tmp_dev;
9141- int blk_in_chunk, factor, chunk, chunk_size;
9142- long block, rblock;
9143-
9144- factor=FACTOR(mddev);
9145- chunk_size=(1UL << FACTOR_SHIFT(factor));
9146- block=*rsector >> 1;
9147- hash=data->hash_table+(block/data->smallest->size);
9148-
9149- if (hash - data->hash_table > data->nr_zones)
9150- {
9151- printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
9152- return -1;
9153- }
9154-
9155- /* Sanity check */
9156- if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
9157- {
9158- printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9159- return (-1);
9160- }
9161-
9162- if (block >= (hash->zone0->size +
9163- hash->zone0->zone_offset))
9164- {
9165- if (!hash->zone1)
9166- {
9167- printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
9168- return (-1);
9169- }
9170-
9171- zone=hash->zone1;
9172- }
9173- else
9174- zone=hash->zone0;
9175+ raid0_conf_t *conf = mddev_to_conf(mddev);
9176+ struct raid0_hash *hash;
9177+ struct strip_zone *zone;
9178+ mdk_rdev_t *tmp_dev;
9179+ int blk_in_chunk, chunksize_bits, chunk, chunk_size;
9180+ long block, rblock;
9181+
9182+ chunk_size = mddev->param.chunk_size >> 10;
9183+ chunksize_bits = ffz(~chunk_size);
9184+ block = *rsector >> 1;
9185+ hash = conf->hash_table + block / conf->smallest->size;
9186+
9187+ if (hash - conf->hash_table > conf->nr_zones) {
9188+ printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
9189+ return -1;
9190+ }
9191+
9192+ /* Sanity check */
9193+ if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
9194+ goto bad_map;
9195+
9196+ if (!hash)
9197+ goto bad_hash;
9198+
9199+ if (!hash->zone0)
9200+ goto bad_zone0;
9201+
9202+ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
9203+ if (!hash->zone1)
9204+ goto bad_zone1;
9205+ zone = hash->zone1;
9206+ } else
9207+ zone = hash->zone0;
9208
9209- blk_in_chunk=block & (chunk_size -1);
9210- chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
9211- tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
9212- rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
9213+ blk_in_chunk = block & (chunk_size -1);
9214+ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
9215+ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
9216+ rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
9217
9218- *rdev=tmp_dev->dev;
9219- *rsector=rblock<<1;
9220+ *rdev = tmp_dev->dev;
9221+ *rsector = rblock << 1;
9222
9223- return (0);
9224+ return 0;
9225+
9226+bad_map:
9227+ printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9228+ return -1;
9229+bad_hash:
9230+ printk("raid0_map bug: hash==NULL for block %ld\n", block);
9231+ return -1;
9232+bad_zone0:
9233+ printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
9234+ return -1;
9235+bad_zone1:
9236+ printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
9237+ return -1;
9238 }
9239
9240
9241-static int raid0_status (char *page, int minor, struct md_dev *mddev)
9242+static int raid0_status (char *page, mddev_t *mddev)
9243 {
9244- int sz=0;
9245+ int sz = 0;
9246 #undef MD_DEBUG
9247 #ifdef MD_DEBUG
9248- int j, k;
9249- struct raid0_data *data=(struct raid0_data *) mddev->private;
9250+ int j, k;
9251+ raid0_conf_t *conf = mddev_to_conf(mddev);
9252
9253- sz+=sprintf (page+sz, " ");
9254- for (j=0; j<data->nr_zones; j++)
9255- {
9256- sz+=sprintf (page+sz, "[z%d",
9257- data->hash_table[j].zone0-data->strip_zone);
9258- if (data->hash_table[j].zone1)
9259- sz+=sprintf (page+sz, "/z%d] ",
9260- data->hash_table[j].zone1-data->strip_zone);
9261- else
9262- sz+=sprintf (page+sz, "] ");
9263- }
9264+ sz += sprintf(page + sz, " ");
9265+ for (j = 0; j < conf->nr_zones; j++) {
9266+ sz += sprintf(page + sz, "[z%d",
9267+ conf->hash_table[j].zone0 - conf->strip_zone);
9268+ if (conf->hash_table[j].zone1)
9269+ sz += sprintf(page+sz, "/z%d] ",
9270+ conf->hash_table[j].zone1 - conf->strip_zone);
9271+ else
9272+ sz += sprintf(page+sz, "] ");
9273+ }
9274
9275- sz+=sprintf (page+sz, "\n");
9276+ sz += sprintf(page + sz, "\n");
9277
9278- for (j=0; j<data->nr_strip_zones; j++)
9279- {
9280- sz+=sprintf (page+sz, " z%d=[", j);
9281- for (k=0; k<data->strip_zone[j].nb_dev; k++)
9282- sz+=sprintf (page+sz, "%s/",
9283- partition_name(data->strip_zone[j].dev[k]->dev));
9284- sz--;
9285- sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9286- data->strip_zone[j].zone_offset,
9287- data->strip_zone[j].dev_offset,
9288- data->strip_zone[j].size);
9289- }
9290+ for (j = 0; j < conf->nr_strip_zones; j++) {
9291+ sz += sprintf(page + sz, " z%d=[", j);
9292+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
9293+ sz += sprintf (page+sz, "%s/", partition_name(
9294+ conf->strip_zone[j].dev[k]->dev));
9295+ sz--;
9296+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9297+ conf->strip_zone[j].zone_offset,
9298+ conf->strip_zone[j].dev_offset,
9299+ conf->strip_zone[j].size);
9300+ }
9301 #endif
9302- sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
9303- return sz;
9304+ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
9305+ return sz;
9306 }
9307
9308-
9309-static struct md_personality raid0_personality=
9310+static mdk_personality_t raid0_personality=
9311 {
9312- "raid0",
9313- raid0_map,
9314- NULL, /* no special make_request */
9315- NULL, /* no special end_request */
9316- raid0_run,
9317- raid0_stop,
9318- raid0_status,
9319- NULL, /* no ioctls */
9320- 0,
9321- NULL, /* no error_handler */
9322- NULL, /* hot_add_disk */
9323- NULL, /* hot_remove_disk */
9324- NULL /* mark_spare */
9325+ "raid0",
9326+ raid0_map,
9327+ NULL, /* no special make_request */
9328+ NULL, /* no special end_request */
9329+ raid0_run,
9330+ raid0_stop,
9331+ raid0_status,
9332+ NULL, /* no ioctls */
9333+ 0,
9334+ NULL, /* no error_handler */
9335+ NULL, /* no diskop */
9336+ NULL, /* no stop resync */
9337+ NULL /* no restart resync */
9338 };
9339
9340-
9341 #ifndef MODULE
9342
9343 void raid0_init (void)
9344 {
9345- register_md_personality (RAID0, &raid0_personality);
9346+ register_md_personality (RAID0, &raid0_personality);
9347 }
9348
9349 #else
9350
9351 int init_module (void)
9352 {
9353- return (register_md_personality (RAID0, &raid0_personality));
9354+ return (register_md_personality (RAID0, &raid0_personality));
9355 }
9356
9357 void cleanup_module (void)
9358 {
9359- unregister_md_personality (RAID0);
9360+ unregister_md_personality (RAID0);
9361 }
9362
9363 #endif
9364+
9365--- linux/drivers/block/raid1.c.orig Mon Dec 11 01:49:41 2000
9366+++ linux/drivers/block/raid1.c Tue Jan 16 13:42:04 2001
9367@@ -1,6 +1,6 @@
9368-/************************************************************************
9369+/*
9370 * raid1.c : Multiple Devices driver for Linux
9371- * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9372+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9373 *
9374 * RAID-1 management functions.
9375 *
9376@@ -15,50 +15,55 @@
9377 */
9378
9379 #include <linux/module.h>
9380-#include <linux/locks.h>
9381 #include <linux/malloc.h>
9382-#include <linux/md.h>
9383-#include <linux/raid1.h>
9384-#include <asm/bitops.h>
9385+#include <linux/raid/raid1.h>
9386 #include <asm/atomic.h>
9387
9388 #define MAJOR_NR MD_MAJOR
9389 #define MD_DRIVER
9390 #define MD_PERSONALITY
9391
9392-/*
9393- * The following can be used to debug the driver
9394- */
9395-/*#define RAID1_DEBUG*/
9396-#ifdef RAID1_DEBUG
9397-#define PRINTK(x) do { printk x; } while (0);
9398-#else
9399-#define PRINTK(x) do { ; } while (0);
9400-#endif
9401+#define MAX_LINEAR_SECTORS 128
9402
9403 #define MAX(a,b) ((a) > (b) ? (a) : (b))
9404 #define MIN(a,b) ((a) < (b) ? (a) : (b))
9405
9406-static struct md_personality raid1_personality;
9407-static struct md_thread *raid1_thread = NULL;
9408+static mdk_personality_t raid1_personality;
9409 struct buffer_head *raid1_retry_list = NULL;
9410
9411-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
9412+static void * raid1_kmalloc (int size)
9413+{
9414+ void * ptr;
9415+ /*
9416+ * now we are rather fault tolerant than nice, but
9417+ * there are a couple of places in the RAID code where we
9418+ * simply can not afford to fail an allocation because
9419+ * there is no failure return path (eg. make_request())
9420+ */
9421+ while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) {
9422+ printk ("raid1: out of memory, retrying...\n");
9423+ current->state = TASK_UNINTERRUPTIBLE;
9424+ schedule_timeout(HZ/10);
9425+ }
9426+
9427+ memset(ptr, 0, size);
9428+ return ptr;
9429+}
9430+
9431+static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
9432 unsigned long *rsector, unsigned long size)
9433 {
9434- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9435- int i, n = raid_conf->raid_disks;
9436+ raid1_conf_t *conf = mddev_to_conf(mddev);
9437+ int i, disks = MD_SB_DISKS;
9438
9439 /*
9440 * Later we do read balancing on the read side
9441 * now we use the first available disk.
9442 */
9443
9444- PRINTK(("raid1_map().\n"));
9445-
9446- for (i=0; i<n; i++) {
9447- if (raid_conf->mirrors[i].operational) {
9448- *rdev = raid_conf->mirrors[i].dev;
9449+ for (i = 0; i < disks; i++) {
9450+ if (conf->mirrors[i].operational) {
9451+ *rdev = conf->mirrors[i].dev;
9452 return (0);
9453 }
9454 }
9455@@ -67,29 +72,29 @@
9456 return (-1);
9457 }
9458
9459-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
9460+static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9461 unsigned long *rsector, unsigned long size)
9462 {
9463 return 0;
9464 }
9465
9466-void raid1_reschedule_retry (struct buffer_head *bh)
9467+static void raid1_reschedule_retry (struct buffer_head *bh)
9468 {
9469 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9470-
9471- PRINTK(("raid1_reschedule_retry().\n"));
9472+ mddev_t *mddev = r1_bh->mddev;
9473+ raid1_conf_t *conf = mddev_to_conf(mddev);
9474
9475 r1_bh->next_retry = raid1_retry_list;
9476 raid1_retry_list = bh;
9477- md_wakeup_thread(raid1_thread);
9478+ md_wakeup_thread(conf->thread);
9479 }
9480
9481 /*
9482- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
9483+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
9484 * operation and are ready to return a success/failure code to the buffer
9485 * cache layer.
9486 */
9487-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
9488+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
9489 {
9490 struct buffer_head *bh = r1_bh->master_bh;
9491
9492@@ -97,8 +102,6 @@
9493 kfree(r1_bh);
9494 }
9495
9496-int raid1_one_error=0;
9497-
9498 void raid1_end_request (struct buffer_head *bh, int uptodate)
9499 {
9500 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9501@@ -106,12 +109,7 @@
9502
9503 save_flags(flags);
9504 cli();
9505- PRINTK(("raid1_end_request().\n"));
9506
9507- if (raid1_one_error) {
9508- raid1_one_error=0;
9509- uptodate=0;
9510- }
9511 /*
9512 * this branch is our 'one mirror IO has finished' event handler:
9513 */
9514@@ -136,15 +134,11 @@
9515 */
9516
9517 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
9518-
9519- PRINTK(("raid1_end_request(), read branch.\n"));
9520-
9521 /*
9522 * we have only one buffer_head on the read side
9523 */
9524 if (uptodate) {
9525- PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
9526- raid1_end_buffer_io(r1_bh, uptodate);
9527+ raid1_end_bh_io(r1_bh, uptodate);
9528 restore_flags(flags);
9529 return;
9530 }
9531@@ -152,71 +146,56 @@
9532 * oops, read error:
9533 */
9534 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
9535- kdevname(bh->b_dev), bh->b_blocknr);
9536- raid1_reschedule_retry (bh);
9537+ partition_name(bh->b_dev), bh->b_blocknr);
9538+ raid1_reschedule_retry(bh);
9539 restore_flags(flags);
9540 return;
9541 }
9542
9543 /*
9544- * WRITE or WRITEA.
9545- */
9546- PRINTK(("raid1_end_request(), write branch.\n"));
9547-
9548- /*
9549+ * WRITE:
9550+ *
9551 * Let's see if all mirrored write operations have finished
9552- * already [we have irqs off, so we can decrease]:
9553+ * already.
9554 */
9555
9556- if (!--r1_bh->remaining) {
9557- struct md_dev *mddev = r1_bh->mddev;
9558- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9559- int i, n = raid_conf->raid_disks;
9560-
9561- PRINTK(("raid1_end_request(), remaining == 0.\n"));
9562+ if (atomic_dec_and_test(&r1_bh->remaining)) {
9563+ int i, disks = MD_SB_DISKS;
9564
9565- for ( i=0; i<n; i++)
9566- if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
9567+ for ( i = 0; i < disks; i++)
9568+ if (r1_bh->mirror_bh[i])
9569+ kfree(r1_bh->mirror_bh[i]);
9570
9571- raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9572+ raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9573 }
9574- else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
9575 restore_flags(flags);
9576 }
9577
9578-/* This routine checks if the undelying device is an md device and in that
9579- * case it maps the blocks before putting the request on the queue
9580+/*
9581+ * This routine checks if the undelying device is an md device
9582+ * and in that case it maps the blocks before putting the
9583+ * request on the queue
9584 */
9585-static inline void
9586-map_and_make_request (int rw, struct buffer_head *bh)
9587+static void map_and_make_request (int rw, struct buffer_head *bh)
9588 {
9589 if (MAJOR (bh->b_rdev) == MD_MAJOR)
9590- md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
9591+ md_map (bh->b_rdev, &bh->b_rdev,
9592+ &bh->b_rsector, bh->b_size >> 9);
9593 clear_bit(BH_Lock, &bh->b_state);
9594 make_request (MAJOR (bh->b_rdev), rw, bh);
9595 }
9596
9597-static int
9598-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9599+static int raid1_make_request (mddev_t *mddev, int rw,
9600+ struct buffer_head * bh)
9601 {
9602-
9603- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9604+ raid1_conf_t *conf = mddev_to_conf(mddev);
9605 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
9606 struct raid1_bh * r1_bh;
9607- int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
9608+ int disks = MD_SB_DISKS;
9609+ int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
9610 struct mirror_info *mirror;
9611
9612- PRINTK(("raid1_make_request().\n"));
9613-
9614- while (!( /* FIXME: now we are rather fault tolerant than nice */
9615- r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
9616- ) )
9617- {
9618- printk ("raid1_make_request(#1): out of memory\n");
9619- current->policy |= SCHED_YIELD;
9620- schedule();
9621- }
9622- memset (r1_bh, 0, sizeof (struct raid1_bh));
9623+ r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
9624
9625 /*
9626 * make_request() can abort the operation when READA or WRITEA are being
9627@@ -227,43 +206,65 @@
9628 if (rw == READA) rw = READ;
9629 if (rw == WRITEA) rw = WRITE;
9630
9631- if (rw == WRITE || rw == WRITEA)
9632- mark_buffer_clean(bh); /* Too early ? */
9633+ if (rw == WRITE) {
9634+ /*
9635+ * Too early ?
9636+ */
9637+ mark_buffer_clean(bh);
9638+ /*
9639+ * not too early. we _first_ clean the bh, then we start
9640+ * the IO, then when the IO has finished, we unlock the
9641+ * bh and mark it uptodate. This way we do not miss the
9642+ * case when the bh got dirty again during the IO.
9643+ */
9644+ }
9645+
9646+ /*
9647+ * special flag for 'lowprio' reconstruction requests ...
9648+ */
9649+ if (buffer_lowprio(bh))
9650+ lowprio = 1;
9651
9652 /*
9653- * i think the read and write branch should be separated completely, since we want
9654- * to do read balancing on the read side for example. Comments? :) --mingo
9655+ * i think the read and write branch should be separated completely,
9656+ * since we want to do read balancing on the read side for example.
9657+ * Comments? :) --mingo
9658 */
9659
9660 r1_bh->master_bh=bh;
9661 r1_bh->mddev=mddev;
9662 r1_bh->cmd = rw;
9663
9664- if (rw==READ || rw==READA) {
9665- int last_used = raid_conf->last_used;
9666- PRINTK(("raid1_make_request(), read branch.\n"));
9667- mirror = raid_conf->mirrors + last_used;
9668+ if (rw==READ) {
9669+ int last_used = conf->last_used;
9670+
9671+ /*
9672+ * read balancing logic:
9673+ */
9674+ mirror = conf->mirrors + last_used;
9675 bh->b_rdev = mirror->dev;
9676 sectors = bh->b_size >> 9;
9677- if (bh->b_blocknr * sectors == raid_conf->next_sect) {
9678- raid_conf->sect_count += sectors;
9679- if (raid_conf->sect_count >= mirror->sect_limit)
9680+
9681+ if (bh->b_blocknr * sectors == conf->next_sect) {
9682+ conf->sect_count += sectors;
9683+ if (conf->sect_count >= mirror->sect_limit)
9684 switch_disks = 1;
9685 } else
9686 switch_disks = 1;
9687- raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
9688- if (switch_disks) {
9689- PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
9690- raid_conf->sect_count = 0;
9691- last_used = raid_conf->last_used = mirror->next;
9692+ conf->next_sect = (bh->b_blocknr + 1) * sectors;
9693+ /*
9694+ * Do not switch disks if full resync is in progress ...
9695+ */
9696+ if (switch_disks && !conf->resync_mirrors) {
9697+ conf->sect_count = 0;
9698+ last_used = conf->last_used = mirror->next;
9699 /*
9700- * Do not switch to write-only disks ... resyncing
9701- * is in progress
9702+ * Do not switch to write-only disks ...
9703+ * reconstruction is in progress
9704 */
9705- while (raid_conf->mirrors[last_used].write_only)
9706- raid_conf->last_used = raid_conf->mirrors[last_used].next;
9707+ while (conf->mirrors[last_used].write_only)
9708+ conf->last_used = conf->mirrors[last_used].next;
9709 }
9710- PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
9711 bh_req = &r1_bh->bh_req;
9712 memcpy(bh_req, bh, sizeof(*bh));
9713 bh_req->b_end_io = raid1_end_request;
9714@@ -273,13 +274,12 @@
9715 }
9716
9717 /*
9718- * WRITE or WRITEA.
9719+ * WRITE:
9720 */
9721- PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
9722
9723- for (i = 0; i < n; i++) {
9724+ for (i = 0; i < disks; i++) {
9725
9726- if (!raid_conf->mirrors [i].operational) {
9727+ if (!conf->mirrors[i].operational) {
9728 /*
9729 * the r1_bh->mirror_bh[i] pointer remains NULL
9730 */
9731@@ -287,89 +287,91 @@
9732 continue;
9733 }
9734
9735+ /*
9736+ * special case for reconstruction ...
9737+ */
9738+ if (lowprio && (i == conf->last_used)) {
9739+ mirror_bh[i] = NULL;
9740+ continue;
9741+ }
9742+
9743+ /*
9744+ * We should use a private pool (size depending on NR_REQUEST),
9745+ * to avoid writes filling up the memory with bhs
9746+ *
9747+ * Such pools are much faster than kmalloc anyways (so we waste
9748+ * almost nothing by not using the master bh when writing and
9749+ * win alot of cleanness) but for now we are cool enough. --mingo
9750+ *
9751+ * It's safe to sleep here, buffer heads cannot be used in a shared
9752+ * manner in the write branch. Look how we lock the buffer at the
9753+ * beginning of this function to grok the difference ;)
9754+ */
9755+ mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
9756+ /*
9757+ * prepare mirrored bh (fields ordered for max mem throughput):
9758+ */
9759+ mirror_bh[i]->b_blocknr = bh->b_blocknr;
9760+ mirror_bh[i]->b_dev = bh->b_dev;
9761+ mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
9762+ mirror_bh[i]->b_rsector = bh->b_rsector;
9763+ mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9764+ if (lowprio)
9765+ mirror_bh[i]->b_state |= (1<<BH_LowPrio);
9766+
9767+ mirror_bh[i]->b_count = 1;
9768+ mirror_bh[i]->b_size = bh->b_size;
9769+ mirror_bh[i]->b_data = bh->b_data;
9770+ mirror_bh[i]->b_list = BUF_LOCKED;
9771+ mirror_bh[i]->b_end_io = raid1_end_request;
9772+ mirror_bh[i]->b_dev_id = r1_bh;
9773+
9774+ r1_bh->mirror_bh[i] = mirror_bh[i];
9775+ sum_bhs++;
9776+ }
9777+
9778+ md_atomic_set(&r1_bh->remaining, sum_bhs);
9779+
9780 /*
9781- * We should use a private pool (size depending on NR_REQUEST),
9782- * to avoid writes filling up the memory with bhs
9783- *
9784- * Such pools are much faster than kmalloc anyways (so we waste almost
9785- * nothing by not using the master bh when writing and win alot of cleanness)
9786- *
9787- * but for now we are cool enough. --mingo
9788- *
9789- * It's safe to sleep here, buffer heads cannot be used in a shared
9790- * manner in the write branch. Look how we lock the buffer at the beginning
9791- * of this function to grok the difference ;)
9792- */
9793- while (!( /* FIXME: now we are rather fault tolerant than nice */
9794- mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
9795- ) )
9796- {
9797- printk ("raid1_make_request(#2): out of memory\n");
9798- current->policy |= SCHED_YIELD;
9799- schedule();
9800- }
9801- memset (mirror_bh[i], 0, sizeof (struct buffer_head));
9802-
9803- /*
9804- * prepare mirrored bh (fields ordered for max mem throughput):
9805- */
9806- mirror_bh [i]->b_blocknr = bh->b_blocknr;
9807- mirror_bh [i]->b_dev = bh->b_dev;
9808- mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
9809- mirror_bh [i]->b_rsector = bh->b_rsector;
9810- mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9811- mirror_bh [i]->b_count = 1;
9812- mirror_bh [i]->b_size = bh->b_size;
9813- mirror_bh [i]->b_data = bh->b_data;
9814- mirror_bh [i]->b_list = BUF_LOCKED;
9815- mirror_bh [i]->b_end_io = raid1_end_request;
9816- mirror_bh [i]->b_dev_id = r1_bh;
9817-
9818- r1_bh->mirror_bh[i] = mirror_bh[i];
9819- sum_bhs++;
9820- }
9821-
9822- r1_bh->remaining = sum_bhs;
9823-
9824- PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
9825-
9826- /*
9827- * We have to be a bit careful about the semaphore above, thats why we
9828- * start the requests separately. Since kmalloc() could fail, sleep and
9829- * make_request() can sleep too, this is the safer solution. Imagine,
9830- * end_request decreasing the semaphore before we could have set it up ...
9831- * We could play tricks with the semaphore (presetting it and correcting
9832- * at the end if sum_bhs is not 'n' but we have to do end_request by hand
9833- * if all requests finish until we had a chance to set up the semaphore
9834- * correctly ... lots of races).
9835- */
9836- for (i = 0; i < n; i++)
9837- if (mirror_bh [i] != NULL)
9838- map_and_make_request (rw, mirror_bh [i]);
9839+ * We have to be a bit careful about the semaphore above, thats
9840+ * why we start the requests separately. Since kmalloc() could
9841+ * fail, sleep and make_request() can sleep too, this is the
9842+ * safer solution. Imagine, end_request decreasing the semaphore
9843+ * before we could have set it up ... We could play tricks with
9844+ * the semaphore (presetting it and correcting at the end if
9845+ * sum_bhs is not 'n' but we have to do end_request by hand if
9846+ * all requests finish until we had a chance to set up the
9847+ * semaphore correctly ... lots of races).
9848+ */
9849+ for (i = 0; i < disks; i++)
9850+ if (mirror_bh[i])
9851+ map_and_make_request(rw, mirror_bh[i]);
9852
9853 return (0);
9854 }
9855
9856-static int raid1_status (char *page, int minor, struct md_dev *mddev)
9857+static int raid1_status (char *page, mddev_t *mddev)
9858 {
9859- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9860+ raid1_conf_t *conf = mddev_to_conf(mddev);
9861 int sz = 0, i;
9862
9863- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
9864- for (i = 0; i < raid_conf->raid_disks; i++)
9865- sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
9866+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
9867+ conf->working_disks);
9868+ for (i = 0; i < conf->raid_disks; i++)
9869+ sz += sprintf (page+sz, "%s",
9870+ conf->mirrors[i].operational ? "U" : "_");
9871 sz += sprintf (page+sz, "]");
9872 return sz;
9873 }
9874
9875-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
9876+static void unlink_disk (raid1_conf_t *conf, int target)
9877 {
9878- int disks = raid_conf->raid_disks;
9879- int j;
9880+ int disks = MD_SB_DISKS;
9881+ int i;
9882
9883- for (j = 0; j < disks; j++)
9884- if (raid_conf->mirrors [j].next == failed_index)
9885- raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
9886+ for (i = 0; i < disks; i++)
9887+ if (conf->mirrors[i].next == target)
9888+ conf->mirrors[i].next = conf->mirrors[target].next;
9889 }
9890
9891 #define LAST_DISK KERN_ALERT \
9892@@ -388,48 +390,53 @@
9893 #define ALREADY_SYNCING KERN_INFO \
9894 "raid1: syncing already in progress.\n"
9895
9896-static int raid1_error (struct md_dev *mddev, kdev_t dev)
9897+static void mark_disk_bad (mddev_t *mddev, int failed)
9898 {
9899- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9900- struct mirror_info *mirror;
9901- md_superblock_t *sb = mddev->sb;
9902- int disks = raid_conf->raid_disks;
9903- int i;
9904+ raid1_conf_t *conf = mddev_to_conf(mddev);
9905+ struct mirror_info *mirror = conf->mirrors+failed;
9906+ mdp_super_t *sb = mddev->sb;
9907+
9908+ mirror->operational = 0;
9909+ unlink_disk(conf, failed);
9910+ mark_disk_faulty(sb->disks+mirror->number);
9911+ mark_disk_nonsync(sb->disks+mirror->number);
9912+ mark_disk_inactive(sb->disks+mirror->number);
9913+ sb->active_disks--;
9914+ sb->working_disks--;
9915+ sb->failed_disks++;
9916+ mddev->sb_dirty = 1;
9917+ md_wakeup_thread(conf->thread);
9918+ conf->working_disks--;
9919+ printk (DISK_FAILED, partition_name (mirror->dev),
9920+ conf->working_disks);
9921+}
9922
9923- PRINTK(("raid1_error called\n"));
9924+static int raid1_error (mddev_t *mddev, kdev_t dev)
9925+{
9926+ raid1_conf_t *conf = mddev_to_conf(mddev);
9927+ struct mirror_info * mirrors = conf->mirrors;
9928+ int disks = MD_SB_DISKS;
9929+ int i;
9930
9931- if (raid_conf->working_disks == 1) {
9932+ if (conf->working_disks == 1) {
9933 /*
9934 * Uh oh, we can do nothing if this is our last disk, but
9935 * first check if this is a queued request for a device
9936 * which has just failed.
9937 */
9938- for (i = 0, mirror = raid_conf->mirrors; i < disks;
9939- i++, mirror++)
9940- if (mirror->dev == dev && !mirror->operational)
9941+ for (i = 0; i < disks; i++) {
9942+ if (mirrors[i].dev==dev && !mirrors[i].operational)
9943 return 0;
9944+ }
9945 printk (LAST_DISK);
9946 } else {
9947- /* Mark disk as unusable */
9948- for (i = 0, mirror = raid_conf->mirrors; i < disks;
9949- i++, mirror++) {
9950- if (mirror->dev == dev && mirror->operational){
9951- mirror->operational = 0;
9952- raid1_fix_links (raid_conf, i);
9953- sb->disks[mirror->number].state |=
9954- (1 << MD_FAULTY_DEVICE);
9955- sb->disks[mirror->number].state &=
9956- ~(1 << MD_SYNC_DEVICE);
9957- sb->disks[mirror->number].state &=
9958- ~(1 << MD_ACTIVE_DEVICE);
9959- sb->active_disks--;
9960- sb->working_disks--;
9961- sb->failed_disks++;
9962- mddev->sb_dirty = 1;
9963- md_wakeup_thread(raid1_thread);
9964- raid_conf->working_disks--;
9965- printk (DISK_FAILED, kdevname (dev),
9966- raid_conf->working_disks);
9967+ /*
9968+ * Mark disk as unusable
9969+ */
9970+ for (i = 0; i < disks; i++) {
9971+ if (mirrors[i].dev==dev && mirrors[i].operational) {
9972+ mark_disk_bad (mddev, i);
9973+ break;
9974 }
9975 }
9976 }
9977@@ -442,219 +449,396 @@
9978 #undef START_SYNCING
9979
9980 /*
9981- * This is the personality-specific hot-addition routine
9982+ * Insert the spare disk into the drive-ring
9983 */
9984+static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
9985+{
9986+ int j, next;
9987+ int disks = MD_SB_DISKS;
9988+ struct mirror_info *p = conf->mirrors;
9989
9990-#define NO_SUPERBLOCK KERN_ERR \
9991-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
9992+ for (j = 0; j < disks; j++, p++)
9993+ if (p->operational && !p->write_only) {
9994+ next = p->next;
9995+ p->next = mirror->raid_disk;
9996+ mirror->next = next;
9997+ return;
9998+ }
9999
10000-#define WRONG_LEVEL KERN_ERR \
10001-"raid1: hot-add: level of disk is not RAID-1\n"
10002+ printk("raid1: bug: no read-operational devices\n");
10003+}
10004
10005-#define HOT_ADD_SUCCEEDED KERN_INFO \
10006-"raid1: device %s hot-added\n"
10007+static void print_raid1_conf (raid1_conf_t *conf)
10008+{
10009+ int i;
10010+ struct mirror_info *tmp;
10011
10012-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
10013+ printk("RAID1 conf printout:\n");
10014+ if (!conf) {
10015+ printk("(conf==NULL)\n");
10016+ return;
10017+ }
10018+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
10019+ conf->raid_disks, conf->nr_disks);
10020+
10021+ for (i = 0; i < MD_SB_DISKS; i++) {
10022+ tmp = conf->mirrors + i;
10023+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10024+ i, tmp->spare,tmp->operational,
10025+ tmp->number,tmp->raid_disk,tmp->used_slot,
10026+ partition_name(tmp->dev));
10027+ }
10028+}
10029+
10030+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10031 {
10032+ int err = 0;
10033+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10034+ raid1_conf_t *conf = mddev->private;
10035+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10036 unsigned long flags;
10037- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10038- struct mirror_info *mirror;
10039- md_superblock_t *sb = mddev->sb;
10040- struct real_dev * realdev;
10041- int n;
10042+ mdp_super_t *sb = mddev->sb;
10043+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10044+
10045+ save_flags(flags);
10046+ cli();
10047
10048+ print_raid1_conf(conf);
10049 /*
10050- * The device has its superblock already read and it was found
10051- * to be consistent for generic RAID usage. Now we check whether
10052- * it's usable for RAID-1 hot addition.
10053+ * find the disk ...
10054 */
10055+ switch (state) {
10056
10057- n = mddev->nb_dev++;
10058- realdev = &mddev->devices[n];
10059- if (!realdev->sb) {
10060- printk (NO_SUPERBLOCK);
10061- return -EINVAL;
10062- }
10063- if (realdev->sb->level != 1) {
10064- printk (WRONG_LEVEL);
10065- return -EINVAL;
10066+ case DISKOP_SPARE_ACTIVE:
10067+
10068+ /*
10069+ * Find the failed disk within the RAID1 configuration ...
10070+ * (this can only be in the first conf->working_disks part)
10071+ */
10072+ for (i = 0; i < conf->raid_disks; i++) {
10073+ tmp = conf->mirrors + i;
10074+ if ((!tmp->operational && !tmp->spare) ||
10075+ !tmp->used_slot) {
10076+ failed_disk = i;
10077+ break;
10078+ }
10079+ }
10080+ /*
10081+ * When we activate a spare disk we _must_ have a disk in
10082+ * the lower (active) part of the array to replace.
10083+ */
10084+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10085+ MD_BUG();
10086+ err = 1;
10087+ goto abort;
10088+ }
10089+ /* fall through */
10090+
10091+ case DISKOP_SPARE_WRITE:
10092+ case DISKOP_SPARE_INACTIVE:
10093+
10094+ /*
10095+ * Find the spare disk ... (can only be in the 'high'
10096+ * area of the array)
10097+ */
10098+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10099+ tmp = conf->mirrors + i;
10100+ if (tmp->spare && tmp->number == (*d)->number) {
10101+ spare_disk = i;
10102+ break;
10103+ }
10104+ }
10105+ if (spare_disk == -1) {
10106+ MD_BUG();
10107+ err = 1;
10108+ goto abort;
10109+ }
10110+ break;
10111+
10112+ case DISKOP_HOT_REMOVE_DISK:
10113+
10114+ for (i = 0; i < MD_SB_DISKS; i++) {
10115+ tmp = conf->mirrors + i;
10116+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
10117+ if (tmp->operational) {
10118+ err = -EBUSY;
10119+ goto abort;
10120+ }
10121+ removed_disk = i;
10122+ break;
10123+ }
10124+ }
10125+ if (removed_disk == -1) {
10126+ MD_BUG();
10127+ err = 1;
10128+ goto abort;
10129+ }
10130+ break;
10131+
10132+ case DISKOP_HOT_ADD_DISK:
10133+
10134+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10135+ tmp = conf->mirrors + i;
10136+ if (!tmp->used_slot) {
10137+ added_disk = i;
10138+ break;
10139+ }
10140+ }
10141+ if (added_disk == -1) {
10142+ MD_BUG();
10143+ err = 1;
10144+ goto abort;
10145+ }
10146+ break;
10147 }
10148- /* FIXME: are there other things left we could sanity-check? */
10149
10150+ switch (state) {
10151 /*
10152- * We have to disable interrupts, as our RAID-1 state is used
10153- * from irq handlers as well.
10154+ * Switch the spare disk to write-only mode:
10155 */
10156- save_flags(flags);
10157- cli();
10158+ case DISKOP_SPARE_WRITE:
10159+ sdisk = conf->mirrors + spare_disk;
10160+ sdisk->operational = 1;
10161+ sdisk->write_only = 1;
10162+ break;
10163+ /*
10164+ * Deactivate a spare disk:
10165+ */
10166+ case DISKOP_SPARE_INACTIVE:
10167+ sdisk = conf->mirrors + spare_disk;
10168+ sdisk->operational = 0;
10169+ sdisk->write_only = 0;
10170+ break;
10171+ /*
10172+ * Activate (mark read-write) the (now sync) spare disk,
10173+ * which means we switch it's 'raid position' (->raid_disk)
10174+ * with the failed disk. (only the first 'conf->nr_disks'
10175+ * slots are used for 'real' disks and we must preserve this
10176+ * property)
10177+ */
10178+ case DISKOP_SPARE_ACTIVE:
10179
10180- raid_conf->raid_disks++;
10181- mirror = raid_conf->mirrors+n;
10182+ sdisk = conf->mirrors + spare_disk;
10183+ fdisk = conf->mirrors + failed_disk;
10184
10185- mirror->number=n;
10186- mirror->raid_disk=n;
10187- mirror->dev=dev;
10188- mirror->next=0; /* FIXME */
10189- mirror->sect_limit=128;
10190-
10191- mirror->operational=0;
10192- mirror->spare=1;
10193- mirror->write_only=0;
10194-
10195- sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
10196- sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
10197- sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
10198- sb->nr_disks++;
10199- sb->spare_disks++;
10200+ spare_desc = &sb->disks[sdisk->number];
10201+ failed_desc = &sb->disks[fdisk->number];
10202
10203- restore_flags(flags);
10204+ if (spare_desc != *d) {
10205+ MD_BUG();
10206+ err = 1;
10207+ goto abort;
10208+ }
10209
10210- md_update_sb(MINOR(dev));
10211+ if (spare_desc->raid_disk != sdisk->raid_disk) {
10212+ MD_BUG();
10213+ err = 1;
10214+ goto abort;
10215+ }
10216+
10217+ if (sdisk->raid_disk != spare_disk) {
10218+ MD_BUG();
10219+ err = 1;
10220+ goto abort;
10221+ }
10222
10223- printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
10224+ if (failed_desc->raid_disk != fdisk->raid_disk) {
10225+ MD_BUG();
10226+ err = 1;
10227+ goto abort;
10228+ }
10229
10230- return 0;
10231-}
10232+ if (fdisk->raid_disk != failed_disk) {
10233+ MD_BUG();
10234+ err = 1;
10235+ goto abort;
10236+ }
10237
10238-#undef NO_SUPERBLOCK
10239-#undef WRONG_LEVEL
10240-#undef HOT_ADD_SUCCEEDED
10241+ /*
10242+ * do the switch finally
10243+ */
10244+ xchg_values(*spare_desc, *failed_desc);
10245+ xchg_values(*fdisk, *sdisk);
10246
10247-/*
10248- * Insert the spare disk into the drive-ring
10249- */
10250-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
10251-{
10252- int j, next;
10253- struct mirror_info *p = raid_conf->mirrors;
10254+ /*
10255+ * (careful, 'failed' and 'spare' are switched from now on)
10256+ *
10257+ * we want to preserve linear numbering and we want to
10258+ * give the proper raid_disk number to the now activated
10259+ * disk. (this means we switch back these values)
10260+ */
10261+
10262+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10263+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10264+ xchg_values(spare_desc->number, failed_desc->number);
10265+ xchg_values(sdisk->number, fdisk->number);
10266
10267- for (j = 0; j < raid_conf->raid_disks; j++, p++)
10268- if (p->operational && !p->write_only) {
10269- next = p->next;
10270- p->next = mirror->raid_disk;
10271- mirror->next = next;
10272- return;
10273- }
10274- printk("raid1: bug: no read-operational devices\n");
10275-}
10276+ *d = failed_desc;
10277
10278-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
10279- int state)
10280-{
10281- int i = 0, failed_disk = -1;
10282- struct raid1_data *raid_conf = mddev->private;
10283- struct mirror_info *mirror = raid_conf->mirrors;
10284- md_descriptor_t *descriptor;
10285- unsigned long flags;
10286+ if (sdisk->dev == MKDEV(0,0))
10287+ sdisk->used_slot = 0;
10288+ /*
10289+ * this really activates the spare.
10290+ */
10291+ fdisk->spare = 0;
10292+ fdisk->write_only = 0;
10293+ link_disk(conf, fdisk);
10294
10295- for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
10296- if (mirror->spare && mirror->number == spare->number)
10297- goto found;
10298- }
10299- return 1;
10300-found:
10301- for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
10302- i++, mirror++)
10303- if (!mirror->operational)
10304- failed_disk = i;
10305+ /*
10306+ * if we activate a spare, we definitely replace a
10307+ * non-operational disk slot in the 'low' area of
10308+ * the disk array.
10309+ */
10310
10311- save_flags(flags);
10312- cli();
10313- switch (state) {
10314- case SPARE_WRITE:
10315- mirror->operational = 1;
10316- mirror->write_only = 1;
10317- raid_conf->raid_disks = MAX(raid_conf->raid_disks,
10318- mirror->raid_disk + 1);
10319- break;
10320- case SPARE_INACTIVE:
10321- mirror->operational = 0;
10322- mirror->write_only = 0;
10323- break;
10324- case SPARE_ACTIVE:
10325- mirror->spare = 0;
10326- mirror->write_only = 0;
10327- raid_conf->working_disks++;
10328- add_ring(raid_conf, mirror);
10329-
10330- if (failed_disk != -1) {
10331- descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
10332- i = spare->raid_disk;
10333- spare->raid_disk = descriptor->raid_disk;
10334- descriptor->raid_disk = i;
10335- }
10336- break;
10337- default:
10338- printk("raid1_mark_spare: bug: state == %d\n", state);
10339- restore_flags(flags);
10340- return 1;
10341+ conf->working_disks++;
10342+
10343+ break;
10344+
10345+ case DISKOP_HOT_REMOVE_DISK:
10346+ rdisk = conf->mirrors + removed_disk;
10347+
10348+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10349+ MD_BUG();
10350+ err = 1;
10351+ goto abort;
10352+ }
10353+ rdisk->dev = MKDEV(0,0);
10354+ rdisk->used_slot = 0;
10355+ conf->nr_disks--;
10356+ break;
10357+
10358+ case DISKOP_HOT_ADD_DISK:
10359+ adisk = conf->mirrors + added_disk;
10360+ added_desc = *d;
10361+
10362+ if (added_disk != added_desc->number) {
10363+ MD_BUG();
10364+ err = 1;
10365+ goto abort;
10366+ }
10367+
10368+ adisk->number = added_desc->number;
10369+ adisk->raid_disk = added_desc->raid_disk;
10370+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10371+
10372+ adisk->operational = 0;
10373+ adisk->write_only = 0;
10374+ adisk->spare = 1;
10375+ adisk->used_slot = 1;
10376+ conf->nr_disks++;
10377+
10378+ break;
10379+
10380+ default:
10381+ MD_BUG();
10382+ err = 1;
10383+ goto abort;
10384 }
10385+abort:
10386 restore_flags(flags);
10387- return 0;
10388+ print_raid1_conf(conf);
10389+ return err;
10390 }
10391
10392+
10393+#define IO_ERROR KERN_ALERT \
10394+"raid1: %s: unrecoverable I/O read error for block %lu\n"
10395+
10396+#define REDIRECT_SECTOR KERN_ERR \
10397+"raid1: %s: redirecting sector %lu to another mirror\n"
10398+
10399 /*
10400 * This is a kernel thread which:
10401 *
10402 * 1. Retries failed read operations on working mirrors.
10403 * 2. Updates the raid superblock when problems encounter.
10404 */
10405-void raid1d (void *data)
10406+static void raid1d (void *data)
10407 {
10408 struct buffer_head *bh;
10409 kdev_t dev;
10410 unsigned long flags;
10411- struct raid1_bh * r1_bh;
10412- struct md_dev *mddev;
10413+ struct raid1_bh *r1_bh;
10414+ mddev_t *mddev;
10415
10416- PRINTK(("raid1d() active\n"));
10417- save_flags(flags);
10418- cli();
10419 while (raid1_retry_list) {
10420+ save_flags(flags);
10421+ cli();
10422 bh = raid1_retry_list;
10423 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
10424 raid1_retry_list = r1_bh->next_retry;
10425 restore_flags(flags);
10426
10427- mddev = md_dev + MINOR(bh->b_dev);
10428+ mddev = kdev_to_mddev(bh->b_dev);
10429 if (mddev->sb_dirty) {
10430- printk("dirty sb detected, updating.\n");
10431+ printk(KERN_INFO "dirty sb detected, updating.\n");
10432 mddev->sb_dirty = 0;
10433- md_update_sb(MINOR(bh->b_dev));
10434+ md_update_sb(mddev);
10435 }
10436 dev = bh->b_rdev;
10437- __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
10438+ __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
10439+ bh->b_size >> 9);
10440 if (bh->b_rdev == dev) {
10441- printk (KERN_ALERT
10442- "raid1: %s: unrecoverable I/O read error for block %lu\n",
10443- kdevname(bh->b_dev), bh->b_blocknr);
10444- raid1_end_buffer_io(r1_bh, 0);
10445+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
10446+ raid1_end_bh_io(r1_bh, 0);
10447 } else {
10448- printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
10449- kdevname(bh->b_dev), bh->b_blocknr);
10450+ printk (REDIRECT_SECTOR,
10451+ partition_name(bh->b_dev), bh->b_blocknr);
10452 map_and_make_request (r1_bh->cmd, bh);
10453 }
10454- cli();
10455 }
10456- restore_flags(flags);
10457+}
10458+#undef IO_ERROR
10459+#undef REDIRECT_SECTOR
10460+
10461+/*
10462+ * Private kernel thread to reconstruct mirrors after an unclean
10463+ * shutdown.
10464+ */
10465+static void raid1syncd (void *data)
10466+{
10467+ raid1_conf_t *conf = data;
10468+ mddev_t *mddev = conf->mddev;
10469+
10470+ if (!conf->resync_mirrors)
10471+ return;
10472+ if (conf->resync_mirrors == 2)
10473+ return;
10474+ down(&mddev->recovery_sem);
10475+ if (md_do_sync(mddev, NULL)) {
10476+ up(&mddev->recovery_sem);
10477+ return;
10478+ }
10479+ /*
10480+ * Only if everything went Ok.
10481+ */
10482+ conf->resync_mirrors = 0;
10483+ up(&mddev->recovery_sem);
10484 }
10485
10486+
10487 /*
10488 * This will catch the scenario in which one of the mirrors was
10489 * mounted as a normal device rather than as a part of a raid set.
10490+ *
10491+ * check_consistency is very personality-dependent, eg. RAID5 cannot
10492+ * do this check, it uses another method.
10493 */
10494-static int __check_consistency (struct md_dev *mddev, int row)
10495+static int __check_consistency (mddev_t *mddev, int row)
10496 {
10497- struct raid1_data *raid_conf = mddev->private;
10498+ raid1_conf_t *conf = mddev_to_conf(mddev);
10499+ int disks = MD_SB_DISKS;
10500 kdev_t dev;
10501 struct buffer_head *bh = NULL;
10502 int i, rc = 0;
10503 char *buffer = NULL;
10504
10505- for (i = 0; i < raid_conf->raid_disks; i++) {
10506- if (!raid_conf->mirrors[i].operational)
10507+ for (i = 0; i < disks; i++) {
10508+ printk("(checking disk %d)\n",i);
10509+ if (!conf->mirrors[i].operational)
10510 continue;
10511- dev = raid_conf->mirrors[i].dev;
10512+ printk("(really checking disk %d)\n",i);
10513+ dev = conf->mirrors[i].dev;
10514 set_blocksize(dev, 4096);
10515 if ((bh = bread(dev, row / 4, 4096)) == NULL)
10516 break;
10517@@ -683,167 +867,342 @@
10518 return rc;
10519 }
10520
10521-static int check_consistency (struct md_dev *mddev)
10522+static int check_consistency (mddev_t *mddev)
10523 {
10524- int size = mddev->sb->size;
10525- int row;
10526+ if (__check_consistency(mddev, 0))
10527+/*
10528+ * we do not do this currently, as it's perfectly possible to
10529+ * have an inconsistent array when it's freshly created. Only
10530+ * newly written data has to be consistent.
10531+ */
10532+ return 0;
10533
10534- for (row = 0; row < size; row += size / 8)
10535- if (__check_consistency(mddev, row))
10536- return 1;
10537 return 0;
10538 }
10539
10540-static int raid1_run (int minor, struct md_dev *mddev)
10541+#define INVALID_LEVEL KERN_WARNING \
10542+"raid1: md%d: raid level not set to mirroring (%d)\n"
10543+
10544+#define NO_SB KERN_ERR \
10545+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
10546+
10547+#define ERRORS KERN_ERR \
10548+"raid1: disabled mirror %s (errors detected)\n"
10549+
10550+#define NOT_IN_SYNC KERN_ERR \
10551+"raid1: disabled mirror %s (not in sync)\n"
10552+
10553+#define INCONSISTENT KERN_ERR \
10554+"raid1: disabled mirror %s (inconsistent descriptor)\n"
10555+
10556+#define ALREADY_RUNNING KERN_ERR \
10557+"raid1: disabled mirror %s (mirror %d already operational)\n"
10558+
10559+#define OPERATIONAL KERN_INFO \
10560+"raid1: device %s operational as mirror %d\n"
10561+
10562+#define MEM_ERROR KERN_ERR \
10563+"raid1: couldn't allocate memory for md%d\n"
10564+
10565+#define SPARE KERN_INFO \
10566+"raid1: spare disk %s\n"
10567+
10568+#define NONE_OPERATIONAL KERN_ERR \
10569+"raid1: no operational mirrors for md%d\n"
10570+
10571+#define RUNNING_CKRAID KERN_ERR \
10572+"raid1: detected mirror differences -- running resync\n"
10573+
10574+#define ARRAY_IS_ACTIVE KERN_INFO \
10575+"raid1: raid set md%d active with %d out of %d mirrors\n"
10576+
10577+#define THREAD_ERROR KERN_ERR \
10578+"raid1: couldn't allocate thread for md%d\n"
10579+
10580+#define START_RESYNC KERN_WARNING \
10581+"raid1: raid set md%d not clean; reconstructing mirrors\n"
10582+
10583+static int raid1_run (mddev_t *mddev)
10584 {
10585- struct raid1_data *raid_conf;
10586- int i, j, raid_disk;
10587- md_superblock_t *sb = mddev->sb;
10588- md_descriptor_t *descriptor;
10589- struct real_dev *realdev;
10590+ raid1_conf_t *conf;
10591+ int i, j, disk_idx;
10592+ struct mirror_info *disk;
10593+ mdp_super_t *sb = mddev->sb;
10594+ mdp_disk_t *descriptor;
10595+ mdk_rdev_t *rdev;
10596+ struct md_list_head *tmp;
10597+ int start_recovery = 0;
10598
10599 MOD_INC_USE_COUNT;
10600
10601 if (sb->level != 1) {
10602- printk("raid1: %s: raid level not set to mirroring (%d)\n",
10603- kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
10604- MOD_DEC_USE_COUNT;
10605- return -EIO;
10606- }
10607- /****
10608- * copy the now verified devices into our private RAID1 bookkeeping
10609- * area. [whatever we allocate in raid1_run(), should be freed in
10610- * raid1_stop()]
10611+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
10612+ goto out;
10613+ }
10614+ /*
10615+ * copy the already verified devices into our private RAID1
10616+ * bookkeeping area. [whatever we allocate in raid1_run(),
10617+ * should be freed in raid1_stop()]
10618 */
10619
10620- while (!( /* FIXME: now we are rather fault tolerant than nice */
10621- mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
10622- ) )
10623- {
10624- printk ("raid1_run(): out of memory\n");
10625- current->policy |= SCHED_YIELD;
10626- schedule();
10627- }
10628- raid_conf = mddev->private;
10629- memset(raid_conf, 0, sizeof(*raid_conf));
10630-
10631- PRINTK(("raid1_run(%d) called.\n", minor));
10632-
10633- for (i = 0; i < mddev->nb_dev; i++) {
10634- realdev = &mddev->devices[i];
10635- if (!realdev->sb) {
10636- printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10637+ conf = raid1_kmalloc(sizeof(raid1_conf_t));
10638+ mddev->private = conf;
10639+ if (!conf) {
10640+ printk(MEM_ERROR, mdidx(mddev));
10641+ goto out;
10642+ }
10643+
10644+ ITERATE_RDEV(mddev,rdev,tmp) {
10645+ if (rdev->faulty) {
10646+ printk(ERRORS, partition_name(rdev->dev));
10647+ } else {
10648+ if (!rdev->sb) {
10649+ MD_BUG();
10650+ continue;
10651+ }
10652+ }
10653+ if (rdev->desc_nr == -1) {
10654+ MD_BUG();
10655 continue;
10656 }
10657-
10658- /*
10659- * This is important -- we are using the descriptor on
10660- * the disk only to get a pointer to the descriptor on
10661- * the main superblock, which might be more recent.
10662- */
10663- descriptor = &sb->disks[realdev->sb->descriptor.number];
10664- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10665- printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
10666+ descriptor = &sb->disks[rdev->desc_nr];
10667+ disk_idx = descriptor->raid_disk;
10668+ disk = conf->mirrors + disk_idx;
10669+
10670+ if (disk_faulty(descriptor)) {
10671+ disk->number = descriptor->number;
10672+ disk->raid_disk = disk_idx;
10673+ disk->dev = rdev->dev;
10674+ disk->sect_limit = MAX_LINEAR_SECTORS;
10675+ disk->operational = 0;
10676+ disk->write_only = 0;
10677+ disk->spare = 0;
10678+ disk->used_slot = 1;
10679 continue;
10680 }
10681- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10682- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10683- printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
10684+ if (disk_active(descriptor)) {
10685+ if (!disk_sync(descriptor)) {
10686+ printk(NOT_IN_SYNC,
10687+ partition_name(rdev->dev));
10688 continue;
10689 }
10690- raid_disk = descriptor->raid_disk;
10691- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10692- printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10693+ if ((descriptor->number > MD_SB_DISKS) ||
10694+ (disk_idx > sb->raid_disks)) {
10695+
10696+ printk(INCONSISTENT,
10697+ partition_name(rdev->dev));
10698 continue;
10699 }
10700- if (raid_conf->mirrors[raid_disk].operational) {
10701- printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
10702+ if (disk->operational) {
10703+ printk(ALREADY_RUNNING,
10704+ partition_name(rdev->dev),
10705+ disk_idx);
10706 continue;
10707 }
10708- printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
10709- raid_conf->mirrors[raid_disk].number = descriptor->number;
10710- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10711- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10712- raid_conf->mirrors[raid_disk].operational = 1;
10713- raid_conf->mirrors[raid_disk].sect_limit = 128;
10714- raid_conf->working_disks++;
10715+ printk(OPERATIONAL, partition_name(rdev->dev),
10716+ disk_idx);
10717+ disk->number = descriptor->number;
10718+ disk->raid_disk = disk_idx;
10719+ disk->dev = rdev->dev;
10720+ disk->sect_limit = MAX_LINEAR_SECTORS;
10721+ disk->operational = 1;
10722+ disk->write_only = 0;
10723+ disk->spare = 0;
10724+ disk->used_slot = 1;
10725+ conf->working_disks++;
10726 } else {
10727 /*
10728 * Must be a spare disk ..
10729 */
10730- printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
10731- raid_disk = descriptor->raid_disk;
10732- raid_conf->mirrors[raid_disk].number = descriptor->number;
10733- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10734- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10735- raid_conf->mirrors[raid_disk].sect_limit = 128;
10736-
10737- raid_conf->mirrors[raid_disk].operational = 0;
10738- raid_conf->mirrors[raid_disk].write_only = 0;
10739- raid_conf->mirrors[raid_disk].spare = 1;
10740- }
10741- }
10742- if (!raid_conf->working_disks) {
10743- printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10744- kfree(raid_conf);
10745- mddev->private = NULL;
10746- MOD_DEC_USE_COUNT;
10747- return -EIO;
10748- }
10749-
10750- raid_conf->raid_disks = sb->raid_disks;
10751- raid_conf->mddev = mddev;
10752-
10753- for (j = 0; !raid_conf->mirrors[j].operational; j++);
10754- raid_conf->last_used = j;
10755- for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
10756- if (raid_conf->mirrors[i].operational) {
10757- PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
10758- raid_conf->mirrors[i].next = j;
10759+ printk(SPARE, partition_name(rdev->dev));
10760+ disk->number = descriptor->number;
10761+ disk->raid_disk = disk_idx;
10762+ disk->dev = rdev->dev;
10763+ disk->sect_limit = MAX_LINEAR_SECTORS;
10764+ disk->operational = 0;
10765+ disk->write_only = 0;
10766+ disk->spare = 1;
10767+ disk->used_slot = 1;
10768+ }
10769+ }
10770+ if (!conf->working_disks) {
10771+ printk(NONE_OPERATIONAL, mdidx(mddev));
10772+ goto out_free_conf;
10773+ }
10774+
10775+ conf->raid_disks = sb->raid_disks;
10776+ conf->nr_disks = sb->nr_disks;
10777+ conf->mddev = mddev;
10778+
10779+ for (i = 0; i < MD_SB_DISKS; i++) {
10780+
10781+ descriptor = sb->disks+i;
10782+ disk_idx = descriptor->raid_disk;
10783+ disk = conf->mirrors + disk_idx;
10784+
10785+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
10786+ !disk->used_slot) {
10787+
10788+ disk->number = descriptor->number;
10789+ disk->raid_disk = disk_idx;
10790+ disk->dev = MKDEV(0,0);
10791+
10792+ disk->operational = 0;
10793+ disk->write_only = 0;
10794+ disk->spare = 0;
10795+ disk->used_slot = 1;
10796+ }
10797+ }
10798+
10799+ /*
10800+ * find the first working one and use it as a starting point
10801+ * to read balancing.
10802+ */
10803+ for (j = 0; !conf->mirrors[j].operational; j++)
10804+ /* nothing */;
10805+ conf->last_used = j;
10806+
10807+ /*
10808+ * initialize the 'working disks' list.
10809+ */
10810+ for (i = conf->raid_disks - 1; i >= 0; i--) {
10811+ if (conf->mirrors[i].operational) {
10812+ conf->mirrors[i].next = j;
10813 j = i;
10814 }
10815 }
10816
10817- if (check_consistency(mddev)) {
10818- printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
10819- sb->state |= 1 << MD_SB_ERRORS;
10820- kfree(raid_conf);
10821- mddev->private = NULL;
10822- MOD_DEC_USE_COUNT;
10823- return -EIO;
10824+ if (conf->working_disks != sb->raid_disks) {
10825+ printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10826+ start_recovery = 1;
10827 }
10828
10829+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
10830+ /*
10831+ * we do sanity checks even if the device says
10832+ * it's clean ...
10833+ */
10834+ if (check_consistency(mddev)) {
10835+ printk(RUNNING_CKRAID);
10836+ sb->state &= ~(1 << MD_SB_CLEAN);
10837+ }
10838+ }
10839+
10840+ {
10841+ const char * name = "raid1d";
10842+
10843+ conf->thread = md_register_thread(raid1d, conf, name);
10844+ if (!conf->thread) {
10845+ printk(THREAD_ERROR, mdidx(mddev));
10846+ goto out_free_conf;
10847+ }
10848+ }
10849+
10850+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
10851+ const char * name = "raid1syncd";
10852+
10853+ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
10854+ if (!conf->resync_thread) {
10855+ printk(THREAD_ERROR, mdidx(mddev));
10856+ goto out_free_conf;
10857+ }
10858+
10859+ printk(START_RESYNC, mdidx(mddev));
10860+ conf->resync_mirrors = 1;
10861+ md_wakeup_thread(conf->resync_thread);
10862+ }
10863+
10864 /*
10865 * Regenerate the "device is in sync with the raid set" bit for
10866 * each device.
10867 */
10868- for (i = 0; i < sb->nr_disks ; i++) {
10869- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10870+ for (i = 0; i < MD_SB_DISKS; i++) {
10871+ mark_disk_nonsync(sb->disks+i);
10872 for (j = 0; j < sb->raid_disks; j++) {
10873- if (!raid_conf->mirrors[j].operational)
10874+ if (!conf->mirrors[j].operational)
10875 continue;
10876- if (sb->disks[i].number == raid_conf->mirrors[j].number)
10877- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10878+ if (sb->disks[i].number == conf->mirrors[j].number)
10879+ mark_disk_sync(sb->disks+i);
10880 }
10881 }
10882- sb->active_disks = raid_conf->working_disks;
10883+ sb->active_disks = conf->working_disks;
10884
10885- printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
10886- /* Ok, everything is just fine now */
10887- return (0);
10888+ if (start_recovery)
10889+ md_recover_arrays();
10890+
10891+
10892+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
10893+ /*
10894+ * Ok, everything is just fine now
10895+ */
10896+ return 0;
10897+
10898+out_free_conf:
10899+ kfree(conf);
10900+ mddev->private = NULL;
10901+out:
10902+ MOD_DEC_USE_COUNT;
10903+ return -EIO;
10904+}
10905+
10906+#undef INVALID_LEVEL
10907+#undef NO_SB
10908+#undef ERRORS
10909+#undef NOT_IN_SYNC
10910+#undef INCONSISTENT
10911+#undef ALREADY_RUNNING
10912+#undef OPERATIONAL
10913+#undef SPARE
10914+#undef NONE_OPERATIONAL
10915+#undef RUNNING_CKRAID
10916+#undef ARRAY_IS_ACTIVE
10917+
10918+static int raid1_stop_resync (mddev_t *mddev)
10919+{
10920+ raid1_conf_t *conf = mddev_to_conf(mddev);
10921+
10922+ if (conf->resync_thread) {
10923+ if (conf->resync_mirrors) {
10924+ conf->resync_mirrors = 2;
10925+ md_interrupt_thread(conf->resync_thread);
10926+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
10927+ return 1;
10928+ }
10929+ return 0;
10930+ }
10931+ return 0;
10932+}
10933+
10934+static int raid1_restart_resync (mddev_t *mddev)
10935+{
10936+ raid1_conf_t *conf = mddev_to_conf(mddev);
10937+
10938+ if (conf->resync_mirrors) {
10939+ if (!conf->resync_thread) {
10940+ MD_BUG();
10941+ return 0;
10942+ }
10943+ conf->resync_mirrors = 1;
10944+ md_wakeup_thread(conf->resync_thread);
10945+ return 1;
10946+ }
10947+ return 0;
10948 }
10949
10950-static int raid1_stop (int minor, struct md_dev *mddev)
10951+static int raid1_stop (mddev_t *mddev)
10952 {
10953- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10954+ raid1_conf_t *conf = mddev_to_conf(mddev);
10955
10956- kfree (raid_conf);
10957+ md_unregister_thread(conf->thread);
10958+ if (conf->resync_thread)
10959+ md_unregister_thread(conf->resync_thread);
10960+ kfree(conf);
10961 mddev->private = NULL;
10962 MOD_DEC_USE_COUNT;
10963 return 0;
10964 }
10965
10966-static struct md_personality raid1_personality=
10967+static mdk_personality_t raid1_personality=
10968 {
10969 "raid1",
10970 raid1_map,
10971@@ -855,15 +1214,13 @@
10972 NULL, /* no ioctls */
10973 0,
10974 raid1_error,
10975- raid1_hot_add_disk,
10976- /* raid1_hot_remove_drive */ NULL,
10977- raid1_mark_spare
10978+ raid1_diskop,
10979+ raid1_stop_resync,
10980+ raid1_restart_resync
10981 };
10982
10983 int raid1_init (void)
10984 {
10985- if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
10986- return -EBUSY;
10987 return register_md_personality (RAID1, &raid1_personality);
10988 }
10989
10990@@ -875,7 +1232,6 @@
10991
10992 void cleanup_module (void)
10993 {
10994- md_unregister_thread (raid1_thread);
10995 unregister_md_personality (RAID1);
10996 }
10997 #endif
10998--- linux/drivers/block/raid5.c.orig Fri May 8 09:17:13 1998
10999+++ linux/drivers/block/raid5.c Tue Jan 16 13:42:04 2001
11000@@ -1,4 +1,4 @@
11001-/*****************************************************************************
11002+/*
11003 * raid5.c : Multiple Devices driver for Linux
11004 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
11005 *
11006@@ -14,16 +14,15 @@
11007 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
11008 */
11009
11010+
11011 #include <linux/module.h>
11012 #include <linux/locks.h>
11013 #include <linux/malloc.h>
11014-#include <linux/md.h>
11015-#include <linux/raid5.h>
11016+#include <linux/raid/raid5.h>
11017 #include <asm/bitops.h>
11018 #include <asm/atomic.h>
11019-#include <asm/md.h>
11020
11021-static struct md_personality raid5_personality;
11022+static mdk_personality_t raid5_personality;
11023
11024 /*
11025 * Stripe cache
11026@@ -33,7 +32,7 @@
11027 #define HASH_PAGES_ORDER 0
11028 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
11029 #define HASH_MASK (NR_HASH - 1)
11030-#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11031+#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11032
11033 /*
11034 * The following can be used to debug the driver
11035@@ -46,6 +45,8 @@
11036 #define PRINTK(x) do { ; } while (0)
11037 #endif
11038
11039+static void print_raid5_conf (raid5_conf_t *conf);
11040+
11041 static inline int stripe_locked(struct stripe_head *sh)
11042 {
11043 return test_bit(STRIPE_LOCKED, &sh->state);
11044@@ -61,32 +62,32 @@
11045 */
11046 static inline void lock_stripe(struct stripe_head *sh)
11047 {
11048- struct raid5_data *raid_conf = sh->raid_conf;
11049- if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11050+ raid5_conf_t *conf = sh->raid_conf;
11051+ if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11052 PRINTK(("locking stripe %lu\n", sh->sector));
11053- raid_conf->nr_locked_stripes++;
11054+ conf->nr_locked_stripes++;
11055 }
11056 }
11057
11058 static inline void unlock_stripe(struct stripe_head *sh)
11059 {
11060- struct raid5_data *raid_conf = sh->raid_conf;
11061- if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11062+ raid5_conf_t *conf = sh->raid_conf;
11063+ if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11064 PRINTK(("unlocking stripe %lu\n", sh->sector));
11065- raid_conf->nr_locked_stripes--;
11066+ conf->nr_locked_stripes--;
11067 wake_up(&sh->wait);
11068 }
11069 }
11070
11071 static inline void finish_stripe(struct stripe_head *sh)
11072 {
11073- struct raid5_data *raid_conf = sh->raid_conf;
11074+ raid5_conf_t *conf = sh->raid_conf;
11075 unlock_stripe(sh);
11076 sh->cmd = STRIPE_NONE;
11077 sh->phase = PHASE_COMPLETE;
11078- raid_conf->nr_pending_stripes--;
11079- raid_conf->nr_cached_stripes++;
11080- wake_up(&raid_conf->wait_for_stripe);
11081+ conf->nr_pending_stripes--;
11082+ conf->nr_cached_stripes++;
11083+ wake_up(&conf->wait_for_stripe);
11084 }
11085
11086 void __wait_on_stripe(struct stripe_head *sh)
11087@@ -114,7 +115,7 @@
11088 __wait_on_stripe(sh);
11089 }
11090
11091-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11092+static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
11093 {
11094 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
11095
11096@@ -123,21 +124,22 @@
11097 sh->hash_next->hash_pprev = sh->hash_pprev;
11098 *sh->hash_pprev = sh->hash_next;
11099 sh->hash_pprev = NULL;
11100- raid_conf->nr_hashed_stripes--;
11101+ conf->nr_hashed_stripes--;
11102 }
11103 }
11104
11105-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11106+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
11107 {
11108- struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
11109+ struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
11110
11111- PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
11112+ PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
11113+ sh->sector, conf->nr_hashed_stripes));
11114
11115 if ((sh->hash_next = *shp) != NULL)
11116 (*shp)->hash_pprev = &sh->hash_next;
11117 *shp = sh;
11118 sh->hash_pprev = shp;
11119- raid_conf->nr_hashed_stripes++;
11120+ conf->nr_hashed_stripes++;
11121 }
11122
11123 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
11124@@ -145,13 +147,15 @@
11125 struct buffer_head *bh;
11126 unsigned long flags;
11127
11128- save_flags(flags);
11129- cli();
11130- if ((bh = sh->buffer_pool) == NULL)
11131- return NULL;
11132+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11133+ bh = sh->buffer_pool;
11134+ if (!bh)
11135+ goto out_unlock;
11136 sh->buffer_pool = bh->b_next;
11137 bh->b_size = b_size;
11138- restore_flags(flags);
11139+out_unlock:
11140+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11141+
11142 return bh;
11143 }
11144
11145@@ -160,12 +164,14 @@
11146 struct buffer_head *bh;
11147 unsigned long flags;
11148
11149- save_flags(flags);
11150- cli();
11151- if ((bh = sh->bh_pool) == NULL)
11152- return NULL;
11153+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11154+ bh = sh->bh_pool;
11155+ if (!bh)
11156+ goto out_unlock;
11157 sh->bh_pool = bh->b_next;
11158- restore_flags(flags);
11159+out_unlock:
11160+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11161+
11162 return bh;
11163 }
11164
11165@@ -173,54 +179,52 @@
11166 {
11167 unsigned long flags;
11168
11169- save_flags(flags);
11170- cli();
11171+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11172 bh->b_next = sh->buffer_pool;
11173 sh->buffer_pool = bh;
11174- restore_flags(flags);
11175+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11176 }
11177
11178 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
11179 {
11180 unsigned long flags;
11181
11182- save_flags(flags);
11183- cli();
11184+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11185 bh->b_next = sh->bh_pool;
11186 sh->bh_pool = bh;
11187- restore_flags(flags);
11188+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11189 }
11190
11191-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
11192+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
11193 {
11194 struct stripe_head *sh;
11195 unsigned long flags;
11196
11197 save_flags(flags);
11198 cli();
11199- if ((sh = raid_conf->free_sh_list) == NULL) {
11200+ if ((sh = conf->free_sh_list) == NULL) {
11201 restore_flags(flags);
11202 return NULL;
11203 }
11204- raid_conf->free_sh_list = sh->free_next;
11205- raid_conf->nr_free_sh--;
11206- if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
11207+ conf->free_sh_list = sh->free_next;
11208+ conf->nr_free_sh--;
11209+ if (!conf->nr_free_sh && conf->free_sh_list)
11210 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
11211 restore_flags(flags);
11212- if (sh->hash_pprev || sh->nr_pending || sh->count)
11213+ if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
11214 printk("get_free_stripe(): bug\n");
11215 return sh;
11216 }
11217
11218-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
11219+static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
11220 {
11221 unsigned long flags;
11222
11223 save_flags(flags);
11224 cli();
11225- sh->free_next = raid_conf->free_sh_list;
11226- raid_conf->free_sh_list = sh;
11227- raid_conf->nr_free_sh++;
11228+ sh->free_next = conf->free_sh_list;
11229+ conf->free_sh_list = sh;
11230+ conf->nr_free_sh++;
11231 restore_flags(flags);
11232 }
11233
11234@@ -324,8 +328,8 @@
11235
11236 static void kfree_stripe(struct stripe_head *sh)
11237 {
11238- struct raid5_data *raid_conf = sh->raid_conf;
11239- int disks = raid_conf->raid_disks, j;
11240+ raid5_conf_t *conf = sh->raid_conf;
11241+ int disks = conf->raid_disks, j;
11242
11243 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
11244 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
11245@@ -338,19 +342,19 @@
11246 if (sh->bh_new[j] || sh->bh_copy[j])
11247 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
11248 }
11249- remove_hash(raid_conf, sh);
11250- put_free_stripe(raid_conf, sh);
11251+ remove_hash(conf, sh);
11252+ put_free_stripe(conf, sh);
11253 }
11254
11255-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
11256+static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
11257 {
11258 struct stripe_head *sh;
11259 int i, count = 0;
11260
11261- PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
11262+ PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
11263 for (i = 0; i < NR_HASH; i++) {
11264 repeat:
11265- sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
11266+ sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
11267 for (; sh; sh = sh->hash_next) {
11268 if (sh->phase != PHASE_COMPLETE)
11269 continue;
11270@@ -360,30 +364,30 @@
11271 continue;
11272 kfree_stripe(sh);
11273 if (++count == nr) {
11274- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11275- raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
11276+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11277+ conf->clock = (i + conf->clock) & HASH_MASK;
11278 return nr;
11279 }
11280 goto repeat;
11281 }
11282 }
11283- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11284+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11285 return count;
11286 }
11287
11288-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11289+static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11290 {
11291 struct stripe_head *sh;
11292
11293- if (raid_conf->buffer_size != size) {
11294- PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
11295- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
11296- raid_conf->buffer_size = size;
11297+ if (conf->buffer_size != size) {
11298+ PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
11299+ shrink_stripe_cache(conf, conf->max_nr_stripes);
11300+ conf->buffer_size = size;
11301 }
11302
11303 PRINTK(("find_stripe, sector %lu\n", sector));
11304- for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
11305- if (sh->sector == sector && sh->raid_conf == raid_conf) {
11306+ for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
11307+ if (sh->sector == sector && sh->raid_conf == conf) {
11308 if (sh->size == size) {
11309 PRINTK(("found stripe %lu\n", sector));
11310 return sh;
11311@@ -397,7 +401,7 @@
11312 return NULL;
11313 }
11314
11315-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
11316+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
11317 {
11318 struct stripe_head *sh;
11319
11320@@ -405,62 +409,64 @@
11321 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
11322 return 1;
11323 memset(sh, 0, sizeof(*sh));
11324- if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
11325- shrink_buffers(sh, 2 * raid_conf->raid_disks);
11326+ sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
11327+
11328+ if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
11329+ shrink_buffers(sh, 2 * conf->raid_disks);
11330 kfree(sh);
11331 return 1;
11332 }
11333- if (grow_bh(sh, raid_conf->raid_disks, priority)) {
11334- shrink_buffers(sh, 2 * raid_conf->raid_disks);
11335- shrink_bh(sh, raid_conf->raid_disks);
11336+ if (grow_bh(sh, conf->raid_disks, priority)) {
11337+ shrink_buffers(sh, 2 * conf->raid_disks);
11338+ shrink_bh(sh, conf->raid_disks);
11339 kfree(sh);
11340 return 1;
11341 }
11342- put_free_stripe(raid_conf, sh);
11343- raid_conf->nr_stripes++;
11344+ put_free_stripe(conf, sh);
11345+ conf->nr_stripes++;
11346 }
11347 return 0;
11348 }
11349
11350-static void shrink_stripes(struct raid5_data *raid_conf, int num)
11351+static void shrink_stripes(raid5_conf_t *conf, int num)
11352 {
11353 struct stripe_head *sh;
11354
11355 while (num--) {
11356- sh = get_free_stripe(raid_conf);
11357+ sh = get_free_stripe(conf);
11358 if (!sh)
11359 break;
11360- shrink_buffers(sh, raid_conf->raid_disks * 2);
11361- shrink_bh(sh, raid_conf->raid_disks);
11362+ shrink_buffers(sh, conf->raid_disks * 2);
11363+ shrink_bh(sh, conf->raid_disks);
11364 kfree(sh);
11365- raid_conf->nr_stripes--;
11366+ conf->nr_stripes--;
11367 }
11368 }
11369
11370-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11371+static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11372 {
11373 struct stripe_head *sh = NULL, *tmp;
11374 struct buffer_head *buffer_pool, *bh_pool;
11375
11376 PRINTK(("kmalloc_stripe called\n"));
11377
11378- while ((sh = get_free_stripe(raid_conf)) == NULL) {
11379- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
11380- if ((sh = get_free_stripe(raid_conf)) != NULL)
11381+ while ((sh = get_free_stripe(conf)) == NULL) {
11382+ shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
11383+ if ((sh = get_free_stripe(conf)) != NULL)
11384 break;
11385- if (!raid_conf->nr_pending_stripes)
11386+ if (!conf->nr_pending_stripes)
11387 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
11388- md_wakeup_thread(raid_conf->thread);
11389+ md_wakeup_thread(conf->thread);
11390 PRINTK(("waiting for some stripes to complete\n"));
11391- sleep_on(&raid_conf->wait_for_stripe);
11392+ sleep_on(&conf->wait_for_stripe);
11393 }
11394
11395 /*
11396 * The above might have slept, so perhaps another process
11397 * already created the stripe for us..
11398 */
11399- if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
11400- put_free_stripe(raid_conf, sh);
11401+ if ((tmp = find_stripe(conf, sector, size)) != NULL) {
11402+ put_free_stripe(conf, sh);
11403 wait_on_stripe(tmp);
11404 return tmp;
11405 }
11406@@ -472,25 +478,25 @@
11407 sh->bh_pool = bh_pool;
11408 sh->phase = PHASE_COMPLETE;
11409 sh->cmd = STRIPE_NONE;
11410- sh->raid_conf = raid_conf;
11411+ sh->raid_conf = conf;
11412 sh->sector = sector;
11413 sh->size = size;
11414- raid_conf->nr_cached_stripes++;
11415- insert_hash(raid_conf, sh);
11416+ conf->nr_cached_stripes++;
11417+ insert_hash(conf, sh);
11418 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
11419 return sh;
11420 }
11421
11422-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11423+static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11424 {
11425 struct stripe_head *sh;
11426
11427 PRINTK(("get_stripe, sector %lu\n", sector));
11428- sh = find_stripe(raid_conf, sector, size);
11429+ sh = find_stripe(conf, sector, size);
11430 if (sh)
11431 wait_on_stripe(sh);
11432 else
11433- sh = kmalloc_stripe(raid_conf, sector, size);
11434+ sh = kmalloc_stripe(conf, sector, size);
11435 return sh;
11436 }
11437
11438@@ -523,7 +529,7 @@
11439 bh->b_end_io(bh, uptodate);
11440 if (!uptodate)
11441 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
11442- "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
11443+ "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
11444 }
11445
11446 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
11447@@ -537,36 +543,35 @@
11448 static void raid5_end_request (struct buffer_head * bh, int uptodate)
11449 {
11450 struct stripe_head *sh = bh->b_dev_id;
11451- struct raid5_data *raid_conf = sh->raid_conf;
11452- int disks = raid_conf->raid_disks, i;
11453+ raid5_conf_t *conf = sh->raid_conf;
11454+ int disks = conf->raid_disks, i;
11455 unsigned long flags;
11456
11457 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
11458- save_flags(flags);
11459- cli();
11460+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
11461 raid5_mark_buffer_uptodate(bh, uptodate);
11462- --sh->nr_pending;
11463- if (!sh->nr_pending) {
11464- md_wakeup_thread(raid_conf->thread);
11465- atomic_inc(&raid_conf->nr_handle);
11466+ if (atomic_dec_and_test(&sh->nr_pending)) {
11467+ md_wakeup_thread(conf->thread);
11468+ atomic_inc(&conf->nr_handle);
11469 }
11470- if (!uptodate)
11471+ if (!uptodate) {
11472 md_error(bh->b_dev, bh->b_rdev);
11473- if (raid_conf->failed_disks) {
11474+ }
11475+ if (conf->failed_disks) {
11476 for (i = 0; i < disks; i++) {
11477- if (raid_conf->disks[i].operational)
11478+ if (conf->disks[i].operational)
11479 continue;
11480 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
11481 continue;
11482- if (bh->b_rdev != raid_conf->disks[i].dev)
11483+ if (bh->b_rdev != conf->disks[i].dev)
11484 continue;
11485 set_bit(STRIPE_ERROR, &sh->state);
11486 }
11487 }
11488- restore_flags(flags);
11489+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11490 }
11491
11492-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
11493+static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
11494 unsigned long *rsector, unsigned long size)
11495 {
11496 /* No complex mapping used: the core of the work is done in the
11497@@ -577,11 +582,10 @@
11498
11499 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
11500 {
11501- struct raid5_data *raid_conf = sh->raid_conf;
11502- struct md_dev *mddev = raid_conf->mddev;
11503- int minor = (int) (mddev - md_dev);
11504+ raid5_conf_t *conf = sh->raid_conf;
11505+ mddev_t *mddev = conf->mddev;
11506 char *b_data;
11507- kdev_t dev = MKDEV(MD_MAJOR, minor);
11508+ kdev_t dev = mddev_to_kdev(mddev);
11509 int block = sh->sector / (sh->size >> 9);
11510
11511 b_data = ((volatile struct buffer_head *) bh)->b_data;
11512@@ -589,7 +593,7 @@
11513 init_buffer(bh, dev, block, raid5_end_request, sh);
11514 ((volatile struct buffer_head *) bh)->b_data = b_data;
11515
11516- bh->b_rdev = raid_conf->disks[i].dev;
11517+ bh->b_rdev = conf->disks[i].dev;
11518 bh->b_rsector = sh->sector;
11519
11520 bh->b_state = (1 << BH_Req);
11521@@ -597,33 +601,62 @@
11522 bh->b_list = BUF_LOCKED;
11523 }
11524
11525-static int raid5_error (struct md_dev *mddev, kdev_t dev)
11526+static int raid5_error (mddev_t *mddev, kdev_t dev)
11527 {
11528- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
11529- md_superblock_t *sb = mddev->sb;
11530+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
11531+ mdp_super_t *sb = mddev->sb;
11532 struct disk_info *disk;
11533 int i;
11534
11535 PRINTK(("raid5_error called\n"));
11536- raid_conf->resync_parity = 0;
11537- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
11538+ conf->resync_parity = 0;
11539+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
11540 if (disk->dev == dev && disk->operational) {
11541 disk->operational = 0;
11542- sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
11543- sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
11544- sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
11545+ mark_disk_faulty(sb->disks+disk->number);
11546+ mark_disk_nonsync(sb->disks+disk->number);
11547+ mark_disk_inactive(sb->disks+disk->number);
11548 sb->active_disks--;
11549 sb->working_disks--;
11550 sb->failed_disks++;
11551 mddev->sb_dirty = 1;
11552- raid_conf->working_disks--;
11553- raid_conf->failed_disks++;
11554- md_wakeup_thread(raid_conf->thread);
11555+ conf->working_disks--;
11556+ conf->failed_disks++;
11557+ md_wakeup_thread(conf->thread);
11558 printk (KERN_ALERT
11559- "RAID5: Disk failure on %s, disabling device."
11560- "Operation continuing on %d devices\n",
11561- kdevname (dev), raid_conf->working_disks);
11562+ "raid5: Disk failure on %s, disabling device."
11563+ " Operation continuing on %d devices\n",
11564+ partition_name (dev), conf->working_disks);
11565+ return -EIO;
11566 }
11567+ }
11568+ /*
11569+ * handle errors in spares (during reconstruction)
11570+ */
11571+ if (conf->spare) {
11572+ disk = conf->spare;
11573+ if (disk->dev == dev) {
11574+ printk (KERN_ALERT
11575+ "raid5: Disk failure on spare %s\n",
11576+ partition_name (dev));
11577+ if (!conf->spare->operational) {
11578+ MD_BUG();
11579+ return -EIO;
11580+ }
11581+ disk->operational = 0;
11582+ disk->write_only = 0;
11583+ conf->spare = NULL;
11584+ mark_disk_faulty(sb->disks+disk->number);
11585+ mark_disk_nonsync(sb->disks+disk->number);
11586+ mark_disk_inactive(sb->disks+disk->number);
11587+ sb->spare_disks--;
11588+ sb->working_disks--;
11589+ sb->failed_disks++;
11590+
11591+ return -EIO;
11592+ }
11593+ }
11594+ MD_BUG();
11595 return 0;
11596 }
11597
11598@@ -634,12 +667,12 @@
11599 static inline unsigned long
11600 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
11601 unsigned int * dd_idx, unsigned int * pd_idx,
11602- struct raid5_data *raid_conf)
11603+ raid5_conf_t *conf)
11604 {
11605 unsigned int stripe;
11606 int chunk_number, chunk_offset;
11607 unsigned long new_sector;
11608- int sectors_per_chunk = raid_conf->chunk_size >> 9;
11609+ int sectors_per_chunk = conf->chunk_size >> 9;
11610
11611 /* First compute the information on this sector */
11612
11613@@ -662,9 +695,9 @@
11614 /*
11615 * Select the parity disk based on the user selected algorithm.
11616 */
11617- if (raid_conf->level == 4)
11618+ if (conf->level == 4)
11619 *pd_idx = data_disks;
11620- else switch (raid_conf->algorithm) {
11621+ else switch (conf->algorithm) {
11622 case ALGORITHM_LEFT_ASYMMETRIC:
11623 *pd_idx = data_disks - stripe % raid_disks;
11624 if (*dd_idx >= *pd_idx)
11625@@ -684,7 +717,7 @@
11626 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
11627 break;
11628 default:
11629- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11630+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11631 }
11632
11633 /*
11634@@ -705,16 +738,16 @@
11635
11636 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
11637 {
11638- struct raid5_data *raid_conf = sh->raid_conf;
11639- int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
11640+ raid5_conf_t *conf = sh->raid_conf;
11641+ int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
11642 unsigned long new_sector = sh->sector, check;
11643- int sectors_per_chunk = raid_conf->chunk_size >> 9;
11644+ int sectors_per_chunk = conf->chunk_size >> 9;
11645 unsigned long stripe = new_sector / sectors_per_chunk;
11646 int chunk_offset = new_sector % sectors_per_chunk;
11647 int chunk_number, dummy1, dummy2, dd_idx = i;
11648 unsigned long r_sector, blocknr;
11649
11650- switch (raid_conf->algorithm) {
11651+ switch (conf->algorithm) {
11652 case ALGORITHM_LEFT_ASYMMETRIC:
11653 case ALGORITHM_RIGHT_ASYMMETRIC:
11654 if (i > sh->pd_idx)
11655@@ -727,14 +760,14 @@
11656 i -= (sh->pd_idx + 1);
11657 break;
11658 default:
11659- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11660+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11661 }
11662
11663 chunk_number = stripe * data_disks + i;
11664 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
11665 blocknr = r_sector / (sh->size >> 9);
11666
11667- check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
11668+ check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
11669 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
11670 printk("compute_blocknr: map not correct\n");
11671 return 0;
11672@@ -742,36 +775,11 @@
11673 return blocknr;
11674 }
11675
11676-#ifdef HAVE_ARCH_XORBLOCK
11677-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11678-{
11679- __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
11680-}
11681-#else
11682-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11683-{
11684- long lines = dest->b_size / (sizeof (long)) / 8, i;
11685- long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
11686-
11687- for (i = lines; i > 0; i--) {
11688- *(destp + 0) ^= *(sourcep + 0);
11689- *(destp + 1) ^= *(sourcep + 1);
11690- *(destp + 2) ^= *(sourcep + 2);
11691- *(destp + 3) ^= *(sourcep + 3);
11692- *(destp + 4) ^= *(sourcep + 4);
11693- *(destp + 5) ^= *(sourcep + 5);
11694- *(destp + 6) ^= *(sourcep + 6);
11695- *(destp + 7) ^= *(sourcep + 7);
11696- destp += 8;
11697- sourcep += 8;
11698- }
11699-}
11700-#endif
11701-
11702 static void compute_block(struct stripe_head *sh, int dd_idx)
11703 {
11704- struct raid5_data *raid_conf = sh->raid_conf;
11705- int i, disks = raid_conf->raid_disks;
11706+ raid5_conf_t *conf = sh->raid_conf;
11707+ int i, count, disks = conf->raid_disks;
11708+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11709
11710 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
11711
11712@@ -780,69 +788,100 @@
11713 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
11714
11715 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
11716+ bh_ptr[0] = sh->bh_old[dd_idx];
11717+ count = 1;
11718 for (i = 0; i < disks; i++) {
11719 if (i == dd_idx)
11720 continue;
11721 if (sh->bh_old[i]) {
11722- xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
11723- continue;
11724- } else
11725+ bh_ptr[count++] = sh->bh_old[i];
11726+ } else {
11727 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
11728+ }
11729+ if (count == MAX_XOR_BLOCKS) {
11730+ xor_block(count, &bh_ptr[0]);
11731+ count = 1;
11732+ }
11733+ }
11734+ if(count != 1) {
11735+ xor_block(count, &bh_ptr[0]);
11736 }
11737 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
11738 }
11739
11740 static void compute_parity(struct stripe_head *sh, int method)
11741 {
11742- struct raid5_data *raid_conf = sh->raid_conf;
11743- int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
11744+ raid5_conf_t *conf = sh->raid_conf;
11745+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
11746+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11747
11748 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
11749+ lowprio = 1;
11750 for (i = 0; i < disks; i++) {
11751 if (i == pd_idx || !sh->bh_new[i])
11752 continue;
11753 if (!sh->bh_copy[i])
11754 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
11755 raid5_build_block(sh, sh->bh_copy[i], i);
11756+ if (!buffer_lowprio(sh->bh_new[i]))
11757+ lowprio = 0;
11758+ else
11759+ mark_buffer_lowprio(sh->bh_copy[i]);
11760 mark_buffer_clean(sh->bh_new[i]);
11761 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
11762 }
11763 if (sh->bh_copy[pd_idx] == NULL)
11764 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
11765 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
11766+ if (lowprio)
11767+ mark_buffer_lowprio(sh->bh_copy[pd_idx]);
11768
11769 if (method == RECONSTRUCT_WRITE) {
11770 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
11771+ bh_ptr[0] = sh->bh_copy[pd_idx];
11772+ count = 1;
11773 for (i = 0; i < disks; i++) {
11774 if (i == sh->pd_idx)
11775 continue;
11776 if (sh->bh_new[i]) {
11777- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11778- continue;
11779+ bh_ptr[count++] = sh->bh_copy[i];
11780+ } else if (sh->bh_old[i]) {
11781+ bh_ptr[count++] = sh->bh_old[i];
11782 }
11783- if (sh->bh_old[i]) {
11784- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11785- continue;
11786+ if (count == MAX_XOR_BLOCKS) {
11787+ xor_block(count, &bh_ptr[0]);
11788+ count = 1;
11789 }
11790 }
11791+ if (count != 1) {
11792+ xor_block(count, &bh_ptr[0]);
11793+ }
11794 } else if (method == READ_MODIFY_WRITE) {
11795 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
11796+ bh_ptr[0] = sh->bh_copy[pd_idx];
11797+ count = 1;
11798 for (i = 0; i < disks; i++) {
11799 if (i == sh->pd_idx)
11800 continue;
11801 if (sh->bh_new[i] && sh->bh_old[i]) {
11802- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11803- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11804- continue;
11805+ bh_ptr[count++] = sh->bh_copy[i];
11806+ bh_ptr[count++] = sh->bh_old[i];
11807+ }
11808+ if (count >= (MAX_XOR_BLOCKS - 1)) {
11809+ xor_block(count, &bh_ptr[0]);
11810+ count = 1;
11811 }
11812 }
11813+ if (count != 1) {
11814+ xor_block(count, &bh_ptr[0]);
11815+ }
11816 }
11817 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
11818 }
11819
11820 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
11821 {
11822- struct raid5_data *raid_conf = sh->raid_conf;
11823+ raid5_conf_t *conf = sh->raid_conf;
11824 struct buffer_head *bh_req;
11825
11826 if (sh->bh_new[dd_idx]) {
11827@@ -860,19 +899,22 @@
11828 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
11829 sh->phase = PHASE_BEGIN;
11830 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
11831- raid_conf->nr_pending_stripes++;
11832- atomic_inc(&raid_conf->nr_handle);
11833+ conf->nr_pending_stripes++;
11834+ atomic_inc(&conf->nr_handle);
11835 }
11836 sh->bh_new[dd_idx] = bh;
11837 sh->bh_req[dd_idx] = bh_req;
11838 sh->cmd_new[dd_idx] = rw;
11839 sh->new[dd_idx] = 1;
11840+
11841+ if (buffer_lowprio(bh))
11842+ mark_buffer_lowprio(bh_req);
11843 }
11844
11845 static void complete_stripe(struct stripe_head *sh)
11846 {
11847- struct raid5_data *raid_conf = sh->raid_conf;
11848- int disks = raid_conf->raid_disks;
11849+ raid5_conf_t *conf = sh->raid_conf;
11850+ int disks = conf->raid_disks;
11851 int i, new = 0;
11852
11853 PRINTK(("complete_stripe %lu\n", sh->sector));
11854@@ -909,6 +951,22 @@
11855 }
11856 }
11857
11858+
11859+static int is_stripe_lowprio(struct stripe_head *sh, int disks)
11860+{
11861+ int i, lowprio = 1;
11862+
11863+ for (i = 0; i < disks; i++) {
11864+ if (sh->bh_new[i])
11865+ if (!buffer_lowprio(sh->bh_new[i]))
11866+ lowprio = 0;
11867+ if (sh->bh_old[i])
11868+ if (!buffer_lowprio(sh->bh_old[i]))
11869+ lowprio = 0;
11870+ }
11871+ return lowprio;
11872+}
11873+
11874 /*
11875 * handle_stripe() is our main logic routine. Note that:
11876 *
11877@@ -919,28 +977,27 @@
11878 * 2. We should be careful to set sh->nr_pending whenever we sleep,
11879 * to prevent re-entry of handle_stripe() for the same sh.
11880 *
11881- * 3. raid_conf->failed_disks and disk->operational can be changed
11882+ * 3. conf->failed_disks and disk->operational can be changed
11883 * from an interrupt. This complicates things a bit, but it allows
11884 * us to stop issuing requests for a failed drive as soon as possible.
11885 */
11886 static void handle_stripe(struct stripe_head *sh)
11887 {
11888- struct raid5_data *raid_conf = sh->raid_conf;
11889- struct md_dev *mddev = raid_conf->mddev;
11890- int minor = (int) (mddev - md_dev);
11891+ raid5_conf_t *conf = sh->raid_conf;
11892+ mddev_t *mddev = conf->mddev;
11893 struct buffer_head *bh;
11894- int disks = raid_conf->raid_disks;
11895- int i, nr = 0, nr_read = 0, nr_write = 0;
11896+ int disks = conf->raid_disks;
11897+ int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
11898 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
11899 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
11900 int reading = 0, nr_writing = 0;
11901 int method1 = INT_MAX, method2 = INT_MAX;
11902 int block;
11903 unsigned long flags;
11904- int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
11905+ int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
11906
11907 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
11908- if (sh->nr_pending) {
11909+ if (md_atomic_read(&sh->nr_pending)) {
11910 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
11911 return;
11912 }
11913@@ -949,9 +1006,9 @@
11914 return;
11915 }
11916
11917- atomic_dec(&raid_conf->nr_handle);
11918+ atomic_dec(&conf->nr_handle);
11919
11920- if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11921+ if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11922 printk("raid5: restarting stripe %lu\n", sh->sector);
11923 sh->phase = PHASE_BEGIN;
11924 }
11925@@ -969,11 +1026,11 @@
11926 save_flags(flags);
11927 cli();
11928 for (i = 0; i < disks; i++) {
11929- operational[i] = raid_conf->disks[i].operational;
11930- if (i == sh->pd_idx && raid_conf->resync_parity)
11931+ operational[i] = conf->disks[i].operational;
11932+ if (i == sh->pd_idx && conf->resync_parity)
11933 operational[i] = 0;
11934 }
11935- failed_disks = raid_conf->failed_disks;
11936+ failed_disks = conf->failed_disks;
11937 restore_flags(flags);
11938
11939 if (failed_disks > 1) {
11940@@ -1017,7 +1074,7 @@
11941 }
11942
11943 if (nr_write && nr_read)
11944- printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11945+ printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11946
11947 if (nr_write) {
11948 /*
11949@@ -1030,7 +1087,7 @@
11950 if (sh->bh_new[i])
11951 continue;
11952 block = (int) compute_blocknr(sh, i);
11953- bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
11954+ bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
11955 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
11956 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
11957 add_stripe_bh(sh, bh, i, WRITE);
11958@@ -1064,21 +1121,22 @@
11959
11960 if (!method1 || !method2) {
11961 lock_stripe(sh);
11962- sh->nr_pending++;
11963+ lowprio = is_stripe_lowprio(sh, disks);
11964+ atomic_inc(&sh->nr_pending);
11965 sh->phase = PHASE_WRITE;
11966 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
11967 for (i = 0; i < disks; i++) {
11968- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11969+ if (!operational[i] && !conf->spare && !conf->resync_parity)
11970 continue;
11971 if (i == sh->pd_idx || sh->bh_new[i])
11972 nr_writing++;
11973 }
11974
11975- sh->nr_pending = nr_writing;
11976- PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
11977+ md_atomic_set(&sh->nr_pending, nr_writing);
11978+ PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
11979
11980 for (i = 0; i < disks; i++) {
11981- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11982+ if (!operational[i] && !conf->spare && !conf->resync_parity)
11983 continue;
11984 bh = sh->bh_copy[i];
11985 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
11986@@ -1089,18 +1147,30 @@
11987 bh->b_state |= (1<<BH_Dirty);
11988 PRINTK(("making request for buffer %d\n", i));
11989 clear_bit(BH_Lock, &bh->b_state);
11990- if (!operational[i] && !raid_conf->resync_parity) {
11991- bh->b_rdev = raid_conf->spare->dev;
11992- make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
11993- } else
11994- make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
11995+ if (!operational[i] && !conf->resync_parity) {
11996+ bh->b_rdev = conf->spare->dev;
11997+ make_request(MAJOR(conf->spare->dev), WRITE, bh);
11998+ } else {
11999+#if 0
12000+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
12001+#else
12002+ if (!lowprio || (i==sh->pd_idx))
12003+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
12004+ else {
12005+ mark_buffer_clean(bh);
12006+ raid5_end_request(bh,1);
12007+ sh->new[i] = 0;
12008+ }
12009+#endif
12010+ }
12011 }
12012 }
12013 return;
12014 }
12015
12016 lock_stripe(sh);
12017- sh->nr_pending++;
12018+ lowprio = is_stripe_lowprio(sh, disks);
12019+ atomic_inc(&sh->nr_pending);
12020 if (method1 < method2) {
12021 sh->write_method = RECONSTRUCT_WRITE;
12022 for (i = 0; i < disks; i++) {
12023@@ -1110,6 +1180,8 @@
12024 continue;
12025 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12026 raid5_build_block(sh, sh->bh_old[i], i);
12027+ if (lowprio)
12028+ mark_buffer_lowprio(sh->bh_old[i]);
12029 reading++;
12030 }
12031 } else {
12032@@ -1121,19 +1193,21 @@
12033 continue;
12034 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12035 raid5_build_block(sh, sh->bh_old[i], i);
12036+ if (lowprio)
12037+ mark_buffer_lowprio(sh->bh_old[i]);
12038 reading++;
12039 }
12040 }
12041 sh->phase = PHASE_READ_OLD;
12042- sh->nr_pending = reading;
12043- PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
12044+ md_atomic_set(&sh->nr_pending, reading);
12045+ PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12046 for (i = 0; i < disks; i++) {
12047 if (!sh->bh_old[i])
12048 continue;
12049 if (buffer_uptodate(sh->bh_old[i]))
12050 continue;
12051 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12052- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12053+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12054 }
12055 } else {
12056 /*
12057@@ -1141,7 +1215,8 @@
12058 */
12059 method1 = nr_read - nr_cache_overwrite;
12060 lock_stripe(sh);
12061- sh->nr_pending++;
12062+ lowprio = is_stripe_lowprio(sh,disks);
12063+ atomic_inc(&sh->nr_pending);
12064
12065 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
12066 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
12067@@ -1149,18 +1224,22 @@
12068 for (i = 0; i < disks; i++) {
12069 if (!sh->bh_new[i])
12070 continue;
12071- if (!sh->bh_old[i])
12072+ if (!sh->bh_old[i]) {
12073 compute_block(sh, i);
12074+ if (lowprio)
12075+ mark_buffer_lowprio
12076+ (sh->bh_old[i]);
12077+ }
12078 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12079 }
12080- sh->nr_pending--;
12081+ atomic_dec(&sh->nr_pending);
12082 complete_stripe(sh);
12083 return;
12084 }
12085 if (nr_failed_overwrite) {
12086 sh->phase = PHASE_READ_OLD;
12087- sh->nr_pending = (disks - 1) - nr_cache;
12088- PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
12089+ md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
12090+ PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12091 for (i = 0; i < disks; i++) {
12092 if (sh->bh_old[i])
12093 continue;
12094@@ -1168,13 +1247,16 @@
12095 continue;
12096 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12097 raid5_build_block(sh, sh->bh_old[i], i);
12098+ if (lowprio)
12099+ mark_buffer_lowprio(sh->bh_old[i]);
12100 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12101- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12102+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12103 }
12104 } else {
12105 sh->phase = PHASE_READ;
12106- sh->nr_pending = nr_read - nr_cache_overwrite;
12107- PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
12108+ md_atomic_set(&sh->nr_pending,
12109+ nr_read - nr_cache_overwrite);
12110+ PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12111 for (i = 0; i < disks; i++) {
12112 if (!sh->bh_new[i])
12113 continue;
12114@@ -1182,16 +1264,16 @@
12115 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12116 continue;
12117 }
12118- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
12119+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
12120 }
12121 }
12122 }
12123 }
12124
12125-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
12126+static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
12127 {
12128- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12129- const unsigned int raid_disks = raid_conf->raid_disks;
12130+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12131+ const unsigned int raid_disks = conf->raid_disks;
12132 const unsigned int data_disks = raid_disks - 1;
12133 unsigned int dd_idx, pd_idx;
12134 unsigned long new_sector;
12135@@ -1202,15 +1284,15 @@
12136 if (rw == WRITEA) rw = WRITE;
12137
12138 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
12139- &dd_idx, &pd_idx, raid_conf);
12140+ &dd_idx, &pd_idx, conf);
12141
12142 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
12143 repeat:
12144- sh = get_stripe(raid_conf, new_sector, bh->b_size);
12145+ sh = get_stripe(conf, new_sector, bh->b_size);
12146 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
12147 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
12148 lock_stripe(sh);
12149- if (!sh->nr_pending)
12150+ if (!md_atomic_read(&sh->nr_pending))
12151 handle_stripe(sh);
12152 goto repeat;
12153 }
12154@@ -1221,24 +1303,24 @@
12155 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
12156 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
12157 lock_stripe(sh);
12158- md_wakeup_thread(raid_conf->thread);
12159+ md_wakeup_thread(conf->thread);
12160 wait_on_stripe(sh);
12161 goto repeat;
12162 }
12163 add_stripe_bh(sh, bh, dd_idx, rw);
12164
12165- md_wakeup_thread(raid_conf->thread);
12166+ md_wakeup_thread(conf->thread);
12167 return 0;
12168 }
12169
12170 static void unplug_devices(struct stripe_head *sh)
12171 {
12172 #if 0
12173- struct raid5_data *raid_conf = sh->raid_conf;
12174+ raid5_conf_t *conf = sh->raid_conf;
12175 int i;
12176
12177- for (i = 0; i < raid_conf->raid_disks; i++)
12178- unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
12179+ for (i = 0; i < conf->raid_disks; i++)
12180+ unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
12181 #endif
12182 }
12183
12184@@ -1252,8 +1334,8 @@
12185 static void raid5d (void *data)
12186 {
12187 struct stripe_head *sh;
12188- struct raid5_data *raid_conf = data;
12189- struct md_dev *mddev = raid_conf->mddev;
12190+ raid5_conf_t *conf = data;
12191+ mddev_t *mddev = conf->mddev;
12192 int i, handled = 0, unplug = 0;
12193 unsigned long flags;
12194
12195@@ -1261,47 +1343,47 @@
12196
12197 if (mddev->sb_dirty) {
12198 mddev->sb_dirty = 0;
12199- md_update_sb((int) (mddev - md_dev));
12200+ md_update_sb(mddev);
12201 }
12202 for (i = 0; i < NR_HASH; i++) {
12203 repeat:
12204- sh = raid_conf->stripe_hashtbl[i];
12205+ sh = conf->stripe_hashtbl[i];
12206 for (; sh; sh = sh->hash_next) {
12207- if (sh->raid_conf != raid_conf)
12208+ if (sh->raid_conf != conf)
12209 continue;
12210 if (sh->phase == PHASE_COMPLETE)
12211 continue;
12212- if (sh->nr_pending)
12213+ if (md_atomic_read(&sh->nr_pending))
12214 continue;
12215- if (sh->sector == raid_conf->next_sector) {
12216- raid_conf->sector_count += (sh->size >> 9);
12217- if (raid_conf->sector_count >= 128)
12218+ if (sh->sector == conf->next_sector) {
12219+ conf->sector_count += (sh->size >> 9);
12220+ if (conf->sector_count >= 128)
12221 unplug = 1;
12222 } else
12223 unplug = 1;
12224 if (unplug) {
12225- PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
12226+ PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
12227 unplug_devices(sh);
12228 unplug = 0;
12229- raid_conf->sector_count = 0;
12230+ conf->sector_count = 0;
12231 }
12232- raid_conf->next_sector = sh->sector + (sh->size >> 9);
12233+ conf->next_sector = sh->sector + (sh->size >> 9);
12234 handled++;
12235 handle_stripe(sh);
12236 goto repeat;
12237 }
12238 }
12239- if (raid_conf) {
12240- PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
12241+ if (conf) {
12242+ PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
12243 save_flags(flags);
12244 cli();
12245- if (!atomic_read(&raid_conf->nr_handle))
12246- clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
12247+ if (!md_atomic_read(&conf->nr_handle))
12248+ clear_bit(THREAD_WAKEUP, &conf->thread->flags);
12249+ restore_flags(flags);
12250 }
12251 PRINTK(("--- raid5d inactive\n"));
12252 }
12253
12254-#if SUPPORT_RECONSTRUCTION
12255 /*
12256 * Private kernel thread for parity reconstruction after an unclean
12257 * shutdown. Reconstruction on spare drives in case of a failed drive
12258@@ -1309,44 +1391,64 @@
12259 */
12260 static void raid5syncd (void *data)
12261 {
12262- struct raid5_data *raid_conf = data;
12263- struct md_dev *mddev = raid_conf->mddev;
12264+ raid5_conf_t *conf = data;
12265+ mddev_t *mddev = conf->mddev;
12266
12267- if (!raid_conf->resync_parity)
12268+ if (!conf->resync_parity)
12269+ return;
12270+ if (conf->resync_parity == 2)
12271+ return;
12272+ down(&mddev->recovery_sem);
12273+ if (md_do_sync(mddev,NULL)) {
12274+ up(&mddev->recovery_sem);
12275+ printk("raid5: resync aborted!\n");
12276 return;
12277- md_do_sync(mddev);
12278- raid_conf->resync_parity = 0;
12279+ }
12280+ conf->resync_parity = 0;
12281+ up(&mddev->recovery_sem);
12282+ printk("raid5: resync finished.\n");
12283 }
12284-#endif /* SUPPORT_RECONSTRUCTION */
12285
12286-static int __check_consistency (struct md_dev *mddev, int row)
12287+static int __check_consistency (mddev_t *mddev, int row)
12288 {
12289- struct raid5_data *raid_conf = mddev->private;
12290+ raid5_conf_t *conf = mddev->private;
12291 kdev_t dev;
12292 struct buffer_head *bh[MD_SB_DISKS], tmp;
12293- int i, rc = 0, nr = 0;
12294+ int i, rc = 0, nr = 0, count;
12295+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
12296
12297- if (raid_conf->working_disks != raid_conf->raid_disks)
12298+ if (conf->working_disks != conf->raid_disks)
12299 return 0;
12300 tmp.b_size = 4096;
12301 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
12302 return 0;
12303+ md_clear_page((unsigned long)tmp.b_data);
12304 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
12305- for (i = 0; i < raid_conf->raid_disks; i++) {
12306- dev = raid_conf->disks[i].dev;
12307+ for (i = 0; i < conf->raid_disks; i++) {
12308+ dev = conf->disks[i].dev;
12309 set_blocksize(dev, 4096);
12310 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
12311 break;
12312 nr++;
12313 }
12314- if (nr == raid_conf->raid_disks) {
12315- for (i = 1; i < nr; i++)
12316- xor_block(&tmp, bh[i]);
12317+ if (nr == conf->raid_disks) {
12318+ bh_ptr[0] = &tmp;
12319+ count = 1;
12320+ for (i = 1; i < nr; i++) {
12321+ bh_ptr[count++] = bh[i];
12322+ if (count == MAX_XOR_BLOCKS) {
12323+ xor_block(count, &bh_ptr[0]);
12324+ count = 1;
12325+ }
12326+ }
12327+ if (count != 1) {
12328+ xor_block(count, &bh_ptr[0]);
12329+ }
12330 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
12331 rc = 1;
12332 }
12333- for (i = 0; i < raid_conf->raid_disks; i++) {
12334- dev = raid_conf->disks[i].dev;
12335+ for (i = 0; i < conf->raid_disks; i++) {
12336+ dev = conf->disks[i].dev;
12337 if (bh[i]) {
12338 bforget(bh[i]);
12339 bh[i] = NULL;
12340@@ -1358,285 +1460,607 @@
12341 return rc;
12342 }
12343
12344-static int check_consistency (struct md_dev *mddev)
12345+static int check_consistency (mddev_t *mddev)
12346 {
12347- int size = mddev->sb->size;
12348- int row;
12349+ if (__check_consistency(mddev, 0))
12350+/*
12351+ * We are not checking this currently, as it's legitimate to have
12352+ * an inconsistent array, at creation time.
12353+ */
12354+ return 0;
12355
12356- for (row = 0; row < size; row += size / 8)
12357- if (__check_consistency(mddev, row))
12358- return 1;
12359 return 0;
12360 }
12361
12362-static int raid5_run (int minor, struct md_dev *mddev)
12363+static int raid5_run (mddev_t *mddev)
12364 {
12365- struct raid5_data *raid_conf;
12366+ raid5_conf_t *conf;
12367 int i, j, raid_disk, memory;
12368- md_superblock_t *sb = mddev->sb;
12369- md_descriptor_t *descriptor;
12370- struct real_dev *realdev;
12371+ mdp_super_t *sb = mddev->sb;
12372+ mdp_disk_t *desc;
12373+ mdk_rdev_t *rdev;
12374+ struct disk_info *disk;
12375+ struct md_list_head *tmp;
12376+ int start_recovery = 0;
12377
12378 MOD_INC_USE_COUNT;
12379
12380 if (sb->level != 5 && sb->level != 4) {
12381- printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
12382+ printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
12383 MOD_DEC_USE_COUNT;
12384 return -EIO;
12385 }
12386
12387- mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
12388- if ((raid_conf = mddev->private) == NULL)
12389+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
12390+ if ((conf = mddev->private) == NULL)
12391 goto abort;
12392- memset (raid_conf, 0, sizeof (*raid_conf));
12393- raid_conf->mddev = mddev;
12394+ memset (conf, 0, sizeof (*conf));
12395+ conf->mddev = mddev;
12396
12397- if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12398+ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12399 goto abort;
12400- memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12401+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12402
12403- init_waitqueue(&raid_conf->wait_for_stripe);
12404- PRINTK(("raid5_run(%d) called.\n", minor));
12405-
12406- for (i = 0; i < mddev->nb_dev; i++) {
12407- realdev = &mddev->devices[i];
12408- if (!realdev->sb) {
12409- printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
12410- continue;
12411- }
12412+ init_waitqueue(&conf->wait_for_stripe);
12413+ PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
12414
12415+ ITERATE_RDEV(mddev,rdev,tmp) {
12416 /*
12417 * This is important -- we are using the descriptor on
12418 * the disk only to get a pointer to the descriptor on
12419 * the main superblock, which might be more recent.
12420 */
12421- descriptor = &sb->disks[realdev->sb->descriptor.number];
12422- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
12423- printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
12424+ desc = sb->disks + rdev->desc_nr;
12425+ raid_disk = desc->raid_disk;
12426+ disk = conf->disks + raid_disk;
12427+
12428+ if (disk_faulty(desc)) {
12429+ printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
12430+ if (!rdev->faulty) {
12431+ MD_BUG();
12432+ goto abort;
12433+ }
12434+ disk->number = desc->number;
12435+ disk->raid_disk = raid_disk;
12436+ disk->dev = rdev->dev;
12437+
12438+ disk->operational = 0;
12439+ disk->write_only = 0;
12440+ disk->spare = 0;
12441+ disk->used_slot = 1;
12442 continue;
12443 }
12444- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
12445- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
12446- printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
12447- continue;
12448+ if (disk_active(desc)) {
12449+ if (!disk_sync(desc)) {
12450+ printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
12451+ MD_BUG();
12452+ goto abort;
12453 }
12454- raid_disk = descriptor->raid_disk;
12455- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
12456- printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
12457+ if (raid_disk > sb->raid_disks) {
12458+ printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
12459 continue;
12460 }
12461- if (raid_conf->disks[raid_disk].operational) {
12462- printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
12463+ if (disk->operational) {
12464+ printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
12465 continue;
12466 }
12467- printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
12468+ printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
12469
12470- raid_conf->disks[raid_disk].number = descriptor->number;
12471- raid_conf->disks[raid_disk].raid_disk = raid_disk;
12472- raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
12473- raid_conf->disks[raid_disk].operational = 1;
12474+ disk->number = desc->number;
12475+ disk->raid_disk = raid_disk;
12476+ disk->dev = rdev->dev;
12477+ disk->operational = 1;
12478+ disk->used_slot = 1;
12479
12480- raid_conf->working_disks++;
12481+ conf->working_disks++;
12482 } else {
12483 /*
12484 * Must be a spare disk ..
12485 */
12486- printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
12487- raid_disk = descriptor->raid_disk;
12488- raid_conf->disks[raid_disk].number = descriptor->number;
12489- raid_conf->disks[raid_disk].raid_disk = raid_disk;
12490- raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
12491-
12492- raid_conf->disks[raid_disk].operational = 0;
12493- raid_conf->disks[raid_disk].write_only = 0;
12494- raid_conf->disks[raid_disk].spare = 1;
12495- }
12496- }
12497- raid_conf->raid_disks = sb->raid_disks;
12498- raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
12499- raid_conf->mddev = mddev;
12500- raid_conf->chunk_size = sb->chunk_size;
12501- raid_conf->level = sb->level;
12502- raid_conf->algorithm = sb->parity_algorithm;
12503- raid_conf->max_nr_stripes = NR_STRIPES;
12504+ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
12505+ disk->number = desc->number;
12506+ disk->raid_disk = raid_disk;
12507+ disk->dev = rdev->dev;
12508
12509- if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
12510- printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
12511- goto abort;
12512+ disk->operational = 0;
12513+ disk->write_only = 0;
12514+ disk->spare = 1;
12515+ disk->used_slot = 1;
12516+ }
12517 }
12518- if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
12519- printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
12520+
12521+ for (i = 0; i < MD_SB_DISKS; i++) {
12522+ desc = sb->disks + i;
12523+ raid_disk = desc->raid_disk;
12524+ disk = conf->disks + raid_disk;
12525+
12526+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
12527+ !conf->disks[raid_disk].used_slot) {
12528+
12529+ disk->number = desc->number;
12530+ disk->raid_disk = raid_disk;
12531+ disk->dev = MKDEV(0,0);
12532+
12533+ disk->operational = 0;
12534+ disk->write_only = 0;
12535+ disk->spare = 0;
12536+ disk->used_slot = 1;
12537+ }
12538+ }
12539+
12540+ conf->raid_disks = sb->raid_disks;
12541+ /*
12542+ * 0 for a fully functional array, 1 for a degraded array.
12543+ */
12544+ conf->failed_disks = conf->raid_disks - conf->working_disks;
12545+ conf->mddev = mddev;
12546+ conf->chunk_size = sb->chunk_size;
12547+ conf->level = sb->level;
12548+ conf->algorithm = sb->layout;
12549+ conf->max_nr_stripes = NR_STRIPES;
12550+
12551+#if 0
12552+ for (i = 0; i < conf->raid_disks; i++) {
12553+ if (!conf->disks[i].used_slot) {
12554+ MD_BUG();
12555+ goto abort;
12556+ }
12557+ }
12558+#endif
12559+ if (!conf->chunk_size || conf->chunk_size % 4) {
12560+ printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
12561 goto abort;
12562 }
12563- if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12564- printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
12565+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12566+ printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
12567 goto abort;
12568 }
12569- if (raid_conf->failed_disks > 1) {
12570- printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
12571+ if (conf->failed_disks > 1) {
12572+ printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
12573 goto abort;
12574 }
12575
12576- if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
12577- printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
12578- sb->state |= 1 << MD_SB_ERRORS;
12579- goto abort;
12580+ if (conf->working_disks != sb->raid_disks) {
12581+ printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
12582+ start_recovery = 1;
12583 }
12584
12585- if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
12586- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12587- goto abort;
12588+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
12589+ check_consistency(mddev)) {
12590+ printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
12591+ sb->state &= ~(1 << MD_SB_CLEAN);
12592 }
12593
12594-#if SUPPORT_RECONSTRUCTION
12595- if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
12596- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12597- goto abort;
12598+ {
12599+ const char * name = "raid5d";
12600+
12601+ conf->thread = md_register_thread(raid5d, conf, name);
12602+ if (!conf->thread) {
12603+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12604+ goto abort;
12605+ }
12606 }
12607-#endif /* SUPPORT_RECONSTRUCTION */
12608
12609- memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
12610- raid_conf->raid_disks * (sizeof(struct buffer_head) +
12611+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
12612+ conf->raid_disks * (sizeof(struct buffer_head) +
12613 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
12614- if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
12615+ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
12616 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
12617- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12618+ shrink_stripes(conf, conf->max_nr_stripes);
12619 goto abort;
12620 } else
12621- printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
12622+ printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
12623
12624 /*
12625 * Regenerate the "device is in sync with the raid set" bit for
12626 * each device.
12627 */
12628- for (i = 0; i < sb->nr_disks ; i++) {
12629- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
12630+ for (i = 0; i < MD_SB_DISKS ; i++) {
12631+ mark_disk_nonsync(sb->disks + i);
12632 for (j = 0; j < sb->raid_disks; j++) {
12633- if (!raid_conf->disks[j].operational)
12634+ if (!conf->disks[j].operational)
12635 continue;
12636- if (sb->disks[i].number == raid_conf->disks[j].number)
12637- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
12638+ if (sb->disks[i].number == conf->disks[j].number)
12639+ mark_disk_sync(sb->disks + i);
12640 }
12641 }
12642- sb->active_disks = raid_conf->working_disks;
12643+ sb->active_disks = conf->working_disks;
12644
12645 if (sb->active_disks == sb->raid_disks)
12646- printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12647+ printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12648 else
12649- printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12650+ printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12651+
12652+ if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
12653+ const char * name = "raid5syncd";
12654+
12655+ conf->resync_thread = md_register_thread(raid5syncd, conf,name);
12656+ if (!conf->resync_thread) {
12657+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12658+ goto abort;
12659+ }
12660
12661- if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
12662- printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
12663- raid_conf->resync_parity = 1;
12664-#if SUPPORT_RECONSTRUCTION
12665- md_wakeup_thread(raid_conf->resync_thread);
12666-#endif /* SUPPORT_RECONSTRUCTION */
12667+ printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
12668+ conf->resync_parity = 1;
12669+ md_wakeup_thread(conf->resync_thread);
12670 }
12671
12672+ print_raid5_conf(conf);
12673+ if (start_recovery)
12674+ md_recover_arrays();
12675+ print_raid5_conf(conf);
12676+
12677 /* Ok, everything is just fine now */
12678 return (0);
12679 abort:
12680- if (raid_conf) {
12681- if (raid_conf->stripe_hashtbl)
12682- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12683- kfree(raid_conf);
12684+ if (conf) {
12685+ print_raid5_conf(conf);
12686+ if (conf->stripe_hashtbl)
12687+ free_pages((unsigned long) conf->stripe_hashtbl,
12688+ HASH_PAGES_ORDER);
12689+ kfree(conf);
12690 }
12691 mddev->private = NULL;
12692- printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12693+ printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
12694 MOD_DEC_USE_COUNT;
12695 return -EIO;
12696 }
12697
12698-static int raid5_stop (int minor, struct md_dev *mddev)
12699+static int raid5_stop_resync (mddev_t *mddev)
12700+{
12701+ raid5_conf_t *conf = mddev_to_conf(mddev);
12702+ mdk_thread_t *thread = conf->resync_thread;
12703+
12704+ if (thread) {
12705+ if (conf->resync_parity) {
12706+ conf->resync_parity = 2;
12707+ md_interrupt_thread(thread);
12708+ printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
12709+ return 1;
12710+ }
12711+ return 0;
12712+ }
12713+ return 0;
12714+}
12715+
12716+static int raid5_restart_resync (mddev_t *mddev)
12717 {
12718- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12719+ raid5_conf_t *conf = mddev_to_conf(mddev);
12720
12721- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
12722- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12723- md_unregister_thread(raid_conf->thread);
12724-#if SUPPORT_RECONSTRUCTION
12725- md_unregister_thread(raid_conf->resync_thread);
12726-#endif /* SUPPORT_RECONSTRUCTION */
12727- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12728- kfree(raid_conf);
12729+ if (conf->resync_parity) {
12730+ if (!conf->resync_thread) {
12731+ MD_BUG();
12732+ return 0;
12733+ }
12734+ printk("raid5: waking up raid5resync.\n");
12735+ conf->resync_parity = 1;
12736+ md_wakeup_thread(conf->resync_thread);
12737+ return 1;
12738+ } else
12739+ printk("raid5: no restart-resync needed.\n");
12740+ return 0;
12741+}
12742+
12743+
12744+static int raid5_stop (mddev_t *mddev)
12745+{
12746+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12747+
12748+ shrink_stripe_cache(conf, conf->max_nr_stripes);
12749+ shrink_stripes(conf, conf->max_nr_stripes);
12750+ md_unregister_thread(conf->thread);
12751+ if (conf->resync_thread)
12752+ md_unregister_thread(conf->resync_thread);
12753+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
12754+ kfree(conf);
12755 mddev->private = NULL;
12756 MOD_DEC_USE_COUNT;
12757 return 0;
12758 }
12759
12760-static int raid5_status (char *page, int minor, struct md_dev *mddev)
12761+static int raid5_status (char *page, mddev_t *mddev)
12762 {
12763- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12764- md_superblock_t *sb = mddev->sb;
12765+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12766+ mdp_super_t *sb = mddev->sb;
12767 int sz = 0, i;
12768
12769- sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
12770- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
12771- for (i = 0; i < raid_conf->raid_disks; i++)
12772- sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
12773+ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
12774+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
12775+ for (i = 0; i < conf->raid_disks; i++)
12776+ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
12777 sz += sprintf (page+sz, "]");
12778 return sz;
12779 }
12780
12781-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
12782+static void print_raid5_conf (raid5_conf_t *conf)
12783+{
12784+ int i;
12785+ struct disk_info *tmp;
12786+
12787+ printk("RAID5 conf printout:\n");
12788+ if (!conf) {
12789+ printk("(conf==NULL)\n");
12790+ return;
12791+ }
12792+ printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
12793+ conf->working_disks, conf->failed_disks);
12794+
12795+ for (i = 0; i < MD_SB_DISKS; i++) {
12796+ tmp = conf->disks + i;
12797+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
12798+ i, tmp->spare,tmp->operational,
12799+ tmp->number,tmp->raid_disk,tmp->used_slot,
12800+ partition_name(tmp->dev));
12801+ }
12802+}
12803+
12804+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
12805 {
12806- int i = 0, failed_disk = -1;
12807- struct raid5_data *raid_conf = mddev->private;
12808- struct disk_info *disk = raid_conf->disks;
12809+ int err = 0;
12810+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
12811+ raid5_conf_t *conf = mddev->private;
12812+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
12813 unsigned long flags;
12814- md_superblock_t *sb = mddev->sb;
12815- md_descriptor_t *descriptor;
12816+ mdp_super_t *sb = mddev->sb;
12817+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
12818
12819- for (i = 0; i < MD_SB_DISKS; i++, disk++) {
12820- if (disk->spare && disk->number == spare->number)
12821- goto found;
12822- }
12823- return 1;
12824-found:
12825- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
12826- if (!disk->operational)
12827- failed_disk = i;
12828- if (failed_disk == -1)
12829- return 1;
12830 save_flags(flags);
12831 cli();
12832+
12833+ print_raid5_conf(conf);
12834+ /*
12835+ * find the disk ...
12836+ */
12837 switch (state) {
12838- case SPARE_WRITE:
12839- disk->operational = 1;
12840- disk->write_only = 1;
12841- raid_conf->spare = disk;
12842- break;
12843- case SPARE_INACTIVE:
12844- disk->operational = 0;
12845- disk->write_only = 0;
12846- raid_conf->spare = NULL;
12847- break;
12848- case SPARE_ACTIVE:
12849- disk->spare = 0;
12850- disk->write_only = 0;
12851
12852- descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
12853- i = spare->raid_disk;
12854- disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
12855- if (disk->raid_disk != failed_disk)
12856- printk("raid5: disk->raid_disk != failed_disk");
12857- descriptor->raid_disk = i;
12858-
12859- raid_conf->spare = NULL;
12860- raid_conf->working_disks++;
12861- raid_conf->failed_disks--;
12862- raid_conf->disks[failed_disk] = *disk;
12863- break;
12864- default:
12865- printk("raid5_mark_spare: bug: state == %d\n", state);
12866- restore_flags(flags);
12867- return 1;
12868+ case DISKOP_SPARE_ACTIVE:
12869+
12870+ /*
12871+ * Find the failed disk within the RAID5 configuration ...
12872+ * (this can only be in the first conf->raid_disks part)
12873+ */
12874+ for (i = 0; i < conf->raid_disks; i++) {
12875+ tmp = conf->disks + i;
12876+ if ((!tmp->operational && !tmp->spare) ||
12877+ !tmp->used_slot) {
12878+ failed_disk = i;
12879+ break;
12880+ }
12881+ }
12882+ /*
12883+ * When we activate a spare disk we _must_ have a disk in
12884+ * the lower (active) part of the array to replace.
12885+ */
12886+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
12887+ MD_BUG();
12888+ err = 1;
12889+ goto abort;
12890+ }
12891+ /* fall through */
12892+
12893+ case DISKOP_SPARE_WRITE:
12894+ case DISKOP_SPARE_INACTIVE:
12895+
12896+ /*
12897+ * Find the spare disk ... (can only be in the 'high'
12898+ * area of the array)
12899+ */
12900+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12901+ tmp = conf->disks + i;
12902+ if (tmp->spare && tmp->number == (*d)->number) {
12903+ spare_disk = i;
12904+ break;
12905+ }
12906+ }
12907+ if (spare_disk == -1) {
12908+ MD_BUG();
12909+ err = 1;
12910+ goto abort;
12911+ }
12912+ break;
12913+
12914+ case DISKOP_HOT_REMOVE_DISK:
12915+
12916+ for (i = 0; i < MD_SB_DISKS; i++) {
12917+ tmp = conf->disks + i;
12918+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
12919+ if (tmp->operational) {
12920+ err = -EBUSY;
12921+ goto abort;
12922+ }
12923+ removed_disk = i;
12924+ break;
12925+ }
12926+ }
12927+ if (removed_disk == -1) {
12928+ MD_BUG();
12929+ err = 1;
12930+ goto abort;
12931+ }
12932+ break;
12933+
12934+ case DISKOP_HOT_ADD_DISK:
12935+
12936+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12937+ tmp = conf->disks + i;
12938+ if (!tmp->used_slot) {
12939+ added_disk = i;
12940+ break;
12941+ }
12942+ }
12943+ if (added_disk == -1) {
12944+ MD_BUG();
12945+ err = 1;
12946+ goto abort;
12947+ }
12948+ break;
12949+ }
12950+
12951+ switch (state) {
12952+ /*
12953+ * Switch the spare disk to write-only mode:
12954+ */
12955+ case DISKOP_SPARE_WRITE:
12956+ if (conf->spare) {
12957+ MD_BUG();
12958+ err = 1;
12959+ goto abort;
12960+ }
12961+ sdisk = conf->disks + spare_disk;
12962+ sdisk->operational = 1;
12963+ sdisk->write_only = 1;
12964+ conf->spare = sdisk;
12965+ break;
12966+ /*
12967+ * Deactivate a spare disk:
12968+ */
12969+ case DISKOP_SPARE_INACTIVE:
12970+ sdisk = conf->disks + spare_disk;
12971+ sdisk->operational = 0;
12972+ sdisk->write_only = 0;
12973+ /*
12974+ * Was the spare being resynced?
12975+ */
12976+ if (conf->spare == sdisk)
12977+ conf->spare = NULL;
12978+ break;
12979+ /*
12980+ * Activate (mark read-write) the (now sync) spare disk,
12981+ * which means we switch it's 'raid position' (->raid_disk)
12982+ * with the failed disk. (only the first 'conf->raid_disks'
12983+ * slots are used for 'real' disks and we must preserve this
12984+ * property)
12985+ */
12986+ case DISKOP_SPARE_ACTIVE:
12987+ if (!conf->spare) {
12988+ MD_BUG();
12989+ err = 1;
12990+ goto abort;
12991+ }
12992+ sdisk = conf->disks + spare_disk;
12993+ fdisk = conf->disks + failed_disk;
12994+
12995+ spare_desc = &sb->disks[sdisk->number];
12996+ failed_desc = &sb->disks[fdisk->number];
12997+
12998+ if (spare_desc != *d) {
12999+ MD_BUG();
13000+ err = 1;
13001+ goto abort;
13002+ }
13003+
13004+ if (spare_desc->raid_disk != sdisk->raid_disk) {
13005+ MD_BUG();
13006+ err = 1;
13007+ goto abort;
13008+ }
13009+
13010+ if (sdisk->raid_disk != spare_disk) {
13011+ MD_BUG();
13012+ err = 1;
13013+ goto abort;
13014+ }
13015+
13016+ if (failed_desc->raid_disk != fdisk->raid_disk) {
13017+ MD_BUG();
13018+ err = 1;
13019+ goto abort;
13020+ }
13021+
13022+ if (fdisk->raid_disk != failed_disk) {
13023+ MD_BUG();
13024+ err = 1;
13025+ goto abort;
13026+ }
13027+
13028+ /*
13029+ * do the switch finally
13030+ */
13031+ xchg_values(*spare_desc, *failed_desc);
13032+ xchg_values(*fdisk, *sdisk);
13033+
13034+ /*
13035+ * (careful, 'failed' and 'spare' are switched from now on)
13036+ *
13037+ * we want to preserve linear numbering and we want to
13038+ * give the proper raid_disk number to the now activated
13039+ * disk. (this means we switch back these values)
13040+ */
13041+
13042+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
13043+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
13044+ xchg_values(spare_desc->number, failed_desc->number);
13045+ xchg_values(sdisk->number, fdisk->number);
13046+
13047+ *d = failed_desc;
13048+
13049+ if (sdisk->dev == MKDEV(0,0))
13050+ sdisk->used_slot = 0;
13051+
13052+ /*
13053+ * this really activates the spare.
13054+ */
13055+ fdisk->spare = 0;
13056+ fdisk->write_only = 0;
13057+
13058+ /*
13059+ * if we activate a spare, we definitely replace a
13060+ * non-operational disk slot in the 'low' area of
13061+ * the disk array.
13062+ */
13063+ conf->failed_disks--;
13064+ conf->working_disks++;
13065+ conf->spare = NULL;
13066+
13067+ break;
13068+
13069+ case DISKOP_HOT_REMOVE_DISK:
13070+ rdisk = conf->disks + removed_disk;
13071+
13072+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
13073+ MD_BUG();
13074+ err = 1;
13075+ goto abort;
13076+ }
13077+ rdisk->dev = MKDEV(0,0);
13078+ rdisk->used_slot = 0;
13079+
13080+ break;
13081+
13082+ case DISKOP_HOT_ADD_DISK:
13083+ adisk = conf->disks + added_disk;
13084+ added_desc = *d;
13085+
13086+ if (added_disk != added_desc->number) {
13087+ MD_BUG();
13088+ err = 1;
13089+ goto abort;
13090+ }
13091+
13092+ adisk->number = added_desc->number;
13093+ adisk->raid_disk = added_desc->raid_disk;
13094+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
13095+
13096+ adisk->operational = 0;
13097+ adisk->write_only = 0;
13098+ adisk->spare = 1;
13099+ adisk->used_slot = 1;
13100+
13101+
13102+ break;
13103+
13104+ default:
13105+ MD_BUG();
13106+ err = 1;
13107+ goto abort;
13108 }
13109+abort:
13110 restore_flags(flags);
13111- return 0;
13112+ print_raid5_conf(conf);
13113+ return err;
13114 }
13115
13116-static struct md_personality raid5_personality=
13117+static mdk_personality_t raid5_personality=
13118 {
13119 "raid5",
13120 raid5_map,
13121@@ -1648,14 +2072,19 @@
13122 NULL, /* no ioctls */
13123 0,
13124 raid5_error,
13125- /* raid5_hot_add_disk, */ NULL,
13126- /* raid1_hot_remove_drive */ NULL,
13127- raid5_mark_spare
13128+ raid5_diskop,
13129+ raid5_stop_resync,
13130+ raid5_restart_resync
13131 };
13132
13133 int raid5_init (void)
13134 {
13135- return register_md_personality (RAID5, &raid5_personality);
13136+ int err;
13137+
13138+ err = register_md_personality (RAID5, &raid5_personality);
13139+ if (err)
13140+ return err;
13141+ return 0;
13142 }
13143
13144 #ifdef MODULE
13145--- linux/drivers/block/translucent.c.orig Tue Jan 16 13:42:04 2001
13146+++ linux/drivers/block/translucent.c Tue Jan 16 13:42:04 2001
13147@@ -0,0 +1,136 @@
13148+/*
13149+ translucent.c : Translucent RAID driver for Linux
13150+ Copyright (C) 1998 Ingo Molnar
13151+
13152+ Translucent mode management functions.
13153+
13154+ This program is free software; you can redistribute it and/or modify
13155+ it under the terms of the GNU General Public License as published by
13156+ the Free Software Foundation; either version 2, or (at your option)
13157+ any later version.
13158+
13159+ You should have received a copy of the GNU General Public License
13160+ (for example /usr/src/linux/COPYING); if not, write to the Free
13161+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13162+*/
13163+
13164+#include <linux/module.h>
13165+
13166+#include <linux/raid/md.h>
13167+#include <linux/malloc.h>
13168+
13169+#include <linux/raid/translucent.h>
13170+
13171+#define MAJOR_NR MD_MAJOR
13172+#define MD_DRIVER
13173+#define MD_PERSONALITY
13174+
13175+static int translucent_run (mddev_t *mddev)
13176+{
13177+ translucent_conf_t *conf;
13178+ mdk_rdev_t *rdev;
13179+ int i;
13180+
13181+ MOD_INC_USE_COUNT;
13182+
13183+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
13184+ if (!conf)
13185+ goto out;
13186+ mddev->private = conf;
13187+
13188+ if (mddev->nb_dev != 2) {
13189+ printk("translucent: this mode needs 2 disks, aborting!\n");
13190+ goto out;
13191+ }
13192+
13193+ if (md_check_ordering(mddev)) {
13194+ printk("translucent: disks are not ordered, aborting!\n");
13195+ goto out;
13196+ }
13197+
13198+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
13199+ dev_info_t *disk = conf->disks + i;
13200+
13201+ disk->dev = rdev->dev;
13202+ disk->size = rdev->size;
13203+ }
13204+
13205+ return 0;
13206+
13207+out:
13208+ if (conf)
13209+ kfree(conf);
13210+
13211+ MOD_DEC_USE_COUNT;
13212+ return 1;
13213+}
13214+
13215+static int translucent_stop (mddev_t *mddev)
13216+{
13217+ translucent_conf_t *conf = mddev_to_conf(mddev);
13218+
13219+ kfree(conf);
13220+
13221+ MOD_DEC_USE_COUNT;
13222+
13223+ return 0;
13224+}
13225+
13226+
13227+static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
13228+ unsigned long *rsector, unsigned long size)
13229+{
13230+ translucent_conf_t *conf = mddev_to_conf(mddev);
13231+
13232+ *rdev = conf->disks[0].dev;
13233+
13234+ return 0;
13235+}
13236+
13237+static int translucent_status (char *page, mddev_t *mddev)
13238+{
13239+ int sz = 0;
13240+
13241+ sz += sprintf(page+sz, " %d%% full", 10);
13242+ return sz;
13243+}
13244+
13245+
13246+static mdk_personality_t translucent_personality=
13247+{
13248+ "translucent",
13249+ translucent_map,
13250+ NULL,
13251+ NULL,
13252+ translucent_run,
13253+ translucent_stop,
13254+ translucent_status,
13255+ NULL,
13256+ 0,
13257+ NULL,
13258+ NULL,
13259+ NULL,
13260+ NULL
13261+};
13262+
13263+#ifndef MODULE
13264+
13265+md__initfunc(void translucent_init (void))
13266+{
13267+ register_md_personality (TRANSLUCENT, &translucent_personality);
13268+}
13269+
13270+#else
13271+
13272+int init_module (void)
13273+{
13274+ return (register_md_personality (TRANSLUCENT, &translucent_personality));
13275+}
13276+
13277+void cleanup_module (void)
13278+{
13279+ unregister_md_personality (TRANSLUCENT);
13280+}
13281+
13282+#endif
13283+
13284--- linux/drivers/block/xor.c.orig Tue Jan 16 13:42:04 2001
13285+++ linux/drivers/block/xor.c Tue Jan 16 13:42:04 2001
13286@@ -0,0 +1,1894 @@
13287+/*
13288+ * xor.c : Multiple Devices driver for Linux
13289+ *
13290+ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
13291+ *
13292+ *
13293+ * optimized RAID-5 checksumming functions.
13294+ *
13295+ * This program is free software; you can redistribute it and/or modify
13296+ * it under the terms of the GNU General Public License as published by
13297+ * the Free Software Foundation; either version 2, or (at your option)
13298+ * any later version.
13299+ *
13300+ * You should have received a copy of the GNU General Public License
13301+ * (for example /usr/src/linux/COPYING); if not, write to the Free
13302+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13303+ */
13304+#include <linux/module.h>
13305+#include <linux/raid/md.h>
13306+#ifdef __sparc_v9__
13307+#include <asm/head.h>
13308+#include <asm/asi.h>
13309+#include <asm/visasm.h>
13310+#endif
13311+
13312+/*
13313+ * we use the 'XOR function template' to register multiple xor
13314+ * functions runtime. The kernel measures their speed upon bootup
13315+ * and decides which one to use. (compile-time registration is
13316+ * not enough as certain CPU features like MMX can only be detected
13317+ * runtime)
13318+ *
13319+ * this architecture makes it pretty easy to add new routines
13320+ * that are faster on certain CPUs, without killing other CPU's
13321+ * 'native' routine. Although the current routines are belived
13322+ * to be the physically fastest ones on all CPUs tested, but
13323+ * feel free to prove me wrong and add yet another routine =B-)
13324+ * --mingo
13325+ */
13326+
13327+#define MAX_XOR_BLOCKS 5
13328+
13329+#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
13330+
13331+typedef void (*xor_block_t) XOR_ARGS;
13332+xor_block_t xor_block = NULL;
13333+
13334+#ifndef __sparc_v9__
13335+
13336+struct xor_block_template;
13337+
13338+struct xor_block_template {
13339+ char * name;
13340+ xor_block_t xor_block;
13341+ int speed;
13342+ struct xor_block_template * next;
13343+};
13344+
13345+struct xor_block_template * xor_functions = NULL;
13346+
13347+#define XORBLOCK_TEMPLATE(x) \
13348+static void xor_block_##x XOR_ARGS; \
13349+static struct xor_block_template t_xor_block_##x = \
13350+ { #x, xor_block_##x, 0, NULL }; \
13351+static void xor_block_##x XOR_ARGS
13352+
13353+#ifdef __i386__
13354+
13355+#ifdef CONFIG_X86_XMM
13356+/*
13357+ * Cache avoiding checksumming functions utilizing KNI instructions
13358+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
13359+ */
13360+
13361+XORBLOCK_TEMPLATE(pIII_kni)
13362+{
13363+ char xmm_save[16*4];
13364+ int cr0;
13365+ int lines = (bh_ptr[0]->b_size>>8);
13366+
13367+ __asm__ __volatile__ (
13368+ "movl %%cr0,%0 ;\n\t"
13369+ "clts ;\n\t"
13370+ "movups %%xmm0,(%1) ;\n\t"
13371+ "movups %%xmm1,0x10(%1) ;\n\t"
13372+ "movups %%xmm2,0x20(%1) ;\n\t"
13373+ "movups %%xmm3,0x30(%1) ;\n\t"
13374+ : "=r" (cr0)
13375+ : "r" (xmm_save)
13376+ : "memory" );
13377+
13378+#define OFFS(x) "8*("#x"*2)"
13379+#define PF0(x) \
13380+ " prefetcht0 "OFFS(x)"(%1) ;\n"
13381+#define LD(x,y) \
13382+ " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
13383+#define ST(x,y) \
13384+ " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
13385+#define PF1(x) \
13386+ " prefetchnta "OFFS(x)"(%2) ;\n"
13387+#define PF2(x) \
13388+ " prefetchnta "OFFS(x)"(%3) ;\n"
13389+#define PF3(x) \
13390+ " prefetchnta "OFFS(x)"(%4) ;\n"
13391+#define PF4(x) \
13392+ " prefetchnta "OFFS(x)"(%5) ;\n"
13393+#define PF5(x) \
13394+ " prefetchnta "OFFS(x)"(%6) ;\n"
13395+#define XO1(x,y) \
13396+ " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
13397+#define XO2(x,y) \
13398+ " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
13399+#define XO3(x,y) \
13400+ " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
13401+#define XO4(x,y) \
13402+ " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
13403+#define XO5(x,y) \
13404+ " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
13405+
13406+ switch(count) {
13407+ case 2:
13408+ __asm__ __volatile__ (
13409+#undef BLOCK
13410+#define BLOCK(i) \
13411+ LD(i,0) \
13412+ LD(i+1,1) \
13413+ PF1(i) \
13414+ PF1(i+2) \
13415+ LD(i+2,2) \
13416+ LD(i+3,3) \
13417+ PF0(i+4) \
13418+ PF0(i+6) \
13419+ XO1(i,0) \
13420+ XO1(i+1,1) \
13421+ XO1(i+2,2) \
13422+ XO1(i+3,3) \
13423+ ST(i,0) \
13424+ ST(i+1,1) \
13425+ ST(i+2,2) \
13426+ ST(i+3,3) \
13427+
13428+
13429+ PF0(0)
13430+ PF0(2)
13431+
13432+ " .align 32,0x90 ;\n"
13433+ " 1: ;\n"
13434+
13435+ BLOCK(0)
13436+ BLOCK(4)
13437+ BLOCK(8)
13438+ BLOCK(12)
13439+
13440+ " addl $256, %1 ;\n"
13441+ " addl $256, %2 ;\n"
13442+ " decl %0 ;\n"
13443+ " jnz 1b ;\n"
13444+
13445+ :
13446+ : "r" (lines),
13447+ "r" (bh_ptr[0]->b_data),
13448+ "r" (bh_ptr[1]->b_data)
13449+ : "memory" );
13450+ break;
13451+ case 3:
13452+ __asm__ __volatile__ (
13453+#undef BLOCK
13454+#define BLOCK(i) \
13455+ PF1(i) \
13456+ PF1(i+2) \
13457+ LD(i,0) \
13458+ LD(i+1,1) \
13459+ LD(i+2,2) \
13460+ LD(i+3,3) \
13461+ PF2(i) \
13462+ PF2(i+2) \
13463+ PF0(i+4) \
13464+ PF0(i+6) \
13465+ XO1(i,0) \
13466+ XO1(i+1,1) \
13467+ XO1(i+2,2) \
13468+ XO1(i+3,3) \
13469+ XO2(i,0) \
13470+ XO2(i+1,1) \
13471+ XO2(i+2,2) \
13472+ XO2(i+3,3) \
13473+ ST(i,0) \
13474+ ST(i+1,1) \
13475+ ST(i+2,2) \
13476+ ST(i+3,3) \
13477+
13478+
13479+ PF0(0)
13480+ PF0(2)
13481+
13482+ " .align 32,0x90 ;\n"
13483+ " 1: ;\n"
13484+
13485+ BLOCK(0)
13486+ BLOCK(4)
13487+ BLOCK(8)
13488+ BLOCK(12)
13489+
13490+ " addl $256, %1 ;\n"
13491+ " addl $256, %2 ;\n"
13492+ " addl $256, %3 ;\n"
13493+ " decl %0 ;\n"
13494+ " jnz 1b ;\n"
13495+ :
13496+ : "r" (lines),
13497+ "r" (bh_ptr[0]->b_data),
13498+ "r" (bh_ptr[1]->b_data),
13499+ "r" (bh_ptr[2]->b_data)
13500+ : "memory" );
13501+ break;
13502+ case 4:
13503+ __asm__ __volatile__ (
13504+#undef BLOCK
13505+#define BLOCK(i) \
13506+ PF1(i) \
13507+ PF1(i+2) \
13508+ LD(i,0) \
13509+ LD(i+1,1) \
13510+ LD(i+2,2) \
13511+ LD(i+3,3) \
13512+ PF2(i) \
13513+ PF2(i+2) \
13514+ XO1(i,0) \
13515+ XO1(i+1,1) \
13516+ XO1(i+2,2) \
13517+ XO1(i+3,3) \
13518+ PF3(i) \
13519+ PF3(i+2) \
13520+ PF0(i+4) \
13521+ PF0(i+6) \
13522+ XO2(i,0) \
13523+ XO2(i+1,1) \
13524+ XO2(i+2,2) \
13525+ XO2(i+3,3) \
13526+ XO3(i,0) \
13527+ XO3(i+1,1) \
13528+ XO3(i+2,2) \
13529+ XO3(i+3,3) \
13530+ ST(i,0) \
13531+ ST(i+1,1) \
13532+ ST(i+2,2) \
13533+ ST(i+3,3) \
13534+
13535+
13536+ PF0(0)
13537+ PF0(2)
13538+
13539+ " .align 32,0x90 ;\n"
13540+ " 1: ;\n"
13541+
13542+ BLOCK(0)
13543+ BLOCK(4)
13544+ BLOCK(8)
13545+ BLOCK(12)
13546+
13547+ " addl $256, %1 ;\n"
13548+ " addl $256, %2 ;\n"
13549+ " addl $256, %3 ;\n"
13550+ " addl $256, %4 ;\n"
13551+ " decl %0 ;\n"
13552+ " jnz 1b ;\n"
13553+
13554+ :
13555+ : "r" (lines),
13556+ "r" (bh_ptr[0]->b_data),
13557+ "r" (bh_ptr[1]->b_data),
13558+ "r" (bh_ptr[2]->b_data),
13559+ "r" (bh_ptr[3]->b_data)
13560+ : "memory" );
13561+ break;
13562+ case 5:
13563+ __asm__ __volatile__ (
13564+#undef BLOCK
13565+#define BLOCK(i) \
13566+ PF1(i) \
13567+ PF1(i+2) \
13568+ LD(i,0) \
13569+ LD(i+1,1) \
13570+ LD(i+2,2) \
13571+ LD(i+3,3) \
13572+ PF2(i) \
13573+ PF2(i+2) \
13574+ XO1(i,0) \
13575+ XO1(i+1,1) \
13576+ XO1(i+2,2) \
13577+ XO1(i+3,3) \
13578+ PF3(i) \
13579+ PF3(i+2) \
13580+ XO2(i,0) \
13581+ XO2(i+1,1) \
13582+ XO2(i+2,2) \
13583+ XO2(i+3,3) \
13584+ PF4(i) \
13585+ PF4(i+2) \
13586+ PF0(i+4) \
13587+ PF0(i+6) \
13588+ XO3(i,0) \
13589+ XO3(i+1,1) \
13590+ XO3(i+2,2) \
13591+ XO3(i+3,3) \
13592+ XO4(i,0) \
13593+ XO4(i+1,1) \
13594+ XO4(i+2,2) \
13595+ XO4(i+3,3) \
13596+ ST(i,0) \
13597+ ST(i+1,1) \
13598+ ST(i+2,2) \
13599+ ST(i+3,3) \
13600+
13601+
13602+ PF0(0)
13603+ PF0(2)
13604+
13605+ " .align 32,0x90 ;\n"
13606+ " 1: ;\n"
13607+
13608+ BLOCK(0)
13609+ BLOCK(4)
13610+ BLOCK(8)
13611+ BLOCK(12)
13612+
13613+ " addl $256, %1 ;\n"
13614+ " addl $256, %2 ;\n"
13615+ " addl $256, %3 ;\n"
13616+ " addl $256, %4 ;\n"
13617+ " addl $256, %5 ;\n"
13618+ " decl %0 ;\n"
13619+ " jnz 1b ;\n"
13620+
13621+ :
13622+ : "r" (lines),
13623+ "r" (bh_ptr[0]->b_data),
13624+ "r" (bh_ptr[1]->b_data),
13625+ "r" (bh_ptr[2]->b_data),
13626+ "r" (bh_ptr[3]->b_data),
13627+ "r" (bh_ptr[4]->b_data)
13628+ : "memory");
13629+ break;
13630+ }
13631+
13632+ __asm__ __volatile__ (
13633+ "sfence ;\n\t"
13634+ "movups (%1),%%xmm0 ;\n\t"
13635+ "movups 0x10(%1),%%xmm1 ;\n\t"
13636+ "movups 0x20(%1),%%xmm2 ;\n\t"
13637+ "movups 0x30(%1),%%xmm3 ;\n\t"
13638+ "movl %0,%%cr0 ;\n\t"
13639+ :
13640+ : "r" (cr0), "r" (xmm_save)
13641+ : "memory" );
13642+}
13643+
13644+#undef OFFS
13645+#undef LD
13646+#undef ST
13647+#undef PF0
13648+#undef PF1
13649+#undef PF2
13650+#undef PF3
13651+#undef PF4
13652+#undef PF5
13653+#undef XO1
13654+#undef XO2
13655+#undef XO3
13656+#undef XO4
13657+#undef XO5
13658+#undef BLOCK
13659+
13660+#endif /* CONFIG_X86_XMM */
13661+
13662+/*
13663+ * high-speed RAID5 checksumming functions utilizing MMX instructions
13664+ * Copyright (C) 1998 Ingo Molnar
13665+ */
13666+XORBLOCK_TEMPLATE(pII_mmx)
13667+{
13668+ char fpu_save[108];
13669+ int lines = (bh_ptr[0]->b_size>>7);
13670+
13671+ if (!(current->flags & PF_USEDFPU))
13672+ __asm__ __volatile__ ( " clts;\n");
13673+
13674+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13675+
13676+#define LD(x,y) \
13677+ " movq 8*("#x")(%1), %%mm"#y" ;\n"
13678+#define ST(x,y) \
13679+ " movq %%mm"#y", 8*("#x")(%1) ;\n"
13680+#define XO1(x,y) \
13681+ " pxor 8*("#x")(%2), %%mm"#y" ;\n"
13682+#define XO2(x,y) \
13683+ " pxor 8*("#x")(%3), %%mm"#y" ;\n"
13684+#define XO3(x,y) \
13685+ " pxor 8*("#x")(%4), %%mm"#y" ;\n"
13686+#define XO4(x,y) \
13687+ " pxor 8*("#x")(%5), %%mm"#y" ;\n"
13688+
13689+ switch(count) {
13690+ case 2:
13691+ __asm__ __volatile__ (
13692+#undef BLOCK
13693+#define BLOCK(i) \
13694+ LD(i,0) \
13695+ LD(i+1,1) \
13696+ LD(i+2,2) \
13697+ LD(i+3,3) \
13698+ XO1(i,0) \
13699+ ST(i,0) \
13700+ XO1(i+1,1) \
13701+ ST(i+1,1) \
13702+ XO1(i+2,2) \
13703+ ST(i+2,2) \
13704+ XO1(i+3,3) \
13705+ ST(i+3,3)
13706+
13707+ " .align 32,0x90 ;\n"
13708+ " 1: ;\n"
13709+
13710+ BLOCK(0)
13711+ BLOCK(4)
13712+ BLOCK(8)
13713+ BLOCK(12)
13714+
13715+ " addl $128, %1 ;\n"
13716+ " addl $128, %2 ;\n"
13717+ " decl %0 ;\n"
13718+ " jnz 1b ;\n"
13719+ :
13720+ : "r" (lines),
13721+ "r" (bh_ptr[0]->b_data),
13722+ "r" (bh_ptr[1]->b_data)
13723+ : "memory");
13724+ break;
13725+ case 3:
13726+ __asm__ __volatile__ (
13727+#undef BLOCK
13728+#define BLOCK(i) \
13729+ LD(i,0) \
13730+ LD(i+1,1) \
13731+ LD(i+2,2) \
13732+ LD(i+3,3) \
13733+ XO1(i,0) \
13734+ XO1(i+1,1) \
13735+ XO1(i+2,2) \
13736+ XO1(i+3,3) \
13737+ XO2(i,0) \
13738+ ST(i,0) \
13739+ XO2(i+1,1) \
13740+ ST(i+1,1) \
13741+ XO2(i+2,2) \
13742+ ST(i+2,2) \
13743+ XO2(i+3,3) \
13744+ ST(i+3,3)
13745+
13746+ " .align 32,0x90 ;\n"
13747+ " 1: ;\n"
13748+
13749+ BLOCK(0)
13750+ BLOCK(4)
13751+ BLOCK(8)
13752+ BLOCK(12)
13753+
13754+ " addl $128, %1 ;\n"
13755+ " addl $128, %2 ;\n"
13756+ " addl $128, %3 ;\n"
13757+ " decl %0 ;\n"
13758+ " jnz 1b ;\n"
13759+ :
13760+ : "r" (lines),
13761+ "r" (bh_ptr[0]->b_data),
13762+ "r" (bh_ptr[1]->b_data),
13763+ "r" (bh_ptr[2]->b_data)
13764+ : "memory");
13765+ break;
13766+ case 4:
13767+ __asm__ __volatile__ (
13768+#undef BLOCK
13769+#define BLOCK(i) \
13770+ LD(i,0) \
13771+ LD(i+1,1) \
13772+ LD(i+2,2) \
13773+ LD(i+3,3) \
13774+ XO1(i,0) \
13775+ XO1(i+1,1) \
13776+ XO1(i+2,2) \
13777+ XO1(i+3,3) \
13778+ XO2(i,0) \
13779+ XO2(i+1,1) \
13780+ XO2(i+2,2) \
13781+ XO2(i+3,3) \
13782+ XO3(i,0) \
13783+ ST(i,0) \
13784+ XO3(i+1,1) \
13785+ ST(i+1,1) \
13786+ XO3(i+2,2) \
13787+ ST(i+2,2) \
13788+ XO3(i+3,3) \
13789+ ST(i+3,3)
13790+
13791+ " .align 32,0x90 ;\n"
13792+ " 1: ;\n"
13793+
13794+ BLOCK(0)
13795+ BLOCK(4)
13796+ BLOCK(8)
13797+ BLOCK(12)
13798+
13799+ " addl $128, %1 ;\n"
13800+ " addl $128, %2 ;\n"
13801+ " addl $128, %3 ;\n"
13802+ " addl $128, %4 ;\n"
13803+ " decl %0 ;\n"
13804+ " jnz 1b ;\n"
13805+ :
13806+ : "r" (lines),
13807+ "r" (bh_ptr[0]->b_data),
13808+ "r" (bh_ptr[1]->b_data),
13809+ "r" (bh_ptr[2]->b_data),
13810+ "r" (bh_ptr[3]->b_data)
13811+ : "memory");
13812+ break;
13813+ case 5:
13814+ __asm__ __volatile__ (
13815+#undef BLOCK
13816+#define BLOCK(i) \
13817+ LD(i,0) \
13818+ LD(i+1,1) \
13819+ LD(i+2,2) \
13820+ LD(i+3,3) \
13821+ XO1(i,0) \
13822+ XO1(i+1,1) \
13823+ XO1(i+2,2) \
13824+ XO1(i+3,3) \
13825+ XO2(i,0) \
13826+ XO2(i+1,1) \
13827+ XO2(i+2,2) \
13828+ XO2(i+3,3) \
13829+ XO3(i,0) \
13830+ XO3(i+1,1) \
13831+ XO3(i+2,2) \
13832+ XO3(i+3,3) \
13833+ XO4(i,0) \
13834+ ST(i,0) \
13835+ XO4(i+1,1) \
13836+ ST(i+1,1) \
13837+ XO4(i+2,2) \
13838+ ST(i+2,2) \
13839+ XO4(i+3,3) \
13840+ ST(i+3,3)
13841+
13842+ " .align 32,0x90 ;\n"
13843+ " 1: ;\n"
13844+
13845+ BLOCK(0)
13846+ BLOCK(4)
13847+ BLOCK(8)
13848+ BLOCK(12)
13849+
13850+ " addl $128, %1 ;\n"
13851+ " addl $128, %2 ;\n"
13852+ " addl $128, %3 ;\n"
13853+ " addl $128, %4 ;\n"
13854+ " addl $128, %5 ;\n"
13855+ " decl %0 ;\n"
13856+ " jnz 1b ;\n"
13857+ :
13858+ : "r" (lines),
13859+ "r" (bh_ptr[0]->b_data),
13860+ "r" (bh_ptr[1]->b_data),
13861+ "r" (bh_ptr[2]->b_data),
13862+ "r" (bh_ptr[3]->b_data),
13863+ "r" (bh_ptr[4]->b_data)
13864+ : "memory");
13865+ break;
13866+ }
13867+
13868+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
13869+
13870+ if (!(current->flags & PF_USEDFPU))
13871+ stts();
13872+}
13873+
13874+#undef LD
13875+#undef XO1
13876+#undef XO2
13877+#undef XO3
13878+#undef XO4
13879+#undef ST
13880+#undef BLOCK
13881+
13882+XORBLOCK_TEMPLATE(p5_mmx)
13883+{
13884+ char fpu_save[108];
13885+ int lines = (bh_ptr[0]->b_size>>6);
13886+
13887+ if (!(current->flags & PF_USEDFPU))
13888+ __asm__ __volatile__ ( " clts;\n");
13889+
13890+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13891+
13892+ switch(count) {
13893+ case 2:
13894+ __asm__ __volatile__ (
13895+
13896+ " .align 32,0x90 ;\n"
13897+ " 1: ;\n"
13898+ " movq (%1), %%mm0 ;\n"
13899+ " movq 8(%1), %%mm1 ;\n"
13900+ " pxor (%2), %%mm0 ;\n"
13901+ " movq 16(%1), %%mm2 ;\n"
13902+ " movq %%mm0, (%1) ;\n"
13903+ " pxor 8(%2), %%mm1 ;\n"
13904+ " movq 24(%1), %%mm3 ;\n"
13905+ " movq %%mm1, 8(%1) ;\n"
13906+ " pxor 16(%2), %%mm2 ;\n"
13907+ " movq 32(%1), %%mm4 ;\n"
13908+ " movq %%mm2, 16(%1) ;\n"
13909+ " pxor 24(%2), %%mm3 ;\n"
13910+ " movq 40(%1), %%mm5 ;\n"
13911+ " movq %%mm3, 24(%1) ;\n"
13912+ " pxor 32(%2), %%mm4 ;\n"
13913+ " movq 48(%1), %%mm6 ;\n"
13914+ " movq %%mm4, 32(%1) ;\n"
13915+ " pxor 40(%2), %%mm5 ;\n"
13916+ " movq 56(%1), %%mm7 ;\n"
13917+ " movq %%mm5, 40(%1) ;\n"
13918+ " pxor 48(%2), %%mm6 ;\n"
13919+ " pxor 56(%2), %%mm7 ;\n"
13920+ " movq %%mm6, 48(%1) ;\n"
13921+ " movq %%mm7, 56(%1) ;\n"
13922+
13923+ " addl $64, %1 ;\n"
13924+ " addl $64, %2 ;\n"
13925+ " decl %0 ;\n"
13926+ " jnz 1b ;\n"
13927+
13928+ :
13929+ : "r" (lines),
13930+ "r" (bh_ptr[0]->b_data),
13931+ "r" (bh_ptr[1]->b_data)
13932+ : "memory" );
13933+ break;
13934+ case 3:
13935+ __asm__ __volatile__ (
13936+
13937+ " .align 32,0x90 ;\n"
13938+ " 1: ;\n"
13939+ " movq (%1), %%mm0 ;\n"
13940+ " movq 8(%1), %%mm1 ;\n"
13941+ " pxor (%2), %%mm0 ;\n"
13942+ " movq 16(%1), %%mm2 ;\n"
13943+ " pxor 8(%2), %%mm1 ;\n"
13944+ " pxor (%3), %%mm0 ;\n"
13945+ " pxor 16(%2), %%mm2 ;\n"
13946+ " movq %%mm0, (%1) ;\n"
13947+ " pxor 8(%3), %%mm1 ;\n"
13948+ " pxor 16(%3), %%mm2 ;\n"
13949+ " movq 24(%1), %%mm3 ;\n"
13950+ " movq %%mm1, 8(%1) ;\n"
13951+ " movq 32(%1), %%mm4 ;\n"
13952+ " movq 40(%1), %%mm5 ;\n"
13953+ " pxor 24(%2), %%mm3 ;\n"
13954+ " movq %%mm2, 16(%1) ;\n"
13955+ " pxor 32(%2), %%mm4 ;\n"
13956+ " pxor 24(%3), %%mm3 ;\n"
13957+ " pxor 40(%2), %%mm5 ;\n"
13958+ " movq %%mm3, 24(%1) ;\n"
13959+ " pxor 32(%3), %%mm4 ;\n"
13960+ " pxor 40(%3), %%mm5 ;\n"
13961+ " movq 48(%1), %%mm6 ;\n"
13962+ " movq %%mm4, 32(%1) ;\n"
13963+ " movq 56(%1), %%mm7 ;\n"
13964+ " pxor 48(%2), %%mm6 ;\n"
13965+ " movq %%mm5, 40(%1) ;\n"
13966+ " pxor 56(%2), %%mm7 ;\n"
13967+ " pxor 48(%3), %%mm6 ;\n"
13968+ " pxor 56(%3), %%mm7 ;\n"
13969+ " movq %%mm6, 48(%1) ;\n"
13970+ " movq %%mm7, 56(%1) ;\n"
13971+
13972+ " addl $64, %1 ;\n"
13973+ " addl $64, %2 ;\n"
13974+ " addl $64, %3 ;\n"
13975+ " decl %0 ;\n"
13976+ " jnz 1b ;\n"
13977+
13978+ :
13979+ : "r" (lines),
13980+ "r" (bh_ptr[0]->b_data),
13981+ "r" (bh_ptr[1]->b_data),
13982+ "r" (bh_ptr[2]->b_data)
13983+ : "memory" );
13984+ break;
13985+ case 4:
13986+ __asm__ __volatile__ (
13987+
13988+ " .align 32,0x90 ;\n"
13989+ " 1: ;\n"
13990+ " movq (%1), %%mm0 ;\n"
13991+ " movq 8(%1), %%mm1 ;\n"
13992+ " pxor (%2), %%mm0 ;\n"
13993+ " movq 16(%1), %%mm2 ;\n"
13994+ " pxor 8(%2), %%mm1 ;\n"
13995+ " pxor (%3), %%mm0 ;\n"
13996+ " pxor 16(%2), %%mm2 ;\n"
13997+ " pxor 8(%3), %%mm1 ;\n"
13998+ " pxor (%4), %%mm0 ;\n"
13999+ " movq 24(%1), %%mm3 ;\n"
14000+ " pxor 16(%3), %%mm2 ;\n"
14001+ " pxor 8(%4), %%mm1 ;\n"
14002+ " movq %%mm0, (%1) ;\n"
14003+ " movq 32(%1), %%mm4 ;\n"
14004+ " pxor 24(%2), %%mm3 ;\n"
14005+ " pxor 16(%4), %%mm2 ;\n"
14006+ " movq %%mm1, 8(%1) ;\n"
14007+ " movq 40(%1), %%mm5 ;\n"
14008+ " pxor 32(%2), %%mm4 ;\n"
14009+ " pxor 24(%3), %%mm3 ;\n"
14010+ " movq %%mm2, 16(%1) ;\n"
14011+ " pxor 40(%2), %%mm5 ;\n"
14012+ " pxor 32(%3), %%mm4 ;\n"
14013+ " pxor 24(%4), %%mm3 ;\n"
14014+ " movq %%mm3, 24(%1) ;\n"
14015+ " movq 56(%1), %%mm7 ;\n"
14016+ " movq 48(%1), %%mm6 ;\n"
14017+ " pxor 40(%3), %%mm5 ;\n"
14018+ " pxor 32(%4), %%mm4 ;\n"
14019+ " pxor 48(%2), %%mm6 ;\n"
14020+ " movq %%mm4, 32(%1) ;\n"
14021+ " pxor 56(%2), %%mm7 ;\n"
14022+ " pxor 40(%4), %%mm5 ;\n"
14023+ " pxor 48(%3), %%mm6 ;\n"
14024+ " pxor 56(%3), %%mm7 ;\n"
14025+ " movq %%mm5, 40(%1) ;\n"
14026+ " pxor 48(%4), %%mm6 ;\n"
14027+ " pxor 56(%4), %%mm7 ;\n"
14028+ " movq %%mm6, 48(%1) ;\n"
14029+ " movq %%mm7, 56(%1) ;\n"
14030+
14031+ " addl $64, %1 ;\n"
14032+ " addl $64, %2 ;\n"
14033+ " addl $64, %3 ;\n"
14034+ " addl $64, %4 ;\n"
14035+ " decl %0 ;\n"
14036+ " jnz 1b ;\n"
14037+
14038+ :
14039+ : "r" (lines),
14040+ "r" (bh_ptr[0]->b_data),
14041+ "r" (bh_ptr[1]->b_data),
14042+ "r" (bh_ptr[2]->b_data),
14043+ "r" (bh_ptr[3]->b_data)
14044+ : "memory" );
14045+ break;
14046+ case 5:
14047+ __asm__ __volatile__ (
14048+
14049+ " .align 32,0x90 ;\n"
14050+ " 1: ;\n"
14051+ " movq (%1), %%mm0 ;\n"
14052+ " movq 8(%1), %%mm1 ;\n"
14053+ " pxor (%2), %%mm0 ;\n"
14054+ " pxor 8(%2), %%mm1 ;\n"
14055+ " movq 16(%1), %%mm2 ;\n"
14056+ " pxor (%3), %%mm0 ;\n"
14057+ " pxor 8(%3), %%mm1 ;\n"
14058+ " pxor 16(%2), %%mm2 ;\n"
14059+ " pxor (%4), %%mm0 ;\n"
14060+ " pxor 8(%4), %%mm1 ;\n"
14061+ " pxor 16(%3), %%mm2 ;\n"
14062+ " movq 24(%1), %%mm3 ;\n"
14063+ " pxor (%5), %%mm0 ;\n"
14064+ " pxor 8(%5), %%mm1 ;\n"
14065+ " movq %%mm0, (%1) ;\n"
14066+ " pxor 16(%4), %%mm2 ;\n"
14067+ " pxor 24(%2), %%mm3 ;\n"
14068+ " movq %%mm1, 8(%1) ;\n"
14069+ " pxor 16(%5), %%mm2 ;\n"
14070+ " pxor 24(%3), %%mm3 ;\n"
14071+ " movq 32(%1), %%mm4 ;\n"
14072+ " movq %%mm2, 16(%1) ;\n"
14073+ " pxor 24(%4), %%mm3 ;\n"
14074+ " pxor 32(%2), %%mm4 ;\n"
14075+ " movq 40(%1), %%mm5 ;\n"
14076+ " pxor 24(%5), %%mm3 ;\n"
14077+ " pxor 32(%3), %%mm4 ;\n"
14078+ " pxor 40(%2), %%mm5 ;\n"
14079+ " movq %%mm3, 24(%1) ;\n"
14080+ " pxor 32(%4), %%mm4 ;\n"
14081+ " pxor 40(%3), %%mm5 ;\n"
14082+ " movq 48(%1), %%mm6 ;\n"
14083+ " movq 56(%1), %%mm7 ;\n"
14084+ " pxor 32(%5), %%mm4 ;\n"
14085+ " pxor 40(%4), %%mm5 ;\n"
14086+ " pxor 48(%2), %%mm6 ;\n"
14087+ " pxor 56(%2), %%mm7 ;\n"
14088+ " movq %%mm4, 32(%1) ;\n"
14089+ " pxor 48(%3), %%mm6 ;\n"
14090+ " pxor 56(%3), %%mm7 ;\n"
14091+ " pxor 40(%5), %%mm5 ;\n"
14092+ " pxor 48(%4), %%mm6 ;\n"
14093+ " pxor 56(%4), %%mm7 ;\n"
14094+ " movq %%mm5, 40(%1) ;\n"
14095+ " pxor 48(%5), %%mm6 ;\n"
14096+ " pxor 56(%5), %%mm7 ;\n"
14097+ " movq %%mm6, 48(%1) ;\n"
14098+ " movq %%mm7, 56(%1) ;\n"
14099+
14100+ " addl $64, %1 ;\n"
14101+ " addl $64, %2 ;\n"
14102+ " addl $64, %3 ;\n"
14103+ " addl $64, %4 ;\n"
14104+ " addl $64, %5 ;\n"
14105+ " decl %0 ;\n"
14106+ " jnz 1b ;\n"
14107+
14108+ :
14109+ : "r" (lines),
14110+ "r" (bh_ptr[0]->b_data),
14111+ "r" (bh_ptr[1]->b_data),
14112+ "r" (bh_ptr[2]->b_data),
14113+ "r" (bh_ptr[3]->b_data),
14114+ "r" (bh_ptr[4]->b_data)
14115+ : "memory" );
14116+ break;
14117+ }
14118+
14119+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
14120+
14121+ if (!(current->flags & PF_USEDFPU))
14122+ stts();
14123+}
14124+#endif /* __i386__ */
14125+#endif /* !__sparc_v9__ */
14126+
14127+#ifdef __sparc_v9__
14128+/*
14129+ * High speed xor_block operation for RAID4/5 utilizing the
14130+ * UltraSparc Visual Instruction Set.
14131+ *
14132+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
14133+ *
14134+ * Requirements:
14135+ * !(((long)dest | (long)sourceN) & (64 - 1)) &&
14136+ * !(len & 127) && len >= 256
14137+ *
14138+ * It is done in pure assembly, as otherwise gcc makes it
14139+ * a non-leaf function, which is not what we want.
14140+ * Also, we don't measure the speeds as on other architectures,
14141+ * as the measuring routine does not take into account cold caches
14142+ * and the fact that xor_block_VIS bypasses the caches.
14143+ * xor_block_32regs might be 5% faster for count 2 if caches are hot
14144+ * and things just right (for count 3 VIS is about as fast as 32regs for
14145+ * hot caches and for count 4 and 5 VIS is faster by good margin always),
14146+ * but I think it is better not to pollute the caches.
14147+ * Actually, if I'd just fight for speed for hot caches, I could
14148+ * write a hybrid VIS/integer routine, which would do always two
14149+ * 64B blocks in VIS and two in IEUs, but I really care more about
14150+ * caches.
14151+ */
14152+extern void *VISenter(void);
14153+extern void xor_block_VIS XOR_ARGS;
14154+
14155+void __xor_block_VIS(void)
14156+{
14157+__asm__ ("
14158+ .globl xor_block_VIS
14159+xor_block_VIS:
14160+ ldx [%%o1 + 0], %%o4
14161+ ldx [%%o1 + 8], %%o3
14162+ ldx [%%o4 + %1], %%g5
14163+ ldx [%%o4 + %0], %%o4
14164+ ldx [%%o3 + %0], %%o3
14165+ rd %%fprs, %%o5
14166+ andcc %%o5, %2, %%g0
14167+ be,pt %%icc, 297f
14168+ sethi %%hi(%5), %%g1
14169+ jmpl %%g1 + %%lo(%5), %%g7
14170+ add %%g7, 8, %%g7
14171+297: wr %%g0, %4, %%fprs
14172+ membar #LoadStore|#StoreLoad|#StoreStore
14173+ sub %%g5, 64, %%g5
14174+ ldda [%%o4] %3, %%f0
14175+ ldda [%%o3] %3, %%f16
14176+ cmp %%o0, 4
14177+ bgeu,pt %%xcc, 10f
14178+ cmp %%o0, 3
14179+ be,pn %%xcc, 13f
14180+ mov -64, %%g1
14181+ sub %%g5, 64, %%g5
14182+ rd %%asi, %%g1
14183+ wr %%g0, %3, %%asi
14184+
14185+2: ldda [%%o4 + 64] %%asi, %%f32
14186+ fxor %%f0, %%f16, %%f16
14187+ fxor %%f2, %%f18, %%f18
14188+ fxor %%f4, %%f20, %%f20
14189+ fxor %%f6, %%f22, %%f22
14190+ fxor %%f8, %%f24, %%f24
14191+ fxor %%f10, %%f26, %%f26
14192+ fxor %%f12, %%f28, %%f28
14193+ fxor %%f14, %%f30, %%f30
14194+ stda %%f16, [%%o4] %3
14195+ ldda [%%o3 + 64] %%asi, %%f48
14196+ ldda [%%o4 + 128] %%asi, %%f0
14197+ fxor %%f32, %%f48, %%f48
14198+ fxor %%f34, %%f50, %%f50
14199+ add %%o4, 128, %%o4
14200+ fxor %%f36, %%f52, %%f52
14201+ add %%o3, 128, %%o3
14202+ fxor %%f38, %%f54, %%f54
14203+ subcc %%g5, 128, %%g5
14204+ fxor %%f40, %%f56, %%f56
14205+ fxor %%f42, %%f58, %%f58
14206+ fxor %%f44, %%f60, %%f60
14207+ fxor %%f46, %%f62, %%f62
14208+ stda %%f48, [%%o4 - 64] %%asi
14209+ bne,pt %%xcc, 2b
14210+ ldda [%%o3] %3, %%f16
14211+
14212+ ldda [%%o4 + 64] %%asi, %%f32
14213+ fxor %%f0, %%f16, %%f16
14214+ fxor %%f2, %%f18, %%f18
14215+ fxor %%f4, %%f20, %%f20
14216+ fxor %%f6, %%f22, %%f22
14217+ fxor %%f8, %%f24, %%f24
14218+ fxor %%f10, %%f26, %%f26
14219+ fxor %%f12, %%f28, %%f28
14220+ fxor %%f14, %%f30, %%f30
14221+ stda %%f16, [%%o4] %3
14222+ ldda [%%o3 + 64] %%asi, %%f48
14223+ membar #Sync
14224+ fxor %%f32, %%f48, %%f48
14225+ fxor %%f34, %%f50, %%f50
14226+ fxor %%f36, %%f52, %%f52
14227+ fxor %%f38, %%f54, %%f54
14228+ fxor %%f40, %%f56, %%f56
14229+ fxor %%f42, %%f58, %%f58
14230+ fxor %%f44, %%f60, %%f60
14231+ fxor %%f46, %%f62, %%f62
14232+ stda %%f48, [%%o4 + 64] %%asi
14233+ membar #Sync|#StoreStore|#StoreLoad
14234+ wr %%g0, 0, %%fprs
14235+ retl
14236+ wr %%g1, %%g0, %%asi
14237+
14238+13: ldx [%%o1 + 16], %%o2
14239+ ldx [%%o2 + %0], %%o2
14240+
14241+3: ldda [%%o2] %3, %%f32
14242+ fxor %%f0, %%f16, %%f48
14243+ fxor %%f2, %%f18, %%f50
14244+ add %%o4, 64, %%o4
14245+ fxor %%f4, %%f20, %%f52
14246+ fxor %%f6, %%f22, %%f54
14247+ add %%o3, 64, %%o3
14248+ fxor %%f8, %%f24, %%f56
14249+ fxor %%f10, %%f26, %%f58
14250+ fxor %%f12, %%f28, %%f60
14251+ fxor %%f14, %%f30, %%f62
14252+ ldda [%%o4] %3, %%f0
14253+ fxor %%f48, %%f32, %%f48
14254+ fxor %%f50, %%f34, %%f50
14255+ fxor %%f52, %%f36, %%f52
14256+ fxor %%f54, %%f38, %%f54
14257+ add %%o2, 64, %%o2
14258+ fxor %%f56, %%f40, %%f56
14259+ fxor %%f58, %%f42, %%f58
14260+ subcc %%g5, 64, %%g5
14261+ fxor %%f60, %%f44, %%f60
14262+ fxor %%f62, %%f46, %%f62
14263+ stda %%f48, [%%o4 + %%g1] %3
14264+ bne,pt %%xcc, 3b
14265+ ldda [%%o3] %3, %%f16
14266+
14267+ ldda [%%o2] %3, %%f32
14268+ fxor %%f0, %%f16, %%f48
14269+ fxor %%f2, %%f18, %%f50
14270+ fxor %%f4, %%f20, %%f52
14271+ fxor %%f6, %%f22, %%f54
14272+ fxor %%f8, %%f24, %%f56
14273+ fxor %%f10, %%f26, %%f58
14274+ fxor %%f12, %%f28, %%f60
14275+ fxor %%f14, %%f30, %%f62
14276+ membar #Sync
14277+ fxor %%f48, %%f32, %%f48
14278+ fxor %%f50, %%f34, %%f50
14279+ fxor %%f52, %%f36, %%f52
14280+ fxor %%f54, %%f38, %%f54
14281+ fxor %%f56, %%f40, %%f56
14282+ fxor %%f58, %%f42, %%f58
14283+ fxor %%f60, %%f44, %%f60
14284+ fxor %%f62, %%f46, %%f62
14285+ stda %%f48, [%%o4] %3
14286+ membar #Sync|#StoreStore|#StoreLoad
14287+ retl
14288+ wr %%g0, 0, %%fprs
14289+
14290+10: cmp %%o0, 5
14291+ be,pt %%xcc, 15f
14292+ mov -64, %%g1
14293+
14294+14: ldx [%%o1 + 16], %%o2
14295+ ldx [%%o1 + 24], %%o0
14296+ ldx [%%o2 + %0], %%o2
14297+ ldx [%%o0 + %0], %%o0
14298+
14299+4: ldda [%%o2] %3, %%f32
14300+ fxor %%f0, %%f16, %%f16
14301+ fxor %%f2, %%f18, %%f18
14302+ add %%o4, 64, %%o4
14303+ fxor %%f4, %%f20, %%f20
14304+ fxor %%f6, %%f22, %%f22
14305+ add %%o3, 64, %%o3
14306+ fxor %%f8, %%f24, %%f24
14307+ fxor %%f10, %%f26, %%f26
14308+ fxor %%f12, %%f28, %%f28
14309+ fxor %%f14, %%f30, %%f30
14310+ ldda [%%o0] %3, %%f48
14311+ fxor %%f16, %%f32, %%f32
14312+ fxor %%f18, %%f34, %%f34
14313+ fxor %%f20, %%f36, %%f36
14314+ fxor %%f22, %%f38, %%f38
14315+ add %%o2, 64, %%o2
14316+ fxor %%f24, %%f40, %%f40
14317+ fxor %%f26, %%f42, %%f42
14318+ fxor %%f28, %%f44, %%f44
14319+ fxor %%f30, %%f46, %%f46
14320+ ldda [%%o4] %3, %%f0
14321+ fxor %%f32, %%f48, %%f48
14322+ fxor %%f34, %%f50, %%f50
14323+ fxor %%f36, %%f52, %%f52
14324+ add %%o0, 64, %%o0
14325+ fxor %%f38, %%f54, %%f54
14326+ fxor %%f40, %%f56, %%f56
14327+ fxor %%f42, %%f58, %%f58
14328+ subcc %%g5, 64, %%g5
14329+ fxor %%f44, %%f60, %%f60
14330+ fxor %%f46, %%f62, %%f62
14331+ stda %%f48, [%%o4 + %%g1] %3
14332+ bne,pt %%xcc, 4b
14333+ ldda [%%o3] %3, %%f16
14334+
14335+ ldda [%%o2] %3, %%f32
14336+ fxor %%f0, %%f16, %%f16
14337+ fxor %%f2, %%f18, %%f18
14338+ fxor %%f4, %%f20, %%f20
14339+ fxor %%f6, %%f22, %%f22
14340+ fxor %%f8, %%f24, %%f24
14341+ fxor %%f10, %%f26, %%f26
14342+ fxor %%f12, %%f28, %%f28
14343+ fxor %%f14, %%f30, %%f30
14344+ ldda [%%o0] %3, %%f48
14345+ fxor %%f16, %%f32, %%f32
14346+ fxor %%f18, %%f34, %%f34
14347+ fxor %%f20, %%f36, %%f36
14348+ fxor %%f22, %%f38, %%f38
14349+ fxor %%f24, %%f40, %%f40
14350+ fxor %%f26, %%f42, %%f42
14351+ fxor %%f28, %%f44, %%f44
14352+ fxor %%f30, %%f46, %%f46
14353+ membar #Sync
14354+ fxor %%f32, %%f48, %%f48
14355+ fxor %%f34, %%f50, %%f50
14356+ fxor %%f36, %%f52, %%f52
14357+ fxor %%f38, %%f54, %%f54
14358+ fxor %%f40, %%f56, %%f56
14359+ fxor %%f42, %%f58, %%f58
14360+ fxor %%f44, %%f60, %%f60
14361+ fxor %%f46, %%f62, %%f62
14362+ stda %%f48, [%%o4] %3
14363+ membar #Sync|#StoreStore|#StoreLoad
14364+ retl
14365+ wr %%g0, 0, %%fprs
14366+
14367+15: ldx [%%o1 + 16], %%o2
14368+ ldx [%%o1 + 24], %%o0
14369+ ldx [%%o1 + 32], %%o1
14370+ ldx [%%o2 + %0], %%o2
14371+ ldx [%%o0 + %0], %%o0
14372+ ldx [%%o1 + %0], %%o1
14373+
14374+5: ldda [%%o2] %3, %%f32
14375+ fxor %%f0, %%f16, %%f48
14376+ fxor %%f2, %%f18, %%f50
14377+ add %%o4, 64, %%o4
14378+ fxor %%f4, %%f20, %%f52
14379+ fxor %%f6, %%f22, %%f54
14380+ add %%o3, 64, %%o3
14381+ fxor %%f8, %%f24, %%f56
14382+ fxor %%f10, %%f26, %%f58
14383+ fxor %%f12, %%f28, %%f60
14384+ fxor %%f14, %%f30, %%f62
14385+ ldda [%%o0] %3, %%f16
14386+ fxor %%f48, %%f32, %%f48
14387+ fxor %%f50, %%f34, %%f50
14388+ fxor %%f52, %%f36, %%f52
14389+ fxor %%f54, %%f38, %%f54
14390+ add %%o2, 64, %%o2
14391+ fxor %%f56, %%f40, %%f56
14392+ fxor %%f58, %%f42, %%f58
14393+ fxor %%f60, %%f44, %%f60
14394+ fxor %%f62, %%f46, %%f62
14395+ ldda [%%o1] %3, %%f32
14396+ fxor %%f48, %%f16, %%f48
14397+ fxor %%f50, %%f18, %%f50
14398+ add %%o0, 64, %%o0
14399+ fxor %%f52, %%f20, %%f52
14400+ fxor %%f54, %%f22, %%f54
14401+ add %%o1, 64, %%o1
14402+ fxor %%f56, %%f24, %%f56
14403+ fxor %%f58, %%f26, %%f58
14404+ fxor %%f60, %%f28, %%f60
14405+ fxor %%f62, %%f30, %%f62
14406+ ldda [%%o4] %3, %%f0
14407+ fxor %%f48, %%f32, %%f48
14408+ fxor %%f50, %%f34, %%f50
14409+ fxor %%f52, %%f36, %%f52
14410+ fxor %%f54, %%f38, %%f54
14411+ fxor %%f56, %%f40, %%f56
14412+ fxor %%f58, %%f42, %%f58
14413+ subcc %%g5, 64, %%g5
14414+ fxor %%f60, %%f44, %%f60
14415+ fxor %%f62, %%f46, %%f62
14416+ stda %%f48, [%%o4 + %%g1] %3
14417+ bne,pt %%xcc, 5b
14418+ ldda [%%o3] %3, %%f16
14419+
14420+ ldda [%%o2] %3, %%f32
14421+ fxor %%f0, %%f16, %%f48
14422+ fxor %%f2, %%f18, %%f50
14423+ fxor %%f4, %%f20, %%f52
14424+ fxor %%f6, %%f22, %%f54
14425+ fxor %%f8, %%f24, %%f56
14426+ fxor %%f10, %%f26, %%f58
14427+ fxor %%f12, %%f28, %%f60
14428+ fxor %%f14, %%f30, %%f62
14429+ ldda [%%o0] %3, %%f16
14430+ fxor %%f48, %%f32, %%f48
14431+ fxor %%f50, %%f34, %%f50
14432+ fxor %%f52, %%f36, %%f52
14433+ fxor %%f54, %%f38, %%f54
14434+ fxor %%f56, %%f40, %%f56
14435+ fxor %%f58, %%f42, %%f58
14436+ fxor %%f60, %%f44, %%f60
14437+ fxor %%f62, %%f46, %%f62
14438+ ldda [%%o1] %3, %%f32
14439+ fxor %%f48, %%f16, %%f48
14440+ fxor %%f50, %%f18, %%f50
14441+ fxor %%f52, %%f20, %%f52
14442+ fxor %%f54, %%f22, %%f54
14443+ fxor %%f56, %%f24, %%f56
14444+ fxor %%f58, %%f26, %%f58
14445+ fxor %%f60, %%f28, %%f60
14446+ fxor %%f62, %%f30, %%f62
14447+ membar #Sync
14448+ fxor %%f48, %%f32, %%f48
14449+ fxor %%f50, %%f34, %%f50
14450+ fxor %%f52, %%f36, %%f52
14451+ fxor %%f54, %%f38, %%f54
14452+ fxor %%f56, %%f40, %%f56
14453+ fxor %%f58, %%f42, %%f58
14454+ fxor %%f60, %%f44, %%f60
14455+ fxor %%f62, %%f46, %%f62
14456+ stda %%f48, [%%o4] %3
14457+ membar #Sync|#StoreStore|#StoreLoad
14458+ retl
14459+ wr %%g0, 0, %%fprs
14460+ " : :
14461+ "i" (&((struct buffer_head *)0)->b_data),
14462+ "i" (&((struct buffer_head *)0)->b_size),
14463+ "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
14464+ "i" (FPRS_FEF), "i" (VISenter));
14465+}
14466+#endif /* __sparc_v9__ */
14467+
14468+#if defined(__sparc__) && !defined(__sparc_v9__)
14469+/*
14470+ * High speed xor_block operation for RAID4/5 utilizing the
14471+ * ldd/std SPARC instructions.
14472+ *
14473+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
14474+ *
14475+ */
14476+
14477+XORBLOCK_TEMPLATE(SPARC)
14478+{
14479+ int size = bh_ptr[0]->b_size;
14480+ int lines = size / (sizeof (long)) / 8, i;
14481+ long *destp = (long *) bh_ptr[0]->b_data;
14482+ long *source1 = (long *) bh_ptr[1]->b_data;
14483+ long *source2, *source3, *source4;
14484+
14485+ switch (count) {
14486+ case 2:
14487+ for (i = lines; i > 0; i--) {
14488+ __asm__ __volatile__("
14489+ ldd [%0 + 0x00], %%g2
14490+ ldd [%0 + 0x08], %%g4
14491+ ldd [%0 + 0x10], %%o0
14492+ ldd [%0 + 0x18], %%o2
14493+ ldd [%1 + 0x00], %%o4
14494+ ldd [%1 + 0x08], %%l0
14495+ ldd [%1 + 0x10], %%l2
14496+ ldd [%1 + 0x18], %%l4
14497+ xor %%g2, %%o4, %%g2
14498+ xor %%g3, %%o5, %%g3
14499+ xor %%g4, %%l0, %%g4
14500+ xor %%g5, %%l1, %%g5
14501+ xor %%o0, %%l2, %%o0
14502+ xor %%o1, %%l3, %%o1
14503+ xor %%o2, %%l4, %%o2
14504+ xor %%o3, %%l5, %%o3
14505+ std %%g2, [%0 + 0x00]
14506+ std %%g4, [%0 + 0x08]
14507+ std %%o0, [%0 + 0x10]
14508+ std %%o2, [%0 + 0x18]
14509+ " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
14510+ "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
14511+ destp += 8;
14512+ source1 += 8;
14513+ }
14514+ break;
14515+ case 3:
14516+ source2 = (long *) bh_ptr[2]->b_data;
14517+ for (i = lines; i > 0; i--) {
14518+ __asm__ __volatile__("
14519+ ldd [%0 + 0x00], %%g2
14520+ ldd [%0 + 0x08], %%g4
14521+ ldd [%0 + 0x10], %%o0
14522+ ldd [%0 + 0x18], %%o2
14523+ ldd [%1 + 0x00], %%o4
14524+ ldd [%1 + 0x08], %%l0
14525+ ldd [%1 + 0x10], %%l2
14526+ ldd [%1 + 0x18], %%l4
14527+ xor %%g2, %%o4, %%g2
14528+ xor %%g3, %%o5, %%g3
14529+ ldd [%2 + 0x00], %%o4
14530+ xor %%g4, %%l0, %%g4
14531+ xor %%g5, %%l1, %%g5
14532+ ldd [%2 + 0x08], %%l0
14533+ xor %%o0, %%l2, %%o0
14534+ xor %%o1, %%l3, %%o1
14535+ ldd [%2 + 0x10], %%l2
14536+ xor %%o2, %%l4, %%o2
14537+ xor %%o3, %%l5, %%o3
14538+ ldd [%2 + 0x18], %%l4
14539+ xor %%g2, %%o4, %%g2
14540+ xor %%g3, %%o5, %%g3
14541+ xor %%g4, %%l0, %%g4
14542+ xor %%g5, %%l1, %%g5
14543+ xor %%o0, %%l2, %%o0
14544+ xor %%o1, %%l3, %%o1
14545+ xor %%o2, %%l4, %%o2
14546+ xor %%o3, %%l5, %%o3
14547+ std %%g2, [%0 + 0x00]
14548+ std %%g4, [%0 + 0x08]
14549+ std %%o0, [%0 + 0x10]
14550+ std %%o2, [%0 + 0x18]
14551+ " : : "r" (destp), "r" (source1), "r" (source2)
14552+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14553+ "l0", "l1", "l2", "l3", "l4", "l5");
14554+ destp += 8;
14555+ source1 += 8;
14556+ source2 += 8;
14557+ }
14558+ break;
14559+ case 4:
14560+ source2 = (long *) bh_ptr[2]->b_data;
14561+ source3 = (long *) bh_ptr[3]->b_data;
14562+ for (i = lines; i > 0; i--) {
14563+ __asm__ __volatile__("
14564+ ldd [%0 + 0x00], %%g2
14565+ ldd [%0 + 0x08], %%g4
14566+ ldd [%0 + 0x10], %%o0
14567+ ldd [%0 + 0x18], %%o2
14568+ ldd [%1 + 0x00], %%o4
14569+ ldd [%1 + 0x08], %%l0
14570+ ldd [%1 + 0x10], %%l2
14571+ ldd [%1 + 0x18], %%l4
14572+ xor %%g2, %%o4, %%g2
14573+ xor %%g3, %%o5, %%g3
14574+ ldd [%2 + 0x00], %%o4
14575+ xor %%g4, %%l0, %%g4
14576+ xor %%g5, %%l1, %%g5
14577+ ldd [%2 + 0x08], %%l0
14578+ xor %%o0, %%l2, %%o0
14579+ xor %%o1, %%l3, %%o1
14580+ ldd [%2 + 0x10], %%l2
14581+ xor %%o2, %%l4, %%o2
14582+ xor %%o3, %%l5, %%o3
14583+ ldd [%2 + 0x18], %%l4
14584+ xor %%g2, %%o4, %%g2
14585+ xor %%g3, %%o5, %%g3
14586+ ldd [%3 + 0x00], %%o4
14587+ xor %%g4, %%l0, %%g4
14588+ xor %%g5, %%l1, %%g5
14589+ ldd [%3 + 0x08], %%l0
14590+ xor %%o0, %%l2, %%o0
14591+ xor %%o1, %%l3, %%o1
14592+ ldd [%3 + 0x10], %%l2
14593+ xor %%o2, %%l4, %%o2
14594+ xor %%o3, %%l5, %%o3
14595+ ldd [%3 + 0x18], %%l4
14596+ xor %%g2, %%o4, %%g2
14597+ xor %%g3, %%o5, %%g3
14598+ xor %%g4, %%l0, %%g4
14599+ xor %%g5, %%l1, %%g5
14600+ xor %%o0, %%l2, %%o0
14601+ xor %%o1, %%l3, %%o1
14602+ xor %%o2, %%l4, %%o2
14603+ xor %%o3, %%l5, %%o3
14604+ std %%g2, [%0 + 0x00]
14605+ std %%g4, [%0 + 0x08]
14606+ std %%o0, [%0 + 0x10]
14607+ std %%o2, [%0 + 0x18]
14608+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
14609+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14610+ "l0", "l1", "l2", "l3", "l4", "l5");
14611+ destp += 8;
14612+ source1 += 8;
14613+ source2 += 8;
14614+ source3 += 8;
14615+ }
14616+ break;
14617+ case 5:
14618+ source2 = (long *) bh_ptr[2]->b_data;
14619+ source3 = (long *) bh_ptr[3]->b_data;
14620+ source4 = (long *) bh_ptr[4]->b_data;
14621+ for (i = lines; i > 0; i--) {
14622+ __asm__ __volatile__("
14623+ ldd [%0 + 0x00], %%g2
14624+ ldd [%0 + 0x08], %%g4
14625+ ldd [%0 + 0x10], %%o0
14626+ ldd [%0 + 0x18], %%o2
14627+ ldd [%1 + 0x00], %%o4
14628+ ldd [%1 + 0x08], %%l0
14629+ ldd [%1 + 0x10], %%l2
14630+ ldd [%1 + 0x18], %%l4
14631+ xor %%g2, %%o4, %%g2
14632+ xor %%g3, %%o5, %%g3
14633+ ldd [%2 + 0x00], %%o4
14634+ xor %%g4, %%l0, %%g4
14635+ xor %%g5, %%l1, %%g5
14636+ ldd [%2 + 0x08], %%l0
14637+ xor %%o0, %%l2, %%o0
14638+ xor %%o1, %%l3, %%o1
14639+ ldd [%2 + 0x10], %%l2
14640+ xor %%o2, %%l4, %%o2
14641+ xor %%o3, %%l5, %%o3
14642+ ldd [%2 + 0x18], %%l4
14643+ xor %%g2, %%o4, %%g2
14644+ xor %%g3, %%o5, %%g3
14645+ ldd [%3 + 0x00], %%o4
14646+ xor %%g4, %%l0, %%g4
14647+ xor %%g5, %%l1, %%g5
14648+ ldd [%3 + 0x08], %%l0
14649+ xor %%o0, %%l2, %%o0
14650+ xor %%o1, %%l3, %%o1
14651+ ldd [%3 + 0x10], %%l2
14652+ xor %%o2, %%l4, %%o2
14653+ xor %%o3, %%l5, %%o3
14654+ ldd [%3 + 0x18], %%l4
14655+ xor %%g2, %%o4, %%g2
14656+ xor %%g3, %%o5, %%g3
14657+ ldd [%4 + 0x00], %%o4
14658+ xor %%g4, %%l0, %%g4
14659+ xor %%g5, %%l1, %%g5
14660+ ldd [%4 + 0x08], %%l0
14661+ xor %%o0, %%l2, %%o0
14662+ xor %%o1, %%l3, %%o1
14663+ ldd [%4 + 0x10], %%l2
14664+ xor %%o2, %%l4, %%o2
14665+ xor %%o3, %%l5, %%o3
14666+ ldd [%4 + 0x18], %%l4
14667+ xor %%g2, %%o4, %%g2
14668+ xor %%g3, %%o5, %%g3
14669+ xor %%g4, %%l0, %%g4
14670+ xor %%g5, %%l1, %%g5
14671+ xor %%o0, %%l2, %%o0
14672+ xor %%o1, %%l3, %%o1
14673+ xor %%o2, %%l4, %%o2
14674+ xor %%o3, %%l5, %%o3
14675+ std %%g2, [%0 + 0x00]
14676+ std %%g4, [%0 + 0x08]
14677+ std %%o0, [%0 + 0x10]
14678+ std %%o2, [%0 + 0x18]
14679+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
14680+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14681+ "l0", "l1", "l2", "l3", "l4", "l5");
14682+ destp += 8;
14683+ source1 += 8;
14684+ source2 += 8;
14685+ source3 += 8;
14686+ source4 += 8;
14687+ }
14688+ break;
14689+ }
14690+}
14691+#endif /* __sparc_v[78]__ */
14692+
14693+#ifndef __sparc_v9__
14694+
14695+/*
14696+ * this one works reasonably on any x86 CPU
14697+ * (send me an assembly version for inclusion if you can make it faster)
14698+ *
14699+ * this one is just as fast as written in pure assembly on x86.
14700+ * the reason for this separate version is that the
14701+ * fast open-coded xor routine "32reg" produces suboptimal code
14702+ * on x86, due to lack of registers.
14703+ */
14704+XORBLOCK_TEMPLATE(8regs)
14705+{
14706+ int len = bh_ptr[0]->b_size;
14707+ long *destp = (long *) bh_ptr[0]->b_data;
14708+ long *source1, *source2, *source3, *source4;
14709+ long lines = len / (sizeof (long)) / 8, i;
14710+
14711+ switch(count) {
14712+ case 2:
14713+ source1 = (long *) bh_ptr[1]->b_data;
14714+ for (i = lines; i > 0; i--) {
14715+ *(destp + 0) ^= *(source1 + 0);
14716+ *(destp + 1) ^= *(source1 + 1);
14717+ *(destp + 2) ^= *(source1 + 2);
14718+ *(destp + 3) ^= *(source1 + 3);
14719+ *(destp + 4) ^= *(source1 + 4);
14720+ *(destp + 5) ^= *(source1 + 5);
14721+ *(destp + 6) ^= *(source1 + 6);
14722+ *(destp + 7) ^= *(source1 + 7);
14723+ source1 += 8;
14724+ destp += 8;
14725+ }
14726+ break;
14727+ case 3:
14728+ source2 = (long *) bh_ptr[2]->b_data;
14729+ source1 = (long *) bh_ptr[1]->b_data;
14730+ for (i = lines; i > 0; i--) {
14731+ *(destp + 0) ^= *(source1 + 0);
14732+ *(destp + 0) ^= *(source2 + 0);
14733+ *(destp + 1) ^= *(source1 + 1);
14734+ *(destp + 1) ^= *(source2 + 1);
14735+ *(destp + 2) ^= *(source1 + 2);
14736+ *(destp + 2) ^= *(source2 + 2);
14737+ *(destp + 3) ^= *(source1 + 3);
14738+ *(destp + 3) ^= *(source2 + 3);
14739+ *(destp + 4) ^= *(source1 + 4);
14740+ *(destp + 4) ^= *(source2 + 4);
14741+ *(destp + 5) ^= *(source1 + 5);
14742+ *(destp + 5) ^= *(source2 + 5);
14743+ *(destp + 6) ^= *(source1 + 6);
14744+ *(destp + 6) ^= *(source2 + 6);
14745+ *(destp + 7) ^= *(source1 + 7);
14746+ *(destp + 7) ^= *(source2 + 7);
14747+ source1 += 8;
14748+ source2 += 8;
14749+ destp += 8;
14750+ }
14751+ break;
14752+ case 4:
14753+ source3 = (long *) bh_ptr[3]->b_data;
14754+ source2 = (long *) bh_ptr[2]->b_data;
14755+ source1 = (long *) bh_ptr[1]->b_data;
14756+ for (i = lines; i > 0; i--) {
14757+ *(destp + 0) ^= *(source1 + 0);
14758+ *(destp + 0) ^= *(source2 + 0);
14759+ *(destp + 0) ^= *(source3 + 0);
14760+ *(destp + 1) ^= *(source1 + 1);
14761+ *(destp + 1) ^= *(source2 + 1);
14762+ *(destp + 1) ^= *(source3 + 1);
14763+ *(destp + 2) ^= *(source1 + 2);
14764+ *(destp + 2) ^= *(source2 + 2);
14765+ *(destp + 2) ^= *(source3 + 2);
14766+ *(destp + 3) ^= *(source1 + 3);
14767+ *(destp + 3) ^= *(source2 + 3);
14768+ *(destp + 3) ^= *(source3 + 3);
14769+ *(destp + 4) ^= *(source1 + 4);
14770+ *(destp + 4) ^= *(source2 + 4);
14771+ *(destp + 4) ^= *(source3 + 4);
14772+ *(destp + 5) ^= *(source1 + 5);
14773+ *(destp + 5) ^= *(source2 + 5);
14774+ *(destp + 5) ^= *(source3 + 5);
14775+ *(destp + 6) ^= *(source1 + 6);
14776+ *(destp + 6) ^= *(source2 + 6);
14777+ *(destp + 6) ^= *(source3 + 6);
14778+ *(destp + 7) ^= *(source1 + 7);
14779+ *(destp + 7) ^= *(source2 + 7);
14780+ *(destp + 7) ^= *(source3 + 7);
14781+ source1 += 8;
14782+ source2 += 8;
14783+ source3 += 8;
14784+ destp += 8;
14785+ }
14786+ break;
14787+ case 5:
14788+ source4 = (long *) bh_ptr[4]->b_data;
14789+ source3 = (long *) bh_ptr[3]->b_data;
14790+ source2 = (long *) bh_ptr[2]->b_data;
14791+ source1 = (long *) bh_ptr[1]->b_data;
14792+ for (i = lines; i > 0; i--) {
14793+ *(destp + 0) ^= *(source1 + 0);
14794+ *(destp + 0) ^= *(source2 + 0);
14795+ *(destp + 0) ^= *(source3 + 0);
14796+ *(destp + 0) ^= *(source4 + 0);
14797+ *(destp + 1) ^= *(source1 + 1);
14798+ *(destp + 1) ^= *(source2 + 1);
14799+ *(destp + 1) ^= *(source3 + 1);
14800+ *(destp + 1) ^= *(source4 + 1);
14801+ *(destp + 2) ^= *(source1 + 2);
14802+ *(destp + 2) ^= *(source2 + 2);
14803+ *(destp + 2) ^= *(source3 + 2);
14804+ *(destp + 2) ^= *(source4 + 2);
14805+ *(destp + 3) ^= *(source1 + 3);
14806+ *(destp + 3) ^= *(source2 + 3);
14807+ *(destp + 3) ^= *(source3 + 3);
14808+ *(destp + 3) ^= *(source4 + 3);
14809+ *(destp + 4) ^= *(source1 + 4);
14810+ *(destp + 4) ^= *(source2 + 4);
14811+ *(destp + 4) ^= *(source3 + 4);
14812+ *(destp + 4) ^= *(source4 + 4);
14813+ *(destp + 5) ^= *(source1 + 5);
14814+ *(destp + 5) ^= *(source2 + 5);
14815+ *(destp + 5) ^= *(source3 + 5);
14816+ *(destp + 5) ^= *(source4 + 5);
14817+ *(destp + 6) ^= *(source1 + 6);
14818+ *(destp + 6) ^= *(source2 + 6);
14819+ *(destp + 6) ^= *(source3 + 6);
14820+ *(destp + 6) ^= *(source4 + 6);
14821+ *(destp + 7) ^= *(source1 + 7);
14822+ *(destp + 7) ^= *(source2 + 7);
14823+ *(destp + 7) ^= *(source3 + 7);
14824+ *(destp + 7) ^= *(source4 + 7);
14825+ source1 += 8;
14826+ source2 += 8;
14827+ source3 += 8;
14828+ source4 += 8;
14829+ destp += 8;
14830+ }
14831+ break;
14832+ }
14833+}
14834+
14835+/*
14836+ * platform independent RAID5 checksum calculation, this should
14837+ * be very fast on any platform that has a decent amount of
14838+ * registers. (32 or more)
14839+ */
14840+XORBLOCK_TEMPLATE(32regs)
14841+{
14842+ int size = bh_ptr[0]->b_size;
14843+ int lines = size / (sizeof (long)) / 8, i;
14844+ long *destp = (long *) bh_ptr[0]->b_data;
14845+ long *source1, *source2, *source3, *source4;
14846+
14847+ /* LOTS of registers available...
14848+ We do explicite loop-unrolling here for code which
14849+ favours RISC machines. In fact this is almoast direct
14850+ RISC assembly on Alpha and SPARC :-) */
14851+
14852+
14853+ switch(count) {
14854+ case 2:
14855+ source1 = (long *) bh_ptr[1]->b_data;
14856+ for (i = lines; i > 0; i--) {
14857+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14858+ d0 = destp[0]; /* Pull the stuff into registers */
14859+ d1 = destp[1]; /* ... in bursts, if possible. */
14860+ d2 = destp[2];
14861+ d3 = destp[3];
14862+ d4 = destp[4];
14863+ d5 = destp[5];
14864+ d6 = destp[6];
14865+ d7 = destp[7];
14866+ d0 ^= source1[0];
14867+ d1 ^= source1[1];
14868+ d2 ^= source1[2];
14869+ d3 ^= source1[3];
14870+ d4 ^= source1[4];
14871+ d5 ^= source1[5];
14872+ d6 ^= source1[6];
14873+ d7 ^= source1[7];
14874+ destp[0] = d0; /* Store the result (in burts) */
14875+ destp[1] = d1;
14876+ destp[2] = d2;
14877+ destp[3] = d3;
14878+ destp[4] = d4; /* Store the result (in burts) */
14879+ destp[5] = d5;
14880+ destp[6] = d6;
14881+ destp[7] = d7;
14882+ source1 += 8;
14883+ destp += 8;
14884+ }
14885+ break;
14886+ case 3:
14887+ source2 = (long *) bh_ptr[2]->b_data;
14888+ source1 = (long *) bh_ptr[1]->b_data;
14889+ for (i = lines; i > 0; i--) {
14890+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14891+ d0 = destp[0]; /* Pull the stuff into registers */
14892+ d1 = destp[1]; /* ... in bursts, if possible. */
14893+ d2 = destp[2];
14894+ d3 = destp[3];
14895+ d4 = destp[4];
14896+ d5 = destp[5];
14897+ d6 = destp[6];
14898+ d7 = destp[7];
14899+ d0 ^= source1[0];
14900+ d1 ^= source1[1];
14901+ d2 ^= source1[2];
14902+ d3 ^= source1[3];
14903+ d4 ^= source1[4];
14904+ d5 ^= source1[5];
14905+ d6 ^= source1[6];
14906+ d7 ^= source1[7];
14907+ d0 ^= source2[0];
14908+ d1 ^= source2[1];
14909+ d2 ^= source2[2];
14910+ d3 ^= source2[3];
14911+ d4 ^= source2[4];
14912+ d5 ^= source2[5];
14913+ d6 ^= source2[6];
14914+ d7 ^= source2[7];
14915+ destp[0] = d0; /* Store the result (in burts) */
14916+ destp[1] = d1;
14917+ destp[2] = d2;
14918+ destp[3] = d3;
14919+ destp[4] = d4; /* Store the result (in burts) */
14920+ destp[5] = d5;
14921+ destp[6] = d6;
14922+ destp[7] = d7;
14923+ source1 += 8;
14924+ source2 += 8;
14925+ destp += 8;
14926+ }
14927+ break;
14928+ case 4:
14929+ source3 = (long *) bh_ptr[3]->b_data;
14930+ source2 = (long *) bh_ptr[2]->b_data;
14931+ source1 = (long *) bh_ptr[1]->b_data;
14932+ for (i = lines; i > 0; i--) {
14933+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14934+ d0 = destp[0]; /* Pull the stuff into registers */
14935+ d1 = destp[1]; /* ... in bursts, if possible. */
14936+ d2 = destp[2];
14937+ d3 = destp[3];
14938+ d4 = destp[4];
14939+ d5 = destp[5];
14940+ d6 = destp[6];
14941+ d7 = destp[7];
14942+ d0 ^= source1[0];
14943+ d1 ^= source1[1];
14944+ d2 ^= source1[2];
14945+ d3 ^= source1[3];
14946+ d4 ^= source1[4];
14947+ d5 ^= source1[5];
14948+ d6 ^= source1[6];
14949+ d7 ^= source1[7];
14950+ d0 ^= source2[0];
14951+ d1 ^= source2[1];
14952+ d2 ^= source2[2];
14953+ d3 ^= source2[3];
14954+ d4 ^= source2[4];
14955+ d5 ^= source2[5];
14956+ d6 ^= source2[6];
14957+ d7 ^= source2[7];
14958+ d0 ^= source3[0];
14959+ d1 ^= source3[1];
14960+ d2 ^= source3[2];
14961+ d3 ^= source3[3];
14962+ d4 ^= source3[4];
14963+ d5 ^= source3[5];
14964+ d6 ^= source3[6];
14965+ d7 ^= source3[7];
14966+ destp[0] = d0; /* Store the result (in burts) */
14967+ destp[1] = d1;
14968+ destp[2] = d2;
14969+ destp[3] = d3;
14970+ destp[4] = d4; /* Store the result (in burts) */
14971+ destp[5] = d5;
14972+ destp[6] = d6;
14973+ destp[7] = d7;
14974+ source1 += 8;
14975+ source2 += 8;
14976+ source3 += 8;
14977+ destp += 8;
14978+ }
14979+ break;
14980+ case 5:
14981+ source4 = (long *) bh_ptr[4]->b_data;
14982+ source3 = (long *) bh_ptr[3]->b_data;
14983+ source2 = (long *) bh_ptr[2]->b_data;
14984+ source1 = (long *) bh_ptr[1]->b_data;
14985+ for (i = lines; i > 0; i--) {
14986+ register long d0, d1, d2, d3, d4, d5, d6, d7;
14987+ d0 = destp[0]; /* Pull the stuff into registers */
14988+ d1 = destp[1]; /* ... in bursts, if possible. */
14989+ d2 = destp[2];
14990+ d3 = destp[3];
14991+ d4 = destp[4];
14992+ d5 = destp[5];
14993+ d6 = destp[6];
14994+ d7 = destp[7];
14995+ d0 ^= source1[0];
14996+ d1 ^= source1[1];
14997+ d2 ^= source1[2];
14998+ d3 ^= source1[3];
14999+ d4 ^= source1[4];
15000+ d5 ^= source1[5];
15001+ d6 ^= source1[6];
15002+ d7 ^= source1[7];
15003+ d0 ^= source2[0];
15004+ d1 ^= source2[1];
15005+ d2 ^= source2[2];
15006+ d3 ^= source2[3];
15007+ d4 ^= source2[4];
15008+ d5 ^= source2[5];
15009+ d6 ^= source2[6];
15010+ d7 ^= source2[7];
15011+ d0 ^= source3[0];
15012+ d1 ^= source3[1];
15013+ d2 ^= source3[2];
15014+ d3 ^= source3[3];
15015+ d4 ^= source3[4];
15016+ d5 ^= source3[5];
15017+ d6 ^= source3[6];
15018+ d7 ^= source3[7];
15019+ d0 ^= source4[0];
15020+ d1 ^= source4[1];
15021+ d2 ^= source4[2];
15022+ d3 ^= source4[3];
15023+ d4 ^= source4[4];
15024+ d5 ^= source4[5];
15025+ d6 ^= source4[6];
15026+ d7 ^= source4[7];
15027+ destp[0] = d0; /* Store the result (in burts) */
15028+ destp[1] = d1;
15029+ destp[2] = d2;
15030+ destp[3] = d3;
15031+ destp[4] = d4; /* Store the result (in burts) */
15032+ destp[5] = d5;
15033+ destp[6] = d6;
15034+ destp[7] = d7;
15035+ source1 += 8;
15036+ source2 += 8;
15037+ source3 += 8;
15038+ source4 += 8;
15039+ destp += 8;
15040+ }
15041+ break;
15042+ }
15043+}
15044+
15045+/*
15046+ * (the -6*32 shift factor colors the cache)
15047+ */
15048+#define SIZE (PAGE_SIZE-6*32)
15049+
15050+static void xor_speed ( struct xor_block_template * func,
15051+ struct buffer_head *b1, struct buffer_head *b2)
15052+{
15053+ int speed;
15054+ unsigned long now;
15055+ int i, count, max;
15056+ struct buffer_head *bh_ptr[6];
15057+
15058+ func->next = xor_functions;
15059+ xor_functions = func;
15060+ bh_ptr[0] = b1;
15061+ bh_ptr[1] = b2;
15062+
15063+ /*
15064+ * count the number of XORs done during a whole jiffy.
15065+ * calculate the speed of checksumming from this.
15066+ * (we use a 2-page allocation to have guaranteed
15067+ * color L1-cache layout)
15068+ */
15069+ max = 0;
15070+ for (i = 0; i < 5; i++) {
15071+ now = jiffies;
15072+ count = 0;
15073+ while (jiffies == now) {
15074+ mb();
15075+ func->xor_block(2,bh_ptr);
15076+ mb();
15077+ count++;
15078+ mb();
15079+ }
15080+ if (count > max)
15081+ max = count;
15082+ }
15083+
15084+ speed = max * (HZ*SIZE/1024);
15085+ func->speed = speed;
15086+
15087+ printk( " %-10s: %5d.%03d MB/sec\n", func->name,
15088+ speed / 1000, speed % 1000);
15089+}
15090+
15091+static inline void pick_fastest_function(void)
15092+{
15093+ struct xor_block_template *f, *fastest;
15094+
15095+ fastest = xor_functions;
15096+ for (f = fastest; f; f = f->next) {
15097+ if (f->speed > fastest->speed)
15098+ fastest = f;
15099+ }
15100+#ifdef CONFIG_X86_XMM
15101+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15102+ fastest = &t_xor_block_pIII_kni;
15103+ }
15104+#endif
15105+ xor_block = fastest->xor_block;
15106+ printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
15107+ fastest->speed / 1000, fastest->speed % 1000);
15108+}
15109+
15110+
15111+void calibrate_xor_block(void)
15112+{
15113+ struct buffer_head b1, b2;
15114+
15115+ memset(&b1,0,sizeof(b1));
15116+ b2 = b1;
15117+
15118+ b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
15119+ if (!b1.b_data) {
15120+ pick_fastest_function();
15121+ return;
15122+ }
15123+ b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
15124+
15125+ b1.b_size = SIZE;
15126+
15127+ printk(KERN_INFO "raid5: measuring checksumming speed\n");
15128+
15129+ sti(); /* should be safe */
15130+
15131+#if defined(__sparc__) && !defined(__sparc_v9__)
15132+ printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
15133+ xor_speed(&t_xor_block_SPARC,&b1,&b2);
15134+#endif
15135+
15136+#ifdef CONFIG_X86_XMM
15137+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15138+ printk(KERN_INFO
15139+ "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
15140+ /* we force the use of the KNI xor block because it
15141+ can write around l2. we may also be able
15142+ to load into the l1 only depending on how
15143+ the cpu deals with a load to a line that is
15144+ being prefetched.
15145+ */
15146+ xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
15147+ }
15148+#endif /* CONFIG_X86_XMM */
15149+
15150+#ifdef __i386__
15151+
15152+ if (md_cpu_has_mmx()) {
15153+ printk(KERN_INFO
15154+ "raid5: MMX detected, trying high-speed MMX checksum routines\n");
15155+ xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
15156+ xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
15157+ }
15158+
15159+#endif /* __i386__ */
15160+
15161+
15162+ xor_speed(&t_xor_block_8regs,&b1,&b2);
15163+ xor_speed(&t_xor_block_32regs,&b1,&b2);
15164+
15165+ free_pages((unsigned long)b1.b_data,2);
15166+ pick_fastest_function();
15167+}
15168+
15169+#else /* __sparc_v9__ */
15170+
15171+void calibrate_xor_block(void)
15172+{
15173+ printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
15174+ xor_block = xor_block_VIS;
15175+}
15176+
15177+#endif /* __sparc_v9__ */
15178+
15179+MD_EXPORT_SYMBOL(xor_block);
15180+
15181--- linux/arch/i386/defconfig.orig Mon Dec 11 01:49:41 2000
15182+++ linux/arch/i386/defconfig Tue Jan 16 13:42:04 2001
15183@@ -93,7 +93,15 @@
15184 #
15185 # CONFIG_BLK_DEV_LOOP is not set
15186 # CONFIG_BLK_DEV_NBD is not set
15187-# CONFIG_BLK_DEV_MD is not set
15188+CONFIG_BLK_DEV_MD=y
15189+CONFIG_AUTODETECT_RAID=y
15190+CONFIG_MD_TRANSLUCENT=y
15191+CONFIG_MD_LINEAR=y
15192+CONFIG_MD_STRIPED=y
15193+CONFIG_MD_MIRRORING=y
15194+CONFIG_MD_RAID5=y
15195+CONFIG_MD_BOOT=y
15196+CONFIG_BLK_DEV_HSM=y
15197 # CONFIG_BLK_DEV_RAM is not set
15198 # CONFIG_BLK_DEV_XD is not set
15199 # CONFIG_BLK_DEV_DAC960 is not set
15200--- linux/arch/sparc/config.in.orig Mon Dec 11 01:49:41 2000
15201+++ linux/arch/sparc/config.in Tue Jan 16 13:42:04 2001
15202@@ -88,10 +88,16 @@
15203
15204 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15205 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15206+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15207 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15208 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15209 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15210 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15211+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15212+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15213+fi
15214+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15215+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15216 fi
15217
15218 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15219--- linux/arch/sparc/defconfig.orig Tue Jan 16 13:30:06 2001
15220+++ linux/arch/sparc/defconfig Tue Jan 16 13:42:04 2001
15221@@ -89,10 +89,13 @@
15222 #
15223 CONFIG_BLK_DEV_FD=y
15224 CONFIG_BLK_DEV_MD=y
15225+# CONFIG_AUTODETECT_RAID is not set
15226 CONFIG_MD_LINEAR=m
15227 CONFIG_MD_STRIPED=m
15228 CONFIG_MD_MIRRORING=m
15229 CONFIG_MD_RAID5=m
15230+# CONFIG_MD_TRANSLUCENT is not set
15231+# CONFIG_MD_HSM is not set
15232 CONFIG_BLK_DEV_RAM=y
15233 CONFIG_BLK_DEV_RAM_SIZE=4096
15234 CONFIG_BLK_DEV_INITRD=y
15235--- linux/arch/sparc64/config.in.orig Mon Dec 11 01:49:41 2000
15236+++ linux/arch/sparc64/config.in Tue Jan 16 13:42:04 2001
15237@@ -102,10 +102,16 @@
15238
15239 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15240 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15241+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15242 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15243 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15244 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15245 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15246+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15247+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15248+fi
15249+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15250+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15251 fi
15252
15253 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15254--- linux/arch/sparc64/defconfig.orig Tue Jan 16 13:30:06 2001
15255+++ linux/arch/sparc64/defconfig Tue Jan 16 13:42:04 2001
15256@@ -106,10 +106,13 @@
15257 #
15258 CONFIG_BLK_DEV_FD=y
15259 CONFIG_BLK_DEV_MD=y
15260+# CONFIG_AUTODETECT_RAID is not set
15261 CONFIG_MD_LINEAR=m
15262 CONFIG_MD_STRIPED=m
15263 CONFIG_MD_MIRRORING=m
15264 CONFIG_MD_RAID5=m
15265+# CONFIG_MD_TRANSLUCENT is not set
15266+# CONFIG_MD_HSM is not set
15267 CONFIG_BLK_DEV_RAM=y
15268 CONFIG_BLK_DEV_RAM_SIZE=4096
15269 CONFIG_BLK_DEV_INITRD=y
15270--- linux/Documentation/Configure.help.orig Tue Jan 16 13:30:06 2001
15271+++ linux/Documentation/Configure.help Tue Jan 16 13:42:04 2001
15272@@ -1015,6 +1015,13 @@
15273
15274 If unsure, say N.
15275
15276+Autodetect RAID partitions
15277+CONFIG_AUTODETECT_RAID
15278+ This feature lets the kernel detect RAID partitions on bootup.
15279+ An autodetect RAID partition is a normal partition with partition
15280+ type 0xfd. Use this if you want to boot RAID devices, or want to
15281+ run them automatically.
15282+
15283 Linear (append) mode
15284 CONFIG_MD_LINEAR
15285 If you say Y here, then your multiple devices driver will be able to
15286@@ -1093,6 +1100,21 @@
15287 Documentation/modules.txt.
15288
15289 If unsure, say Y.
15290+
15291+Translucent Block Device Support (EXPERIMENTAL)
15292+CONFIG_MD_TRANSLUCENT
15293+ DO NOT USE THIS STUFF YET!
15294+
15295+ currently there is only a placeholder there as the implementation
15296+ is not yet usable.
15297+
15298+Hierarchical Storage Management support (EXPERIMENTAL)
15299+CONFIG_MD_HSM
15300+ DO NOT USE THIS STUFF YET!
15301+
15302+ i have released this so people can comment on the architecture,
15303+ but user-space tools are still unusable so there is nothing much
15304+ you can do with this.
15305
15306 Boot support (linear, striped)
15307 CONFIG_MD_BOOT
This page took 2.087573 seconds and 4 git commands to generate.