]> git.pld-linux.org Git - packages/kernel.git/blame - raid-2.2.16-mabene
- obsolete
[packages/kernel.git] / raid-2.2.16-mabene
CommitLineData
68dd551b
AF
1diff -ruN linux.orig/Documentation/Configure.help linux-2.2.16/Documentation/Configure.help
2--- linux.orig/Documentation/Configure.help Wed Jun 7 23:26:42 2000
3+++ linux-2.2.16/Documentation/Configure.help Fri Jun 9 11:37:48 2000
4@@ -961,6 +961,13 @@
5
6 If unsure, say N.
7
8+Autodetect RAID partitions
9+CONFIG_AUTODETECT_RAID
10+ This feature lets the kernel detect RAID partitions on bootup.
11+ An autodetect RAID partition is a normal partition with partition
12+ type 0xfd. Use this if you want to boot RAID devices, or want to
13+ run them automatically.
14+
15 Linear (append) mode
16 CONFIG_MD_LINEAR
17 If you say Y here, then your multiple devices driver will be able to
18@@ -1039,6 +1046,21 @@
19 Documentation/modules.txt.
20
21 If unsure, say Y.
22+
23+Translucent Block Device Support (EXPERIMENTAL)
24+CONFIG_MD_TRANSLUCENT
25+ DO NOT USE THIS STUFF YET!
26+
27+ currently there is only a placeholder there as the implementation
28+ is not yet usable.
29+
30+Hierarchical Storage Management support (EXPERIMENTAL)
31+CONFIG_MD_HSM
32+ DO NOT USE THIS STUFF YET!
33+
34+ i have released this so people can comment on the architecture,
35+ but user-space tools are still unusable so there is nothing much
36+ you can do with this.
37
38 Boot support (linear, striped)
39 CONFIG_MD_BOOT
40diff -ruN linux.orig/arch/i386/defconfig linux-2.2.16/arch/i386/defconfig
41--- linux.orig/arch/i386/defconfig Thu May 4 02:16:30 2000
42+++ linux-2.2.16/arch/i386/defconfig Fri Jun 9 11:37:45 2000
43@@ -93,7 +93,15 @@
44 #
45 # CONFIG_BLK_DEV_LOOP is not set
46 # CONFIG_BLK_DEV_NBD is not set
47-# CONFIG_BLK_DEV_MD is not set
48+CONFIG_BLK_DEV_MD=y
49+CONFIG_AUTODETECT_RAID=y
50+CONFIG_MD_TRANSLUCENT=y
51+CONFIG_MD_LINEAR=y
52+CONFIG_MD_STRIPED=y
53+CONFIG_MD_MIRRORING=y
54+CONFIG_MD_RAID5=y
55+CONFIG_MD_BOOT=y
56+CONFIG_BLK_DEV_HSM=y
57 # CONFIG_BLK_DEV_RAM is not set
58 # CONFIG_BLK_DEV_XD is not set
59 # CONFIG_BLK_DEV_DAC960 is not set
60diff -ruN linux.orig/arch/sparc/config.in linux-2.2.16/arch/sparc/config.in
61--- linux.orig/arch/sparc/config.in Wed Jun 7 23:26:42 2000
62+++ linux-2.2.16/arch/sparc/config.in Fri Jun 9 11:37:45 2000
63@@ -1,4 +1,4 @@
64-# $Id$
65+# $Id$
66 # For a description of the syntax of this configuration file,
67 # see Documentation/kbuild/config-language.txt.
68 #
69@@ -85,10 +85,16 @@
70
71 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
72 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
73+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
74 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
75 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
76 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
77 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
78+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
79+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
80+fi
81+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
82+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
83 fi
84
85 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
86diff -ruN linux.orig/arch/sparc/defconfig linux-2.2.16/arch/sparc/defconfig
87--- linux.orig/arch/sparc/defconfig Thu May 4 02:16:32 2000
88+++ linux-2.2.16/arch/sparc/defconfig Fri Jun 9 11:37:45 2000
89@@ -88,10 +88,13 @@
90 #
91 CONFIG_BLK_DEV_FD=y
92 CONFIG_BLK_DEV_MD=y
93+# CONFIG_AUTODETECT_RAID is not set
94 CONFIG_MD_LINEAR=m
95 CONFIG_MD_STRIPED=m
96 CONFIG_MD_MIRRORING=m
97 CONFIG_MD_RAID5=m
98+# CONFIG_MD_TRANSLUCENT is not set
99+# CONFIG_MD_HSM is not set
100 CONFIG_BLK_DEV_RAM=y
101 CONFIG_BLK_DEV_INITRD=y
102 CONFIG_BLK_DEV_LOOP=m
103diff -ruN linux.orig/arch/sparc64/config.in linux-2.2.16/arch/sparc64/config.in
104--- linux.orig/arch/sparc64/config.in Wed Jun 7 23:26:42 2000
105+++ linux-2.2.16/arch/sparc64/config.in Fri Jun 9 11:37:48 2000
106@@ -97,10 +97,16 @@
107
108 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
109 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
110+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
111 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
112 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
113 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
114 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
115+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
116+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
117+fi
118+if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
119+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
120 fi
121
122 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
123diff -ruN linux.orig/arch/sparc64/defconfig linux-2.2.16/arch/sparc64/defconfig
124--- linux.orig/arch/sparc64/defconfig Wed Jun 7 23:26:42 2000
125+++ linux-2.2.16/arch/sparc64/defconfig Fri Jun 9 11:37:48 2000
126@@ -107,10 +107,13 @@
127 #
128 CONFIG_BLK_DEV_FD=y
129 CONFIG_BLK_DEV_MD=y
130+# CONFIG_AUTODETECT_RAID is not set
131 CONFIG_MD_LINEAR=m
132 CONFIG_MD_STRIPED=m
133 CONFIG_MD_MIRRORING=m
134 CONFIG_MD_RAID5=m
135+# CONFIG_MD_TRANSLUCENT is not set
136+# CONFIG_MD_HSM is not set
137 CONFIG_BLK_DEV_RAM=y
138 CONFIG_BLK_DEV_INITRD=y
139 CONFIG_BLK_DEV_LOOP=m
140diff -ruN linux.orig/drivers/block/Config.in linux-2.2.16/drivers/block/Config.in
141--- linux.orig/drivers/block/Config.in Wed Jun 7 23:26:42 2000
142+++ linux-2.2.16/drivers/block/Config.in Fri Jun 9 11:37:45 2000
143@@ -102,10 +102,13 @@
144 fi
145 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
146 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
147+ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
148 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
149 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
150 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
151 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
152+ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
153+ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
154 fi
155 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
156 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
157diff -ruN linux.orig/drivers/block/Makefile linux-2.2.16/drivers/block/Makefile
158--- linux.orig/drivers/block/Makefile Thu May 4 02:16:32 2000
159+++ linux-2.2.16/drivers/block/Makefile Fri Jun 9 11:37:45 2000
160@@ -282,10 +282,28 @@
161 endif
162
163 ifeq ($(CONFIG_MD_RAID5),y)
164+LX_OBJS += xor.o
165 L_OBJS += raid5.o
166 else
167 ifeq ($(CONFIG_MD_RAID5),m)
168+ LX_OBJS += xor.o
169 M_OBJS += raid5.o
170+ endif
171+endif
172+
173+ifeq ($(CONFIG_MD_TRANSLUCENT),y)
174+L_OBJS += translucent.o
175+else
176+ ifeq ($(CONFIG_MD_TRANSLUCENT),m)
177+ M_OBJS += translucent.o
178+ endif
179+endif
180+
181+ifeq ($(CONFIG_MD_HSM),y)
182+L_OBJS += hsm.o
183+else
184+ ifeq ($(CONFIG_MD_HSM),m)
185+ M_OBJS += hsm.o
186 endif
187 endif
188
189diff -ruN linux.orig/drivers/block/genhd.c linux-2.2.16/drivers/block/genhd.c
190--- linux.orig/drivers/block/genhd.c Wed Jun 7 23:26:42 2000
191+++ linux-2.2.16/drivers/block/genhd.c Fri Jun 9 11:37:45 2000
192@@ -28,6 +28,7 @@
193 #include <linux/string.h>
194 #include <linux/blk.h>
195 #include <linux/init.h>
196+#include <linux/raid/md.h>
197
198 #include <asm/system.h>
199 #include <asm/byteorder.h>
200@@ -1649,6 +1650,9 @@
201 else
202 #endif
203 rd_load();
204+#endif
205+#ifdef CONFIG_BLK_DEV_MD
206+ autodetect_raid();
207 #endif
208 #ifdef CONFIG_MD_BOOT
209 md_setup_drive();
210diff -ruN linux.orig/drivers/block/hsm.c linux-2.2.16/drivers/block/hsm.c
211--- linux.orig/drivers/block/hsm.c Thu Jan 1 01:00:00 1970
212+++ linux-2.2.16/drivers/block/hsm.c Fri Jun 9 11:37:45 2000
213@@ -0,0 +1,840 @@
214+/*
215+ hsm.c : HSM RAID driver for Linux
216+ Copyright (C) 1998 Ingo Molnar
217+
218+ HSM mode management functions.
219+
220+ This program is free software; you can redistribute it and/or modify
221+ it under the terms of the GNU General Public License as published by
222+ the Free Software Foundation; either version 2, or (at your option)
223+ any later version.
224+
225+ You should have received a copy of the GNU General Public License
226+ (for example /usr/src/linux/COPYING); if not, write to the Free
227+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
228+*/
229+
230+#include <linux/module.h>
231+
232+#include <linux/raid/md.h>
233+#include <linux/malloc.h>
234+
235+#include <linux/raid/hsm.h>
236+#include <linux/blk.h>
237+
238+#define MAJOR_NR MD_MAJOR
239+#define MD_DRIVER
240+#define MD_PERSONALITY
241+
242+
243+#define DEBUG_HSM 1
244+
245+#if DEBUG_HSM
246+#define dprintk(x,y...) printk(x,##y)
247+#else
248+#define dprintk(x,y...) do { } while (0)
249+#endif
250+
251+void print_bh(struct buffer_head *bh)
252+{
253+ dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
254+ bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
255+ bh->b_rsector, bh->b_this_page, bh->b_state,
256+ bh->b_next_free, bh->b_count, bh->b_data,
257+ bh->b_list, bh->b_flushtime
258+ );
259+}
260+
261+static int check_bg (pv_t *pv, pv_block_group_t * bg)
262+{
263+ int i, free = 0;
264+
265+ dprintk("checking bg ...\n");
266+
267+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
268+ if (pv_pptr_free(bg->blocks + i)) {
269+ free++;
270+ if (test_bit(i, bg->used_bitmap)) {
271+ printk("hm, bit %d set?\n", i);
272+ }
273+ } else {
274+ if (!test_bit(i, bg->used_bitmap)) {
275+ printk("hm, bit %d not set?\n", i);
276+ }
277+ }
278+ }
279+ dprintk("%d free blocks in bg ...\n", free);
280+ return free;
281+}
282+
283+static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
284+{
285+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
286+ struct buffer_head *bh;
287+
288+ dprintk("... getting BG at %u ...\n", bg_pos);
289+
290+ bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
291+ if (!bh) {
292+ MD_BUG();
293+ return;
294+ }
295+ desc->bg = (pv_block_group_t *) bh->b_data;
296+ desc->free_blocks = check_bg(pv, desc->bg);
297+}
298+
299+static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
300+ unsigned int lblock, lv_lptr_t * index)
301+{
302+ int i;
303+
304+ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
305+ pv_pptr_t * bptr = desc->bg->blocks + i;
306+ if (pv_pptr_free(bptr)) {
307+ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
308+
309+ if (test_bit(i, desc->bg->used_bitmap)) {
310+ MD_BUG();
311+ continue;
312+ }
313+ bptr->u.used.owner.log_id = lv->log_id;
314+ bptr->u.used.owner.log_index = lblock;
315+ index->data.phys_nr = pv->phys_nr;
316+ index->data.phys_block = bg_pos + i + 1;
317+ set_bit(i, desc->bg->used_bitmap);
318+ desc->free_blocks--;
319+ dprintk(".....free blocks left in bg %p: %d\n",
320+ desc->bg, desc->free_blocks);
321+ return 0;
322+ }
323+ }
324+ return -ENOSPC;
325+}
326+
327+static int __get_free_block (lv_t *lv, pv_t *pv,
328+ unsigned int lblock, lv_lptr_t * index)
329+{
330+ int i;
331+
332+ dprintk("trying to get free block for lblock %d ...\n", lblock);
333+
334+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
335+ pv_bg_desc_t *desc = pv->bg_array + i;
336+
337+ dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
338+ if (!desc->bg)
339+ get_bg(pv, desc, i);
340+
341+ if (desc->bg && desc->free_blocks)
342+ return find_free_block(lv, pv, desc, i,
343+ lblock, index);
344+ }
345+ dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
346+ return -ENOSPC;
347+}
348+
349+static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
350+{
351+ int err;
352+
353+ if (!lv->free_indices)
354+ return -ENOSPC;
355+
356+ /* fix me */
357+ err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
358+
359+ if (err || !index->data.phys_block) {
360+ MD_BUG();
361+ return -ENOSPC;
362+ }
363+
364+ lv->free_indices--;
365+
366+ return 0;
367+}
368+
369+/*
370+ * fix me: wordsize assumptions ...
371+ */
372+#define INDEX_BITS 8
373+#define INDEX_DEPTH (32/INDEX_BITS)
374+#define INDEX_MASK ((1<<INDEX_BITS) - 1)
375+
376+static void print_index_list (lv_t *lv, lv_lptr_t *index)
377+{
378+ lv_lptr_t *tmp;
379+ int i;
380+
381+ dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
382+ index->data.phys_block, index->cpu_addr);
383+
384+ tmp = index_child(index);
385+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
386+ if (index_block(lv, tmp))
387+ dprintk("(%d->%d)", i, index_block(lv, tmp));
388+ tmp++;
389+ }
390+ dprintk(".]\n");
391+}
392+
393+static int read_index_group (lv_t *lv, lv_lptr_t *index)
394+{
395+ lv_lptr_t *index_group, *tmp;
396+ struct buffer_head *bh;
397+ int i;
398+
399+ dprintk("reading index group <%s:%d>\n",
400+ partition_name(index_dev(lv, index)), index_block(lv, index));
401+
402+ bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
403+ if (!bh) {
404+ MD_BUG();
405+ return -EIO;
406+ }
407+ if (!buffer_uptodate(bh))
408+ MD_BUG();
409+
410+ index_group = (lv_lptr_t *) bh->b_data;
411+ tmp = index_group;
412+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
413+ if (index_block(lv, tmp)) {
414+ dprintk("index group has BLOCK %d, non-present.\n", i);
415+ tmp->cpu_addr = 0;
416+ }
417+ tmp++;
418+ }
419+ index->cpu_addr = ptr_to_cpuaddr(index_group);
420+
421+ dprintk("have read index group %p at block %d.\n",
422+ index_group, index_block(lv, index));
423+ print_index_list(lv, index);
424+
425+ return 0;
426+}
427+
428+static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
429+{
430+ struct buffer_head *bh;
431+ lv_lptr_t * index_group;
432+
433+ if (get_free_block(lv, lblock, index))
434+ return -ENOSPC;
435+
436+ dprintk("creating block for index group <%s:%d>\n",
437+ partition_name(index_dev(lv, index)), index_block(lv, index));
438+
439+ bh = getblk(index_dev(lv, index),
440+ index_block(lv, index), HSM_BLOCKSIZE);
441+
442+ index_group = (lv_lptr_t *) bh->b_data;
443+ md_clear_page(index_group);
444+ mark_buffer_uptodate(bh, 1);
445+
446+ index->cpu_addr = ptr_to_cpuaddr(index_group);
447+
448+ dprintk("allocated index group %p at block %d.\n",
449+ index_group, index_block(lv, index));
450+ return 0;
451+}
452+
453+static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
454+{
455+ lv_lptr_t * index = index_child(&lv->root_index);
456+ int idx, l;
457+
458+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
459+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
460+ index += idx;
461+ if (!l)
462+ break;
463+ if (!index_present(index)) {
464+ dprintk("no group, level %u, pos %u\n", l, idx);
465+ if (alloc_index_group(lv, lblock, index))
466+ return NULL;
467+ }
468+ index = index_child(index);
469+ }
470+ if (!index_block(lv,index)) {
471+ dprintk("no data, pos %u\n", idx);
472+ if (get_free_block(lv, lblock, index))
473+ return NULL;
474+ return index;
475+ }
476+ MD_BUG();
477+ return index;
478+}
479+
480+static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
481+{
482+ lv_lptr_t * index = index_child(&lv->root_index);
483+ int idx, l;
484+
485+ for (l = INDEX_DEPTH-1; l >= 0; l--) {
486+ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
487+ index += idx;
488+ if (!l)
489+ break;
490+ if (index_free(index))
491+ return NULL;
492+ if (!index_present(index))
493+ read_index_group(lv, index);
494+ if (!index_present(index)) {
495+ MD_BUG();
496+ return NULL;
497+ }
498+ index = index_child(index);
499+ }
500+ if (!index_block(lv,index))
501+ return NULL;
502+ return index;
503+}
504+
505+static int read_root_index(lv_t *lv)
506+{
507+ int err;
508+ lv_lptr_t *index = &lv->root_index;
509+
510+ if (!index_block(lv, index)) {
511+ printk("LV has no root index yet, creating.\n");
512+
513+ err = alloc_index_group (lv, 0, index);
514+ if (err) {
515+ printk("could not create index group, err:%d\n", err);
516+ return err;
517+ }
518+ lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
519+ lv->root_index.data;
520+ } else {
521+ printk("LV already has a root index.\n");
522+ printk("... at <%s:%d>.\n",
523+ partition_name(index_dev(lv, index)),
524+ index_block(lv, index));
525+
526+ read_index_group(lv, index);
527+ }
528+ return 0;
529+}
530+
531+static int init_pv(pv_t *pv)
532+{
533+ struct buffer_head *bh;
534+ pv_sb_t *pv_sb;
535+
536+ bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
537+ if (!bh) {
538+ MD_BUG();
539+ return -1;
540+ }
541+
542+ pv_sb = (pv_sb_t *) bh->b_data;
543+ pv->pv_sb = pv_sb;
544+
545+ if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
546+ printk("%s is not a PV, has magic %x instead of %x!\n",
547+ partition_name(pv->dev), pv_sb->pv_magic,
548+ HSM_PV_SB_MAGIC);
549+ return -1;
550+ }
551+ printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
552+ pv->phys_nr);
553+ printk("... created under HSM version %d.%d.%d, at %x.\n",
554+ pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
555+ printk("... total # of blocks: %d (%d left unallocated).\n",
556+ pv_sb->pv_total_size, pv_sb->pv_blocks_left);
557+
558+ printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
559+ printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
560+ printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
561+ printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
562+
563+ if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
564+ MD_BUG();
565+ return 1;
566+ }
567+ pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
568+ if (!pv->bg_array) {
569+ MD_BUG();
570+ return 1;
571+ }
572+ memset(pv->bg_array, 0, PAGE_SIZE);
573+
574+ return 0;
575+}
576+
577+static int free_pv(pv_t *pv)
578+{
579+ struct buffer_head *bh;
580+
581+ dprintk("freeing PV %d ...\n", pv->phys_nr);
582+
583+ if (pv->bg_array) {
584+ int i;
585+
586+ dprintk(".... freeing BGs ...\n");
587+ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
588+ unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
589+ pv_bg_desc_t *desc = pv->bg_array + i;
590+
591+ if (desc->bg) {
592+ dprintk(".... freeing BG %d ...\n", i);
593+ bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
594+ mark_buffer_dirty(bh, 1);
595+ brelse(bh);
596+ brelse(bh);
597+ }
598+ }
599+ free_page((unsigned long)pv->bg_array);
600+ } else
601+ MD_BUG();
602+
603+ bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
604+ if (!bh) {
605+ MD_BUG();
606+ return -1;
607+ }
608+ mark_buffer_dirty(bh, 1);
609+ brelse(bh);
610+ brelse(bh);
611+
612+ return 0;
613+}
614+
615+struct semaphore hsm_sem = MUTEX;
616+
617+#define HSM_SECTORS (HSM_BLOCKSIZE/512)
618+
619+static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
620+ unsigned long *rsector, unsigned long bsectors)
621+{
622+ lv_t *lv = kdev_to_lv(dev);
623+ lv_lptr_t *index;
624+ unsigned int lblock = *rsector / HSM_SECTORS;
625+ unsigned int offset = *rsector % HSM_SECTORS;
626+ int err = -EIO;
627+
628+ if (!lv) {
629+ printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
630+ goto out;
631+ }
632+ if (offset + bsectors > HSM_SECTORS) {
633+ MD_BUG();
634+ goto out;
635+ }
636+ down(&hsm_sem);
637+ index = find_index(lv, lblock);
638+ if (!index) {
639+ printk("no block %u yet ... allocating\n", lblock);
640+ index = alloc_fixed_index(lv, lblock);
641+ }
642+
643+ err = 0;
644+
645+ printk(" %u <%s : %ld(%ld)> -> ", lblock,
646+ partition_name(*rdev), *rsector, bsectors);
647+
648+ *rdev = index_dev(lv, index);
649+ *rsector = index_block(lv, index) * HSM_SECTORS + offset;
650+
651+ printk(" <%s : %ld> %u\n",
652+ partition_name(*rdev), *rsector, index_block(lv, index));
653+
654+ up(&hsm_sem);
655+out:
656+ return err;
657+}
658+
659+static void free_index (lv_t *lv, lv_lptr_t * index)
660+{
661+ struct buffer_head *bh;
662+
663+ printk("tryin to get cached block for index group <%s:%d>\n",
664+ partition_name(index_dev(lv, index)), index_block(lv, index));
665+
666+ bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
667+
668+ printk("....FREEING ");
669+ print_index_list(lv, index);
670+
671+ if (bh) {
672+ if (!buffer_uptodate(bh))
673+ MD_BUG();
674+ if ((lv_lptr_t *)bh->b_data != index_child(index)) {
675+ printk("huh? b_data is %p, index content is %p.\n",
676+ bh->b_data, index_child(index));
677+ } else
678+ printk("good, b_data == index content == %p.\n",
679+ index_child(index));
680+ printk("b_count == %d, writing.\n", bh->b_count);
681+ mark_buffer_dirty(bh, 1);
682+ brelse(bh);
683+ brelse(bh);
684+ printk("done.\n");
685+ } else {
686+ printk("FAILED!\n");
687+ }
688+ print_index_list(lv, index);
689+ index_child(index) = NULL;
690+}
691+
692+static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
693+{
694+ char dots [3*8];
695+ lv_lptr_t * index;
696+ int i, nr_dots;
697+
698+ nr_dots = (INDEX_DEPTH-level)*3;
699+ memcpy(dots,"...............",nr_dots);
700+ dots[nr_dots] = 0;
701+
702+ dprintk("%s level %d index group block:\n", dots, level);
703+
704+
705+ index = index_0;
706+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
707+ if (index->data.phys_block) {
708+ dprintk("%s block <%u,%u,%x>\n", dots,
709+ index->data.phys_nr,
710+ index->data.phys_block,
711+ index->cpu_addr);
712+ if (level && index_present(index)) {
713+ dprintk("%s==> deeper one level\n", dots);
714+ free_index_group(lv, level-1,
715+ index_child(index));
716+ dprintk("%s freeing index group block %p ...",
717+ dots, index_child(index));
718+ free_index(lv, index);
719+ }
720+ }
721+ index++;
722+ }
723+ dprintk("%s DONE: level %d index group block.\n", dots, level);
724+}
725+
726+static void free_lv_indextree (lv_t *lv)
727+{
728+ dprintk("freeing LV %d ...\n", lv->log_id);
729+ dprintk("..root index: %p\n", index_child(&lv->root_index));
730+ dprintk("..INDEX TREE:\n");
731+ free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
732+ dprintk("..freeing root index %p ...", index_child(&lv->root_index));
733+ dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
734+ lv->root_index.data.phys_block, lv->root_index.cpu_addr);
735+ free_index(lv, &lv->root_index);
736+ dprintk("..INDEX TREE done.\n");
737+ fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
738+ lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
739+}
740+
741+static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
742+{
743+ char dots [3*5];
744+ lv_lptr_t * index;
745+ int i, nr_dots;
746+
747+ nr_dots = (INDEX_DEPTH-level)*3;
748+ memcpy(dots,"...............",nr_dots);
749+ dots[nr_dots] = 0;
750+
751+ dprintk("%s level %d index group block:\n", dots, level);
752+
753+
754+ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
755+ index = index_0 + i;
756+ if (index->data.phys_block) {
757+ dprintk("%s block <%u,%u,%x>\n", dots,
758+ index->data.phys_nr,
759+ index->data.phys_block,
760+ index->cpu_addr);
761+ if (level && index_present(index)) {
762+ dprintk("%s==> deeper one level\n", dots);
763+ print_index_group(lv, level-1,
764+ index_child(index));
765+ }
766+ }
767+ }
768+ dprintk("%s DONE: level %d index group block.\n", dots, level);
769+}
770+
771+static void print_lv (lv_t *lv)
772+{
773+ dprintk("printing LV %d ...\n", lv->log_id);
774+ dprintk("..root index: %p\n", index_child(&lv->root_index));
775+ dprintk("..INDEX TREE:\n");
776+ print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
777+ dprintk("..INDEX TREE done.\n");
778+}
779+
780+static int map_lv (lv_t *lv)
781+{
782+ kdev_t dev = lv->dev;
783+ unsigned int nr = MINOR(dev);
784+ mddev_t *mddev = lv->vg->mddev;
785+
786+ if (MAJOR(dev) != MD_MAJOR) {
787+ MD_BUG();
788+ return -1;
789+ }
790+ if (kdev_to_mddev(dev)) {
791+ MD_BUG();
792+ return -1;
793+ }
794+ md_hd_struct[nr].start_sect = 0;
795+ md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
796+ md_size[nr] = md_size[mdidx(mddev)];
797+ add_mddev_mapping(mddev, dev, lv);
798+
799+ return 0;
800+}
801+
802+static int unmap_lv (lv_t *lv)
803+{
804+ kdev_t dev = lv->dev;
805+ unsigned int nr = MINOR(dev);
806+
807+ if (MAJOR(dev) != MD_MAJOR) {
808+ MD_BUG();
809+ return -1;
810+ }
811+ md_hd_struct[nr].start_sect = 0;
812+ md_hd_struct[nr].nr_sects = 0;
813+ md_size[nr] = 0;
814+ del_mddev_mapping(lv->vg->mddev, dev);
815+
816+ return 0;
817+}
818+
819+static int init_vg (vg_t *vg)
820+{
821+ int i;
822+ lv_t *lv;
823+ kdev_t dev;
824+ vg_sb_t *vg_sb;
825+ struct buffer_head *bh;
826+ lv_descriptor_t *lv_desc;
827+
828+ /*
829+ * fix me: read all PVs and compare the SB
830+ */
831+ dev = vg->pv_array[0].dev;
832+ bh = bread (dev, 1, HSM_BLOCKSIZE);
833+ if (!bh) {
834+ MD_BUG();
835+ return -1;
836+ }
837+
838+ vg_sb = (vg_sb_t *) bh->b_data;
839+ vg->vg_sb = vg_sb;
840+
841+ if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
842+ printk("%s is not a valid VG, has magic %x instead of %x!\n",
843+ partition_name(dev), vg_sb->vg_magic,
844+ HSM_VG_SB_MAGIC);
845+ return -1;
846+ }
847+
848+ vg->nr_lv = 0;
849+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
850+ unsigned int id;
851+ lv_desc = vg->vg_sb->lv_array + i;
852+
853+ id = lv_desc->lv_id;
854+ if (!id) {
855+ printk("... LV desc %d empty\n", i);
856+ continue;
857+ }
858+ if (id >= HSM_MAX_LVS_PER_VG) {
859+ MD_BUG();
860+ continue;
861+ }
862+
863+ lv = vg->lv_array + id;
864+ if (lv->vg) {
865+ MD_BUG();
866+ continue;
867+ }
868+ lv->log_id = id;
869+ lv->vg = vg;
870+ lv->max_indices = lv_desc->lv_max_indices;
871+ lv->free_indices = lv_desc->lv_free_indices;
872+ lv->root_index.data = lv_desc->lv_root_idx;
873+ lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
874+
875+ vg->nr_lv++;
876+
877+ map_lv(lv);
878+ if (read_root_index(lv)) {
879+ vg->nr_lv--;
880+ unmap_lv(lv);
881+ memset(lv, 0, sizeof(*lv));
882+ }
883+ }
884+ if (vg->nr_lv != vg_sb->nr_lvs)
885+ MD_BUG();
886+
887+ return 0;
888+}
889+
890+static int hsm_run (mddev_t *mddev)
891+{
892+ int i;
893+ vg_t *vg;
894+ mdk_rdev_t *rdev;
895+
896+ MOD_INC_USE_COUNT;
897+
898+ vg = kmalloc (sizeof (*vg), GFP_KERNEL);
899+ if (!vg)
900+ goto out;
901+ memset(vg, 0, sizeof(*vg));
902+ mddev->private = vg;
903+ vg->mddev = mddev;
904+
905+ if (md_check_ordering(mddev)) {
906+ printk("hsm: disks are not ordered, aborting!\n");
907+ goto out;
908+ }
909+
910+ set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
911+
912+ vg->nr_pv = mddev->nb_dev;
913+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
914+ pv_t *pv = vg->pv_array + i;
915+
916+ pv->dev = rdev->dev;
917+ fsync_dev (pv->dev);
918+ set_blocksize (pv->dev, HSM_BLOCKSIZE);
919+ pv->phys_nr = i;
920+ if (init_pv(pv))
921+ goto out;
922+ }
923+
924+ init_vg(vg);
925+
926+ return 0;
927+
928+out:
929+ if (vg) {
930+ kfree(vg);
931+ mddev->private = NULL;
932+ }
933+ MOD_DEC_USE_COUNT;
934+
935+ return 1;
936+}
937+
938+static int hsm_stop (mddev_t *mddev)
939+{
940+ lv_t *lv;
941+ vg_t *vg;
942+ int i;
943+
944+ vg = mddev_to_vg(mddev);
945+
946+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
947+ lv = vg->lv_array + i;
948+ if (!lv->log_id)
949+ continue;
950+ print_lv(lv);
951+ free_lv_indextree(lv);
952+ unmap_lv(lv);
953+ }
954+ for (i = 0; i < vg->nr_pv; i++)
955+ free_pv(vg->pv_array + i);
956+
957+ kfree(vg);
958+
959+ MOD_DEC_USE_COUNT;
960+
961+ return 0;
962+}
963+
964+
965+static int hsm_status (char *page, mddev_t *mddev)
966+{
967+ int sz = 0, i;
968+ lv_t *lv;
969+ vg_t *vg;
970+
971+ vg = mddev_to_vg(mddev);
972+
973+ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
974+ lv = vg->lv_array + i;
975+ if (!lv->log_id)
976+ continue;
977+ sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
978+ lv->max_indices - lv->free_indices, lv->max_indices);
979+ }
980+ return sz;
981+}
982+
983+
984+static mdk_personality_t hsm_personality=
985+{
986+ "hsm",
987+ hsm_map,
988+ NULL,
989+ NULL,
990+ hsm_run,
991+ hsm_stop,
992+ hsm_status,
993+ NULL,
994+ 0,
995+ NULL,
996+ NULL,
997+ NULL,
998+ NULL
999+};
1000+
1001+#ifndef MODULE
1002+
1003+md__initfunc(void hsm_init (void))
1004+{
1005+ register_md_personality (HSM, &hsm_personality);
1006+}
1007+
1008+#else
1009+
1010+int init_module (void)
1011+{
1012+ return (register_md_personality (HSM, &hsm_personality));
1013+}
1014+
1015+void cleanup_module (void)
1016+{
1017+ unregister_md_personality (HSM);
1018+}
1019+
1020+#endif
1021+
1022+/*
1023+ * This Linus-trick catches bugs via the linker.
1024+ */
1025+
1026+extern void __BUG__in__hsm_dot_c_1(void);
1027+extern void __BUG__in__hsm_dot_c_2(void);
1028+extern void __BUG__in__hsm_dot_c_3(void);
1029+extern void __BUG__in__hsm_dot_c_4(void);
1030+extern void __BUG__in__hsm_dot_c_5(void);
1031+extern void __BUG__in__hsm_dot_c_6(void);
1032+extern void __BUG__in__hsm_dot_c_7(void);
1033+
1034+void bugcatcher (void)
1035+{
1036+ if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
1037+ __BUG__in__hsm_dot_c_1();
1038+ if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
1039+ __BUG__in__hsm_dot_c_2();
1040+
1041+ if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
1042+ __BUG__in__hsm_dot_c_4();
1043+ if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
1044+ __BUG__in__hsm_dot_c_3();
1045+ if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
1046+ __BUG__in__hsm_dot_c_6();
1047+
1048+ if (sizeof(lv_lptr_t) != 16)
1049+ __BUG__in__hsm_dot_c_5();
1050+ if (sizeof(pv_pptr_t) != 16)
1051+ __BUG__in__hsm_dot_c_6();
1052+}
1053+
1054diff -ruN linux.orig/drivers/block/linear.c linux-2.2.16/drivers/block/linear.c
1055--- linux.orig/drivers/block/linear.c Sat Nov 8 20:39:12 1997
1056+++ linux-2.2.16/drivers/block/linear.c Fri Jun 9 11:37:45 2000
1057@@ -1,4 +1,3 @@
1058-
1059 /*
1060 linear.c : Multiple Devices driver for Linux
1061 Copyright (C) 1994-96 Marc ZYNGIER
1062@@ -19,186 +18,207 @@
1063
1064 #include <linux/module.h>
1065
1066-#include <linux/md.h>
1067+#include <linux/raid/md.h>
1068 #include <linux/malloc.h>
1069-#include <linux/init.h>
1070
1071-#include "linear.h"
1072+#include <linux/raid/linear.h>
1073
1074 #define MAJOR_NR MD_MAJOR
1075 #define MD_DRIVER
1076 #define MD_PERSONALITY
1077
1078-static int linear_run (int minor, struct md_dev *mddev)
1079+static int linear_run (mddev_t *mddev)
1080 {
1081- int cur=0, i, size, dev0_size, nb_zone;
1082- struct linear_data *data;
1083-
1084- MOD_INC_USE_COUNT;
1085-
1086- mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
1087- data=(struct linear_data *) mddev->private;
1088-
1089- /*
1090- Find out the smallest device. This was previously done
1091- at registry time, but since it violates modularity,
1092- I moved it here... Any comment ? ;-)
1093- */
1094-
1095- data->smallest=mddev->devices;
1096- for (i=1; i<mddev->nb_dev; i++)
1097- if (data->smallest->size > mddev->devices[i].size)
1098- data->smallest=mddev->devices+i;
1099-
1100- nb_zone=data->nr_zones=
1101- md_size[minor]/data->smallest->size +
1102- (md_size[minor]%data->smallest->size ? 1 : 0);
1103-
1104- data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
1105-
1106- size=mddev->devices[cur].size;
1107+ linear_conf_t *conf;
1108+ struct linear_hash *table;
1109+ mdk_rdev_t *rdev;
1110+ int size, i, j, nb_zone;
1111+ unsigned int curr_offset;
1112+
1113+ MOD_INC_USE_COUNT;
1114+
1115+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
1116+ if (!conf)
1117+ goto out;
1118+ mddev->private = conf;
1119+
1120+ if (md_check_ordering(mddev)) {
1121+ printk("linear: disks are not ordered, aborting!\n");
1122+ goto out;
1123+ }
1124+ /*
1125+ * Find the smallest device.
1126+ */
1127+
1128+ conf->smallest = NULL;
1129+ curr_offset = 0;
1130+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
1131+ dev_info_t *disk = conf->disks + j;
1132+
1133+ disk->dev = rdev->dev;
1134+ disk->size = rdev->size;
1135+ disk->offset = curr_offset;
1136+
1137+ curr_offset += disk->size;
1138+
1139+ if (!conf->smallest || (disk->size < conf->smallest->size))
1140+ conf->smallest = disk;
1141+ }
1142+
1143+ nb_zone = conf->nr_zones =
1144+ md_size[mdidx(mddev)] / conf->smallest->size +
1145+ ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
1146+
1147+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
1148+ GFP_KERNEL);
1149+ if (!conf->hash_table)
1150+ goto out;
1151+
1152+ /*
1153+ * Here we generate the linear hash table
1154+ */
1155+ table = conf->hash_table;
1156+ i = 0;
1157+ size = 0;
1158+ for (j = 0; j < mddev->nb_dev; j++) {
1159+ dev_info_t *disk = conf->disks + j;
1160+
1161+ if (size < 0) {
1162+ table->dev1 = disk;
1163+ table++;
1164+ }
1165+ size += disk->size;
1166+
1167+ while (size) {
1168+ table->dev0 = disk;
1169+ size -= conf->smallest->size;
1170+ if (size < 0)
1171+ break;
1172+ table->dev1 = NULL;
1173+ table++;
1174+ }
1175+ }
1176+ table->dev1 = NULL;
1177+
1178+ return 0;
1179+
1180+out:
1181+ if (conf)
1182+ kfree(conf);
1183+ MOD_DEC_USE_COUNT;
1184+ return 1;
1185+}
1186+
1187+static int linear_stop (mddev_t *mddev)
1188+{
1189+ linear_conf_t *conf = mddev_to_conf(mddev);
1190+
1191+ kfree(conf->hash_table);
1192+ kfree(conf);
1193
1194- i=0;
1195- while (cur<mddev->nb_dev)
1196- {
1197- data->hash_table[i].dev0=mddev->devices+cur;
1198+ MOD_DEC_USE_COUNT;
1199
1200- if (size>=data->smallest->size) /* If we completely fill the slot */
1201- {
1202- data->hash_table[i++].dev1=NULL;
1203- size-=data->smallest->size;
1204-
1205- if (!size)
1206- {
1207- if (++cur==mddev->nb_dev) continue;
1208- size=mddev->devices[cur].size;
1209- }
1210-
1211- continue;
1212- }
1213-
1214- if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
1215- {
1216- data->hash_table[i].dev1=NULL;
1217- continue;
1218- }
1219-
1220- dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
1221- size=mddev->devices[cur].size;
1222- data->hash_table[i++].dev1=mddev->devices+cur;
1223- size-=(data->smallest->size - dev0_size);
1224- }
1225-
1226- return 0;
1227-}
1228-
1229-static int linear_stop (int minor, struct md_dev *mddev)
1230-{
1231- struct linear_data *data=(struct linear_data *) mddev->private;
1232-
1233- kfree (data->hash_table);
1234- kfree (data);
1235-
1236- MOD_DEC_USE_COUNT;
1237-
1238- return 0;
1239+ return 0;
1240 }
1241
1242
1243-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
1244+static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1245 unsigned long *rsector, unsigned long size)
1246 {
1247- struct linear_data *data=(struct linear_data *) mddev->private;
1248- struct linear_hash *hash;
1249- struct real_dev *tmp_dev;
1250- long block;
1251-
1252- block=*rsector >> 1;
1253- hash=data->hash_table+(block/data->smallest->size);
1254-
1255- if (block >= (hash->dev0->size + hash->dev0->offset))
1256- {
1257- if (!hash->dev1)
1258- {
1259- printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
1260- return (-1);
1261- }
1262-
1263- tmp_dev=hash->dev1;
1264- }
1265- else
1266- tmp_dev=hash->dev0;
1267+ linear_conf_t *conf = mddev_to_conf(mddev);
1268+ struct linear_hash *hash;
1269+ dev_info_t *tmp_dev;
1270+ long block;
1271+
1272+ block = *rsector >> 1;
1273+ hash = conf->hash_table + (block / conf->smallest->size);
1274+
1275+ if (block >= (hash->dev0->size + hash->dev0->offset))
1276+ {
1277+ if (!hash->dev1)
1278+ {
1279+ printk ("linear_map : hash->dev1==NULL for block %ld\n",
1280+ block);
1281+ return -1;
1282+ }
1283+ tmp_dev = hash->dev1;
1284+ } else
1285+ tmp_dev = hash->dev0;
1286
1287- if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
1288- printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
1289- block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
1290+ if (block >= (tmp_dev->size + tmp_dev->offset)
1291+ || block < tmp_dev->offset)
1292+ printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
1293+ block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
1294
1295- *rdev=tmp_dev->dev;
1296- *rsector=(block-(tmp_dev->offset)) << 1;
1297+ *rdev = tmp_dev->dev;
1298+ *rsector = (block - tmp_dev->offset) << 1;
1299
1300- return (0);
1301+ return 0;
1302 }
1303
1304-static int linear_status (char *page, int minor, struct md_dev *mddev)
1305+static int linear_status (char *page, mddev_t *mddev)
1306 {
1307- int sz=0;
1308+ int sz=0;
1309
1310 #undef MD_DEBUG
1311 #ifdef MD_DEBUG
1312- int j;
1313- struct linear_data *data=(struct linear_data *) mddev->private;
1314+ int j;
1315+ linear_conf_t *conf = mddev_to_conf(mddev);
1316
1317- sz+=sprintf (page+sz, " ");
1318- for (j=0; j<data->nr_zones; j++)
1319- {
1320- sz+=sprintf (page+sz, "[%s",
1321- partition_name (data->hash_table[j].dev0->dev));
1322-
1323- if (data->hash_table[j].dev1)
1324- sz+=sprintf (page+sz, "/%s] ",
1325- partition_name(data->hash_table[j].dev1->dev));
1326- else
1327- sz+=sprintf (page+sz, "] ");
1328- }
1329-
1330- sz+=sprintf (page+sz, "\n");
1331+ sz += sprintf(page+sz, " ");
1332+ for (j = 0; j < conf->nr_zones; j++)
1333+ {
1334+ sz += sprintf(page+sz, "[%s",
1335+ partition_name(conf->hash_table[j].dev0->dev));
1336+
1337+ if (conf->hash_table[j].dev1)
1338+ sz += sprintf(page+sz, "/%s] ",
1339+ partition_name(conf->hash_table[j].dev1->dev));
1340+ else
1341+ sz += sprintf(page+sz, "] ");
1342+ }
1343+ sz += sprintf(page+sz, "\n");
1344 #endif
1345- sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
1346- return sz;
1347+ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
1348+ return sz;
1349 }
1350
1351
1352-static struct md_personality linear_personality=
1353+static mdk_personality_t linear_personality=
1354 {
1355- "linear",
1356- linear_map,
1357- NULL,
1358- NULL,
1359- linear_run,
1360- linear_stop,
1361- linear_status,
1362- NULL, /* no ioctls */
1363- 0
1364+ "linear",
1365+ linear_map,
1366+ NULL,
1367+ NULL,
1368+ linear_run,
1369+ linear_stop,
1370+ linear_status,
1371+ NULL,
1372+ 0,
1373+ NULL,
1374+ NULL,
1375+ NULL,
1376+ NULL
1377 };
1378
1379-
1380 #ifndef MODULE
1381
1382-__initfunc(void linear_init (void))
1383+md__initfunc(void linear_init (void))
1384 {
1385- register_md_personality (LINEAR, &linear_personality);
1386+ register_md_personality (LINEAR, &linear_personality);
1387 }
1388
1389 #else
1390
1391 int init_module (void)
1392 {
1393- return (register_md_personality (LINEAR, &linear_personality));
1394+ return (register_md_personality (LINEAR, &linear_personality));
1395 }
1396
1397 void cleanup_module (void)
1398 {
1399- unregister_md_personality (LINEAR);
1400+ unregister_md_personality (LINEAR);
1401 }
1402
1403 #endif
1404+
1405diff -ruN linux.orig/drivers/block/linear.h linux-2.2.16/drivers/block/linear.h
1406--- linux.orig/drivers/block/linear.h Fri Nov 22 15:07:23 1996
1407+++ linux-2.2.16/drivers/block/linear.h Thu Jan 1 01:00:00 1970
1408@@ -1,16 +0,0 @@
1409-#ifndef _LINEAR_H
1410-#define _LINEAR_H
1411-
1412-struct linear_hash
1413-{
1414- struct real_dev *dev0, *dev1;
1415-};
1416-
1417-struct linear_data
1418-{
1419- struct linear_hash *hash_table; /* Dynamically allocated */
1420- struct real_dev *smallest;
1421- int nr_zones;
1422-};
1423-
1424-#endif
1425diff -ruN linux.orig/drivers/block/ll_rw_blk.c linux-2.2.16/drivers/block/ll_rw_blk.c
1426--- linux.orig/drivers/block/ll_rw_blk.c Wed Jun 7 23:26:42 2000
1427+++ linux-2.2.16/drivers/block/ll_rw_blk.c Fri Jun 9 11:37:45 2000
1428@@ -23,6 +23,7 @@
1429 #include <asm/io.h>
1430 #include <asm/uaccess.h>
1431 #include <linux/blk.h>
1432+#include <linux/raid/md.h>
1433
1434 #include <linux/module.h>
1435
1436@@ -53,6 +54,11 @@
1437 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
1438
1439 /*
1440+ * per-major idle-IO detection
1441+ */
1442+unsigned long io_events[MAX_BLKDEV] = {0, };
1443+
1444+/*
1445 * used to wait on when there are no free requests
1446 */
1447 struct wait_queue * wait_for_request;
1448@@ -641,6 +647,8 @@
1449 return;
1450 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
1451 lock_buffer(bh);
1452+ if (!buffer_lowprio(bh))
1453+ io_events[major]++;
1454
1455 if (blk_size[major]) {
1456 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
1457@@ -892,7 +900,7 @@
1458 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
1459 #ifdef CONFIG_BLK_DEV_MD
1460 if (major==MD_MAJOR &&
1461- md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
1462+ md_map (bh[i]->b_dev, &bh[i]->b_rdev,
1463 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
1464 printk (KERN_ERR
1465 "Bad md_map in ll_rw_block\n");
1466@@ -912,7 +920,7 @@
1467 set_bit(BH_Req, &bh[i]->b_state);
1468 #ifdef CONFIG_BLK_DEV_MD
1469 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
1470- md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
1471+ md_make_request(bh[i], rw);
1472 continue;
1473 }
1474 #endif
1475diff -ruN linux.orig/drivers/block/md.c linux-2.2.16/drivers/block/md.c
1476--- linux.orig/drivers/block/md.c Wed Jun 7 23:26:42 2000
1477+++ linux-2.2.16/drivers/block/md.c Fri Jun 9 11:43:43 2000
1478@@ -1,21 +1,17 @@
1479-
1480 /*
1481 md.c : Multiple Devices driver for Linux
1482- Copyright (C) 1994-96 Marc ZYNGIER
1483- <zyngier@ufr-info-p7.ibp.fr> or
1484- <maz@gloups.fdn.fr>
1485+ Copyright (C) 1998, 1999 Ingo Molnar
1486
1487- A lot of inspiration came from hd.c ...
1488+ completely rewritten, based on the MD driver code from Marc Zyngier
1489
1490- kerneld support by Boris Tobotras <boris@xtalk.msk.su>
1491- boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
1492+ Changes:
1493
1494- RAID-1/RAID-5 extensions by:
1495- Ingo Molnar, Miguel de Icaza, Gadi Oxman
1496+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
1497+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
1498+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
1499+ - kmod support by: Cyrus Durgin
1500+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
1501
1502- Changes for kmod by:
1503- Cyrus Durgin
1504-
1505 This program is free software; you can redistribute it and/or modify
1506 it under the terms of the GNU General Public License as published by
1507 the Free Software Foundation; either version 2, or (at your option)
1508@@ -26,809 +22,3007 @@
1509 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1510 */
1511
1512-/*
1513- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
1514- * the extra system load does not show up that much. Increase it if your
1515- * system can take more.
1516- */
1517-#define SPEED_LIMIT 1024
1518+#include <linux/raid/md.h>
1519+#include <linux/raid/xor.h>
1520
1521-#include <linux/config.h>
1522-#include <linux/module.h>
1523-#include <linux/version.h>
1524-#include <linux/malloc.h>
1525-#include <linux/mm.h>
1526-#include <linux/md.h>
1527-#include <linux/hdreg.h>
1528-#include <linux/stat.h>
1529-#include <linux/fs.h>
1530-#include <linux/proc_fs.h>
1531-#include <linux/blkdev.h>
1532-#include <linux/genhd.h>
1533-#include <linux/smp_lock.h>
1534 #ifdef CONFIG_KMOD
1535 #include <linux/kmod.h>
1536 #endif
1537-#include <linux/errno.h>
1538-#include <linux/init.h>
1539
1540 #define __KERNEL_SYSCALLS__
1541 #include <linux/unistd.h>
1542
1543+#include <asm/unaligned.h>
1544+
1545+extern asmlinkage int sys_sched_yield(void);
1546+extern asmlinkage int sys_setsid(void);
1547+
1548+extern unsigned long io_events[MAX_BLKDEV];
1549+
1550 #define MAJOR_NR MD_MAJOR
1551 #define MD_DRIVER
1552
1553 #include <linux/blk.h>
1554-#include <asm/uaccess.h>
1555-#include <asm/bitops.h>
1556-#include <asm/atomic.h>
1557
1558 #ifdef CONFIG_MD_BOOT
1559-extern kdev_t name_to_kdev_t(char *line) __init;
1560+extern kdev_t name_to_kdev_t(char *line) md__init;
1561 #endif
1562
1563-static struct hd_struct md_hd_struct[MAX_MD_DEV];
1564-static int md_blocksizes[MAX_MD_DEV];
1565-int md_maxreadahead[MAX_MD_DEV];
1566-#if SUPPORT_RECONSTRUCTION
1567-static struct md_thread *md_sync_thread = NULL;
1568-#endif /* SUPPORT_RECONSTRUCTION */
1569+static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
1570+
1571+/*
1572+ * these have to be allocated separately because external
1573+ * subsystems want to have a pre-defined structure
1574+ */
1575+struct hd_struct md_hd_struct[MAX_MD_DEVS];
1576+static int md_blocksizes[MAX_MD_DEVS];
1577+static int md_maxreadahead[MAX_MD_DEVS];
1578+static mdk_thread_t *md_recovery_thread = NULL;
1579
1580-int md_size[MAX_MD_DEV]={0, };
1581+int md_size[MAX_MD_DEVS] = {0, };
1582
1583 static void md_geninit (struct gendisk *);
1584
1585 static struct gendisk md_gendisk=
1586 {
1587- MD_MAJOR,
1588- "md",
1589- 0,
1590- 1,
1591- MAX_MD_DEV,
1592- md_geninit,
1593- md_hd_struct,
1594- md_size,
1595- MAX_MD_DEV,
1596- NULL,
1597- NULL
1598+ MD_MAJOR,
1599+ "md",
1600+ 0,
1601+ 1,
1602+ MAX_MD_DEVS,
1603+ md_geninit,
1604+ md_hd_struct,
1605+ md_size,
1606+ MAX_MD_DEVS,
1607+ NULL,
1608+ NULL
1609 };
1610
1611-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
1612-struct md_dev md_dev[MAX_MD_DEV];
1613-
1614-int md_thread(void * arg);
1615+/*
1616+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
1617+ * is 100 KB/sec, so the extra system load does not show up that much.
1618+ * Increase it if you want to have more _guaranteed_ speed. Note that
1619+ * the RAID driver will use the maximum available bandwith if the IO
1620+ * subsystem is idle.
1621+ *
1622+ * you can change it via /proc/sys/dev/speed-limit
1623+ */
1624
1625-static struct gendisk *find_gendisk (kdev_t dev)
1626-{
1627- struct gendisk *tmp=gendisk_head;
1628+static int sysctl_speed_limit = 100;
1629
1630- while (tmp != NULL)
1631- {
1632- if (tmp->major==MAJOR(dev))
1633- return (tmp);
1634-
1635- tmp=tmp->next;
1636- }
1637+static struct ctl_table_header *md_table_header;
1638
1639- return (NULL);
1640-}
1641+static ctl_table md_table[] = {
1642+ {DEV_MD_SPEED_LIMIT, "speed-limit",
1643+ &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
1644+ {0}
1645+};
1646
1647-char *partition_name (kdev_t dev)
1648-{
1649- static char name[40]; /* This should be long
1650- enough for a device name ! */
1651- struct gendisk *hd = find_gendisk (dev);
1652+static ctl_table md_dir_table[] = {
1653+ {DEV_MD, "md", NULL, 0, 0555, md_table},
1654+ {0}
1655+};
1656
1657- if (!hd)
1658- {
1659- sprintf (name, "[dev %s]", kdevname(dev));
1660- return (name);
1661- }
1662+static ctl_table md_root_table[] = {
1663+ {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
1664+ {0}
1665+};
1666
1667- return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
1668+static void md_register_sysctl(void)
1669+{
1670+ md_table_header = register_sysctl_table(md_root_table, 1);
1671 }
1672
1673-static int legacy_raid_sb (int minor, int pnum)
1674+void md_unregister_sysctl(void)
1675 {
1676- int i, factor;
1677+ unregister_sysctl_table(md_table_header);
1678+}
1679+
1680+/*
1681+ * The mapping between kdev and mddev is not necessary a simple
1682+ * one! Eg. HSM uses several sub-devices to implement Logical
1683+ * Volumes. All these sub-devices map to the same mddev.
1684+ */
1685+dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
1686
1687- factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
1688+void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
1689+{
1690+ unsigned int minor = MINOR(dev);
1691
1692- /*****
1693- * do size and offset calculations.
1694- */
1695- for (i=0; i<md_dev[minor].nb_dev; i++) {
1696- md_dev[minor].devices[i].size &= ~(factor - 1);
1697- md_size[minor] += md_dev[minor].devices[i].size;
1698- md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
1699- md_dev[minor].devices[i-1].size) : 0;
1700+ if (MAJOR(dev) != MD_MAJOR) {
1701+ MD_BUG();
1702+ return;
1703 }
1704- if (pnum == RAID0 >> PERSONALITY_SHIFT)
1705- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
1706- return 0;
1707+ if (mddev_map[minor].mddev != NULL) {
1708+ MD_BUG();
1709+ return;
1710+ }
1711+ mddev_map[minor].mddev = mddev;
1712+ mddev_map[minor].data = data;
1713 }
1714
1715-static void free_sb (struct md_dev *mddev)
1716+void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
1717 {
1718- int i;
1719- struct real_dev *realdev;
1720+ unsigned int minor = MINOR(dev);
1721
1722- if (mddev->sb) {
1723- free_page((unsigned long) mddev->sb);
1724- mddev->sb = NULL;
1725+ if (MAJOR(dev) != MD_MAJOR) {
1726+ MD_BUG();
1727+ return;
1728 }
1729- for (i = 0; i <mddev->nb_dev; i++) {
1730- realdev = mddev->devices + i;
1731- if (realdev->sb) {
1732- free_page((unsigned long) realdev->sb);
1733- realdev->sb = NULL;
1734- }
1735+ if (mddev_map[minor].mddev != mddev) {
1736+ MD_BUG();
1737+ return;
1738 }
1739+ mddev_map[minor].mddev = NULL;
1740+ mddev_map[minor].data = NULL;
1741 }
1742
1743 /*
1744- * Check one RAID superblock for generic plausibility
1745+ * Enables to iterate over all existing md arrays
1746 */
1747+static MD_LIST_HEAD(all_mddevs);
1748
1749-#define BAD_MAGIC KERN_ERR \
1750-"md: %s: invalid raid superblock magic (%x) on block %u\n"
1751+static mddev_t * alloc_mddev (kdev_t dev)
1752+{
1753+ mddev_t * mddev;
1754
1755-#define OUT_OF_MEM KERN_ALERT \
1756-"md: out of memory.\n"
1757+ if (MAJOR(dev) != MD_MAJOR) {
1758+ MD_BUG();
1759+ return 0;
1760+ }
1761+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
1762+ if (!mddev)
1763+ return NULL;
1764+
1765+ memset(mddev, 0, sizeof(*mddev));
1766
1767-#define NO_DEVICE KERN_ERR \
1768-"md: disabled device %s\n"
1769+ mddev->__minor = MINOR(dev);
1770+ mddev->reconfig_sem = MUTEX;
1771+ mddev->recovery_sem = MUTEX;
1772+ mddev->resync_sem = MUTEX;
1773+ MD_INIT_LIST_HEAD(&mddev->disks);
1774+ /*
1775+ * The 'base' mddev is the one with data NULL.
1776+ * personalities can create additional mddevs
1777+ * if necessary.
1778+ */
1779+ add_mddev_mapping(mddev, dev, 0);
1780+ md_list_add(&mddev->all_mddevs, &all_mddevs);
1781
1782-#define SUCCESS 0
1783-#define FAILURE -1
1784+ return mddev;
1785+}
1786
1787-static int analyze_one_sb (struct real_dev * rdev)
1788+static void free_mddev (mddev_t *mddev)
1789 {
1790- int ret = FAILURE;
1791- struct buffer_head *bh;
1792- kdev_t dev = rdev->dev;
1793- md_superblock_t *sb;
1794+ if (!mddev) {
1795+ MD_BUG();
1796+ return;
1797+ }
1798
1799 /*
1800- * Read the superblock, it's at the end of the disk
1801+ * Make sure nobody else is using this mddev
1802+ * (careful, we rely on the global kernel lock here)
1803 */
1804- rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
1805- set_blocksize (dev, MD_SB_BYTES);
1806- bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
1807-
1808- if (bh) {
1809- sb = (md_superblock_t *) bh->b_data;
1810- if (sb->md_magic != MD_SB_MAGIC) {
1811- printk (BAD_MAGIC, kdevname(dev),
1812- sb->md_magic, rdev->sb_offset);
1813- goto abort;
1814- }
1815- rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
1816- if (!rdev->sb) {
1817- printk (OUT_OF_MEM);
1818- goto abort;
1819- }
1820- memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
1821+ while (md_atomic_read(&mddev->resync_sem.count) != 1)
1822+ schedule();
1823+ while (md_atomic_read(&mddev->recovery_sem.count) != 1)
1824+ schedule();
1825
1826- rdev->size = sb->size;
1827- } else
1828- printk (NO_DEVICE,kdevname(rdev->dev));
1829- ret = SUCCESS;
1830-abort:
1831- if (bh)
1832- brelse (bh);
1833- return ret;
1834+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
1835+ md_list_del(&mddev->all_mddevs);
1836+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
1837+ kfree(mddev);
1838 }
1839
1840-#undef SUCCESS
1841-#undef FAILURE
1842-
1843-#undef BAD_MAGIC
1844-#undef OUT_OF_MEM
1845-#undef NO_DEVICE
1846
1847-/*
1848- * Check a full RAID array for plausibility
1849- */
1850+struct gendisk * find_gendisk (kdev_t dev)
1851+{
1852+ struct gendisk *tmp = gendisk_head;
1853
1854-#define INCONSISTENT KERN_ERR \
1855-"md: superblock inconsistency -- run ckraid\n"
1856+ while (tmp != NULL) {
1857+ if (tmp->major == MAJOR(dev))
1858+ return (tmp);
1859+ tmp = tmp->next;
1860+ }
1861+ return (NULL);
1862+}
1863
1864-#define OUT_OF_DATE KERN_ERR \
1865-"md: superblock update time inconsistenty -- using the most recent one\n"
1866+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
1867+{
1868+ mdk_rdev_t * rdev;
1869+ struct md_list_head *tmp;
1870
1871-#define OLD_VERSION KERN_ALERT \
1872-"md: %s: unsupported raid array version %d.%d.%d\n"
1873+ ITERATE_RDEV(mddev,rdev,tmp) {
1874+ if (rdev->desc_nr == nr)
1875+ return rdev;
1876+ }
1877+ return NULL;
1878+}
1879
1880-#define NOT_CLEAN KERN_ERR \
1881-"md: %s: raid array is not clean -- run ckraid\n"
1882+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
1883+{
1884+ struct md_list_head *tmp;
1885+ mdk_rdev_t *rdev;
1886
1887-#define NOT_CLEAN_IGNORE KERN_ERR \
1888-"md: %s: raid array is not clean -- reconstructing parity\n"
1889+ ITERATE_RDEV(mddev,rdev,tmp) {
1890+ if (rdev->dev == dev)
1891+ return rdev;
1892+ }
1893+ return NULL;
1894+}
1895
1896-#define UNKNOWN_LEVEL KERN_ERR \
1897-"md: %s: unsupported raid level %d\n"
1898+static MD_LIST_HEAD(device_names);
1899
1900-static int analyze_sbs (int minor, int pnum)
1901+char * partition_name (kdev_t dev)
1902 {
1903- struct md_dev *mddev = md_dev + minor;
1904- int i, N = mddev->nb_dev, out_of_date = 0;
1905- struct real_dev * disks = mddev->devices;
1906- md_superblock_t *sb, *freshest = NULL;
1907+ struct gendisk *hd;
1908+ static char nomem [] = "<nomem>";
1909+ dev_name_t *dname;
1910+ struct md_list_head *tmp = device_names.next;
1911
1912- /*
1913- * RAID-0 and linear don't use a RAID superblock
1914- */
1915- if (pnum == RAID0 >> PERSONALITY_SHIFT ||
1916- pnum == LINEAR >> PERSONALITY_SHIFT)
1917- return legacy_raid_sb (minor, pnum);
1918+ while (tmp != &device_names) {
1919+ dname = md_list_entry(tmp, dev_name_t, list);
1920+ if (dname->dev == dev)
1921+ return dname->name;
1922+ tmp = tmp->next;
1923+ }
1924+
1925+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
1926
1927+ if (!dname)
1928+ return nomem;
1929 /*
1930- * Verify the RAID superblock on each real device
1931+ * ok, add this new device name to the list
1932 */
1933- for (i = 0; i < N; i++)
1934- if (analyze_one_sb(disks+i))
1935- goto abort;
1936+ hd = find_gendisk (dev);
1937+
1938+ if (!hd)
1939+ sprintf (dname->name, "[dev %s]", kdevname(dev));
1940+ else
1941+ disk_name (hd, MINOR(dev), dname->name);
1942+
1943+ dname->dev = dev;
1944+ md_list_add(&dname->list, &device_names);
1945+
1946+ return dname->name;
1947+}
1948+
1949+static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
1950+ int persistent)
1951+{
1952+ unsigned int size = 0;
1953+
1954+ if (blk_size[MAJOR(dev)])
1955+ size = blk_size[MAJOR(dev)][MINOR(dev)];
1956+ if (persistent)
1957+ size = MD_NEW_SIZE_BLOCKS(size);
1958+ return size;
1959+}
1960+
1961+static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
1962+{
1963+ unsigned int size;
1964+
1965+ size = calc_dev_sboffset(dev, mddev, persistent);
1966+ if (!mddev->sb) {
1967+ MD_BUG();
1968+ return size;
1969+ }
1970+ if (mddev->sb->chunk_size)
1971+ size &= ~(mddev->sb->chunk_size/1024 - 1);
1972+ return size;
1973+}
1974+
1975+/*
1976+ * We check wether all devices are numbered from 0 to nb_dev-1. The
1977+ * order is guaranteed even after device name changes.
1978+ *
1979+ * Some personalities (raid0, linear) use this. Personalities that
1980+ * provide data have to be able to deal with loss of individual
1981+ * disks, so they do their checking themselves.
1982+ */
1983+int md_check_ordering (mddev_t *mddev)
1984+{
1985+ int i, c;
1986+ mdk_rdev_t *rdev;
1987+ struct md_list_head *tmp;
1988
1989 /*
1990- * The superblock constant part has to be the same
1991- * for all disks in the array.
1992+ * First, all devices must be fully functional
1993 */
1994- sb = NULL;
1995- for (i = 0; i < N; i++) {
1996- if (!disks[i].sb)
1997- continue;
1998- if (!sb) {
1999- sb = disks[i].sb;
2000- continue;
2001- }
2002- if (memcmp(sb,
2003- disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
2004- printk (INCONSISTENT);
2005+ ITERATE_RDEV(mddev,rdev,tmp) {
2006+ if (rdev->faulty) {
2007+ printk("md: md%d's device %s faulty, aborting.\n",
2008+ mdidx(mddev), partition_name(rdev->dev));
2009 goto abort;
2010 }
2011 }
2012
2013- /*
2014- * OK, we have all disks and the array is ready to run. Let's
2015- * find the freshest superblock, that one will be the superblock
2016- * that represents the whole array.
2017- */
2018- if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
2019+ c = 0;
2020+ ITERATE_RDEV(mddev,rdev,tmp) {
2021+ c++;
2022+ }
2023+ if (c != mddev->nb_dev) {
2024+ MD_BUG();
2025 goto abort;
2026- freshest = NULL;
2027- for (i = 0; i < N; i++) {
2028- if (!disks[i].sb)
2029- continue;
2030- if (!freshest) {
2031- freshest = disks[i].sb;
2032- continue;
2033- }
2034- /*
2035- * Find the newest superblock version
2036- */
2037- if (disks[i].sb->utime != freshest->utime) {
2038- out_of_date = 1;
2039- if (disks[i].sb->utime > freshest->utime)
2040- freshest = disks[i].sb;
2041- }
2042 }
2043- if (out_of_date)
2044- printk(OUT_OF_DATE);
2045- memcpy (sb, freshest, sizeof(*freshest));
2046-
2047- /*
2048- * Check if we can support this RAID array
2049- */
2050- if (sb->major_version != MD_MAJOR_VERSION ||
2051- sb->minor_version > MD_MINOR_VERSION) {
2052-
2053- printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
2054- sb->major_version, sb->minor_version,
2055- sb->patch_version);
2056+ if (mddev->nb_dev != mddev->sb->raid_disks) {
2057+ printk("md: md%d, array needs %d disks, has %d, aborting.\n",
2058+ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
2059 goto abort;
2060 }
2061-
2062 /*
2063- * We need to add this as a superblock option.
2064+ * Now the numbering check
2065 */
2066-#if SUPPORT_RECONSTRUCTION
2067- if (sb->state != (1 << MD_SB_CLEAN)) {
2068- if (sb->level == 1) {
2069- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
2070+ for (i = 0; i < mddev->nb_dev; i++) {
2071+ c = 0;
2072+ ITERATE_RDEV(mddev,rdev,tmp) {
2073+ if (rdev->desc_nr == i)
2074+ c++;
2075+ }
2076+ if (c == 0) {
2077+ printk("md: md%d, missing disk #%d, aborting.\n",
2078+ mdidx(mddev), i);
2079 goto abort;
2080- } else
2081- printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
2082- }
2083-#else
2084- if (sb->state != (1 << MD_SB_CLEAN)) {
2085- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
2086- goto abort;
2087- }
2088-#endif /* SUPPORT_RECONSTRUCTION */
2089-
2090- switch (sb->level) {
2091- case 1:
2092- md_size[minor] = sb->size;
2093- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
2094- break;
2095- case 4:
2096- case 5:
2097- md_size[minor] = sb->size * (sb->raid_disks - 1);
2098- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
2099- break;
2100- default:
2101- printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
2102- sb->level);
2103+ }
2104+ if (c > 1) {
2105+ printk("md: md%d, too many disks #%d, aborting.\n",
2106+ mdidx(mddev), i);
2107 goto abort;
2108+ }
2109 }
2110 return 0;
2111 abort:
2112- free_sb(mddev);
2113 return 1;
2114 }
2115
2116-#undef INCONSISTENT
2117-#undef OUT_OF_DATE
2118-#undef OLD_VERSION
2119-#undef NOT_CLEAN
2120-#undef OLD_LEVEL
2121-
2122-int md_update_sb(int minor)
2123+static unsigned int zoned_raid_size (mddev_t *mddev)
2124 {
2125- struct md_dev *mddev = md_dev + minor;
2126- struct buffer_head *bh;
2127- md_superblock_t *sb = mddev->sb;
2128- struct real_dev *realdev;
2129- kdev_t dev;
2130- int i;
2131- u32 sb_offset;
2132+ unsigned int mask;
2133+ mdk_rdev_t * rdev;
2134+ struct md_list_head *tmp;
2135
2136- sb->utime = CURRENT_TIME;
2137- for (i = 0; i < mddev->nb_dev; i++) {
2138- realdev = mddev->devices + i;
2139- if (!realdev->sb)
2140- continue;
2141- dev = realdev->dev;
2142- sb_offset = realdev->sb_offset;
2143- set_blocksize(dev, MD_SB_BYTES);
2144- printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
2145- bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2146- if (bh) {
2147- sb = (md_superblock_t *) bh->b_data;
2148- memcpy(sb, mddev->sb, MD_SB_BYTES);
2149- memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
2150- mark_buffer_uptodate(bh, 1);
2151- mark_buffer_dirty(bh, 1);
2152- ll_rw_block(WRITE, 1, &bh);
2153- wait_on_buffer(bh);
2154- bforget(bh);
2155- fsync_dev(dev);
2156- invalidate_buffers(dev);
2157- } else
2158- printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
2159+ if (!mddev->sb) {
2160+ MD_BUG();
2161+ return -EINVAL;
2162+ }
2163+ /*
2164+ * do size and offset calculations.
2165+ */
2166+ mask = ~(mddev->sb->chunk_size/1024 - 1);
2167+printk("mask %08x\n", mask);
2168+
2169+ ITERATE_RDEV(mddev,rdev,tmp) {
2170+printk(" rdev->size: %d\n", rdev->size);
2171+ rdev->size &= mask;
2172+printk(" masked rdev->size: %d\n", rdev->size);
2173+ md_size[mdidx(mddev)] += rdev->size;
2174+printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
2175 }
2176 return 0;
2177 }
2178
2179-static int do_md_run (int minor, int repart)
2180+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
2181 {
2182- int pnum, i, min, factor, err;
2183+ if (disk_active(disk)) {
2184+ sb->working_disks--;
2185+ } else {
2186+ if (disk_spare(disk)) {
2187+ sb->spare_disks--;
2188+ sb->working_disks--;
2189+ } else {
2190+ sb->failed_disks--;
2191+ }
2192+ }
2193+ sb->nr_disks--;
2194+ disk->major = 0;
2195+ disk->minor = 0;
2196+ mark_disk_removed(disk);
2197+}
2198
2199- if (!md_dev[minor].nb_dev)
2200- return -EINVAL;
2201-
2202- if (md_dev[minor].pers)
2203- return -EBUSY;
2204+#define BAD_MAGIC KERN_ERR \
2205+"md: invalid raid superblock magic on %s\n"
2206
2207- md_dev[minor].repartition=repart;
2208-
2209- if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
2210- >= MAX_PERSONALITY)
2211- return -EINVAL;
2212-
2213- /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
2214- if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
2215- for (i = 0; i < md_dev [minor].nb_dev; i++)
2216- if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
2217- return -EINVAL;
2218- }
2219- if (!pers[pnum])
2220- {
2221-#ifdef CONFIG_KMOD
2222- char module_name[80];
2223- sprintf (module_name, "md-personality-%d", pnum);
2224- request_module (module_name);
2225- if (!pers[pnum])
2226-#endif
2227- return -EINVAL;
2228- }
2229-
2230- factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
2231-
2232- md_blocksizes[minor] <<= FACTOR_SHIFT(FACTOR((md_dev+minor)));
2233+#define BAD_MINOR KERN_ERR \
2234+"md: %s: invalid raid minor (%x)\n"
2235
2236- for (i=0; i<md_dev[minor].nb_dev; i++)
2237- if (md_dev[minor].devices[i].size<min)
2238- {
2239- printk ("Dev %s smaller than %dk, cannot shrink\n",
2240- partition_name (md_dev[minor].devices[i].dev), min);
2241- return -EINVAL;
2242- }
2243-
2244- for (i=0; i<md_dev[minor].nb_dev; i++) {
2245- fsync_dev(md_dev[minor].devices[i].dev);
2246- invalidate_buffers(md_dev[minor].devices[i].dev);
2247- }
2248-
2249- /* Resize devices according to the factor. It is used to align
2250- partitions size on a given chunk size. */
2251- md_size[minor]=0;
2252-
2253- /*
2254- * Analyze the raid superblock
2255- */
2256- if (analyze_sbs(minor, pnum))
2257- return -EINVAL;
2258+#define OUT_OF_MEM KERN_ALERT \
2259+"md: out of memory.\n"
2260
2261- md_dev[minor].pers=pers[pnum];
2262-
2263- if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
2264- {
2265- md_dev[minor].pers=NULL;
2266- free_sb(md_dev + minor);
2267- return (err);
2268- }
2269-
2270- if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
2271- {
2272- md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
2273- md_update_sb(minor);
2274- }
2275-
2276- /* FIXME : We assume here we have blocks
2277- that are twice as large as sectors.
2278- THIS MAY NOT BE TRUE !!! */
2279- md_hd_struct[minor].start_sect=0;
2280- md_hd_struct[minor].nr_sects=md_size[minor]<<1;
2281-
2282- read_ahead[MD_MAJOR] = 128;
2283- return (0);
2284-}
2285+#define NO_SB KERN_ERR \
2286+"md: disabled device %s, could not read superblock.\n"
2287+
2288+#define BAD_CSUM KERN_WARNING \
2289+"md: invalid superblock checksum on %s\n"
2290
2291-static int do_md_stop (int minor, struct inode *inode)
2292+static int alloc_array_sb (mddev_t * mddev)
2293 {
2294- int i;
2295-
2296- if (inode->i_count>1 || md_dev[minor].busy>1) {
2297- /*
2298- * ioctl : one open channel
2299- */
2300- printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
2301- minor, inode->i_count, md_dev[minor].busy);
2302- return -EBUSY;
2303- }
2304-
2305- if (md_dev[minor].pers) {
2306- /*
2307- * It is safe to call stop here, it only frees private
2308- * data. Also, it tells us if a device is unstoppable
2309- * (eg. resyncing is in progress)
2310- */
2311- if (md_dev[minor].pers->stop (minor, md_dev+minor))
2312- return -EBUSY;
2313- /*
2314- * The device won't exist anymore -> flush it now
2315- */
2316- fsync_dev (inode->i_rdev);
2317- invalidate_buffers (inode->i_rdev);
2318- if (md_dev[minor].sb) {
2319- md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
2320- md_update_sb(minor);
2321- }
2322+ if (mddev->sb) {
2323+ MD_BUG();
2324+ return 0;
2325 }
2326-
2327- /* Remove locks. */
2328- if (md_dev[minor].sb)
2329- free_sb(md_dev + minor);
2330- for (i=0; i<md_dev[minor].nb_dev; i++)
2331- clear_inode (md_dev[minor].devices[i].inode);
2332-
2333- md_dev[minor].nb_dev=md_size[minor]=0;
2334- md_hd_struct[minor].nr_sects=0;
2335- md_dev[minor].pers=NULL;
2336-
2337- read_ahead[MD_MAJOR] = 128;
2338-
2339- return (0);
2340+
2341+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
2342+ if (!mddev->sb)
2343+ return -ENOMEM;
2344+ md_clear_page((unsigned long)mddev->sb);
2345+ return 0;
2346 }
2347
2348-static int do_md_add (int minor, kdev_t dev)
2349+static int alloc_disk_sb (mdk_rdev_t * rdev)
2350 {
2351- int i;
2352- int hot_add=0;
2353- struct real_dev *realdev;
2354+ if (rdev->sb)
2355+ MD_BUG();
2356
2357- if (md_dev[minor].nb_dev==MAX_REAL)
2358+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
2359+ if (!rdev->sb) {
2360+ printk (OUT_OF_MEM);
2361 return -EINVAL;
2362+ }
2363+ md_clear_page((unsigned long)rdev->sb);
2364
2365- if (!fs_may_mount (dev))
2366- return -EBUSY;
2367+ return 0;
2368+}
2369
2370- if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
2371- printk("md_add(): zero device size, huh, bailing out.\n");
2372- return -EINVAL;
2373+static void free_disk_sb (mdk_rdev_t * rdev)
2374+{
2375+ if (rdev->sb) {
2376+ free_page((unsigned long) rdev->sb);
2377+ rdev->sb = NULL;
2378+ rdev->sb_offset = 0;
2379+ rdev->size = 0;
2380+ } else {
2381+ if (!rdev->faulty)
2382+ MD_BUG();
2383 }
2384+}
2385
2386- if (md_dev[minor].pers) {
2387- /*
2388- * The array is already running, hot-add the drive, or
2389- * bail out:
2390- */
2391- if (!md_dev[minor].pers->hot_add_disk)
2392- return -EBUSY;
2393- else
2394- hot_add=1;
2395+static void mark_rdev_faulty (mdk_rdev_t * rdev)
2396+{
2397+ unsigned long flags;
2398+
2399+ if (!rdev) {
2400+ MD_BUG();
2401+ return;
2402 }
2403+ save_flags(flags);
2404+ cli();
2405+ free_disk_sb(rdev);
2406+ rdev->faulty = 1;
2407+ restore_flags(flags);
2408+}
2409+
2410+static int read_disk_sb (mdk_rdev_t * rdev)
2411+{
2412+ int ret = -EINVAL;
2413+ struct buffer_head *bh = NULL;
2414+ kdev_t dev = rdev->dev;
2415+ mdp_super_t *sb;
2416+ u32 sb_offset;
2417
2418+ if (!rdev->sb) {
2419+ MD_BUG();
2420+ goto abort;
2421+ }
2422+
2423 /*
2424- * Careful. We cannot increase nb_dev for a running array.
2425+ * Calculate the position of the superblock,
2426+ * it's at the end of the disk
2427 */
2428- i=md_dev[minor].nb_dev;
2429- realdev = &md_dev[minor].devices[i];
2430- realdev->dev=dev;
2431-
2432- /* Lock the device by inserting a dummy inode. This doesn't
2433- smell very good, but I need to be consistent with the
2434- mount stuff, specially with fs_may_mount. If someone have
2435- a better idea, please help ! */
2436-
2437- realdev->inode=get_empty_inode ();
2438- realdev->inode->i_dev=dev; /* don't care about other fields */
2439- insert_inode_hash (realdev->inode);
2440-
2441- /* Sizes are now rounded at run time */
2442-
2443-/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
2444-
2445- realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
2446+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
2447+ rdev->sb_offset = sb_offset;
2448+ printk("(read) %s's sb offset: %d", partition_name(dev),
2449+ sb_offset);
2450+ fsync_dev(dev);
2451+ set_blocksize (dev, MD_SB_BYTES);
2452+ bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2453
2454- if (hot_add) {
2455- /*
2456- * Check the superblock for consistency.
2457- * The personality itself has to check whether it's getting
2458- * added with the proper flags. The personality has to be
2459- * checked too. ;)
2460+ if (bh) {
2461+ sb = (mdp_super_t *) bh->b_data;
2462+ memcpy (rdev->sb, sb, MD_SB_BYTES);
2463+ } else {
2464+ printk (NO_SB,partition_name(rdev->dev));
2465+ goto abort;
2466+ }
2467+ printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
2468+ ret = 0;
2469+abort:
2470+ if (bh)
2471+ brelse (bh);
2472+ return ret;
2473+}
2474+
2475+static unsigned int calc_sb_csum (mdp_super_t * sb)
2476+{
2477+ unsigned int disk_csum, csum;
2478+
2479+ disk_csum = sb->sb_csum;
2480+ sb->sb_csum = 0;
2481+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
2482+ sb->sb_csum = disk_csum;
2483+ return csum;
2484+}
2485+
2486+/*
2487+ * Check one RAID superblock for generic plausibility
2488+ */
2489+
2490+static int check_disk_sb (mdk_rdev_t * rdev)
2491+{
2492+ mdp_super_t *sb;
2493+ int ret = -EINVAL;
2494+
2495+ sb = rdev->sb;
2496+ if (!sb) {
2497+ MD_BUG();
2498+ goto abort;
2499+ }
2500+
2501+ if (sb->md_magic != MD_SB_MAGIC) {
2502+ printk (BAD_MAGIC, partition_name(rdev->dev));
2503+ goto abort;
2504+ }
2505+
2506+ if (sb->md_minor >= MAX_MD_DEVS) {
2507+ printk (BAD_MINOR, partition_name(rdev->dev),
2508+ sb->md_minor);
2509+ goto abort;
2510+ }
2511+
2512+ if (calc_sb_csum(sb) != sb->sb_csum)
2513+ printk(BAD_CSUM, partition_name(rdev->dev));
2514+ ret = 0;
2515+abort:
2516+ return ret;
2517+}
2518+
2519+static kdev_t dev_unit(kdev_t dev)
2520+{
2521+ unsigned int mask;
2522+ struct gendisk *hd = find_gendisk(dev);
2523+
2524+ if (!hd)
2525+ return 0;
2526+ mask = ~((1 << hd->minor_shift) - 1);
2527+
2528+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
2529+}
2530+
2531+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
2532+{
2533+ struct md_list_head *tmp;
2534+ mdk_rdev_t *rdev;
2535+
2536+ ITERATE_RDEV(mddev,rdev,tmp)
2537+ if (dev_unit(rdev->dev) == dev_unit(dev))
2538+ return rdev;
2539+
2540+ return NULL;
2541+}
2542+
2543+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
2544+{
2545+ struct md_list_head *tmp;
2546+ mdk_rdev_t *rdev;
2547+
2548+ ITERATE_RDEV(mddev1,rdev,tmp)
2549+ if (match_dev_unit(mddev2, rdev->dev))
2550+ return 1;
2551+
2552+ return 0;
2553+}
2554+
2555+static MD_LIST_HEAD(all_raid_disks);
2556+static MD_LIST_HEAD(pending_raid_disks);
2557+
2558+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
2559+{
2560+ mdk_rdev_t *same_pdev;
2561+
2562+ if (rdev->mddev) {
2563+ MD_BUG();
2564+ return;
2565+ }
2566+ same_pdev = match_dev_unit(mddev, rdev->dev);
2567+ if (same_pdev)
2568+ printk( KERN_WARNING
2569+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
2570+" protection against single-disk failure might be compromised.\n",
2571+ mdidx(mddev), partition_name(rdev->dev),
2572+ partition_name(same_pdev->dev));
2573+
2574+ md_list_add(&rdev->same_set, &mddev->disks);
2575+ rdev->mddev = mddev;
2576+ mddev->nb_dev++;
2577+ printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
2578+}
2579+
2580+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
2581+{
2582+ if (!rdev->mddev) {
2583+ MD_BUG();
2584+ return;
2585+ }
2586+ md_list_del(&rdev->same_set);
2587+ MD_INIT_LIST_HEAD(&rdev->same_set);
2588+ rdev->mddev->nb_dev--;
2589+ printk("unbind<%s,%d>\n", partition_name(rdev->dev),
2590+ rdev->mddev->nb_dev);
2591+ rdev->mddev = NULL;
2592+}
2593+
2594+/*
2595+ * prevent the device from being mounted, repartitioned or
2596+ * otherwise reused by a RAID array (or any other kernel
2597+ * subsystem), by opening the device. [simply getting an
2598+ * inode is not enough, the SCSI module usage code needs
2599+ * an explicit open() on the device]
2600+ */
2601+static int lock_rdev (mdk_rdev_t *rdev)
2602+{
2603+ int err = 0;
2604+
2605+ /*
2606+ * First insert a dummy inode.
2607+ */
2608+ if (rdev->inode)
2609+ MD_BUG();
2610+ rdev->inode = get_empty_inode();
2611+ /*
2612+ * we dont care about any other fields
2613+ */
2614+ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
2615+ insert_inode_hash(rdev->inode);
2616+
2617+ memset(&rdev->filp, 0, sizeof(rdev->filp));
2618+ rdev->filp.f_mode = 3; /* read write */
2619+ err = blkdev_open(rdev->inode, &rdev->filp);
2620+ if (err) {
2621+ printk("blkdev_open() failed: %d\n", err);
2622+ clear_inode(rdev->inode);
2623+ rdev->inode = NULL;
2624+ }
2625+ return err;
2626+}
2627+
2628+static void unlock_rdev (mdk_rdev_t *rdev)
2629+{
2630+ blkdev_release(rdev->inode);
2631+ if (!rdev->inode)
2632+ MD_BUG();
2633+ clear_inode(rdev->inode);
2634+ rdev->inode = NULL;
2635+}
2636+
2637+static void export_rdev (mdk_rdev_t * rdev)
2638+{
2639+ printk("export_rdev(%s)\n",partition_name(rdev->dev));
2640+ if (rdev->mddev)
2641+ MD_BUG();
2642+ unlock_rdev(rdev);
2643+ free_disk_sb(rdev);
2644+ md_list_del(&rdev->all);
2645+ MD_INIT_LIST_HEAD(&rdev->all);
2646+ if (rdev->pending.next != &rdev->pending) {
2647+ printk("(%s was pending)\n",partition_name(rdev->dev));
2648+ md_list_del(&rdev->pending);
2649+ MD_INIT_LIST_HEAD(&rdev->pending);
2650+ }
2651+ rdev->dev = 0;
2652+ rdev->faulty = 0;
2653+ kfree(rdev);
2654+}
2655+
2656+static void kick_rdev_from_array (mdk_rdev_t * rdev)
2657+{
2658+ unbind_rdev_from_array(rdev);
2659+ export_rdev(rdev);
2660+}
2661+
2662+static void export_array (mddev_t *mddev)
2663+{
2664+ struct md_list_head *tmp;
2665+ mdk_rdev_t *rdev;
2666+ mdp_super_t *sb = mddev->sb;
2667+
2668+ if (mddev->sb) {
2669+ mddev->sb = NULL;
2670+ free_page((unsigned long) sb);
2671+ }
2672+
2673+ ITERATE_RDEV(mddev,rdev,tmp) {
2674+ if (!rdev->mddev) {
2675+ MD_BUG();
2676+ continue;
2677+ }
2678+ kick_rdev_from_array(rdev);
2679+ }
2680+ if (mddev->nb_dev)
2681+ MD_BUG();
2682+}
2683+
2684+#undef BAD_CSUM
2685+#undef BAD_MAGIC
2686+#undef OUT_OF_MEM
2687+#undef NO_SB
2688+
2689+static void print_desc(mdp_disk_t *desc)
2690+{
2691+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
2692+ partition_name(MKDEV(desc->major,desc->minor)),
2693+ desc->major,desc->minor,desc->raid_disk,desc->state);
2694+}
2695+
2696+static void print_sb(mdp_super_t *sb)
2697+{
2698+ int i;
2699+
2700+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2701+ sb->major_version, sb->minor_version, sb->patch_version,
2702+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2703+ sb->ctime);
2704+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
2705+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
2706+ sb->layout, sb->chunk_size);
2707+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
2708+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
2709+ sb->failed_disks, sb->spare_disks,
2710+ sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
2711+
2712+ for (i = 0; i < MD_SB_DISKS; i++) {
2713+ mdp_disk_t *desc;
2714+
2715+ desc = sb->disks + i;
2716+ printk(" D %2d: ", i);
2717+ print_desc(desc);
2718+ }
2719+ printk(" THIS: ");
2720+ print_desc(&sb->this_disk);
2721+
2722+}
2723+
2724+static void print_rdev(mdk_rdev_t *rdev)
2725+{
2726+ printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
2727+ partition_name(rdev->dev), partition_name(rdev->old_dev),
2728+ rdev->size, rdev->faulty, rdev->desc_nr);
2729+ if (rdev->sb) {
2730+ printk("rdev superblock:\n");
2731+ print_sb(rdev->sb);
2732+ } else
2733+ printk("no rdev superblock!\n");
2734+}
2735+
2736+void md_print_devices (void)
2737+{
2738+ struct md_list_head *tmp, *tmp2;
2739+ mdk_rdev_t *rdev;
2740+ mddev_t *mddev;
2741+
2742+ printk("\n");
2743+ printk(" **********************************\n");
2744+ printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
2745+ printk(" **********************************\n");
2746+ ITERATE_MDDEV(mddev,tmp) {
2747+ printk("md%d: ", mdidx(mddev));
2748+
2749+ ITERATE_RDEV(mddev,rdev,tmp2)
2750+ printk("<%s>", partition_name(rdev->dev));
2751+
2752+ if (mddev->sb) {
2753+ printk(" array superblock:\n");
2754+ print_sb(mddev->sb);
2755+ } else
2756+ printk(" no array superblock.\n");
2757+
2758+ ITERATE_RDEV(mddev,rdev,tmp2)
2759+ print_rdev(rdev);
2760+ }
2761+ printk(" **********************************\n");
2762+ printk("\n");
2763+}
2764+
2765+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
2766+{
2767+ int ret;
2768+ mdp_super_t *tmp1, *tmp2;
2769+
2770+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
2771+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
2772+
2773+ if (!tmp1 || !tmp2) {
2774+ ret = 0;
2775+ goto abort;
2776+ }
2777+
2778+ *tmp1 = *sb1;
2779+ *tmp2 = *sb2;
2780+
2781+ /*
2782+ * nr_disks is not constant
2783+ */
2784+ tmp1->nr_disks = 0;
2785+ tmp2->nr_disks = 0;
2786+
2787+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
2788+ ret = 0;
2789+ else
2790+ ret = 1;
2791+
2792+abort:
2793+ if (tmp1)
2794+ kfree(tmp1);
2795+ if (tmp2)
2796+ kfree(tmp2);
2797+
2798+ return ret;
2799+}
2800+
2801+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
2802+{
2803+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
2804+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
2805+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
2806+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
2807+
2808+ return 1;
2809+
2810+ return 0;
2811+}
2812+
2813+static mdk_rdev_t * find_rdev_all (kdev_t dev)
2814+{
2815+ struct md_list_head *tmp;
2816+ mdk_rdev_t *rdev;
2817+
2818+ tmp = all_raid_disks.next;
2819+ while (tmp != &all_raid_disks) {
2820+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
2821+ if (rdev->dev == dev)
2822+ return rdev;
2823+ tmp = tmp->next;
2824+ }
2825+ return NULL;
2826+}
2827+
2828+#define GETBLK_FAILED KERN_ERR \
2829+"md: getblk failed for device %s\n"
2830+
2831+static int write_disk_sb(mdk_rdev_t * rdev)
2832+{
2833+ struct buffer_head *bh;
2834+ kdev_t dev;
2835+ u32 sb_offset, size;
2836+ mdp_super_t *sb;
2837+
2838+ if (!rdev->sb) {
2839+ MD_BUG();
2840+ return -1;
2841+ }
2842+ if (rdev->faulty) {
2843+ MD_BUG();
2844+ return -1;
2845+ }
2846+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
2847+ MD_BUG();
2848+ return -1;
2849+ }
2850+
2851+ dev = rdev->dev;
2852+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
2853+ if (rdev->sb_offset != sb_offset) {
2854+ printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
2855+ goto skip;
2856+ }
2857+ /*
2858+ * If the disk went offline meanwhile and it's just a spare, then
2859+ * it's size has changed to zero silently, and the MD code does
2860+ * not yet know that it's faulty.
2861+ */
2862+ size = calc_dev_size(dev, rdev->mddev, 1);
2863+ if (size != rdev->size) {
2864+ printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
2865+ goto skip;
2866+ }
2867+
2868+ printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
2869+ fsync_dev(dev);
2870+ set_blocksize(dev, MD_SB_BYTES);
2871+ bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2872+ if (!bh) {
2873+ printk(GETBLK_FAILED, partition_name(dev));
2874+ return 1;
2875+ }
2876+ memset(bh->b_data,0,bh->b_size);
2877+ sb = (mdp_super_t *) bh->b_data;
2878+ memcpy(sb, rdev->sb, MD_SB_BYTES);
2879+
2880+ mark_buffer_uptodate(bh, 1);
2881+ mark_buffer_dirty(bh, 1);
2882+ ll_rw_block(WRITE, 1, &bh);
2883+ wait_on_buffer(bh);
2884+ brelse(bh);
2885+ fsync_dev(dev);
2886+skip:
2887+ return 0;
2888+}
2889+#undef GETBLK_FAILED KERN_ERR
2890+
2891+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2892+{
2893+ int i, ok = 0;
2894+ mdp_disk_t *desc;
2895+
2896+ for (i = 0; i < MD_SB_DISKS; i++) {
2897+ desc = mddev->sb->disks + i;
2898+#if 0
2899+ if (disk_faulty(desc)) {
2900+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
2901+ ok = 1;
2902+ continue;
2903+ }
2904+#endif
2905+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
2906+ rdev->sb->this_disk = *desc;
2907+ rdev->desc_nr = desc->number;
2908+ ok = 1;
2909+ break;
2910+ }
2911+ }
2912+
2913+ if (!ok) {
2914+ MD_BUG();
2915+ }
2916+}
2917+
2918+static int sync_sbs(mddev_t * mddev)
2919+{
2920+ mdk_rdev_t *rdev;
2921+ mdp_super_t *sb;
2922+ struct md_list_head *tmp;
2923+
2924+ ITERATE_RDEV(mddev,rdev,tmp) {
2925+ if (rdev->faulty)
2926+ continue;
2927+ sb = rdev->sb;
2928+ *sb = *mddev->sb;
2929+ set_this_disk(mddev, rdev);
2930+ sb->sb_csum = calc_sb_csum(sb);
2931+ }
2932+ return 0;
2933+}
2934+
2935+int md_update_sb(mddev_t * mddev)
2936+{
2937+ int first, err, count = 100;
2938+ struct md_list_head *tmp;
2939+ mdk_rdev_t *rdev;
2940+ __u64 ev;
2941+
2942+repeat:
2943+ mddev->sb->utime = CURRENT_TIME;
2944+ ev = get_unaligned(&mddev->sb->events);
2945+ ++ev;
2946+ put_unaligned(ev,&mddev->sb->events);
2947+ if (ev == (__u64)0) {
2948+ /*
2949+ * oops, this 64-bit counter should never wrap.
2950+ * Either we are in around ~1 trillion A.C., assuming
2951+ * 1 reboot per second, or we have a bug:
2952+ */
2953+ MD_BUG();
2954+ --ev;
2955+ put_unaligned(ev,&mddev->sb->events);
2956+ }
2957+ sync_sbs(mddev);
2958+
2959+ /*
2960+ * do not write anything to disk if using
2961+ * nonpersistent superblocks
2962+ */
2963+ if (mddev->sb->not_persistent)
2964+ return 0;
2965+
2966+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
2967+ mdidx(mddev));
2968+
2969+ first = 1;
2970+ err = 0;
2971+ ITERATE_RDEV(mddev,rdev,tmp) {
2972+ if (!first) {
2973+ first = 0;
2974+ printk(", ");
2975+ }
2976+ if (rdev->faulty)
2977+ printk("(skipping faulty ");
2978+ printk("%s ", partition_name(rdev->dev));
2979+ if (!rdev->faulty) {
2980+ printk("[events: %08lx]",
2981+ (unsigned long)get_unaligned(&rdev->sb->events));
2982+ err += write_disk_sb(rdev);
2983+ } else
2984+ printk(")\n");
2985+ }
2986+ printk(".\n");
2987+ if (err) {
2988+ printk("errors occured during superblock update, repeating\n");
2989+ if (--count)
2990+ goto repeat;
2991+ printk("excessive errors occured during superblock update, exiting\n");
2992+ }
2993+ return 0;
2994+}
2995+
2996+/*
2997+ * Import a device. If 'on_disk', then sanity check the superblock
2998+ *
2999+ * mark the device faulty if:
3000+ *
3001+ * - the device is nonexistent (zero size)
3002+ * - the device has no valid superblock
3003+ *
3004+ * a faulty rdev _never_ has rdev->sb set.
3005+ */
3006+static int md_import_device (kdev_t newdev, int on_disk)
3007+{
3008+ int err;
3009+ mdk_rdev_t *rdev;
3010+ unsigned int size;
3011+
3012+ if (find_rdev_all(newdev))
3013+ return -EEXIST;
3014+
3015+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
3016+ if (!rdev) {
3017+ printk("could not alloc mem for %s!\n", partition_name(newdev));
3018+ return -ENOMEM;
3019+ }
3020+ memset(rdev, 0, sizeof(*rdev));
3021+
3022+ if (!fs_may_mount(newdev)) {
3023+ printk("md: can not import %s, has active inodes!\n",
3024+ partition_name(newdev));
3025+ err = -EBUSY;
3026+ goto abort_free;
3027+ }
3028+
3029+ if ((err = alloc_disk_sb(rdev)))
3030+ goto abort_free;
3031+
3032+ rdev->dev = newdev;
3033+ if (lock_rdev(rdev)) {
3034+ printk("md: could not lock %s, zero-size? Marking faulty.\n",
3035+ partition_name(newdev));
3036+ err = -EINVAL;
3037+ goto abort_free;
3038+ }
3039+ rdev->desc_nr = -1;
3040+ rdev->faulty = 0;
3041+
3042+ size = 0;
3043+ if (blk_size[MAJOR(newdev)])
3044+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
3045+ if (!size) {
3046+ printk("md: %s has zero size, marking faulty!\n",
3047+ partition_name(newdev));
3048+ err = -EINVAL;
3049+ goto abort_free;
3050+ }
3051+
3052+ if (on_disk) {
3053+ if ((err = read_disk_sb(rdev))) {
3054+ printk("md: could not read %s's sb, not importing!\n",
3055+ partition_name(newdev));
3056+ goto abort_free;
3057+ }
3058+ if ((err = check_disk_sb(rdev))) {
3059+ printk("md: %s has invalid sb, not importing!\n",
3060+ partition_name(newdev));
3061+ goto abort_free;
3062+ }
3063+
3064+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
3065+ rdev->sb->this_disk.minor);
3066+ rdev->desc_nr = rdev->sb->this_disk.number;
3067+ }
3068+ md_list_add(&rdev->all, &all_raid_disks);
3069+ MD_INIT_LIST_HEAD(&rdev->pending);
3070+
3071+ if (rdev->faulty && rdev->sb)
3072+ free_disk_sb(rdev);
3073+ return 0;
3074+
3075+abort_free:
3076+ if (rdev->sb) {
3077+ if (rdev->inode)
3078+ unlock_rdev(rdev);
3079+ free_disk_sb(rdev);
3080+ }
3081+ kfree(rdev);
3082+ return err;
3083+}
3084+
3085+/*
3086+ * Check a full RAID array for plausibility
3087+ */
3088+
3089+#define INCONSISTENT KERN_ERR \
3090+"md: fatal superblock inconsistency in %s -- removing from array\n"
3091+
3092+#define OUT_OF_DATE KERN_ERR \
3093+"md: superblock update time inconsistency -- using the most recent one\n"
3094+
3095+#define OLD_VERSION KERN_ALERT \
3096+"md: md%d: unsupported raid array version %d.%d.%d\n"
3097+
3098+#define NOT_CLEAN_IGNORE KERN_ERR \
3099+"md: md%d: raid array is not clean -- starting background reconstruction\n"
3100+
3101+#define UNKNOWN_LEVEL KERN_ERR \
3102+"md: md%d: unsupported raid level %d\n"
3103+
3104+static int analyze_sbs (mddev_t * mddev)
3105+{
3106+ int out_of_date = 0, i;
3107+ struct md_list_head *tmp, *tmp2;
3108+ mdk_rdev_t *rdev, *rdev2, *freshest;
3109+ mdp_super_t *sb;
3110+
3111+ /*
3112+ * Verify the RAID superblock on each real device
3113+ */
3114+ ITERATE_RDEV(mddev,rdev,tmp) {
3115+ if (rdev->faulty) {
3116+ MD_BUG();
3117+ goto abort;
3118+ }
3119+ if (!rdev->sb) {
3120+ MD_BUG();
3121+ goto abort;
3122+ }
3123+ if (check_disk_sb(rdev))
3124+ goto abort;
3125+ }
3126+
3127+ /*
3128+ * The superblock constant part has to be the same
3129+ * for all disks in the array.
3130+ */
3131+ sb = NULL;
3132+
3133+ ITERATE_RDEV(mddev,rdev,tmp) {
3134+ if (!sb) {
3135+ sb = rdev->sb;
3136+ continue;
3137+ }
3138+ if (!sb_equal(sb, rdev->sb)) {
3139+ printk (INCONSISTENT, partition_name(rdev->dev));
3140+ kick_rdev_from_array(rdev);
3141+ continue;
3142+ }
3143+ }
3144+
3145+ /*
3146+ * OK, we have all disks and the array is ready to run. Let's
3147+ * find the freshest superblock, that one will be the superblock
3148+ * that represents the whole array.
3149+ */
3150+ if (!mddev->sb)
3151+ if (alloc_array_sb(mddev))
3152+ goto abort;
3153+ sb = mddev->sb;
3154+ freshest = NULL;
3155+
3156+ ITERATE_RDEV(mddev,rdev,tmp) {
3157+ __u64 ev1, ev2;
3158+ /*
3159+ * if the checksum is invalid, use the superblock
3160+ * only as a last resort. (decrease it's age by
3161+ * one event)
3162+ */
3163+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
3164+ __u64 ev = get_unaligned(&rdev->sb->events);
3165+ if (ev != (__u64)0) {
3166+ --ev;
3167+ put_unaligned(ev,&rdev->sb->events);
3168+ }
3169+ }
3170+
3171+ printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
3172+ (unsigned long)get_unaligned(&rdev->sb->events));
3173+ if (!freshest) {
3174+ freshest = rdev;
3175+ continue;
3176+ }
3177+ /*
3178+ * Find the newest superblock version
3179+ */
3180+ ev1 = get_unaligned(&rdev->sb->events);
3181+ ev2 = get_unaligned(&freshest->sb->events);
3182+ if (ev1 != ev2) {
3183+ out_of_date = 1;
3184+ if (ev1 > ev2)
3185+ freshest = rdev;
3186+ }
3187+ }
3188+ if (out_of_date) {
3189+ printk(OUT_OF_DATE);
3190+ printk("freshest: %s\n", partition_name(freshest->dev));
3191+ }
3192+ memcpy (sb, freshest->sb, sizeof(*sb));
3193+
3194+ /*
3195+ * at this point we have picked the 'best' superblock
3196+ * from all available superblocks.
3197+ * now we validate this superblock and kick out possibly
3198+ * failed disks.
3199+ */
3200+ ITERATE_RDEV(mddev,rdev,tmp) {
3201+ /*
3202+ * Kick all non-fresh devices faulty
3203+ */
3204+ __u64 ev1, ev2;
3205+ ev1 = get_unaligned(&rdev->sb->events);
3206+ ev2 = get_unaligned(&sb->events);
3207+ ++ev1;
3208+ if (ev1 < ev2) {
3209+ printk("md: kicking non-fresh %s from array!\n",
3210+ partition_name(rdev->dev));
3211+ kick_rdev_from_array(rdev);
3212+ continue;
3213+ }
3214+ }
3215+
3216+ /*
3217+ * Fix up changed device names ... but only if this disk has a
3218+ * recent update time. Use faulty checksum ones too.
3219+ */
3220+ ITERATE_RDEV(mddev,rdev,tmp) {
3221+ __u64 ev1, ev2, ev3;
3222+ if (rdev->faulty) { /* REMOVEME */
3223+ MD_BUG();
3224+ goto abort;
3225+ }
3226+ ev1 = get_unaligned(&rdev->sb->events);
3227+ ev2 = get_unaligned(&sb->events);
3228+ ev3 = ev2;
3229+ --ev3;
3230+ if ((rdev->dev != rdev->old_dev) &&
3231+ ((ev1 == ev2) || (ev1 == ev3))) {
3232+ mdp_disk_t *desc;
3233+
3234+ printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
3235+ if (rdev->desc_nr == -1) {
3236+ MD_BUG();
3237+ goto abort;
3238+ }
3239+ desc = &sb->disks[rdev->desc_nr];
3240+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
3241+ MD_BUG();
3242+ goto abort;
3243+ }
3244+ desc->major = MAJOR(rdev->dev);
3245+ desc->minor = MINOR(rdev->dev);
3246+ desc = &rdev->sb->this_disk;
3247+ desc->major = MAJOR(rdev->dev);
3248+ desc->minor = MINOR(rdev->dev);
3249+ }
3250+ }
3251+
3252+ /*
3253+ * Remove unavailable and faulty devices ...
3254+ *
3255+ * note that if an array becomes completely unrunnable due to
3256+ * missing devices, we do not write the superblock back, so the
3257+ * administrator has a chance to fix things up. The removal thus
3258+ * only happens if it's nonfatal to the contents of the array.
3259+ */
3260+ for (i = 0; i < MD_SB_DISKS; i++) {
3261+ int found;
3262+ mdp_disk_t *desc;
3263+ kdev_t dev;
3264+
3265+ desc = sb->disks + i;
3266+ dev = MKDEV(desc->major, desc->minor);
3267+
3268+ /*
3269+ * We kick faulty devices/descriptors immediately.
3270+ */
3271+ if (disk_faulty(desc)) {
3272+ found = 0;
3273+ ITERATE_RDEV(mddev,rdev,tmp) {
3274+ if (rdev->desc_nr != desc->number)
3275+ continue;
3276+ printk("md%d: kicking faulty %s!\n",
3277+ mdidx(mddev),partition_name(rdev->dev));
3278+ kick_rdev_from_array(rdev);
3279+ found = 1;
3280+ break;
3281+ }
3282+ if (!found) {
3283+ if (dev == MKDEV(0,0))
3284+ continue;
3285+ printk("md%d: removing former faulty %s!\n",
3286+ mdidx(mddev), partition_name(dev));
3287+ }
3288+ remove_descriptor(desc, sb);
3289+ continue;
3290+ }
3291+
3292+ if (dev == MKDEV(0,0))
3293+ continue;
3294+ /*
3295+ * Is this device present in the rdev ring?
3296+ */
3297+ found = 0;
3298+ ITERATE_RDEV(mddev,rdev,tmp) {
3299+ if (rdev->desc_nr == desc->number) {
3300+ found = 1;
3301+ break;
3302+ }
3303+ }
3304+ if (found)
3305+ continue;
3306+
3307+ printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
3308+ remove_descriptor(desc, sb);
3309+ }
3310+
3311+ /*
3312+ * Double check wether all devices mentioned in the
3313+ * superblock are in the rdev ring.
3314+ */
3315+ for (i = 0; i < MD_SB_DISKS; i++) {
3316+ mdp_disk_t *desc;
3317+ kdev_t dev;
3318+
3319+ desc = sb->disks + i;
3320+ dev = MKDEV(desc->major, desc->minor);
3321+
3322+ if (dev == MKDEV(0,0))
3323+ continue;
3324+
3325+ if (disk_faulty(desc)) {
3326+ MD_BUG();
3327+ goto abort;
3328+ }
3329+
3330+ rdev = find_rdev(mddev, dev);
3331+ if (!rdev) {
3332+ MD_BUG();
3333+ goto abort;
3334+ }
3335+ }
3336+
3337+ /*
3338+ * Do a final reality check.
3339+ */
3340+ ITERATE_RDEV(mddev,rdev,tmp) {
3341+ if (rdev->desc_nr == -1) {
3342+ MD_BUG();
3343+ goto abort;
3344+ }
3345+ /*
3346+ * is the desc_nr unique?
3347+ */
3348+ ITERATE_RDEV(mddev,rdev2,tmp2) {
3349+ if ((rdev2 != rdev) &&
3350+ (rdev2->desc_nr == rdev->desc_nr)) {
3351+ MD_BUG();
3352+ goto abort;
3353+ }
3354+ }
3355+ /*
3356+ * is the device unique?
3357+ */
3358+ ITERATE_RDEV(mddev,rdev2,tmp2) {
3359+ if ((rdev2 != rdev) &&
3360+ (rdev2->dev == rdev->dev)) {
3361+ MD_BUG();
3362+ goto abort;
3363+ }
3364+ }
3365+ }
3366+
3367+ /*
3368+ * Check if we can support this RAID array
3369+ */
3370+ if (sb->major_version != MD_MAJOR_VERSION ||
3371+ sb->minor_version > MD_MINOR_VERSION) {
3372+
3373+ printk (OLD_VERSION, mdidx(mddev), sb->major_version,
3374+ sb->minor_version, sb->patch_version);
3375+ goto abort;
3376+ }
3377+
3378+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
3379+ (sb->level == 4) || (sb->level == 5)))
3380+ printk (NOT_CLEAN_IGNORE, mdidx(mddev));
3381+
3382+ return 0;
3383+abort:
3384+ return 1;
3385+}
3386+
3387+#undef INCONSISTENT
3388+#undef OUT_OF_DATE
3389+#undef OLD_VERSION
3390+#undef OLD_LEVEL
3391+
3392+static int device_size_calculation (mddev_t * mddev)
3393+{
3394+ int data_disks = 0, persistent;
3395+ unsigned int readahead;
3396+ mdp_super_t *sb = mddev->sb;
3397+ struct md_list_head *tmp;
3398+ mdk_rdev_t *rdev;
3399+
3400+ /*
3401+ * Do device size calculation. Bail out if too small.
3402+ * (we have to do this after having validated chunk_size,
3403+ * because device size has to be modulo chunk_size)
3404+ */
3405+ persistent = !mddev->sb->not_persistent;
3406+ ITERATE_RDEV(mddev,rdev,tmp) {
3407+ if (rdev->faulty)
3408+ continue;
3409+ if (rdev->size) {
3410+ MD_BUG();
3411+ continue;
3412+ }
3413+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
3414+ if (rdev->size < sb->chunk_size / 1024) {
3415+ printk (KERN_WARNING
3416+ "Dev %s smaller than chunk_size: %dk < %dk\n",
3417+ partition_name(rdev->dev),
3418+ rdev->size, sb->chunk_size / 1024);
3419+ return -EINVAL;
3420+ }
3421+ }
3422+
3423+ switch (sb->level) {
3424+ case -3:
3425+ data_disks = 1;
3426+ break;
3427+ case -2:
3428+ data_disks = 1;
3429+ break;
3430+ case -1:
3431+ zoned_raid_size(mddev);
3432+ data_disks = 1;
3433+ break;
3434+ case 0:
3435+ zoned_raid_size(mddev);
3436+ data_disks = sb->raid_disks;
3437+ break;
3438+ case 1:
3439+ data_disks = 1;
3440+ break;
3441+ case 4:
3442+ case 5:
3443+ data_disks = sb->raid_disks-1;
3444+ break;
3445+ default:
3446+ printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
3447+ goto abort;
3448+ }
3449+ if (!md_size[mdidx(mddev)])
3450+ md_size[mdidx(mddev)] = sb->size * data_disks;
3451+
3452+ readahead = MD_READAHEAD;
3453+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
3454+ readahead = mddev->sb->chunk_size * 4 * data_disks;
3455+ if (readahead < data_disks * MAX_SECTORS*512*2)
3456+ readahead = data_disks * MAX_SECTORS*512*2;
3457+ else {
3458+ if (sb->level == -3)
3459+ readahead = 0;
3460+ }
3461+ md_maxreadahead[mdidx(mddev)] = readahead;
3462+
3463+ printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
3464+ mdidx(mddev), readahead/1024);
3465+
3466+ printk(KERN_INFO
3467+ "md%d: %d data-disks, max readahead per data-disk: %dk\n",
3468+ mdidx(mddev), data_disks, readahead/data_disks/1024);
3469+ return 0;
3470+abort:
3471+ return 1;
3472+}
3473+
3474+
3475+#define TOO_BIG_CHUNKSIZE KERN_ERR \
3476+"too big chunk_size: %d > %d\n"
3477+
3478+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
3479+"too small chunk_size: %d < %ld\n"
3480+
3481+#define BAD_CHUNKSIZE KERN_ERR \
3482+"no chunksize specified, see 'man raidtab'\n"
3483+
3484+static int do_md_run (mddev_t * mddev)
3485+{
3486+ int pnum, err;
3487+ int chunk_size;
3488+ struct md_list_head *tmp;
3489+ mdk_rdev_t *rdev;
3490+
3491+
3492+ if (!mddev->nb_dev) {
3493+ MD_BUG();
3494+ return -EINVAL;
3495+ }
3496+
3497+ if (mddev->pers)
3498+ return -EBUSY;
3499+
3500+ /*
3501+ * Resize disks to align partitions size on a given
3502+ * chunk size.
3503+ */
3504+ md_size[mdidx(mddev)] = 0;
3505+
3506+ /*
3507+ * Analyze all RAID superblock(s)
3508+ */
3509+ if (analyze_sbs(mddev)) {
3510+ MD_BUG();
3511+ return -EINVAL;
3512+ }
3513+
3514+ chunk_size = mddev->sb->chunk_size;
3515+ pnum = level_to_pers(mddev->sb->level);
3516+
3517+ mddev->param.chunk_size = chunk_size;
3518+ mddev->param.personality = pnum;
3519+
3520+ if (chunk_size > MAX_CHUNK_SIZE) {
3521+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
3522+ return -EINVAL;
3523+ }
3524+ /*
3525+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3526+ */
3527+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
3528+ MD_BUG();
3529+ return -EINVAL;
3530+ }
3531+ if (chunk_size < PAGE_SIZE) {
3532+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
3533+ return -EINVAL;
3534+ }
3535+
3536+ if (pnum >= MAX_PERSONALITY) {
3537+ MD_BUG();
3538+ return -EINVAL;
3539+ }
3540+
3541+ if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
3542+ /*
3543+ * 'default chunksize' in the old md code used to
3544+ * be PAGE_SIZE, baaad.
3545+ * we abort here to be on the safe side. We dont
3546+ * want to continue the bad practice.
3547+ */
3548+ printk(BAD_CHUNKSIZE);
3549+ return -EINVAL;
3550+ }
3551+
3552+ if (!pers[pnum])
3553+ {
3554+#ifdef CONFIG_KMOD
3555+ char module_name[80];
3556+ sprintf (module_name, "md-personality-%d", pnum);
3557+ request_module (module_name);
3558+ if (!pers[pnum])
3559+#endif
3560+ return -EINVAL;
3561+ }
3562+
3563+ if (device_size_calculation(mddev))
3564+ return -EINVAL;
3565+
3566+ /*
3567+ * Drop all container device buffers, from now on
3568+ * the only valid external interface is through the md
3569+ * device.
3570+ */
3571+ ITERATE_RDEV(mddev,rdev,tmp) {
3572+ if (rdev->faulty)
3573+ continue;
3574+ fsync_dev(rdev->dev);
3575+ invalidate_buffers(rdev->dev);
3576+ }
3577+
3578+ mddev->pers = pers[pnum];
3579+
3580+ err = mddev->pers->run(mddev);
3581+ if (err) {
3582+ printk("pers->run() failed ...\n");
3583+ mddev->pers = NULL;
3584+ return -EINVAL;
3585+ }
3586+
3587+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
3588+ md_update_sb(mddev);
3589+
3590+ /*
3591+ * md_size has units of 1K blocks, which are
3592+ * twice as large as sectors.
3593+ */
3594+ md_hd_struct[mdidx(mddev)].start_sect = 0;
3595+ md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
3596+
3597+ read_ahead[MD_MAJOR] = 1024;
3598+ return (0);
3599+}
3600+
3601+#undef TOO_BIG_CHUNKSIZE
3602+#undef BAD_CHUNKSIZE
3603+
3604+#define OUT(x) do { err = (x); goto out; } while (0)
3605+
3606+static int restart_array (mddev_t *mddev)
3607+{
3608+ int err = 0;
3609+
3610+ /*
3611+ * Complain if it has no devices
3612+ */
3613+ if (!mddev->nb_dev)
3614+ OUT(-ENXIO);
3615+
3616+ if (mddev->pers) {
3617+ if (!mddev->ro)
3618+ OUT(-EBUSY);
3619+
3620+ mddev->ro = 0;
3621+ set_device_ro(mddev_to_kdev(mddev), 0);
3622+
3623+ printk (KERN_INFO
3624+ "md%d switched to read-write mode.\n", mdidx(mddev));
3625+ /*
3626+ * Kick recovery or resync if necessary
3627+ */
3628+ md_recover_arrays();
3629+ if (mddev->pers->restart_resync)
3630+ mddev->pers->restart_resync(mddev);
3631+ } else
3632+ err = -EINVAL;
3633+
3634+out:
3635+ return err;
3636+}
3637+
3638+#define STILL_MOUNTED KERN_WARNING \
3639+"md: md%d still mounted.\n"
3640+
3641+static int do_md_stop (mddev_t * mddev, int ro)
3642+{
3643+ int err = 0, resync_interrupted = 0;
3644+ kdev_t dev = mddev_to_kdev(mddev);
3645+
3646+ if (!ro && !fs_may_mount (dev)) {
3647+ printk (STILL_MOUNTED, mdidx(mddev));
3648+ OUT(-EBUSY);
3649+ }
3650+
3651+ /*
3652+ * complain if it's already stopped
3653+ */
3654+ if (!mddev->nb_dev)
3655+ OUT(-ENXIO);
3656+
3657+ if (mddev->pers) {
3658+ /*
3659+ * It is safe to call stop here, it only frees private
3660+ * data. Also, it tells us if a device is unstoppable
3661+ * (eg. resyncing is in progress)
3662+ */
3663+ if (mddev->pers->stop_resync)
3664+ if (mddev->pers->stop_resync(mddev))
3665+ resync_interrupted = 1;
3666+
3667+ if (mddev->recovery_running)
3668+ md_interrupt_thread(md_recovery_thread);
3669+
3670+ /*
3671+ * This synchronizes with signal delivery to the
3672+ * resync or reconstruction thread. It also nicely
3673+ * hangs the process if some reconstruction has not
3674+ * finished.
3675+ */
3676+ down(&mddev->recovery_sem);
3677+ up(&mddev->recovery_sem);
3678+
3679+ /*
3680+ * sync and invalidate buffers because we cannot kill the
3681+ * main thread with valid IO transfers still around.
3682+ * the kernel lock protects us from new requests being
3683+ * added after invalidate_buffers().
3684+ */
3685+ fsync_dev (mddev_to_kdev(mddev));
3686+ fsync_dev (dev);
3687+ invalidate_buffers (dev);
3688+
3689+ if (ro) {
3690+ if (mddev->ro)
3691+ OUT(-ENXIO);
3692+ mddev->ro = 1;
3693+ } else {
3694+ if (mddev->ro)
3695+ set_device_ro(dev, 0);
3696+ if (mddev->pers->stop(mddev)) {
3697+ if (mddev->ro)
3698+ set_device_ro(dev, 1);
3699+ OUT(-EBUSY);
3700+ }
3701+ if (mddev->ro)
3702+ mddev->ro = 0;
3703+ }
3704+ if (mddev->sb) {
3705+ /*
3706+ * mark it clean only if there was no resync
3707+ * interrupted.
3708+ */
3709+ if (!mddev->recovery_running && !resync_interrupted) {
3710+ printk("marking sb clean...\n");
3711+ mddev->sb->state |= 1 << MD_SB_CLEAN;
3712+ }
3713+ md_update_sb(mddev);
3714+ }
3715+ if (ro)
3716+ set_device_ro(dev, 1);
3717+ }
3718+
3719+ /*
3720+ * Free resources if final stop
3721+ */
3722+ if (!ro) {
3723+ export_array(mddev);
3724+ md_size[mdidx(mddev)] = 0;
3725+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
3726+ free_mddev(mddev);
3727+
3728+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
3729+ } else
3730+ printk (KERN_INFO
3731+ "md%d switched to read-only mode.\n", mdidx(mddev));
3732+out:
3733+ return err;
3734+}
3735+
3736+#undef OUT
3737+
3738+/*
3739+ * We have to safely support old arrays too.
3740+ */
3741+int detect_old_array (mdp_super_t *sb)
3742+{
3743+ if (sb->major_version > 0)
3744+ return 0;
3745+ if (sb->minor_version >= 90)
3746+ return 0;
3747+
3748+ return -EINVAL;
3749+}
3750+
3751+
3752+static void autorun_array (mddev_t *mddev)
3753+{
3754+ mdk_rdev_t *rdev;
3755+ struct md_list_head *tmp;
3756+ int err;
3757+
3758+ if (mddev->disks.prev == &mddev->disks) {
3759+ MD_BUG();
3760+ return;
3761+ }
3762+
3763+ printk("running: ");
3764+
3765+ ITERATE_RDEV(mddev,rdev,tmp) {
3766+ printk("<%s>", partition_name(rdev->dev));
3767+ }
3768+ printk("\nnow!\n");
3769+
3770+ err = do_md_run (mddev);
3771+ if (err) {
3772+ printk("do_md_run() returned %d\n", err);
3773+ /*
3774+ * prevent the writeback of an unrunnable array
3775+ */
3776+ mddev->sb_dirty = 0;
3777+ do_md_stop (mddev, 0);
3778+ }
3779+}
3780+
3781+/*
3782+ * lets try to run arrays based on all disks that have arrived
3783+ * until now. (those are in the ->pending list)
3784+ *
3785+ * the method: pick the first pending disk, collect all disks with
3786+ * the same UUID, remove all from the pending list and put them into
3787+ * the 'same_array' list. Then order this list based on superblock
3788+ * update time (freshest comes first), kick out 'old' disks and
3789+ * compare superblocks. If everything's fine then run it.
3790+ */
3791+static void autorun_devices (void)
3792+{
3793+ struct md_list_head candidates;
3794+ struct md_list_head *tmp;
3795+ mdk_rdev_t *rdev0, *rdev;
3796+ mddev_t *mddev;
3797+ kdev_t md_kdev;
3798+
3799+
3800+ printk("autorun ...\n");
3801+ while (pending_raid_disks.next != &pending_raid_disks) {
3802+ rdev0 = md_list_entry(pending_raid_disks.next,
3803+ mdk_rdev_t, pending);
3804+
3805+ printk("considering %s ...\n", partition_name(rdev0->dev));
3806+ MD_INIT_LIST_HEAD(&candidates);
3807+ ITERATE_RDEV_PENDING(rdev,tmp) {
3808+ if (uuid_equal(rdev0, rdev)) {
3809+ if (!sb_equal(rdev0->sb, rdev->sb)) {
3810+ printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
3811+ continue;
3812+ }
3813+ printk(" adding %s ...\n", partition_name(rdev->dev));
3814+ md_list_del(&rdev->pending);
3815+ md_list_add(&rdev->pending, &candidates);
3816+ }
3817+ }
3818+ /*
3819+ * now we have a set of devices, with all of them having
3820+ * mostly sane superblocks. It's time to allocate the
3821+ * mddev.
3822 */
3823- if (analyze_one_sb (realdev))
3824+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
3825+ mddev = kdev_to_mddev(md_kdev);
3826+ if (mddev) {
3827+ printk("md%d already running, cannot run %s\n",
3828+ mdidx(mddev), partition_name(rdev0->dev));
3829+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
3830+ export_rdev(rdev);
3831+ continue;
3832+ }
3833+ mddev = alloc_mddev(md_kdev);
3834+ printk("created md%d\n", mdidx(mddev));
3835+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
3836+ bind_rdev_to_array(rdev, mddev);
3837+ md_list_del(&rdev->pending);
3838+ MD_INIT_LIST_HEAD(&rdev->pending);
3839+ }
3840+ autorun_array(mddev);
3841+ }
3842+ printk("... autorun DONE.\n");
3843+}
3844+
3845+/*
3846+ * import RAID devices based on one partition
3847+ * if possible, the array gets run as well.
3848+ */
3849+
3850+#define BAD_VERSION KERN_ERR \
3851+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
3852+
3853+#define OUT_OF_MEM KERN_ALERT \
3854+"md: out of memory.\n"
3855+
3856+#define NO_DEVICE KERN_ERR \
3857+"md: disabled device %s\n"
3858+
3859+#define AUTOADD_FAILED KERN_ERR \
3860+"md: auto-adding devices to md%d FAILED (error %d).\n"
3861+
3862+#define AUTOADD_FAILED_USED KERN_ERR \
3863+"md: cannot auto-add device %s to md%d, already used.\n"
3864+
3865+#define AUTORUN_FAILED KERN_ERR \
3866+"md: auto-running md%d FAILED (error %d).\n"
3867+
3868+#define MDDEV_BUSY KERN_ERR \
3869+"md: cannot auto-add to md%d, already running.\n"
3870+
3871+#define AUTOADDING KERN_INFO \
3872+"md: auto-adding devices to md%d, based on %s's superblock.\n"
3873+
3874+#define AUTORUNNING KERN_INFO \
3875+"md: auto-running md%d.\n"
3876+
3877+static int autostart_array (kdev_t startdev)
3878+{
3879+ int err = -EINVAL, i;
3880+ mdp_super_t *sb = NULL;
3881+ mdk_rdev_t *start_rdev = NULL, *rdev;
3882+
3883+ if (md_import_device(startdev, 1)) {
3884+ printk("could not import %s!\n", partition_name(startdev));
3885+ goto abort;
3886+ }
3887+
3888+ start_rdev = find_rdev_all(startdev);
3889+ if (!start_rdev) {
3890+ MD_BUG();
3891+ goto abort;
3892+ }
3893+ if (start_rdev->faulty) {
3894+ printk("can not autostart based on faulty %s!\n",
3895+ partition_name(startdev));
3896+ goto abort;
3897+ }
3898+ md_list_add(&start_rdev->pending, &pending_raid_disks);
3899+
3900+ sb = start_rdev->sb;
3901+
3902+ err = detect_old_array(sb);
3903+ if (err) {
3904+ printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
3905+ goto abort;
3906+ }
3907+
3908+ for (i = 0; i < MD_SB_DISKS; i++) {
3909+ mdp_disk_t *desc;
3910+ kdev_t dev;
3911+
3912+ desc = sb->disks + i;
3913+ dev = MKDEV(desc->major, desc->minor);
3914+
3915+ if (dev == MKDEV(0,0))
3916+ continue;
3917+ if (dev == startdev)
3918+ continue;
3919+ if (md_import_device(dev, 1)) {
3920+ printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
3921+ continue;
3922+ }
3923+ rdev = find_rdev_all(dev);
3924+ if (!rdev) {
3925+ MD_BUG();
3926+ goto abort;
3927+ }
3928+ md_list_add(&rdev->pending, &pending_raid_disks);
3929+ }
3930+
3931+ /*
3932+ * possibly return codes
3933+ */
3934+ autorun_devices();
3935+ return 0;
3936+
3937+abort:
3938+ if (start_rdev)
3939+ export_rdev(start_rdev);
3940+ return err;
3941+}
3942+
3943+#undef BAD_VERSION
3944+#undef OUT_OF_MEM
3945+#undef NO_DEVICE
3946+#undef AUTOADD_FAILED_USED
3947+#undef AUTOADD_FAILED
3948+#undef AUTORUN_FAILED
3949+#undef AUTOADDING
3950+#undef AUTORUNNING
3951+
3952+struct {
3953+ int set;
3954+ int noautodetect;
3955+
3956+} raid_setup_args md__initdata = { 0, 0 };
3957+
3958+/*
3959+ * Searches all registered partitions for autorun RAID arrays
3960+ * at boot time.
3961+ */
3962+md__initfunc(void autodetect_raid(void))
3963+{
3964+#ifdef CONFIG_AUTODETECT_RAID
3965+ struct gendisk *disk;
3966+ mdk_rdev_t *rdev;
3967+ int i;
3968+
3969+ if (raid_setup_args.noautodetect) {
3970+ printk(KERN_INFO "skipping autodetection of RAID arrays\n");
3971+ return;
3972+ }
3973+ printk(KERN_INFO "autodetecting RAID arrays\n");
3974+
3975+ for (disk = gendisk_head ; disk ; disk = disk->next) {
3976+ for (i = 0; i < disk->max_p*disk->max_nr; i++) {
3977+ kdev_t dev = MKDEV(disk->major,i);
3978+
3979+ if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
3980+ printk(KERN_ALERT
3981+"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
3982+" to maintain interoperability with other OSs! Autodetection support for\n"
3983+" type 0x86 will be deleted after some migration timeout. Sorry.\n",
3984+ partition_name(dev));
3985+ disk->part[i].type = LINUX_RAID_PARTITION;
3986+ }
3987+ if (disk->part[i].type != LINUX_RAID_PARTITION)
3988+ continue;
3989+
3990+ if (md_import_device(dev,1)) {
3991+ printk(KERN_ALERT "could not import %s!\n",
3992+ partition_name(dev));
3993+ continue;
3994+ }
3995+ /*
3996+ * Sanity checks:
3997+ */
3998+ rdev = find_rdev_all(dev);
3999+ if (!rdev) {
4000+ MD_BUG();
4001+ continue;
4002+ }
4003+ if (rdev->faulty) {
4004+ MD_BUG();
4005+ continue;
4006+ }
4007+ md_list_add(&rdev->pending, &pending_raid_disks);
4008+ }
4009+ }
4010+
4011+ autorun_devices();
4012+#endif
4013+}
4014+
4015+static int get_version (void * arg)
4016+{
4017+ mdu_version_t ver;
4018+
4019+ ver.major = MD_MAJOR_VERSION;
4020+ ver.minor = MD_MINOR_VERSION;
4021+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
4022+
4023+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
4024+ return -EFAULT;
4025+
4026+ return 0;
4027+}
4028+
4029+#define SET_FROM_SB(x) info.x = mddev->sb->x
4030+static int get_array_info (mddev_t * mddev, void * arg)
4031+{
4032+ mdu_array_info_t info;
4033+
4034+ if (!mddev->sb)
4035+ return -EINVAL;
4036+
4037+ SET_FROM_SB(major_version);
4038+ SET_FROM_SB(minor_version);
4039+ SET_FROM_SB(patch_version);
4040+ SET_FROM_SB(ctime);
4041+ SET_FROM_SB(level);
4042+ SET_FROM_SB(size);
4043+ SET_FROM_SB(nr_disks);
4044+ SET_FROM_SB(raid_disks);
4045+ SET_FROM_SB(md_minor);
4046+ SET_FROM_SB(not_persistent);
4047+
4048+ SET_FROM_SB(utime);
4049+ SET_FROM_SB(state);
4050+ SET_FROM_SB(active_disks);
4051+ SET_FROM_SB(working_disks);
4052+ SET_FROM_SB(failed_disks);
4053+ SET_FROM_SB(spare_disks);
4054+
4055+ SET_FROM_SB(layout);
4056+ SET_FROM_SB(chunk_size);
4057+
4058+ if (md_copy_to_user(arg, &info, sizeof(info)))
4059+ return -EFAULT;
4060+
4061+ return 0;
4062+}
4063+#undef SET_FROM_SB
4064+
4065+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
4066+static int get_disk_info (mddev_t * mddev, void * arg)
4067+{
4068+ mdu_disk_info_t info;
4069+ unsigned int nr;
4070+
4071+ if (!mddev->sb)
4072+ return -EINVAL;
4073+
4074+ if (md_copy_from_user(&info, arg, sizeof(info)))
4075+ return -EFAULT;
4076+
4077+ nr = info.number;
4078+ if (nr >= mddev->sb->nr_disks)
4079+ return -EINVAL;
4080+
4081+ SET_FROM_SB(major);
4082+ SET_FROM_SB(minor);
4083+ SET_FROM_SB(raid_disk);
4084+ SET_FROM_SB(state);
4085+
4086+ if (md_copy_to_user(arg, &info, sizeof(info)))
4087+ return -EFAULT;
4088+
4089+ return 0;
4090+}
4091+#undef SET_FROM_SB
4092+
4093+#define SET_SB(x) mddev->sb->disks[nr].x = info.x
4094+
4095+static int add_new_disk (mddev_t * mddev, void * arg)
4096+{
4097+ int err, size, persistent;
4098+ mdu_disk_info_t info;
4099+ mdk_rdev_t *rdev;
4100+ unsigned int nr;
4101+ kdev_t dev;
4102+
4103+ if (!mddev->sb)
4104+ return -EINVAL;
4105+
4106+ if (md_copy_from_user(&info, arg, sizeof(info)))
4107+ return -EFAULT;
4108+
4109+ nr = info.number;
4110+ if (nr >= mddev->sb->nr_disks)
4111+ return -EINVAL;
4112+
4113+ dev = MKDEV(info.major,info.minor);
4114+
4115+ if (find_rdev_all(dev)) {
4116+ printk("device %s already used in a RAID array!\n",
4117+ partition_name(dev));
4118+ return -EBUSY;
4119+ }
4120+
4121+ SET_SB(number);
4122+ SET_SB(major);
4123+ SET_SB(minor);
4124+ SET_SB(raid_disk);
4125+ SET_SB(state);
4126+
4127+ if ((info.state & (1<<MD_DISK_FAULTY))==0) {
4128+ err = md_import_device (dev, 0);
4129+ if (err) {
4130+ printk("md: error, md_import_device() returned %d\n", err);
4131+ return -EINVAL;
4132+ }
4133+ rdev = find_rdev_all(dev);
4134+ if (!rdev) {
4135+ MD_BUG();
4136 return -EINVAL;
4137+ }
4138+
4139+ rdev->old_dev = dev;
4140+ rdev->desc_nr = info.number;
4141+
4142+ bind_rdev_to_array(rdev, mddev);
4143+
4144+ persistent = !mddev->sb->not_persistent;
4145+ if (!persistent)
4146+ printk("nonpersistent superblock ...\n");
4147+ if (!mddev->sb->chunk_size)
4148+ printk("no chunksize?\n");
4149+
4150+ size = calc_dev_size(dev, mddev, persistent);
4151+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
4152+
4153+ if (!mddev->sb->size || (mddev->sb->size > size))
4154+ mddev->sb->size = size;
4155+ }
4156+
4157+ /*
4158+ * sync all other superblocks with the main superblock
4159+ */
4160+ sync_sbs(mddev);
4161+
4162+ return 0;
4163+}
4164+#undef SET_SB
4165+
4166+static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
4167+{
4168+ int err;
4169+ mdk_rdev_t *rdev;
4170+ mdp_disk_t *disk;
4171+
4172+ if (!mddev->pers)
4173+ return -ENODEV;
4174+
4175+ printk("trying to remove %s from md%d ... \n",
4176+ partition_name(dev), mdidx(mddev));
4177+
4178+ if (!mddev->pers->diskop) {
4179+ printk("md%d: personality does not support diskops!\n",
4180+ mdidx(mddev));
4181+ return -EINVAL;
4182+ }
4183+
4184+ rdev = find_rdev(mddev, dev);
4185+ if (!rdev)
4186+ return -ENXIO;
4187+
4188+ if (rdev->desc_nr == -1) {
4189+ MD_BUG();
4190+ return -EINVAL;
4191+ }
4192+ disk = &mddev->sb->disks[rdev->desc_nr];
4193+ if (disk_active(disk))
4194+ goto busy;
4195+ if (disk_removed(disk)) {
4196+ MD_BUG();
4197+ return -EINVAL;
4198+ }
4199+
4200+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
4201+ if (err == -EBUSY)
4202+ goto busy;
4203+ if (err) {
4204+ MD_BUG();
4205+ return -EINVAL;
4206+ }
4207+
4208+ remove_descriptor(disk, mddev->sb);
4209+ kick_rdev_from_array(rdev);
4210+ mddev->sb_dirty = 1;
4211+ md_update_sb(mddev);
4212+
4213+ return 0;
4214+busy:
4215+ printk("cannot remove active disk %s from md%d ... \n",
4216+ partition_name(dev), mdidx(mddev));
4217+ return -EBUSY;
4218+}
4219+
4220+static int hot_add_disk (mddev_t * mddev, kdev_t dev)
4221+{
4222+ int i, err, persistent;
4223+ unsigned int size;
4224+ mdk_rdev_t *rdev;
4225+ mdp_disk_t *disk;
4226+
4227+ if (!mddev->pers)
4228+ return -ENODEV;
4229+
4230+ printk("trying to hot-add %s to md%d ... \n",
4231+ partition_name(dev), mdidx(mddev));
4232+
4233+ if (!mddev->pers->diskop) {
4234+ printk("md%d: personality does not support diskops!\n",
4235+ mdidx(mddev));
4236+ return -EINVAL;
4237+ }
4238+
4239+ persistent = !mddev->sb->not_persistent;
4240+ size = calc_dev_size(dev, mddev, persistent);
4241+
4242+ if (size < mddev->sb->size) {
4243+ printk("md%d: disk size %d blocks < array size %d\n",
4244+ mdidx(mddev), size, mddev->sb->size);
4245+ return -ENOSPC;
4246+ }
4247+
4248+ rdev = find_rdev(mddev, dev);
4249+ if (rdev)
4250+ return -EBUSY;
4251+
4252+ err = md_import_device (dev, 0);
4253+ if (err) {
4254+ printk("md: error, md_import_device() returned %d\n", err);
4255+ return -EINVAL;
4256+ }
4257+ rdev = find_rdev_all(dev);
4258+ if (!rdev) {
4259+ MD_BUG();
4260+ return -EINVAL;
4261+ }
4262+ if (rdev->faulty) {
4263+ printk("md: can not hot-add faulty %s disk to md%d!\n",
4264+ partition_name(dev), mdidx(mddev));
4265+ err = -EINVAL;
4266+ goto abort_export;
4267+ }
4268+ bind_rdev_to_array(rdev, mddev);
4269+
4270+ /*
4271+ * The rest should better be atomic, we can have disk failures
4272+ * noticed in interrupt contexts ...
4273+ */
4274+ cli();
4275+ rdev->old_dev = dev;
4276+ rdev->size = size;
4277+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
4278+
4279+ disk = mddev->sb->disks + mddev->sb->raid_disks;
4280+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
4281+ disk = mddev->sb->disks + i;
4282+
4283+ if (!disk->major && !disk->minor)
4284+ break;
4285+ if (disk_removed(disk))
4286+ break;
4287+ }
4288+ if (i == MD_SB_DISKS) {
4289+ sti();
4290+ printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
4291+ err = -EBUSY;
4292+ goto abort_unbind_export;
4293+ }
4294+
4295+ if (disk_removed(disk)) {
4296 /*
4297- * hot_add has to bump up nb_dev itself
4298+ * reuse slot
4299 */
4300- if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
4301- /*
4302- * FIXME: here we should free up the inode and stuff
4303- */
4304- printk ("FIXME\n");
4305- return -EINVAL;
4306+ if (disk->number != i) {
4307+ sti();
4308+ MD_BUG();
4309+ err = -EINVAL;
4310+ goto abort_unbind_export;
4311 }
4312- } else
4313- md_dev[minor].nb_dev++;
4314+ } else {
4315+ disk->number = i;
4316+ }
4317
4318- printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
4319- return (0);
4320+ disk->raid_disk = disk->number;
4321+ disk->major = MAJOR(dev);
4322+ disk->minor = MINOR(dev);
4323+
4324+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
4325+ sti();
4326+ MD_BUG();
4327+ err = -EINVAL;
4328+ goto abort_unbind_export;
4329+ }
4330+
4331+ mark_disk_spare(disk);
4332+ mddev->sb->nr_disks++;
4333+ mddev->sb->spare_disks++;
4334+ mddev->sb->working_disks++;
4335+
4336+ mddev->sb_dirty = 1;
4337+
4338+ sti();
4339+ md_update_sb(mddev);
4340+
4341+ /*
4342+ * Kick recovery, maybe this spare has to be added to the
4343+ * array immediately.
4344+ */
4345+ md_recover_arrays();
4346+
4347+ return 0;
4348+
4349+abort_unbind_export:
4350+ unbind_rdev_from_array(rdev);
4351+
4352+abort_export:
4353+ export_rdev(rdev);
4354+ return err;
4355+}
4356+
4357+#define SET_SB(x) mddev->sb->x = info.x
4358+static int set_array_info (mddev_t * mddev, void * arg)
4359+{
4360+ mdu_array_info_t info;
4361+
4362+ if (mddev->sb) {
4363+ printk("array md%d already has a superblock!\n",
4364+ mdidx(mddev));
4365+ return -EBUSY;
4366+ }
4367+
4368+ if (md_copy_from_user(&info, arg, sizeof(info)))
4369+ return -EFAULT;
4370+
4371+ if (alloc_array_sb(mddev))
4372+ return -ENOMEM;
4373+
4374+ mddev->sb->major_version = MD_MAJOR_VERSION;
4375+ mddev->sb->minor_version = MD_MINOR_VERSION;
4376+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
4377+ mddev->sb->ctime = CURRENT_TIME;
4378+
4379+ SET_SB(level);
4380+ SET_SB(size);
4381+ SET_SB(nr_disks);
4382+ SET_SB(raid_disks);
4383+ SET_SB(md_minor);
4384+ SET_SB(not_persistent);
4385+
4386+ SET_SB(state);
4387+ SET_SB(active_disks);
4388+ SET_SB(working_disks);
4389+ SET_SB(failed_disks);
4390+ SET_SB(spare_disks);
4391+
4392+ SET_SB(layout);
4393+ SET_SB(chunk_size);
4394+
4395+ mddev->sb->md_magic = MD_SB_MAGIC;
4396+
4397+ /*
4398+ * Generate a 128 bit UUID
4399+ */
4400+ get_random_bytes(&mddev->sb->set_uuid0, 4);
4401+ get_random_bytes(&mddev->sb->set_uuid1, 4);
4402+ get_random_bytes(&mddev->sb->set_uuid2, 4);
4403+ get_random_bytes(&mddev->sb->set_uuid3, 4);
4404+
4405+ return 0;
4406+}
4407+#undef SET_SB
4408+
4409+static int set_disk_info (mddev_t * mddev, void * arg)
4410+{
4411+ printk("not yet");
4412+ return -EINVAL;
4413+}
4414+
4415+static int clear_array (mddev_t * mddev)
4416+{
4417+ printk("not yet");
4418+ return -EINVAL;
4419+}
4420+
4421+static int write_raid_info (mddev_t * mddev)
4422+{
4423+ printk("not yet");
4424+ return -EINVAL;
4425+}
4426+
4427+static int protect_array (mddev_t * mddev)
4428+{
4429+ printk("not yet");
4430+ return -EINVAL;
4431+}
4432+
4433+static int unprotect_array (mddev_t * mddev)
4434+{
4435+ printk("not yet");
4436+ return -EINVAL;
4437+}
4438+
4439+static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
4440+{
4441+ int ret;
4442+
4443+ fsync_dev(mddev_to_kdev(mddev));
4444+ ret = md_error(mddev_to_kdev(mddev), dev);
4445+ return ret;
4446 }
4447
4448 static int md_ioctl (struct inode *inode, struct file *file,
4449 unsigned int cmd, unsigned long arg)
4450 {
4451- int minor, err;
4452- struct hd_geometry *loc = (struct hd_geometry *) arg;
4453+ unsigned int minor;
4454+ int err = 0;
4455+ struct hd_geometry *loc = (struct hd_geometry *) arg;
4456+ mddev_t *mddev = NULL;
4457+ kdev_t dev;
4458
4459- if (!capable(CAP_SYS_ADMIN))
4460- return -EACCES;
4461+ if (!md_capable_admin())
4462+ return -EACCES;
4463
4464- if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
4465- (minor & 0x7f) < MAX_PERSONALITY &&
4466- pers[minor & 0x7f] &&
4467- pers[minor & 0x7f]->ioctl)
4468- return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
4469-
4470- if (minor >= MAX_MD_DEV)
4471- return -EINVAL;
4472+ dev = inode->i_rdev;
4473+ minor = MINOR(dev);
4474+ if (minor >= MAX_MD_DEVS)
4475+ return -EINVAL;
4476
4477- switch (cmd)
4478- {
4479- case REGISTER_DEV:
4480- return do_md_add (minor, to_kdev_t ((dev_t) arg));
4481+ /*
4482+ * Commands dealing with the RAID driver but not any
4483+ * particular array:
4484+ */
4485+ switch (cmd)
4486+ {
4487+ case RAID_VERSION:
4488+ err = get_version((void *)arg);
4489+ goto done;
4490+
4491+ case PRINT_RAID_DEBUG:
4492+ err = 0;
4493+ md_print_devices();
4494+ goto done_unlock;
4495+
4496+ case BLKGETSIZE: /* Return device size */
4497+ if (!arg) {
4498+ err = -EINVAL;
4499+ goto abort;
4500+ }
4501+ err = md_put_user(md_hd_struct[minor].nr_sects,
4502+ (long *) arg);
4503+ goto done;
4504
4505- case START_MD:
4506- return do_md_run (minor, (int) arg);
4507+ case BLKFLSBUF:
4508+ fsync_dev(dev);
4509+ invalidate_buffers(dev);
4510+ goto done;
4511
4512- case STOP_MD:
4513- return do_md_stop (minor, inode);
4514-
4515- case BLKGETSIZE: /* Return device size */
4516- if (!arg) return -EINVAL;
4517- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
4518- if (err)
4519- return err;
4520- break;
4521-
4522- case BLKFLSBUF:
4523- fsync_dev (inode->i_rdev);
4524- invalidate_buffers (inode->i_rdev);
4525- break;
4526-
4527- case BLKRASET:
4528- if (arg > 0xff)
4529- return -EINVAL;
4530- read_ahead[MAJOR(inode->i_rdev)] = arg;
4531- return 0;
4532-
4533- case BLKRAGET:
4534- if (!arg) return -EINVAL;
4535- err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
4536- if (err)
4537- return err;
4538- break;
4539-
4540- /* We have a problem here : there is no easy way to give a CHS
4541- virtual geometry. We currently pretend that we have a 2 heads
4542- 4 sectors (with a BIG number of cylinders...). This drives dosfs
4543- just mad... ;-) */
4544-
4545- case HDIO_GETGEO:
4546- if (!loc) return -EINVAL;
4547- err = put_user (2, (char *) &loc->heads);
4548- if (err)
4549- return err;
4550- err = put_user (4, (char *) &loc->sectors);
4551- if (err)
4552- return err;
4553- err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
4554- if (err)
4555- return err;
4556- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
4557- (long *) &loc->start);
4558- if (err)
4559- return err;
4560- break;
4561-
4562- RO_IOCTLS(inode->i_rdev,arg);
4563+ case BLKRASET:
4564+ if (arg > 0xff) {
4565+ err = -EINVAL;
4566+ goto abort;
4567+ }
4568+ read_ahead[MAJOR(dev)] = arg;
4569+ goto done;
4570
4571- default:
4572- return -EINVAL;
4573- }
4574+ case BLKRAGET:
4575+ if (!arg) {
4576+ err = -EINVAL;
4577+ goto abort;
4578+ }
4579+ err = md_put_user (read_ahead[
4580+ MAJOR(dev)], (long *) arg);
4581+ goto done;
4582+ default:
4583+ }
4584+
4585+ /*
4586+ * Commands creating/starting a new array:
4587+ */
4588+
4589+ mddev = kdev_to_mddev(dev);
4590+
4591+ switch (cmd)
4592+ {
4593+ case SET_ARRAY_INFO:
4594+ case START_ARRAY:
4595+ if (mddev) {
4596+ printk("array md%d already exists!\n",
4597+ mdidx(mddev));
4598+ err = -EEXIST;
4599+ goto abort;
4600+ }
4601+ default:
4602+ }
4603+
4604+ switch (cmd)
4605+ {
4606+ case SET_ARRAY_INFO:
4607+ mddev = alloc_mddev(dev);
4608+ if (!mddev) {
4609+ err = -ENOMEM;
4610+ goto abort;
4611+ }
4612+ /*
4613+ * alloc_mddev() should possibly self-lock.
4614+ */
4615+ err = lock_mddev(mddev);
4616+ if (err) {
4617+ printk("ioctl, reason %d, cmd %d\n", err, cmd);
4618+ goto abort;
4619+ }
4620+ err = set_array_info(mddev, (void *)arg);
4621+ if (err) {
4622+ printk("couldnt set array info. %d\n", err);
4623+ goto abort;
4624+ }
4625+ goto done_unlock;
4626+
4627+ case START_ARRAY:
4628+ /*
4629+ * possibly make it lock the array ...
4630+ */
4631+ err = autostart_array((kdev_t)arg);
4632+ if (err) {
4633+ printk("autostart %s failed!\n",
4634+ partition_name((kdev_t)arg));
4635+ goto abort;
4636+ }
4637+ goto done;
4638+
4639+ default:
4640+ }
4641+
4642+ /*
4643+ * Commands querying/configuring an existing array:
4644+ */
4645+
4646+ if (!mddev) {
4647+ err = -ENODEV;
4648+ goto abort;
4649+ }
4650+ err = lock_mddev(mddev);
4651+ if (err) {
4652+ printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
4653+ goto abort;
4654+ }
4655+
4656+ /*
4657+ * Commands even a read-only array can execute:
4658+ */
4659+ switch (cmd)
4660+ {
4661+ case GET_ARRAY_INFO:
4662+ err = get_array_info(mddev, (void *)arg);
4663+ goto done_unlock;
4664+
4665+ case GET_DISK_INFO:
4666+ err = get_disk_info(mddev, (void *)arg);
4667+ goto done_unlock;
4668+
4669+ case RESTART_ARRAY_RW:
4670+ err = restart_array(mddev);
4671+ goto done_unlock;
4672+
4673+ case STOP_ARRAY:
4674+ err = do_md_stop (mddev, 0);
4675+ goto done_unlock;
4676+
4677+ case STOP_ARRAY_RO:
4678+ err = do_md_stop (mddev, 1);
4679+ goto done_unlock;
4680+
4681+ /*
4682+ * We have a problem here : there is no easy way to give a CHS
4683+ * virtual geometry. We currently pretend that we have a 2 heads
4684+ * 4 sectors (with a BIG number of cylinders...). This drives
4685+ * dosfs just mad... ;-)
4686+ */
4687+ case HDIO_GETGEO:
4688+ if (!loc) {
4689+ err = -EINVAL;
4690+ goto abort_unlock;
4691+ }
4692+ err = md_put_user (2, (char *) &loc->heads);
4693+ if (err)
4694+ goto abort_unlock;
4695+ err = md_put_user (4, (char *) &loc->sectors);
4696+ if (err)
4697+ goto abort_unlock;
4698+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
4699+ (short *) &loc->cylinders);
4700+ if (err)
4701+ goto abort_unlock;
4702+ err = md_put_user (md_hd_struct[minor].start_sect,
4703+ (long *) &loc->start);
4704+ goto done_unlock;
4705+ }
4706
4707- return (0);
4708+ /*
4709+ * The remaining ioctls are changing the state of the
4710+ * superblock, so we do not allow read-only arrays
4711+ * here:
4712+ */
4713+ if (mddev->ro) {
4714+ err = -EROFS;
4715+ goto abort_unlock;
4716+ }
4717+
4718+ switch (cmd)
4719+ {
4720+ case CLEAR_ARRAY:
4721+ err = clear_array(mddev);
4722+ goto done_unlock;
4723+
4724+ case ADD_NEW_DISK:
4725+ err = add_new_disk(mddev, (void *)arg);
4726+ goto done_unlock;
4727+
4728+ case HOT_REMOVE_DISK:
4729+ err = hot_remove_disk(mddev, (kdev_t)arg);
4730+ goto done_unlock;
4731+
4732+ case HOT_ADD_DISK:
4733+ err = hot_add_disk(mddev, (kdev_t)arg);
4734+ goto done_unlock;
4735+
4736+ case SET_DISK_INFO:
4737+ err = set_disk_info(mddev, (void *)arg);
4738+ goto done_unlock;
4739+
4740+ case WRITE_RAID_INFO:
4741+ err = write_raid_info(mddev);
4742+ goto done_unlock;
4743+
4744+ case UNPROTECT_ARRAY:
4745+ err = unprotect_array(mddev);
4746+ goto done_unlock;
4747+
4748+ case PROTECT_ARRAY:
4749+ err = protect_array(mddev);
4750+ goto done_unlock;
4751+
4752+ case SET_DISK_FAULTY:
4753+ err = set_disk_faulty(mddev, (kdev_t)arg);
4754+ goto done_unlock;
4755+
4756+ case RUN_ARRAY:
4757+ {
4758+ mdu_param_t param;
4759+
4760+ err = md_copy_from_user(&param, (mdu_param_t *)arg,
4761+ sizeof(param));
4762+ if (err)
4763+ goto abort_unlock;
4764+
4765+ err = do_md_run (mddev);
4766+ /*
4767+ * we have to clean up the mess if
4768+ * the array cannot be run for some
4769+ * reason ...
4770+ */
4771+ if (err) {
4772+ mddev->sb_dirty = 0;
4773+ do_md_stop (mddev, 0);
4774+ }
4775+ goto done_unlock;
4776+ }
4777+
4778+ default:
4779+ printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
4780+ err = -EINVAL;
4781+ goto abort_unlock;
4782+ }
4783+
4784+done_unlock:
4785+abort_unlock:
4786+ if (mddev)
4787+ unlock_mddev(mddev);
4788+ else
4789+ printk("huh11?\n");
4790+
4791+ return err;
4792+done:
4793+ if (err)
4794+ printk("huh12?\n");
4795+abort:
4796+ return err;
4797 }
4798
4799+
4800+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
4801+
4802 static int md_open (struct inode *inode, struct file *file)
4803 {
4804- int minor=MINOR(inode->i_rdev);
4805+ /*
4806+ * Always succeed
4807+ */
4808+ return (0);
4809+}
4810+
4811+static void md_release (struct inode *inode, struct file *file)
4812+{
4813+ sync_dev(inode->i_rdev);
4814+}
4815+
4816+
4817+static int md_read (struct inode *inode, struct file *file,
4818+ char *buf, int count)
4819+{
4820+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4821
4822- md_dev[minor].busy++;
4823- return (0); /* Always succeed */
4824+ if (!mddev || !mddev->pers)
4825+ return -ENXIO;
4826+
4827+ return block_read (inode, file, buf, count);
4828 }
4829
4830+static int md_write (struct inode *inode, struct file *file,
4831+ const char *buf, int count)
4832+{
4833+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4834+
4835+ if (!mddev || !mddev->pers)
4836+ return -ENXIO;
4837
4838-static int md_release (struct inode *inode, struct file *file)
4839+ return block_write (inode, file, buf, count);
4840+}
4841+
4842+static struct file_operations md_fops=
4843 {
4844- int minor=MINOR(inode->i_rdev);
4845+ NULL,
4846+ md_read,
4847+ md_write,
4848+ NULL,
4849+ NULL,
4850+ md_ioctl,
4851+ NULL,
4852+ md_open,
4853+ md_release,
4854+ block_fsync
4855+};
4856+
4857+#else
4858
4859- sync_dev (inode->i_rdev);
4860- md_dev[minor].busy--;
4861- return 0;
4862+static int md_open (struct inode *inode, struct file *file)
4863+{
4864+ /*
4865+ * Always succeed
4866+ */
4867+ return (0);
4868 }
4869
4870+static int md_release (struct inode *inode, struct file *file)
4871+{
4872+ sync_dev(inode->i_rdev);
4873+ return 0;
4874+}
4875
4876 static ssize_t md_read (struct file *file, char *buf, size_t count,
4877 loff_t *ppos)
4878 {
4879- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
4880+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4881
4882- if (!md_dev[minor].pers) /* Check if device is being run */
4883- return -ENXIO;
4884+ if (!mddev || !mddev->pers)
4885+ return -ENXIO;
4886
4887- return block_read(file, buf, count, ppos);
4888+ return block_read(file, buf, count, ppos);
4889 }
4890
4891 static ssize_t md_write (struct file *file, const char *buf,
4892 size_t count, loff_t *ppos)
4893 {
4894- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
4895+ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4896
4897- if (!md_dev[minor].pers) /* Check if device is being run */
4898- return -ENXIO;
4899+ if (!mddev || !mddev->pers)
4900+ return -ENXIO;
4901
4902- return block_write(file, buf, count, ppos);
4903+ return block_write(file, buf, count, ppos);
4904 }
4905
4906 static struct file_operations md_fops=
4907 {
4908- NULL,
4909- md_read,
4910- md_write,
4911- NULL,
4912- NULL,
4913- md_ioctl,
4914- NULL,
4915- md_open,
4916- NULL,
4917- md_release,
4918- block_fsync
4919+ NULL,
4920+ md_read,
4921+ md_write,
4922+ NULL,
4923+ NULL,
4924+ md_ioctl,
4925+ NULL,
4926+ md_open,
4927+ NULL,
4928+ md_release,
4929+ block_fsync
4930 };
4931
4932-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
4933+#endif
4934+
4935+int md_map (kdev_t dev, kdev_t *rdev,
4936+ unsigned long *rsector, unsigned long size)
4937 {
4938- if ((unsigned int) minor >= MAX_MD_DEV)
4939- {
4940- printk ("Bad md device %d\n", minor);
4941- return (-1);
4942- }
4943-
4944- if (!md_dev[minor].pers)
4945- {
4946- printk ("Oops ! md%d not running, giving up !\n", minor);
4947- return (-1);
4948- }
4949+ int err;
4950+ mddev_t *mddev = kdev_to_mddev(dev);
4951
4952- return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
4953+ if (!mddev || !mddev->pers) {
4954+ err = -ENXIO;
4955+ goto out;
4956+ }
4957+
4958+ err = mddev->pers->map(mddev, dev, rdev, rsector, size);
4959+out:
4960+ return err;
4961 }
4962
4963-int md_make_request (int minor, int rw, struct buffer_head * bh)
4964+int md_make_request (struct buffer_head * bh, int rw)
4965 {
4966- if (md_dev [minor].pers->make_request) {
4967- if (buffer_locked(bh))
4968- return 0;
4969+ int err;
4970+ mddev_t *mddev = kdev_to_mddev(bh->b_dev);
4971+
4972+ if (!mddev || !mddev->pers) {
4973+ err = -ENXIO;
4974+ goto out;
4975+ }
4976+
4977+ if (mddev->pers->make_request) {
4978+ if (buffer_locked(bh)) {
4979+ err = 0;
4980+ goto out;
4981+ }
4982 set_bit(BH_Lock, &bh->b_state);
4983 if (rw == WRITE || rw == WRITEA) {
4984 if (!buffer_dirty(bh)) {
4985- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
4986- return 0;
4987+ bh->b_end_io(bh, buffer_uptodate(bh));
4988+ err = 0;
4989+ goto out;
4990 }
4991 }
4992 if (rw == READ || rw == READA) {
4993 if (buffer_uptodate(bh)) {
4994- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
4995- return 0;
4996+ bh->b_end_io(bh, buffer_uptodate(bh));
4997+ err = 0;
4998+ goto out;
4999 }
5000 }
5001- return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
5002+ err = mddev->pers->make_request(mddev, rw, bh);
5003 } else {
5004 make_request (MAJOR(bh->b_rdev), rw, bh);
5005- return 0;
5006+ err = 0;
5007 }
5008+out:
5009+ return err;
5010 }
5011
5012 static void do_md_request (void)
5013 {
5014- printk ("Got md request, not good...");
5015- return;
5016+ printk(KERN_ALERT "Got md request, not good...");
5017+ return;
5018+}
5019+
5020+int md_thread(void * arg)
5021+{
5022+ mdk_thread_t *thread = arg;
5023+
5024+ md_lock_kernel();
5025+ exit_mm(current);
5026+ exit_files(current);
5027+ exit_fs(current);
5028+
5029+ /*
5030+ * Detach thread
5031+ */
5032+ sys_setsid();
5033+ sprintf(current->comm, thread->name);
5034+ md_init_signals();
5035+ md_flush_signals();
5036+ thread->tsk = current;
5037+
5038+ /*
5039+ * md_thread is a 'system-thread', it's priority should be very
5040+ * high. We avoid resource deadlocks individually in each
5041+ * raid personality. (RAID5 does preallocation) We also use RR and
5042+ * the very same RT priority as kswapd, thus we will never get
5043+ * into a priority inversion deadlock.
5044+ *
5045+ * we definitely have to have equal or higher priority than
5046+ * bdflush, otherwise bdflush will deadlock if there are too
5047+ * many dirty RAID5 blocks.
5048+ */
5049+ current->policy = SCHED_OTHER;
5050+ current->priority = 40;
5051+
5052+ up(thread->sem);
5053+
5054+ for (;;) {
5055+ cli();
5056+ if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
5057+ if (!thread->run)
5058+ break;
5059+ interruptible_sleep_on(&thread->wqueue);
5060+ }
5061+ sti();
5062+ clear_bit(THREAD_WAKEUP, &thread->flags);
5063+ if (thread->run) {
5064+ thread->run(thread->data);
5065+ run_task_queue(&tq_disk);
5066+ }
5067+ if (md_signal_pending(current)) {
5068+ printk("%8s(%d) flushing signals.\n", current->comm,
5069+ current->pid);
5070+ md_flush_signals();
5071+ }
5072+ }
5073+ sti();
5074+ up(thread->sem);
5075+ return 0;
5076 }
5077
5078-void md_wakeup_thread(struct md_thread *thread)
5079+void md_wakeup_thread(mdk_thread_t *thread)
5080 {
5081 set_bit(THREAD_WAKEUP, &thread->flags);
5082 wake_up(&thread->wqueue);
5083 }
5084
5085-struct md_thread *md_register_thread (void (*run) (void *), void *data)
5086+mdk_thread_t *md_register_thread (void (*run) (void *),
5087+ void *data, const char *name)
5088 {
5089- struct md_thread *thread = (struct md_thread *)
5090- kmalloc(sizeof(struct md_thread), GFP_KERNEL);
5091+ mdk_thread_t *thread;
5092 int ret;
5093 struct semaphore sem = MUTEX_LOCKED;
5094
5095- if (!thread) return NULL;
5096+ thread = (mdk_thread_t *) kmalloc
5097+ (sizeof(mdk_thread_t), GFP_KERNEL);
5098+ if (!thread)
5099+ return NULL;
5100
5101- memset(thread, 0, sizeof(struct md_thread));
5102+ memset(thread, 0, sizeof(mdk_thread_t));
5103 init_waitqueue(&thread->wqueue);
5104
5105 thread->sem = &sem;
5106 thread->run = run;
5107 thread->data = data;
5108+ thread->name = name;
5109 ret = kernel_thread(md_thread, thread, 0);
5110 if (ret < 0) {
5111 kfree(thread);
5112@@ -838,270 +3032,407 @@
5113 return thread;
5114 }
5115
5116-void md_unregister_thread (struct md_thread *thread)
5117+void md_interrupt_thread (mdk_thread_t *thread)
5118+{
5119+ if (!thread->tsk) {
5120+ MD_BUG();
5121+ return;
5122+ }
5123+ printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
5124+ send_sig(SIGKILL, thread->tsk, 1);
5125+}
5126+
5127+void md_unregister_thread (mdk_thread_t *thread)
5128 {
5129 struct semaphore sem = MUTEX_LOCKED;
5130
5131 thread->sem = &sem;
5132 thread->run = NULL;
5133- if (thread->tsk)
5134- printk("Killing md_thread %d %p %s\n",
5135- thread->tsk->pid, thread->tsk, thread->tsk->comm);
5136- else
5137- printk("Aiee. md_thread has 0 tsk\n");
5138- send_sig(SIGKILL, thread->tsk, 1);
5139- printk("downing on %p\n", &sem);
5140+ thread->name = NULL;
5141+ if (!thread->tsk) {
5142+ MD_BUG();
5143+ return;
5144+ }
5145+ md_interrupt_thread(thread);
5146 down(&sem);
5147 }
5148
5149-#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
5150-
5151-int md_thread(void * arg)
5152+void md_recover_arrays (void)
5153 {
5154- struct md_thread *thread = arg;
5155-
5156- lock_kernel();
5157- exit_mm(current);
5158- exit_files(current);
5159- exit_fs(current);
5160-
5161- current->session = 1;
5162- current->pgrp = 1;
5163- sprintf(current->comm, "md_thread");
5164- siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
5165- thread->tsk = current;
5166- up(thread->sem);
5167-
5168- for (;;) {
5169- cli();
5170- if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
5171- do {
5172- spin_lock(&current->sigmask_lock);
5173- flush_signals(current);
5174- spin_unlock(&current->sigmask_lock);
5175- interruptible_sleep_on(&thread->wqueue);
5176- cli();
5177- if (test_bit(THREAD_WAKEUP, &thread->flags))
5178- break;
5179- if (!thread->run) {
5180- sti();
5181- up(thread->sem);
5182- return 0;
5183- }
5184- } while (signal_pending(current));
5185- }
5186- sti();
5187- clear_bit(THREAD_WAKEUP, &thread->flags);
5188- if (thread->run) {
5189- thread->run(thread->data);
5190- run_task_queue(&tq_disk);
5191- }
5192+ if (!md_recovery_thread) {
5193+ MD_BUG();
5194+ return;
5195 }
5196+ md_wakeup_thread(md_recovery_thread);
5197 }
5198
5199-EXPORT_SYMBOL(md_size);
5200-EXPORT_SYMBOL(md_maxreadahead);
5201-EXPORT_SYMBOL(register_md_personality);
5202-EXPORT_SYMBOL(unregister_md_personality);
5203-EXPORT_SYMBOL(partition_name);
5204-EXPORT_SYMBOL(md_dev);
5205-EXPORT_SYMBOL(md_error);
5206-EXPORT_SYMBOL(md_register_thread);
5207-EXPORT_SYMBOL(md_unregister_thread);
5208-EXPORT_SYMBOL(md_update_sb);
5209-EXPORT_SYMBOL(md_map);
5210-EXPORT_SYMBOL(md_wakeup_thread);
5211-EXPORT_SYMBOL(md_do_sync);
5212
5213-#ifdef CONFIG_PROC_FS
5214-static struct proc_dir_entry proc_md = {
5215- PROC_MD, 6, "mdstat",
5216- S_IFREG | S_IRUGO, 1, 0, 0,
5217- 0, &proc_array_inode_operations,
5218-};
5219+int md_error (kdev_t dev, kdev_t rdev)
5220+{
5221+ mddev_t *mddev = kdev_to_mddev(dev);
5222+ mdk_rdev_t * rrdev;
5223+ int rc;
5224+
5225+ if (!mddev) {
5226+ MD_BUG();
5227+ return 0;
5228+ }
5229+ rrdev = find_rdev(mddev, rdev);
5230+ mark_rdev_faulty(rrdev);
5231+ /*
5232+ * if recovery was running, stop it now.
5233+ */
5234+ if (mddev->pers->stop_resync)
5235+ mddev->pers->stop_resync(mddev);
5236+ if (mddev->recovery_running)
5237+ md_interrupt_thread(md_recovery_thread);
5238+ if (mddev->pers->error_handler) {
5239+ rc = mddev->pers->error_handler(mddev, rdev);
5240+ md_recover_arrays();
5241+ return rc;
5242+ }
5243+#if 0
5244+ /*
5245+ * Drop all buffers in the failed array.
5246+ * _not_. This is called from IRQ handlers ...
5247+ */
5248+ invalidate_buffers(rdev);
5249 #endif
5250+ return 0;
5251+}
5252
5253-static void md_geninit (struct gendisk *gdisk)
5254+static int status_unused (char * page)
5255 {
5256- int i;
5257-
5258- for(i=0;i<MAX_MD_DEV;i++)
5259- {
5260- md_blocksizes[i] = 1024;
5261- md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
5262- md_gendisk.part[i].start_sect=-1; /* avoid partition check */
5263- md_gendisk.part[i].nr_sects=0;
5264- md_dev[i].pers=NULL;
5265- }
5266+ int sz = 0, i = 0;
5267+ mdk_rdev_t *rdev;
5268+ struct md_list_head *tmp;
5269
5270- blksize_size[MD_MAJOR] = md_blocksizes;
5271- max_readahead[MD_MAJOR] = md_maxreadahead;
5272+ sz += sprintf(page + sz, "unused devices: ");
5273
5274-#ifdef CONFIG_PROC_FS
5275- proc_register(&proc_root, &proc_md);
5276-#endif
5277+ ITERATE_RDEV_ALL(rdev,tmp) {
5278+ if (!rdev->same_set.next && !rdev->same_set.prev) {
5279+ /*
5280+ * The device is not yet used by any array.
5281+ */
5282+ i++;
5283+ sz += sprintf(page + sz, "%s ",
5284+ partition_name(rdev->dev));
5285+ }
5286+ }
5287+ if (!i)
5288+ sz += sprintf(page + sz, "<none>");
5289+
5290+ sz += sprintf(page + sz, "\n");
5291+ return sz;
5292 }
5293
5294-int md_error (kdev_t mddev, kdev_t rdev)
5295+
5296+static int status_resync (char * page, mddev_t * mddev)
5297 {
5298- unsigned int minor = MINOR (mddev);
5299- int rc;
5300+ int sz = 0;
5301+ unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
5302
5303- if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
5304- panic ("md_error gets unknown device\n");
5305- if (!md_dev [minor].pers)
5306- panic ("md_error gets an error for an unknown device\n");
5307- if (md_dev [minor].pers->error_handler) {
5308- rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
5309-#if SUPPORT_RECONSTRUCTION
5310- md_wakeup_thread(md_sync_thread);
5311-#endif /* SUPPORT_RECONSTRUCTION */
5312- return rc;
5313- }
5314- return 0;
5315+ resync = mddev->curr_resync;
5316+ blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
5317+ max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
5318+
5319+ /*
5320+ * Should not happen.
5321+ */
5322+ if (!max_blocks) {
5323+ MD_BUG();
5324+ return 0;
5325+ }
5326+ res = resync*100/max_blocks;
5327+ if (!mddev->recovery_running)
5328+ /*
5329+ * true resync
5330+ */
5331+ sz += sprintf(page + sz, " resync=%u%%", res);
5332+ else
5333+ /*
5334+ * recovery ...
5335+ */
5336+ sz += sprintf(page + sz, " recovery=%u%%", res);
5337+
5338+ /*
5339+ * We do not want to overflow, so the order of operands and
5340+ * the * 100 / 100 trick are important. We do a +1 to be
5341+ * safe against division by zero. We only estimate anyway.
5342+ *
5343+ * dt: time until now
5344+ * tt: total time
5345+ * et: estimated finish time
5346+ */
5347+ dt = ((jiffies - mddev->resync_start) / HZ);
5348+ tt = (dt * (max_blocks / (resync/100+1)))/100;
5349+ if (tt > dt)
5350+ et = tt - dt;
5351+ else
5352+ /*
5353+ * ignore rounding effects near finish time
5354+ */
5355+ et = 0;
5356+
5357+ sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
5358+
5359+ return sz;
5360 }
5361
5362 int get_md_status (char *page)
5363 {
5364- int sz=0, i, j, size;
5365-
5366- sz+=sprintf( page+sz, "Personalities : ");
5367- for (i=0; i<MAX_PERSONALITY; i++)
5368- if (pers[i])
5369- sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
5370-
5371- page[sz-1]='\n';
5372-
5373- sz+=sprintf (page+sz, "read_ahead ");
5374- if (read_ahead[MD_MAJOR]==INT_MAX)
5375- sz+=sprintf (page+sz, "not set\n");
5376- else
5377- sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
5378+ int sz = 0, j, size;
5379+ struct md_list_head *tmp, *tmp2;
5380+ mdk_rdev_t *rdev;
5381+ mddev_t *mddev;
5382+
5383+ sz += sprintf(page + sz, "Personalities : ");
5384+ for (j = 0; j < MAX_PERSONALITY; j++)
5385+ if (pers[j])
5386+ sz += sprintf(page+sz, "[%s] ", pers[j]->name);
5387+
5388+ sz += sprintf(page+sz, "\n");
5389+
5390+
5391+ sz += sprintf(page+sz, "read_ahead ");
5392+ if (read_ahead[MD_MAJOR] == INT_MAX)
5393+ sz += sprintf(page+sz, "not set\n");
5394+ else
5395+ sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
5396
5397- for (i=0; i<MAX_MD_DEV; i++)
5398- {
5399- sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
5400-
5401- if (md_dev[i].pers)
5402- sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
5403+ ITERATE_MDDEV(mddev,tmp) {
5404+ sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
5405+ mddev->pers ? "" : "in");
5406+ if (mddev->pers) {
5407+ if (mddev->ro)
5408+ sz += sprintf(page + sz, " (read-only)");
5409+ sz += sprintf(page + sz, " %s", mddev->pers->name);
5410+ }
5411
5412- size=0;
5413- for (j=0; j<md_dev[i].nb_dev; j++)
5414- {
5415- sz+=sprintf (page+sz, " %s",
5416- partition_name(md_dev[i].devices[j].dev));
5417- size+=md_dev[i].devices[j].size;
5418- }
5419+ size = 0;
5420+ ITERATE_RDEV(mddev,rdev,tmp2) {
5421+ sz += sprintf(page + sz, " %s[%d]",
5422+ partition_name(rdev->dev), rdev->desc_nr);
5423+ if (rdev->faulty) {
5424+ sz += sprintf(page + sz, "(F)");
5425+ continue;
5426+ }
5427+ size += rdev->size;
5428+ }
5429
5430- if (md_dev[i].nb_dev) {
5431- if (md_dev[i].pers)
5432- sz+=sprintf (page+sz, " %d blocks", md_size[i]);
5433- else
5434- sz+=sprintf (page+sz, " %d blocks", size);
5435- }
5436+ if (mddev->nb_dev) {
5437+ if (mddev->pers)
5438+ sz += sprintf(page + sz, " %d blocks",
5439+ md_size[mdidx(mddev)]);
5440+ else
5441+ sz += sprintf(page + sz, " %d blocks", size);
5442+ }
5443
5444- if (!md_dev[i].pers)
5445- {
5446- sz+=sprintf (page+sz, "\n");
5447- continue;
5448- }
5449+ if (!mddev->pers) {
5450+ sz += sprintf(page+sz, "\n");
5451+ continue;
5452+ }
5453
5454- if (md_dev[i].pers->max_invalid_dev)
5455- sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
5456+ sz += mddev->pers->status (page+sz, mddev);
5457
5458- sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
5459- sz+=sprintf (page+sz, "\n");
5460- }
5461+ if (mddev->curr_resync)
5462+ sz += status_resync (page+sz, mddev);
5463+ else {
5464+ if (md_atomic_read(&mddev->resync_sem.count) != 1)
5465+ sz += sprintf(page + sz, " resync=DELAYED");
5466+ }
5467+ sz += sprintf(page + sz, "\n");
5468+ }
5469+ sz += status_unused (page + sz);
5470
5471- return (sz);
5472+ return (sz);
5473 }
5474
5475-int register_md_personality (int p_num, struct md_personality *p)
5476+int register_md_personality (int pnum, mdk_personality_t *p)
5477 {
5478- int i=(p_num >> PERSONALITY_SHIFT);
5479-
5480- if (i >= MAX_PERSONALITY)
5481- return -EINVAL;
5482+ if (pnum >= MAX_PERSONALITY)
5483+ return -EINVAL;
5484
5485- if (pers[i])
5486- return -EBUSY;
5487+ if (pers[pnum])
5488+ return -EBUSY;
5489
5490- pers[i]=p;
5491- printk ("%s personality registered\n", p->name);
5492- return 0;
5493+ pers[pnum] = p;
5494+ printk(KERN_INFO "%s personality registered\n", p->name);
5495+ return 0;
5496 }
5497
5498-int unregister_md_personality (int p_num)
5499+int unregister_md_personality (int pnum)
5500 {
5501- int i=(p_num >> PERSONALITY_SHIFT);
5502-
5503- if (i >= MAX_PERSONALITY)
5504- return -EINVAL;
5505+ if (pnum >= MAX_PERSONALITY)
5506+ return -EINVAL;
5507
5508- printk ("%s personality unregistered\n", pers[i]->name);
5509- pers[i]=NULL;
5510- return 0;
5511+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
5512+ pers[pnum] = NULL;
5513+ return 0;
5514 }
5515
5516-static md_descriptor_t *get_spare(struct md_dev *mddev)
5517+static mdp_disk_t *get_spare(mddev_t *mddev)
5518 {
5519- int i;
5520- md_superblock_t *sb = mddev->sb;
5521- md_descriptor_t *descriptor;
5522- struct real_dev *realdev;
5523-
5524- for (i = 0; i < mddev->nb_dev; i++) {
5525- realdev = &mddev->devices[i];
5526- if (!realdev->sb)
5527+ mdp_super_t *sb = mddev->sb;
5528+ mdp_disk_t *disk;
5529+ mdk_rdev_t *rdev;
5530+ struct md_list_head *tmp;
5531+
5532+ ITERATE_RDEV(mddev,rdev,tmp) {
5533+ if (rdev->faulty)
5534+ continue;
5535+ if (!rdev->sb) {
5536+ MD_BUG();
5537 continue;
5538- descriptor = &sb->disks[realdev->sb->descriptor.number];
5539- if (descriptor->state & (1 << MD_FAULTY_DEVICE))
5540+ }
5541+ disk = &sb->disks[rdev->desc_nr];
5542+ if (disk_faulty(disk)) {
5543+ MD_BUG();
5544 continue;
5545- if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
5546+ }
5547+ if (disk_active(disk))
5548 continue;
5549- return descriptor;
5550+ return disk;
5551 }
5552 return NULL;
5553 }
5554
5555+static int is_mddev_idle (mddev_t *mddev)
5556+{
5557+ mdk_rdev_t * rdev;
5558+ struct md_list_head *tmp;
5559+ int idle;
5560+ unsigned long curr_events;
5561+
5562+ idle = 1;
5563+ ITERATE_RDEV(mddev,rdev,tmp) {
5564+ curr_events = io_events[MAJOR(rdev->dev)];
5565+
5566+ if (curr_events != rdev->last_events) {
5567+// printk("!I(%d)", curr_events-rdev->last_events);
5568+ rdev->last_events = curr_events;
5569+ idle = 0;
5570+ }
5571+ }
5572+ return idle;
5573+}
5574+
5575 /*
5576 * parallel resyncing thread.
5577- *
5578- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
5579- * - fix read error handing
5580 */
5581
5582-int md_do_sync(struct md_dev *mddev)
5583+/*
5584+ * Determine correct block size for this device.
5585+ */
5586+unsigned int device_bsize (kdev_t dev)
5587+{
5588+ unsigned int i, correct_size;
5589+
5590+ correct_size = BLOCK_SIZE;
5591+ if (blksize_size[MAJOR(dev)]) {
5592+ i = blksize_size[MAJOR(dev)][MINOR(dev)];
5593+ if (i)
5594+ correct_size = i;
5595+ }
5596+
5597+ return correct_size;
5598+}
5599+
5600+static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
5601+
5602+#define RA_ORDER (1)
5603+#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
5604+#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
5605+
5606+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
5607 {
5608- struct buffer_head *bh;
5609- int max_blocks, blocksize, curr_bsize, percent=1, j;
5610- kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
5611+ mddev_t *mddev2;
5612+ struct buffer_head **bh;
5613+ unsigned int max_blocks, blocksize, curr_bsize,
5614+ i, ii, j, k, chunk, window, nr_blocks, err, serialize;
5615+ kdev_t read_disk = mddev_to_kdev(mddev);
5616 int major = MAJOR(read_disk), minor = MINOR(read_disk);
5617 unsigned long starttime;
5618+ int max_read_errors = 2*MAX_NR_BLOCKS,
5619+ max_write_errors = 2*MAX_NR_BLOCKS;
5620+ struct md_list_head *tmp;
5621+
5622+retry_alloc:
5623+ bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
5624+ if (!bh) {
5625+ printk(KERN_ERR
5626+ "could not alloc bh array for reconstruction ... retrying!\n");
5627+ goto retry_alloc;
5628+ }
5629+
5630+ err = down_interruptible(&mddev->resync_sem);
5631+ if (err)
5632+ goto out_nolock;
5633+
5634+recheck:
5635+ serialize = 0;
5636+ ITERATE_MDDEV(mddev2,tmp) {
5637+ if (mddev2 == mddev)
5638+ continue;
5639+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
5640+ printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
5641+ serialize = 1;
5642+ break;
5643+ }
5644+ }
5645+ if (serialize) {
5646+ interruptible_sleep_on(&resync_wait);
5647+ if (md_signal_pending(current)) {
5648+ md_flush_signals();
5649+ err = -EINTR;
5650+ goto out;
5651+ }
5652+ goto recheck;
5653+ }
5654+
5655+ mddev->curr_resync = 1;
5656
5657- blocksize = blksize_size[major][minor];
5658+ blocksize = device_bsize(read_disk);
5659 max_blocks = blk_size[major][minor] / (blocksize >> 10);
5660
5661- printk("... resync log\n");
5662- printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
5663- printk(" .... raid array: %s\n", kdevname(read_disk));
5664- printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
5665- printk("md: syncing RAID array %s\n", kdevname(read_disk));
5666+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
5667+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
5668+ sysctl_speed_limit);
5669+ printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
5670+
5671+ /*
5672+ * Resync has low priority.
5673+ */
5674+ current->priority = 1;
5675+
5676+ is_mddev_idle(mddev); /* this also initializes IO event counters */
5677+ starttime = jiffies;
5678+ mddev->resync_start = starttime;
5679
5680- mddev->busy++;
5681+ /*
5682+ * Tune reconstruction:
5683+ */
5684+ window = md_maxreadahead[mdidx(mddev)]/1024;
5685+ nr_blocks = window / (blocksize >> 10);
5686+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
5687+ nr_blocks = MAX_NR_BLOCKS;
5688+ printk(KERN_INFO "md: using %dk window.\n",window);
5689
5690- starttime=jiffies;
5691- for (j = 0; j < max_blocks; j++) {
5692+ for (j = 0; j < max_blocks; j += nr_blocks) {
5693
5694+ if (j)
5695+ mddev->curr_resync = j;
5696 /*
5697 * B careful. When some1 mounts a non-'blocksize' filesystem
5698 * then we get the blocksize changed right under us. Go deal
5699 * with it transparently, recalculate 'blocksize', 'j' and
5700 * 'max_blocks':
5701 */
5702- curr_bsize = blksize_size[major][minor];
5703+ curr_bsize = device_bsize(read_disk);
5704 if (curr_bsize != blocksize) {
5705- diff_blocksize:
5706+ printk(KERN_INFO "md%d: blocksize changed\n",
5707+ mdidx(mddev));
5708+retry_read:
5709 if (curr_bsize > blocksize)
5710 /*
5711 * this is safe, rounds downwards.
5712@@ -1111,114 +3442,384 @@
5713 j *= blocksize/curr_bsize;
5714
5715 blocksize = curr_bsize;
5716+ nr_blocks = window / (blocksize >> 10);
5717+ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
5718+ nr_blocks = MAX_NR_BLOCKS;
5719 max_blocks = blk_size[major][minor] / (blocksize >> 10);
5720- }
5721- if ((bh = breada (read_disk, j, blocksize, j * blocksize,
5722- max_blocks * blocksize)) != NULL) {
5723- mark_buffer_dirty(bh, 1);
5724- brelse(bh);
5725- } else {
5726+ printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
5727+ nr_blocks, blocksize, j, max_blocks);
5728 /*
5729- * FIXME: Ugly, but set_blocksize() isnt safe ...
5730+ * We will retry the current block-group
5731 */
5732- curr_bsize = blksize_size[major][minor];
5733- if (curr_bsize != blocksize)
5734- goto diff_blocksize;
5735+ }
5736
5737- /*
5738- * It's a real read problem. FIXME, handle this
5739- * a better way.
5740- */
5741- printk ( KERN_ALERT
5742- "read error, stopping reconstruction.\n");
5743- mddev->busy--;
5744- return 1;
5745+ /*
5746+ * Cleanup routines expect this
5747+ */
5748+ for (k = 0; k < nr_blocks; k++)
5749+ bh[k] = NULL;
5750+
5751+ chunk = nr_blocks;
5752+ if (chunk > max_blocks-j)
5753+ chunk = max_blocks-j;
5754+
5755+ /*
5756+ * request buffer heads ...
5757+ */
5758+ for (i = 0; i < chunk; i++) {
5759+ bh[i] = getblk (read_disk, j+i, blocksize);
5760+ if (!bh[i])
5761+ goto read_error;
5762+ if (!buffer_dirty(bh[i]))
5763+ mark_buffer_lowprio(bh[i]);
5764 }
5765
5766 /*
5767- * Let's sleep some if we are faster than our speed limit:
5768+ * read buffer heads ...
5769 */
5770- while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
5771- {
5772- current->state = TASK_INTERRUPTIBLE;
5773- schedule_timeout(1);
5774+ ll_rw_block (READ, chunk, bh);
5775+ run_task_queue(&tq_disk);
5776+
5777+ /*
5778+ * verify that all of them are OK ...
5779+ */
5780+ for (i = 0; i < chunk; i++) {
5781+ ii = chunk-i-1;
5782+ wait_on_buffer(bh[ii]);
5783+ if (!buffer_uptodate(bh[ii]))
5784+ goto read_error;
5785+ }
5786+
5787+retry_write:
5788+ for (i = 0; i < chunk; i++)
5789+ mark_buffer_dirty_lowprio(bh[i]);
5790+
5791+ ll_rw_block(WRITE, chunk, bh);
5792+ run_task_queue(&tq_disk);
5793+
5794+ for (i = 0; i < chunk; i++) {
5795+ ii = chunk-i-1;
5796+ wait_on_buffer(bh[ii]);
5797+
5798+ if (spare && disk_faulty(spare)) {
5799+ for (k = 0; k < chunk; k++)
5800+ brelse(bh[k]);
5801+ printk(" <SPARE FAILED!>\n ");
5802+ err = -EIO;
5803+ goto out;
5804+ }
5805+
5806+ if (!buffer_uptodate(bh[ii])) {
5807+ curr_bsize = device_bsize(read_disk);
5808+ if (curr_bsize != blocksize) {
5809+ printk(KERN_INFO
5810+ "md%d: blocksize changed during write\n",
5811+ mdidx(mddev));
5812+ for (k = 0; k < chunk; k++)
5813+ if (bh[k]) {
5814+ if (buffer_lowprio(bh[k]))
5815+ mark_buffer_clean(bh[k]);
5816+ brelse(bh[k]);
5817+ }
5818+ goto retry_read;
5819+ }
5820+ printk(" BAD WRITE %8d>\n", j);
5821+ /*
5822+ * Ouch, write error, retry or bail out.
5823+ */
5824+ if (max_write_errors) {
5825+ max_write_errors--;
5826+ printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
5827+ goto retry_write;
5828+ }
5829+ printk ( KERN_ALERT
5830+ "too many write errors, stopping reconstruction.\n");
5831+ for (k = 0; k < chunk; k++)
5832+ if (bh[k]) {
5833+ if (buffer_lowprio(bh[k]))
5834+ mark_buffer_clean(bh[k]);
5835+ brelse(bh[k]);
5836+ }
5837+ err = -EIO;
5838+ goto out;
5839+ }
5840 }
5841
5842 /*
5843- * FIXME: put this status bar thing into /proc
5844+ * This is the normal 'everything went OK' case
5845+ * do a 'free-behind' logic, we sure dont need
5846+ * this buffer if it was the only user.
5847 */
5848- if (!(j%(max_blocks/100))) {
5849- if (!(percent%10))
5850- printk (" %03d%% done.\n",percent);
5851+ for (i = 0; i < chunk; i++)
5852+ if (buffer_dirty(bh[i]))
5853+ brelse(bh[i]);
5854 else
5855- printk (".");
5856- percent++;
5857+ bforget(bh[i]);
5858+
5859+
5860+ if (md_signal_pending(current)) {
5861+ /*
5862+ * got a signal, exit.
5863+ */
5864+ mddev->curr_resync = 0;
5865+ printk("md_do_sync() got signal ... exiting\n");
5866+ md_flush_signals();
5867+ err = -EINTR;
5868+ goto out;
5869 }
5870+
5871+ /*
5872+ * this loop exits only if either when we are slower than
5873+ * the 'hard' speed limit, or the system was IO-idle for
5874+ * a jiffy.
5875+ * the system might be non-idle CPU-wise, but we only care
5876+ * about not overloading the IO subsystem. (things like an
5877+ * e2fsck being done on the RAID array should execute fast)
5878+ */
5879+repeat:
5880+ if (md_need_resched(current))
5881+ schedule();
5882+
5883+ if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
5884+ > sysctl_speed_limit) {
5885+ current->priority = 1;
5886+
5887+ if (!is_mddev_idle(mddev)) {
5888+ current->state = TASK_INTERRUPTIBLE;
5889+ md_schedule_timeout(HZ/2);
5890+ if (!md_signal_pending(current))
5891+ goto repeat;
5892+ }
5893+ } else
5894+ current->priority = 40;
5895 }
5896 fsync_dev(read_disk);
5897- printk("md: %s: sync done.\n", kdevname(read_disk));
5898- mddev->busy--;
5899- return 0;
5900+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
5901+ err = 0;
5902+ /*
5903+ * this also signals 'finished resyncing' to md_stop
5904+ */
5905+out:
5906+ up(&mddev->resync_sem);
5907+out_nolock:
5908+ free_pages((unsigned long)bh, RA_ORDER);
5909+ mddev->curr_resync = 0;
5910+ wake_up(&resync_wait);
5911+ return err;
5912+
5913+read_error:
5914+ /*
5915+ * set_blocksize() might change the blocksize. This
5916+ * should not happen often, but it happens when eg.
5917+ * someone mounts a filesystem that has non-1k
5918+ * blocksize. set_blocksize() doesnt touch our
5919+ * buffer, but to avoid aliasing problems we change
5920+ * our internal blocksize too and retry the read.
5921+ */
5922+ curr_bsize = device_bsize(read_disk);
5923+ if (curr_bsize != blocksize) {
5924+ printk(KERN_INFO "md%d: blocksize changed during read\n",
5925+ mdidx(mddev));
5926+ for (k = 0; k < chunk; k++)
5927+ if (bh[k]) {
5928+ if (buffer_lowprio(bh[k]))
5929+ mark_buffer_clean(bh[k]);
5930+ brelse(bh[k]);
5931+ }
5932+ goto retry_read;
5933+ }
5934+
5935+ /*
5936+ * It's a real read problem. We retry and bail out
5937+ * only if it's excessive.
5938+ */
5939+ if (max_read_errors) {
5940+ max_read_errors--;
5941+ printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
5942+ for (k = 0; k < chunk; k++)
5943+ if (bh[k]) {
5944+ if (buffer_lowprio(bh[k]))
5945+ mark_buffer_clean(bh[k]);
5946+ brelse(bh[k]);
5947+ }
5948+ goto retry_read;
5949+ }
5950+ printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
5951+ for (k = 0; k < chunk; k++)
5952+ if (bh[k]) {
5953+ if (buffer_lowprio(bh[k]))
5954+ mark_buffer_clean(bh[k]);
5955+ brelse(bh[k]);
5956+ }
5957+ err = -EIO;
5958+ goto out;
5959 }
5960
5961+#undef MAX_NR_BLOCKS
5962+
5963 /*
5964- * This is a kernel thread which: syncs a spare disk with the active array
5965+ * This is a kernel thread which syncs a spare disk with the active array
5966 *
5967 * the amount of foolproofing might seem to be a tad excessive, but an
5968 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
5969 * of my root partition with the first 0.5 gigs of my /home partition ... so
5970 * i'm a bit nervous ;)
5971 */
5972-void mdsyncd (void *data)
5973+void md_do_recovery (void *data)
5974 {
5975- int i;
5976- struct md_dev *mddev;
5977- md_superblock_t *sb;
5978- md_descriptor_t *spare;
5979+ int err;
5980+ mddev_t *mddev;
5981+ mdp_super_t *sb;
5982+ mdp_disk_t *spare;
5983 unsigned long flags;
5984+ struct md_list_head *tmp;
5985
5986- for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
5987- if ((sb = mddev->sb) == NULL)
5988+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
5989+restart:
5990+ ITERATE_MDDEV(mddev,tmp) {
5991+ sb = mddev->sb;
5992+ if (!sb)
5993+ continue;
5994+ if (mddev->recovery_running)
5995 continue;
5996 if (sb->active_disks == sb->raid_disks)
5997 continue;
5998- if (!sb->spare_disks)
5999+ if (!sb->spare_disks) {
6000+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
6001 continue;
6002+ }
6003+ /*
6004+ * now here we get the spare and resync it.
6005+ */
6006 if ((spare = get_spare(mddev)) == NULL)
6007 continue;
6008- if (!mddev->pers->mark_spare)
6009+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
6010+ if (!mddev->pers->diskop)
6011 continue;
6012- if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
6013+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
6014 continue;
6015- if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
6016- mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
6017+ down(&mddev->recovery_sem);
6018+ mddev->recovery_running = 1;
6019+ err = md_do_sync(mddev, spare);
6020+ if (err == -EIO) {
6021+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
6022+ if (!disk_faulty(spare)) {
6023+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
6024+ mark_disk_faulty(spare);
6025+ mark_disk_nonsync(spare);
6026+ mark_disk_inactive(spare);
6027+ sb->spare_disks--;
6028+ sb->working_disks--;
6029+ sb->failed_disks++;
6030+ }
6031+ } else
6032+ if (disk_faulty(spare))
6033+ mddev->pers->diskop(mddev, &spare,
6034+ DISKOP_SPARE_INACTIVE);
6035+ if (err == -EINTR) {
6036+ /*
6037+ * Recovery got interrupted ...
6038+ * signal back that we have finished using the array.
6039+ */
6040+ mddev->pers->diskop(mddev, &spare,
6041+ DISKOP_SPARE_INACTIVE);
6042+ up(&mddev->recovery_sem);
6043+ mddev->recovery_running = 0;
6044 continue;
6045+ } else {
6046+ mddev->recovery_running = 0;
6047+ up(&mddev->recovery_sem);
6048 }
6049 save_flags(flags);
6050 cli();
6051- mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
6052- spare->state |= (1 << MD_SYNC_DEVICE);
6053- spare->state |= (1 << MD_ACTIVE_DEVICE);
6054- sb->spare_disks--;
6055- sb->active_disks++;
6056- mddev->sb_dirty = 1;
6057- md_update_sb(mddev - md_dev);
6058+ if (!disk_faulty(spare)) {
6059+ /*
6060+ * the SPARE_ACTIVE diskop possibly changes the
6061+ * pointer too
6062+ */
6063+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
6064+ mark_disk_sync(spare);
6065+ mark_disk_active(spare);
6066+ sb->active_disks++;
6067+ sb->spare_disks--;
6068+ }
6069 restore_flags(flags);
6070+ mddev->sb_dirty = 1;
6071+ md_update_sb(mddev);
6072+ goto restart;
6073 }
6074+ printk(KERN_INFO "md: recovery thread finished ...\n");
6075
6076 }
6077
6078+int md_notify_reboot(struct notifier_block *this,
6079+ unsigned long code, void *x)
6080+{
6081+ struct md_list_head *tmp;
6082+ mddev_t *mddev;
6083+
6084+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
6085+ || (code == MD_SYS_POWER_OFF)) {
6086+
6087+ printk(KERN_INFO "stopping all md devices.\n");
6088+
6089+ ITERATE_MDDEV(mddev,tmp)
6090+ do_md_stop (mddev, 1);
6091+ /*
6092+ * certain more exotic SCSI devices are known to be
6093+ * volatile wrt too early system reboots. While the
6094+ * right place to handle this issue is the given
6095+ * driver, we do want to have a safe RAID driver ...
6096+ */
6097+ md_mdelay(1000*1);
6098+ }
6099+ return NOTIFY_DONE;
6100+}
6101+
6102+struct notifier_block md_notifier = {
6103+ md_notify_reboot,
6104+ NULL,
6105+ 0
6106+};
6107+
6108+md__initfunc(void raid_setup(char *str, int *ints))
6109+{
6110+ char tmpline[100];
6111+ int len, pos, nr, i;
6112+
6113+ len = strlen(str) + 1;
6114+ nr = 0;
6115+ pos = 0;
6116+
6117+ for (i = 0; i < len; i++) {
6118+ char c = str[i];
6119+
6120+ if (c == ',' || !c) {
6121+ tmpline[pos] = 0;
6122+ if (!strcmp(tmpline,"noautodetect"))
6123+ raid_setup_args.noautodetect = 1;
6124+ nr++;
6125+ pos = 0;
6126+ continue;
6127+ }
6128+ tmpline[pos] = c;
6129+ pos++;
6130+ }
6131+ raid_setup_args.set = 1;
6132+ return;
6133+}
6134+
6135 #ifdef CONFIG_MD_BOOT
6136 struct {
6137 int set;
6138 int ints[100];
6139 char str[100];
6140-} md_setup_args __initdata = {
6141+} md_setup_args md__initdata = {
6142 0,{0},{0}
6143 };
6144
6145 /* called from init/main.c */
6146-__initfunc(void md_setup(char *str,int *ints))
6147+md__initfunc(void md_setup(char *str,int *ints))
6148 {
6149 int i;
6150 for(i=0;i<=ints[0];i++) {
6151@@ -1230,21 +3831,24 @@
6152 return;
6153 }
6154
6155-__initfunc(void do_md_setup(char *str,int *ints))
6156+md__initfunc(void do_md_setup(char *str,int *ints))
6157 {
6158- int minor, pers, factor, fault;
6159+#if 0
6160+ int minor, pers, chunk_size, fault;
6161 kdev_t dev;
6162 int i=1;
6163
6164+ printk("i plan to phase this out --mingo\n");
6165+
6166 if(ints[0] < 4) {
6167- printk ("md: Too few Arguments (%d).\n", ints[0]);
6168+ printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
6169 return;
6170 }
6171
6172 minor=ints[i++];
6173
6174- if (minor >= MAX_MD_DEV) {
6175- printk ("md: Minor device number too high.\n");
6176+ if ((unsigned int)minor >= MAX_MD_DEVS) {
6177+ printk (KERN_WARNING "md: Minor device number too high.\n");
6178 return;
6179 }
6180
6181@@ -1254,18 +3858,20 @@
6182 case -1:
6183 #ifdef CONFIG_MD_LINEAR
6184 pers = LINEAR;
6185- printk ("md: Setting up md%d as linear device.\n",minor);
6186+ printk (KERN_INFO "md: Setting up md%d as linear device.\n",
6187+ minor);
6188 #else
6189- printk ("md: Linear mode not configured."
6190+ printk (KERN_WARNING "md: Linear mode not configured."
6191 "Recompile the kernel with linear mode enabled!\n");
6192 #endif
6193 break;
6194 case 0:
6195 pers = STRIPED;
6196 #ifdef CONFIG_MD_STRIPED
6197- printk ("md: Setting up md%d as a striped device.\n",minor);
6198+ printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
6199+ minor);
6200 #else
6201- printk ("md: Striped mode not configured."
6202+ printk (KERN_WARNING "md: Striped mode not configured."
6203 "Recompile the kernel with striped mode enabled!\n");
6204 #endif
6205 break;
6206@@ -1280,79 +3886,145 @@
6207 break;
6208 */
6209 default:
6210- printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
6211+ printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
6212 return;
6213 }
6214
6215- if(pers) {
6216+ if (pers) {
6217
6218- factor=ints[i++]; /* Chunksize */
6219- fault =ints[i++]; /* Faultlevel */
6220+ chunk_size = ints[i++]; /* Chunksize */
6221+ fault = ints[i++]; /* Faultlevel */
6222
6223- pers=pers | factor | (fault << FAULT_SHIFT);
6224+ pers = pers | chunk_size | (fault << FAULT_SHIFT);
6225
6226- while( str && (dev = name_to_kdev_t(str))) {
6227- do_md_add (minor, dev);
6228- if((str = strchr (str, ',')) != NULL)
6229- str++;
6230- }
6231+ while( str && (dev = name_to_kdev_t(str))) {
6232+ do_md_add (minor, dev);
6233+ if((str = strchr (str, ',')) != NULL)
6234+ str++;
6235+ }
6236
6237- do_md_run (minor, pers);
6238- printk ("md: Loading md%d.\n",minor);
6239+ do_md_run (minor, pers);
6240+ printk (KERN_INFO "md: Loading md%d.\n",minor);
6241 }
6242-
6243+#endif
6244 }
6245 #endif
6246
6247+void hsm_init (void);
6248+void translucent_init (void);
6249 void linear_init (void);
6250 void raid0_init (void);
6251 void raid1_init (void);
6252 void raid5_init (void);
6253
6254-__initfunc(int md_init (void))
6255+md__initfunc(int md_init (void))
6256 {
6257- printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
6258- MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
6259- MAX_MD_DEV, MAX_REAL);
6260-
6261- if (register_blkdev (MD_MAJOR, "md", &md_fops))
6262- {
6263- printk ("Unable to get major %d for md\n", MD_MAJOR);
6264- return (-1);
6265- }
6266-
6267- blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
6268- blk_dev[MD_MAJOR].current_request=NULL;
6269- read_ahead[MD_MAJOR]=INT_MAX;
6270- memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
6271- md_gendisk.next=gendisk_head;
6272-
6273- gendisk_head=&md_gendisk;
6274-
6275-#if SUPPORT_RECONSTRUCTION
6276- if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
6277- printk("md: bug: md_sync_thread == NULL\n");
6278-#endif /* SUPPORT_RECONSTRUCTION */
6279+ static char * name = "mdrecoveryd";
6280+
6281+ printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
6282+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
6283+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
6284+
6285+ if (register_blkdev (MD_MAJOR, "md", &md_fops))
6286+ {
6287+ printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
6288+ return (-1);
6289+ }
6290+
6291+ blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
6292+ blk_dev[MD_MAJOR].current_request = NULL;
6293+ read_ahead[MD_MAJOR] = INT_MAX;
6294+ md_gendisk.next = gendisk_head;
6295+
6296+ gendisk_head = &md_gendisk;
6297+
6298+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
6299+ if (!md_recovery_thread)
6300+ printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
6301
6302+ md_register_reboot_notifier(&md_notifier);
6303+ md_register_sysctl();
6304+
6305+#ifdef CONFIG_MD_HSM
6306+ hsm_init ();
6307+#endif
6308+#ifdef CONFIG_MD_TRANSLUCENT
6309+ translucent_init ();
6310+#endif
6311 #ifdef CONFIG_MD_LINEAR
6312- linear_init ();
6313+ linear_init ();
6314 #endif
6315 #ifdef CONFIG_MD_STRIPED
6316- raid0_init ();
6317+ raid0_init ();
6318 #endif
6319 #ifdef CONFIG_MD_MIRRORING
6320- raid1_init ();
6321+ raid1_init ();
6322 #endif
6323 #ifdef CONFIG_MD_RAID5
6324- raid5_init ();
6325+ raid5_init ();
6326+#endif
6327+#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
6328+ /*
6329+ * pick a XOR routine, runtime.
6330+ */
6331+ calibrate_xor_block();
6332 #endif
6333- return (0);
6334+
6335+ return (0);
6336 }
6337
6338 #ifdef CONFIG_MD_BOOT
6339-__initfunc(void md_setup_drive(void))
6340+md__initfunc(void md_setup_drive(void))
6341 {
6342 if(md_setup_args.set)
6343 do_md_setup(md_setup_args.str, md_setup_args.ints);
6344 }
6345 #endif
6346+
6347+MD_EXPORT_SYMBOL(md_size);
6348+MD_EXPORT_SYMBOL(register_md_personality);
6349+MD_EXPORT_SYMBOL(unregister_md_personality);
6350+MD_EXPORT_SYMBOL(partition_name);
6351+MD_EXPORT_SYMBOL(md_error);
6352+MD_EXPORT_SYMBOL(md_recover_arrays);
6353+MD_EXPORT_SYMBOL(md_register_thread);
6354+MD_EXPORT_SYMBOL(md_unregister_thread);
6355+MD_EXPORT_SYMBOL(md_update_sb);
6356+MD_EXPORT_SYMBOL(md_map);
6357+MD_EXPORT_SYMBOL(md_wakeup_thread);
6358+MD_EXPORT_SYMBOL(md_do_sync);
6359+MD_EXPORT_SYMBOL(md_print_devices);
6360+MD_EXPORT_SYMBOL(find_rdev_nr);
6361+MD_EXPORT_SYMBOL(md_check_ordering);
6362+MD_EXPORT_SYMBOL(md_interrupt_thread);
6363+MD_EXPORT_SYMBOL(mddev_map);
6364+
6365+#ifdef CONFIG_PROC_FS
6366+static struct proc_dir_entry proc_md = {
6367+ PROC_MD, 6, "mdstat",
6368+ S_IFREG | S_IRUGO, 1, 0, 0,
6369+ 0, &proc_array_inode_operations,
6370+};
6371+#endif
6372+
6373+static void md_geninit (struct gendisk *gdisk)
6374+{
6375+ int i;
6376+
6377+ for(i = 0; i < MAX_MD_DEVS; i++) {
6378+ md_blocksizes[i] = 1024;
6379+ md_maxreadahead[i] = MD_READAHEAD;
6380+ md_gendisk.part[i].start_sect = -1; /* avoid partition check */
6381+ md_gendisk.part[i].nr_sects = 0;
6382+ }
6383+
6384+ printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6385+
6386+ blksize_size[MD_MAJOR] = md_blocksizes;
6387+ md_set_global_readahead(md_maxreadahead);
6388+
6389+#ifdef CONFIG_PROC_FS
6390+ proc_register(&proc_root, &proc_md);
6391+#endif
6392+}
6393+
6394diff -ruN linux.orig/drivers/block/raid0.c linux-2.2.16/drivers/block/raid0.c
6395--- linux.orig/drivers/block/raid0.c Tue Jan 4 19:12:14 2000
6396+++ linux-2.2.16/drivers/block/raid0.c Fri Jun 9 11:37:45 2000
6397@@ -1,4 +1,3 @@
6398-
6399 /*
6400 raid0.c : Multiple Devices driver for Linux
6401 Copyright (C) 1994-96 Marc ZYNGIER
6402@@ -18,146 +17,201 @@
6403 */
6404
6405 #include <linux/module.h>
6406-#include <linux/md.h>
6407-#include <linux/raid0.h>
6408-#include <linux/vmalloc.h>
6409+#include <linux/raid/raid0.h>
6410
6411 #define MAJOR_NR MD_MAJOR
6412 #define MD_DRIVER
6413 #define MD_PERSONALITY
6414
6415-static int create_strip_zones (int minor, struct md_dev *mddev)
6416+static int create_strip_zones (mddev_t *mddev)
6417 {
6418- int i, j, c=0;
6419- int current_offset=0;
6420- struct real_dev *smallest_by_zone;
6421- struct raid0_data *data=(struct raid0_data *) mddev->private;
6422-
6423- data->nr_strip_zones=1;
6424-
6425- for (i=1; i<mddev->nb_dev; i++)
6426- {
6427- for (j=0; j<i; j++)
6428- if (mddev->devices[i].size==mddev->devices[j].size)
6429- {
6430- c=1;
6431- break;
6432- }
6433-
6434- if (!c)
6435- data->nr_strip_zones++;
6436-
6437- c=0;
6438- }
6439-
6440- if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
6441- return 1;
6442-
6443- data->smallest=NULL;
6444-
6445- for (i=0; i<data->nr_strip_zones; i++)
6446- {
6447- data->strip_zone[i].dev_offset=current_offset;
6448- smallest_by_zone=NULL;
6449- c=0;
6450-
6451- for (j=0; j<mddev->nb_dev; j++)
6452- if (mddev->devices[j].size>current_offset)
6453- {
6454- data->strip_zone[i].dev[c++]=mddev->devices+j;
6455- if (!smallest_by_zone ||
6456- smallest_by_zone->size > mddev->devices[j].size)
6457- smallest_by_zone=mddev->devices+j;
6458- }
6459-
6460- data->strip_zone[i].nb_dev=c;
6461- data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
6462-
6463- if (!data->smallest ||
6464- data->smallest->size > data->strip_zone[i].size)
6465- data->smallest=data->strip_zone+i;
6466-
6467- data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
6468- data->strip_zone[i-1].size) : 0;
6469- current_offset=smallest_by_zone->size;
6470- }
6471- return 0;
6472+ int i, c, j, j1, j2;
6473+ int current_offset, curr_zone_offset;
6474+ raid0_conf_t *conf = mddev_to_conf(mddev);
6475+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
6476+
6477+ /*
6478+ * The number of 'same size groups'
6479+ */
6480+ conf->nr_strip_zones = 0;
6481+
6482+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
6483+ printk("raid0: looking at %s\n", partition_name(rdev1->dev));
6484+ c = 0;
6485+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
6486+ printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
6487+ if (rdev2 == rdev1) {
6488+ printk("raid0: END\n");
6489+ break;
6490+ }
6491+ if (rdev2->size == rdev1->size)
6492+ {
6493+ /*
6494+ * Not unique, dont count it as a new
6495+ * group
6496+ */
6497+ printk("raid0: EQUAL\n");
6498+ c = 1;
6499+ break;
6500+ }
6501+ printk("raid0: NOT EQUAL\n");
6502+ }
6503+ if (!c) {
6504+ printk("raid0: ==> UNIQUE\n");
6505+ conf->nr_strip_zones++;
6506+ printk("raid0: %d zones\n", conf->nr_strip_zones);
6507+ }
6508+ }
6509+ printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
6510+
6511+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
6512+ conf->nr_strip_zones);
6513+ if (!conf->strip_zone)
6514+ return 1;
6515+
6516+
6517+ conf->smallest = NULL;
6518+ current_offset = 0;
6519+ curr_zone_offset = 0;
6520+
6521+ for (i = 0; i < conf->nr_strip_zones; i++)
6522+ {
6523+ struct strip_zone *zone = conf->strip_zone + i;
6524+
6525+ printk("zone %d\n", i);
6526+ zone->dev_offset = current_offset;
6527+ smallest = NULL;
6528+ c = 0;
6529+
6530+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
6531+
6532+ printk(" checking %s ...", partition_name(rdev->dev));
6533+ if (rdev->size > current_offset)
6534+ {
6535+ printk(" contained as device %d\n", c);
6536+ zone->dev[c] = rdev;
6537+ c++;
6538+ if (!smallest || (rdev->size <smallest->size)) {
6539+ smallest = rdev;
6540+ printk(" (%d) is smallest!.\n", rdev->size);
6541+ }
6542+ } else
6543+ printk(" nope.\n");
6544+ }
6545+
6546+ zone->nb_dev = c;
6547+ zone->size = (smallest->size - current_offset) * c;
6548+ printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
6549+
6550+ if (!conf->smallest || (zone->size < conf->smallest->size))
6551+ conf->smallest = zone;
6552+
6553+ zone->zone_offset = curr_zone_offset;
6554+ curr_zone_offset += zone->size;
6555+
6556+ current_offset = smallest->size;
6557+ printk("current zone offset: %d\n", current_offset);
6558+ }
6559+ printk("done.\n");
6560+ return 0;
6561 }
6562
6563-static int raid0_run (int minor, struct md_dev *mddev)
6564+static int raid0_run (mddev_t *mddev)
6565 {
6566- int cur=0, i=0, size, zone0_size, nb_zone;
6567- struct raid0_data *data;
6568-
6569- MOD_INC_USE_COUNT;
6570+ int cur=0, i=0, size, zone0_size, nb_zone;
6571+ raid0_conf_t *conf;
6572
6573- if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
6574- data=(struct raid0_data *) mddev->private;
6575-
6576- if (create_strip_zones (minor, mddev))
6577- {
6578- vfree(data);
6579- return 1;
6580- }
6581-
6582- nb_zone=data->nr_zones=
6583- md_size[minor]/data->smallest->size +
6584- (md_size[minor]%data->smallest->size ? 1 : 0);
6585-
6586- printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
6587- if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
6588- {
6589- vfree(data->strip_zone);
6590- vfree(data);
6591- return 1;
6592- }
6593- size=data->strip_zone[cur].size;
6594-
6595- i=0;
6596- while (cur<data->nr_strip_zones)
6597- {
6598- data->hash_table[i].zone0=data->strip_zone+cur;
6599-
6600- if (size>=data->smallest->size)/* If we completely fill the slot */
6601- {
6602- data->hash_table[i++].zone1=NULL;
6603- size-=data->smallest->size;
6604-
6605- if (!size)
6606- {
6607- if (++cur==data->nr_strip_zones) continue;
6608- size=data->strip_zone[cur].size;
6609- }
6610-
6611- continue;
6612- }
6613-
6614- if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
6615- {
6616- data->hash_table[i].zone1=NULL;
6617- continue;
6618- }
6619-
6620- zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
6621- size=data->strip_zone[cur].size;
6622- data->hash_table[i++].zone1=data->strip_zone+cur;
6623- size-=(data->smallest->size - zone0_size);
6624- }
6625+ MOD_INC_USE_COUNT;
6626
6627- return (0);
6628+ conf = vmalloc(sizeof (raid0_conf_t));
6629+ if (!conf)
6630+ goto out;
6631+ mddev->private = (void *)conf;
6632+
6633+ if (md_check_ordering(mddev)) {
6634+ printk("raid0: disks are not ordered, aborting!\n");
6635+ goto out_free_conf;
6636+ }
6637+
6638+ if (create_strip_zones (mddev))
6639+ goto out_free_conf;
6640+
6641+ printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
6642+ printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
6643+ nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
6644+ (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
6645+ printk("raid0 : nb_zone is %d.\n", nb_zone);
6646+ conf->nr_zones = nb_zone;
6647+
6648+ printk("raid0 : Allocating %d bytes for hash.\n",
6649+ sizeof(struct raid0_hash)*nb_zone);
6650+
6651+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
6652+ if (!conf->hash_table)
6653+ goto out_free_zone_conf;
6654+ size = conf->strip_zone[cur].size;
6655+
6656+ i = 0;
6657+ while (cur < conf->nr_strip_zones) {
6658+ conf->hash_table[i].zone0 = conf->strip_zone + cur;
6659+
6660+ /*
6661+ * If we completely fill the slot
6662+ */
6663+ if (size >= conf->smallest->size) {
6664+ conf->hash_table[i++].zone1 = NULL;
6665+ size -= conf->smallest->size;
6666+
6667+ if (!size) {
6668+ if (++cur == conf->nr_strip_zones)
6669+ continue;
6670+ size = conf->strip_zone[cur].size;
6671+ }
6672+ continue;
6673+ }
6674+ if (++cur == conf->nr_strip_zones) {
6675+ /*
6676+ * Last dev, set unit1 as NULL
6677+ */
6678+ conf->hash_table[i].zone1=NULL;
6679+ continue;
6680+ }
6681+
6682+ /*
6683+ * Here we use a 2nd dev to fill the slot
6684+ */
6685+ zone0_size = size;
6686+ size = conf->strip_zone[cur].size;
6687+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
6688+ size -= (conf->smallest->size - zone0_size);
6689+ }
6690+ return 0;
6691+
6692+out_free_zone_conf:
6693+ vfree(conf->strip_zone);
6694+ conf->strip_zone = NULL;
6695+
6696+out_free_conf:
6697+ vfree(conf);
6698+ mddev->private = NULL;
6699+out:
6700+ MOD_DEC_USE_COUNT;
6701+ return 1;
6702 }
6703
6704-
6705-static int raid0_stop (int minor, struct md_dev *mddev)
6706+static int raid0_stop (mddev_t *mddev)
6707 {
6708- struct raid0_data *data=(struct raid0_data *) mddev->private;
6709+ raid0_conf_t *conf = mddev_to_conf(mddev);
6710
6711- vfree (data->hash_table);
6712- vfree (data->strip_zone);
6713- vfree (data);
6714+ vfree (conf->hash_table);
6715+ conf->hash_table = NULL;
6716+ vfree (conf->strip_zone);
6717+ conf->strip_zone = NULL;
6718+ vfree (conf);
6719+ mddev->private = NULL;
6720
6721- MOD_DEC_USE_COUNT;
6722- return 0;
6723+ MOD_DEC_USE_COUNT;
6724+ return 0;
6725 }
6726
6727 /*
6728@@ -167,135 +221,140 @@
6729 * Of course, those facts may not be valid anymore (and surely won't...)
6730 * Hey guys, there's some work out there ;-)
6731 */
6732-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
6733+static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
6734 unsigned long *rsector, unsigned long size)
6735 {
6736- struct raid0_data *data=(struct raid0_data *) mddev->private;
6737- static struct raid0_hash *hash;
6738- struct strip_zone *zone;
6739- struct real_dev *tmp_dev;
6740- int blk_in_chunk, factor, chunk, chunk_size;
6741- long block, rblock;
6742-
6743- factor=FACTOR(mddev);
6744- chunk_size=(1UL << FACTOR_SHIFT(factor));
6745- block=*rsector >> 1;
6746- hash=data->hash_table+(block/data->smallest->size);
6747-
6748- if (hash - data->hash_table > data->nr_zones)
6749- {
6750- printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block);
6751- return -1;
6752- }
6753-
6754- /* Sanity check */
6755- if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
6756- {
6757- printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
6758- return (-1);
6759- }
6760-
6761- if (block >= (hash->zone0->size +
6762- hash->zone0->zone_offset))
6763- {
6764- if (!hash->zone1)
6765- {
6766- printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
6767- return (-1);
6768- }
6769-
6770- zone=hash->zone1;
6771- }
6772- else
6773- zone=hash->zone0;
6774+ raid0_conf_t *conf = mddev_to_conf(mddev);
6775+ struct raid0_hash *hash;
6776+ struct strip_zone *zone;
6777+ mdk_rdev_t *tmp_dev;
6778+ int blk_in_chunk, chunksize_bits, chunk, chunk_size;
6779+ long block, rblock;
6780+
6781+ chunk_size = mddev->param.chunk_size >> 10;
6782+ chunksize_bits = ffz(~chunk_size);
6783+ block = *rsector >> 1;
6784+ hash = conf->hash_table + block / conf->smallest->size;
6785+
6786+ if (hash - conf->hash_table > conf->nr_zones) {
6787+ printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block);
6788+ return -1;
6789+ }
6790+
6791+ /* Sanity check */
6792+ if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
6793+ goto bad_map;
6794+
6795+ if (!hash)
6796+ goto bad_hash;
6797+
6798+ if (!hash->zone0)
6799+ goto bad_zone0;
6800+
6801+ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
6802+ if (!hash->zone1)
6803+ goto bad_zone1;
6804+ zone = hash->zone1;
6805+ } else
6806+ zone = hash->zone0;
6807
6808- blk_in_chunk=block & (chunk_size -1);
6809- chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
6810- tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
6811- rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
6812+ blk_in_chunk = block & (chunk_size -1);
6813+ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
6814+ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
6815+ rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
6816
6817- *rdev=tmp_dev->dev;
6818- *rsector=rblock<<1;
6819+ *rdev = tmp_dev->dev;
6820+ *rsector = rblock << 1;
6821
6822- return (0);
6823+ return 0;
6824+
6825+bad_map:
6826+ printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
6827+ return -1;
6828+bad_hash:
6829+ printk("raid0_map bug: hash==NULL for block %ld\n", block);
6830+ return -1;
6831+bad_zone0:
6832+ printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
6833+ return -1;
6834+bad_zone1:
6835+ printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
6836+ return -1;
6837 }
6838
6839
6840-static int raid0_status (char *page, int minor, struct md_dev *mddev)
6841+static int raid0_status (char *page, mddev_t *mddev)
6842 {
6843- int sz=0;
6844+ int sz = 0;
6845 #undef MD_DEBUG
6846 #ifdef MD_DEBUG
6847- int j, k;
6848- struct raid0_data *data=(struct raid0_data *) mddev->private;
6849+ int j, k;
6850+ raid0_conf_t *conf = mddev_to_conf(mddev);
6851
6852- sz+=sprintf (page+sz, " ");
6853- for (j=0; j<data->nr_zones; j++)
6854- {
6855- sz+=sprintf (page+sz, "[z%d",
6856- data->hash_table[j].zone0-data->strip_zone);
6857- if (data->hash_table[j].zone1)
6858- sz+=sprintf (page+sz, "/z%d] ",
6859- data->hash_table[j].zone1-data->strip_zone);
6860- else
6861- sz+=sprintf (page+sz, "] ");
6862- }
6863+ sz += sprintf(page + sz, " ");
6864+ for (j = 0; j < conf->nr_zones; j++) {
6865+ sz += sprintf(page + sz, "[z%d",
6866+ conf->hash_table[j].zone0 - conf->strip_zone);
6867+ if (conf->hash_table[j].zone1)
6868+ sz += sprintf(page+sz, "/z%d] ",
6869+ conf->hash_table[j].zone1 - conf->strip_zone);
6870+ else
6871+ sz += sprintf(page+sz, "] ");
6872+ }
6873
6874- sz+=sprintf (page+sz, "\n");
6875+ sz += sprintf(page + sz, "\n");
6876
6877- for (j=0; j<data->nr_strip_zones; j++)
6878- {
6879- sz+=sprintf (page+sz, " z%d=[", j);
6880- for (k=0; k<data->strip_zone[j].nb_dev; k++)
6881- sz+=sprintf (page+sz, "%s/",
6882- partition_name(data->strip_zone[j].dev[k]->dev));
6883- sz--;
6884- sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
6885- data->strip_zone[j].zone_offset,
6886- data->strip_zone[j].dev_offset,
6887- data->strip_zone[j].size);
6888- }
6889+ for (j = 0; j < conf->nr_strip_zones; j++) {
6890+ sz += sprintf(page + sz, " z%d=[", j);
6891+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
6892+ sz += sprintf (page+sz, "%s/", partition_name(
6893+ conf->strip_zone[j].dev[k]->dev));
6894+ sz--;
6895+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
6896+ conf->strip_zone[j].zone_offset,
6897+ conf->strip_zone[j].dev_offset,
6898+ conf->strip_zone[j].size);
6899+ }
6900 #endif
6901- sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
6902- return sz;
6903+ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
6904+ return sz;
6905 }
6906
6907-
6908-static struct md_personality raid0_personality=
6909+static mdk_personality_t raid0_personality=
6910 {
6911- "raid0",
6912- raid0_map,
6913- NULL, /* no special make_request */
6914- NULL, /* no special end_request */
6915- raid0_run,
6916- raid0_stop,
6917- raid0_status,
6918- NULL, /* no ioctls */
6919- 0,
6920- NULL, /* no error_handler */
6921- NULL, /* hot_add_disk */
6922- NULL, /* hot_remove_disk */
6923- NULL /* mark_spare */
6924+ "raid0",
6925+ raid0_map,
6926+ NULL, /* no special make_request */
6927+ NULL, /* no special end_request */
6928+ raid0_run,
6929+ raid0_stop,
6930+ raid0_status,
6931+ NULL, /* no ioctls */
6932+ 0,
6933+ NULL, /* no error_handler */
6934+ NULL, /* no diskop */
6935+ NULL, /* no stop resync */
6936+ NULL /* no restart resync */
6937 };
6938
6939-
6940 #ifndef MODULE
6941
6942 void raid0_init (void)
6943 {
6944- register_md_personality (RAID0, &raid0_personality);
6945+ register_md_personality (RAID0, &raid0_personality);
6946 }
6947
6948 #else
6949
6950 int init_module (void)
6951 {
6952- return (register_md_personality (RAID0, &raid0_personality));
6953+ return (register_md_personality (RAID0, &raid0_personality));
6954 }
6955
6956 void cleanup_module (void)
6957 {
6958- unregister_md_personality (RAID0);
6959+ unregister_md_personality (RAID0);
6960 }
6961
6962 #endif
6963+
6964diff -ruN linux.orig/drivers/block/raid1.c linux-2.2.16/drivers/block/raid1.c
6965--- linux.orig/drivers/block/raid1.c Thu May 4 02:16:33 2000
6966+++ linux-2.2.16/drivers/block/raid1.c Fri Jun 9 11:37:45 2000
6967@@ -1,6 +1,6 @@
6968-/************************************************************************
6969+/*
6970 * raid1.c : Multiple Devices driver for Linux
6971- * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
6972+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
6973 *
6974 * RAID-1 management functions.
6975 *
6976@@ -15,50 +15,52 @@
6977 */
6978
6979 #include <linux/module.h>
6980-#include <linux/locks.h>
6981 #include <linux/malloc.h>
6982-#include <linux/md.h>
6983-#include <linux/raid1.h>
6984-#include <asm/bitops.h>
6985+#include <linux/raid/raid1.h>
6986 #include <asm/atomic.h>
6987
6988 #define MAJOR_NR MD_MAJOR
6989 #define MD_DRIVER
6990 #define MD_PERSONALITY
6991
6992-/*
6993- * The following can be used to debug the driver
6994- */
6995-/*#define RAID1_DEBUG*/
6996-#ifdef RAID1_DEBUG
6997-#define PRINTK(x) do { printk x; } while (0);
6998-#else
6999-#define PRINTK(x) do { ; } while (0);
7000-#endif
7001+#define MAX_LINEAR_SECTORS 128
7002
7003 #define MAX(a,b) ((a) > (b) ? (a) : (b))
7004 #define MIN(a,b) ((a) < (b) ? (a) : (b))
7005
7006-static struct md_personality raid1_personality;
7007-static struct md_thread *raid1_thread = NULL;
7008+static mdk_personality_t raid1_personality;
7009 struct buffer_head *raid1_retry_list = NULL;
7010
7011-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
7012+static void * raid1_kmalloc (int size)
7013+{
7014+ void * ptr;
7015+ /*
7016+ * now we are rather fault tolerant than nice, but
7017+ * there are a couple of places in the RAID code where we
7018+ * simply can not afford to fail an allocation because
7019+ * there is no failure return path (eg. make_request())
7020+ */
7021+ while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL)))
7022+ printk ("raid1: out of memory, retrying...\n");
7023+
7024+ memset(ptr, 0, size);
7025+ return ptr;
7026+}
7027+
7028+static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
7029 unsigned long *rsector, unsigned long size)
7030 {
7031- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7032- int i, n = raid_conf->raid_disks;
7033+ raid1_conf_t *conf = mddev_to_conf(mddev);
7034+ int i, disks = MD_SB_DISKS;
7035
7036 /*
7037 * Later we do read balancing on the read side
7038 * now we use the first available disk.
7039 */
7040
7041- PRINTK(("raid1_map().\n"));
7042-
7043- for (i=0; i<n; i++) {
7044- if (raid_conf->mirrors[i].operational) {
7045- *rdev = raid_conf->mirrors[i].dev;
7046+ for (i = 0; i < disks; i++) {
7047+ if (conf->mirrors[i].operational) {
7048+ *rdev = conf->mirrors[i].dev;
7049 return (0);
7050 }
7051 }
7052@@ -67,29 +69,29 @@
7053 return (-1);
7054 }
7055
7056-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
7057+static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
7058 unsigned long *rsector, unsigned long size)
7059 {
7060 return 0;
7061 }
7062
7063-void raid1_reschedule_retry (struct buffer_head *bh)
7064+static void raid1_reschedule_retry (struct buffer_head *bh)
7065 {
7066 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
7067-
7068- PRINTK(("raid1_reschedule_retry().\n"));
7069+ mddev_t *mddev = r1_bh->mddev;
7070+ raid1_conf_t *conf = mddev_to_conf(mddev);
7071
7072 r1_bh->next_retry = raid1_retry_list;
7073 raid1_retry_list = bh;
7074- md_wakeup_thread(raid1_thread);
7075+ md_wakeup_thread(conf->thread);
7076 }
7077
7078 /*
7079- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
7080+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
7081 * operation and are ready to return a success/failure code to the buffer
7082 * cache layer.
7083 */
7084-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
7085+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
7086 {
7087 struct buffer_head *bh = r1_bh->master_bh;
7088
7089@@ -97,8 +99,6 @@
7090 kfree(r1_bh);
7091 }
7092
7093-int raid1_one_error=0;
7094-
7095 void raid1_end_request (struct buffer_head *bh, int uptodate)
7096 {
7097 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
7098@@ -106,12 +106,7 @@
7099
7100 save_flags(flags);
7101 cli();
7102- PRINTK(("raid1_end_request().\n"));
7103
7104- if (raid1_one_error) {
7105- raid1_one_error=0;
7106- uptodate=0;
7107- }
7108 /*
7109 * this branch is our 'one mirror IO has finished' event handler:
7110 */
7111@@ -136,15 +131,11 @@
7112 */
7113
7114 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
7115-
7116- PRINTK(("raid1_end_request(), read branch.\n"));
7117-
7118 /*
7119 * we have only one buffer_head on the read side
7120 */
7121 if (uptodate) {
7122- PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
7123- raid1_end_buffer_io(r1_bh, uptodate);
7124+ raid1_end_bh_io(r1_bh, uptodate);
7125 restore_flags(flags);
7126 return;
7127 }
7128@@ -152,71 +143,56 @@
7129 * oops, read error:
7130 */
7131 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
7132- kdevname(bh->b_dev), bh->b_blocknr);
7133- raid1_reschedule_retry (bh);
7134+ partition_name(bh->b_dev), bh->b_blocknr);
7135+ raid1_reschedule_retry(bh);
7136 restore_flags(flags);
7137 return;
7138 }
7139
7140 /*
7141- * WRITE or WRITEA.
7142- */
7143- PRINTK(("raid1_end_request(), write branch.\n"));
7144-
7145- /*
7146+ * WRITE:
7147+ *
7148 * Let's see if all mirrored write operations have finished
7149- * already [we have irqs off, so we can decrease]:
7150+ * already.
7151 */
7152
7153- if (!--r1_bh->remaining) {
7154- struct md_dev *mddev = r1_bh->mddev;
7155- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7156- int i, n = raid_conf->raid_disks;
7157+ if (atomic_dec_and_test(&r1_bh->remaining)) {
7158+ int i, disks = MD_SB_DISKS;
7159
7160- PRINTK(("raid1_end_request(), remaining == 0.\n"));
7161+ for ( i = 0; i < disks; i++)
7162+ if (r1_bh->mirror_bh[i])
7163+ kfree(r1_bh->mirror_bh[i]);
7164
7165- for ( i=0; i<n; i++)
7166- if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
7167-
7168- raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
7169+ raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
7170 }
7171- else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
7172 restore_flags(flags);
7173 }
7174
7175-/* This routine checks if the undelying device is an md device and in that
7176- * case it maps the blocks before putting the request on the queue
7177+/*
7178+ * This routine checks if the undelying device is an md device
7179+ * and in that case it maps the blocks before putting the
7180+ * request on the queue
7181 */
7182-static inline void
7183-map_and_make_request (int rw, struct buffer_head *bh)
7184+static void map_and_make_request (int rw, struct buffer_head *bh)
7185 {
7186 if (MAJOR (bh->b_rdev) == MD_MAJOR)
7187- md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
7188+ md_map (bh->b_rdev, &bh->b_rdev,
7189+ &bh->b_rsector, bh->b_size >> 9);
7190 clear_bit(BH_Lock, &bh->b_state);
7191 make_request (MAJOR (bh->b_rdev), rw, bh);
7192 }
7193
7194-static int
7195-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
7196+static int raid1_make_request (mddev_t *mddev, int rw,
7197+ struct buffer_head * bh)
7198 {
7199-
7200- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7201+ raid1_conf_t *conf = mddev_to_conf(mddev);
7202 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
7203 struct raid1_bh * r1_bh;
7204- int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
7205+ int disks = MD_SB_DISKS;
7206+ int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
7207 struct mirror_info *mirror;
7208
7209- PRINTK(("raid1_make_request().\n"));
7210-
7211- while (!( /* FIXME: now we are rather fault tolerant than nice */
7212- r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL)
7213- ) )
7214- {
7215- printk ("raid1_make_request(#1): out of memory\n");
7216- current->policy |= SCHED_YIELD;
7217- schedule();
7218- }
7219- memset (r1_bh, 0, sizeof (struct raid1_bh));
7220+ r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
7221
7222 /*
7223 * make_request() can abort the operation when READA or WRITEA are being
7224@@ -227,43 +203,65 @@
7225 if (rw == READA) rw = READ;
7226 if (rw == WRITEA) rw = WRITE;
7227
7228- if (rw == WRITE || rw == WRITEA)
7229- mark_buffer_clean(bh); /* Too early ? */
7230+ if (rw == WRITE) {
7231+ /*
7232+ * Too early ?
7233+ */
7234+ mark_buffer_clean(bh);
7235+ /*
7236+ * not too early. we _first_ clean the bh, then we start
7237+ * the IO, then when the IO has finished, we unlock the
7238+ * bh and mark it uptodate. This way we do not miss the
7239+ * case when the bh got dirty again during the IO.
7240+ */
7241+ }
7242+
7243+ /*
7244+ * special flag for 'lowprio' reconstruction requests ...
7245+ */
7246+ if (buffer_lowprio(bh))
7247+ lowprio = 1;
7248
7249 /*
7250- * i think the read and write branch should be separated completely, since we want
7251- * to do read balancing on the read side for example. Comments? :) --mingo
7252+ * i think the read and write branch should be separated completely,
7253+ * since we want to do read balancing on the read side for example.
7254+ * Comments? :) --mingo
7255 */
7256
7257 r1_bh->master_bh=bh;
7258 r1_bh->mddev=mddev;
7259 r1_bh->cmd = rw;
7260
7261- if (rw==READ || rw==READA) {
7262- int last_used = raid_conf->last_used;
7263- PRINTK(("raid1_make_request(), read branch.\n"));
7264- mirror = raid_conf->mirrors + last_used;
7265+ if (rw==READ) {
7266+ int last_used = conf->last_used;
7267+
7268+ /*
7269+ * read balancing logic:
7270+ */
7271+ mirror = conf->mirrors + last_used;
7272 bh->b_rdev = mirror->dev;
7273 sectors = bh->b_size >> 9;
7274- if (bh->b_blocknr * sectors == raid_conf->next_sect) {
7275- raid_conf->sect_count += sectors;
7276- if (raid_conf->sect_count >= mirror->sect_limit)
7277+
7278+ if (bh->b_blocknr * sectors == conf->next_sect) {
7279+ conf->sect_count += sectors;
7280+ if (conf->sect_count >= mirror->sect_limit)
7281 switch_disks = 1;
7282 } else
7283 switch_disks = 1;
7284- raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
7285- if (switch_disks) {
7286- PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
7287- raid_conf->sect_count = 0;
7288- last_used = raid_conf->last_used = mirror->next;
7289+ conf->next_sect = (bh->b_blocknr + 1) * sectors;
7290+ /*
7291+ * Do not switch disks if full resync is in progress ...
7292+ */
7293+ if (switch_disks && !conf->resync_mirrors) {
7294+ conf->sect_count = 0;
7295+ last_used = conf->last_used = mirror->next;
7296 /*
7297- * Do not switch to write-only disks ... resyncing
7298- * is in progress
7299+ * Do not switch to write-only disks ...
7300+ * reconstruction is in progress
7301 */
7302- while (raid_conf->mirrors[last_used].write_only)
7303- raid_conf->last_used = raid_conf->mirrors[last_used].next;
7304+ while (conf->mirrors[last_used].write_only)
7305+ conf->last_used = conf->mirrors[last_used].next;
7306 }
7307- PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
7308 bh_req = &r1_bh->bh_req;
7309 memcpy(bh_req, bh, sizeof(*bh));
7310 bh_req->b_end_io = raid1_end_request;
7311@@ -273,13 +271,12 @@
7312 }
7313
7314 /*
7315- * WRITE or WRITEA.
7316+ * WRITE:
7317 */
7318- PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
7319
7320- for (i = 0; i < n; i++) {
7321+ for (i = 0; i < disks; i++) {
7322
7323- if (!raid_conf->mirrors [i].operational) {
7324+ if (!conf->mirrors[i].operational) {
7325 /*
7326 * the r1_bh->mirror_bh[i] pointer remains NULL
7327 */
7328@@ -287,89 +284,91 @@
7329 continue;
7330 }
7331
7332+ /*
7333+ * special case for reconstruction ...
7334+ */
7335+ if (lowprio && (i == conf->last_used)) {
7336+ mirror_bh[i] = NULL;
7337+ continue;
7338+ }
7339+
7340+ /*
7341+ * We should use a private pool (size depending on NR_REQUEST),
7342+ * to avoid writes filling up the memory with bhs
7343+ *
7344+ * Such pools are much faster than kmalloc anyways (so we waste
7345+ * almost nothing by not using the master bh when writing and
7346+ * win alot of cleanness) but for now we are cool enough. --mingo
7347+ *
7348+ * It's safe to sleep here, buffer heads cannot be used in a shared
7349+ * manner in the write branch. Look how we lock the buffer at the
7350+ * beginning of this function to grok the difference ;)
7351+ */
7352+ mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
7353+ /*
7354+ * prepare mirrored bh (fields ordered for max mem throughput):
7355+ */
7356+ mirror_bh[i]->b_blocknr = bh->b_blocknr;
7357+ mirror_bh[i]->b_dev = bh->b_dev;
7358+ mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
7359+ mirror_bh[i]->b_rsector = bh->b_rsector;
7360+ mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
7361+ if (lowprio)
7362+ mirror_bh[i]->b_state |= (1<<BH_LowPrio);
7363+
7364+ mirror_bh[i]->b_count = 1;
7365+ mirror_bh[i]->b_size = bh->b_size;
7366+ mirror_bh[i]->b_data = bh->b_data;
7367+ mirror_bh[i]->b_list = BUF_LOCKED;
7368+ mirror_bh[i]->b_end_io = raid1_end_request;
7369+ mirror_bh[i]->b_dev_id = r1_bh;
7370+
7371+ r1_bh->mirror_bh[i] = mirror_bh[i];
7372+ sum_bhs++;
7373+ }
7374+
7375+ md_atomic_set(&r1_bh->remaining, sum_bhs);
7376+
7377 /*
7378- * We should use a private pool (size depending on NR_REQUEST),
7379- * to avoid writes filling up the memory with bhs
7380- *
7381- * Such pools are much faster than kmalloc anyways (so we waste almost
7382- * nothing by not using the master bh when writing and win alot of cleanness)
7383- *
7384- * but for now we are cool enough. --mingo
7385- *
7386- * It's safe to sleep here, buffer heads cannot be used in a shared
7387- * manner in the write branch. Look how we lock the buffer at the beginning
7388- * of this function to grok the difference ;)
7389- */
7390- while (!( /* FIXME: now we are rather fault tolerant than nice */
7391- mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL)
7392- ) )
7393- {
7394- printk ("raid1_make_request(#2): out of memory\n");
7395- current->policy |= SCHED_YIELD;
7396- schedule();
7397- }
7398- memset (mirror_bh[i], 0, sizeof (struct buffer_head));
7399-
7400- /*
7401- * prepare mirrored bh (fields ordered for max mem throughput):
7402- */
7403- mirror_bh [i]->b_blocknr = bh->b_blocknr;
7404- mirror_bh [i]->b_dev = bh->b_dev;
7405- mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
7406- mirror_bh [i]->b_rsector = bh->b_rsector;
7407- mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
7408- mirror_bh [i]->b_count = 1;
7409- mirror_bh [i]->b_size = bh->b_size;
7410- mirror_bh [i]->b_data = bh->b_data;
7411- mirror_bh [i]->b_list = BUF_LOCKED;
7412- mirror_bh [i]->b_end_io = raid1_end_request;
7413- mirror_bh [i]->b_dev_id = r1_bh;
7414-
7415- r1_bh->mirror_bh[i] = mirror_bh[i];
7416- sum_bhs++;
7417- }
7418-
7419- r1_bh->remaining = sum_bhs;
7420-
7421- PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
7422-
7423- /*
7424- * We have to be a bit careful about the semaphore above, thats why we
7425- * start the requests separately. Since kmalloc() could fail, sleep and
7426- * make_request() can sleep too, this is the safer solution. Imagine,
7427- * end_request decreasing the semaphore before we could have set it up ...
7428- * We could play tricks with the semaphore (presetting it and correcting
7429- * at the end if sum_bhs is not 'n' but we have to do end_request by hand
7430- * if all requests finish until we had a chance to set up the semaphore
7431- * correctly ... lots of races).
7432- */
7433- for (i = 0; i < n; i++)
7434- if (mirror_bh [i] != NULL)
7435- map_and_make_request (rw, mirror_bh [i]);
7436+ * We have to be a bit careful about the semaphore above, thats
7437+ * why we start the requests separately. Since kmalloc() could
7438+ * fail, sleep and make_request() can sleep too, this is the
7439+ * safer solution. Imagine, end_request decreasing the semaphore
7440+ * before we could have set it up ... We could play tricks with
7441+ * the semaphore (presetting it and correcting at the end if
7442+ * sum_bhs is not 'n' but we have to do end_request by hand if
7443+ * all requests finish until we had a chance to set up the
7444+ * semaphore correctly ... lots of races).
7445+ */
7446+ for (i = 0; i < disks; i++)
7447+ if (mirror_bh[i])
7448+ map_and_make_request(rw, mirror_bh[i]);
7449
7450 return (0);
7451 }
7452
7453-static int raid1_status (char *page, int minor, struct md_dev *mddev)
7454+static int raid1_status (char *page, mddev_t *mddev)
7455 {
7456- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7457+ raid1_conf_t *conf = mddev_to_conf(mddev);
7458 int sz = 0, i;
7459
7460- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
7461- for (i = 0; i < raid_conf->raid_disks; i++)
7462- sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
7463+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
7464+ conf->working_disks);
7465+ for (i = 0; i < conf->raid_disks; i++)
7466+ sz += sprintf (page+sz, "%s",
7467+ conf->mirrors[i].operational ? "U" : "_");
7468 sz += sprintf (page+sz, "]");
7469 return sz;
7470 }
7471
7472-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
7473+static void unlink_disk (raid1_conf_t *conf, int target)
7474 {
7475- int disks = raid_conf->raid_disks;
7476- int j;
7477+ int disks = MD_SB_DISKS;
7478+ int i;
7479
7480- for (j = 0; j < disks; j++)
7481- if (raid_conf->mirrors [j].next == failed_index)
7482- raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
7483+ for (i = 0; i < disks; i++)
7484+ if (conf->mirrors[i].next == target)
7485+ conf->mirrors[i].next = conf->mirrors[target].next;
7486 }
7487
7488 #define LAST_DISK KERN_ALERT \
7489@@ -388,48 +387,53 @@
7490 #define ALREADY_SYNCING KERN_INFO \
7491 "raid1: syncing already in progress.\n"
7492
7493-static int raid1_error (struct md_dev *mddev, kdev_t dev)
7494+static void mark_disk_bad (mddev_t *mddev, int failed)
7495 {
7496- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7497- struct mirror_info *mirror;
7498- md_superblock_t *sb = mddev->sb;
7499- int disks = raid_conf->raid_disks;
7500- int i;
7501+ raid1_conf_t *conf = mddev_to_conf(mddev);
7502+ struct mirror_info *mirror = conf->mirrors+failed;
7503+ mdp_super_t *sb = mddev->sb;
7504+
7505+ mirror->operational = 0;
7506+ unlink_disk(conf, failed);
7507+ mark_disk_faulty(sb->disks+mirror->number);
7508+ mark_disk_nonsync(sb->disks+mirror->number);
7509+ mark_disk_inactive(sb->disks+mirror->number);
7510+ sb->active_disks--;
7511+ sb->working_disks--;
7512+ sb->failed_disks++;
7513+ mddev->sb_dirty = 1;
7514+ md_wakeup_thread(conf->thread);
7515+ conf->working_disks--;
7516+ printk (DISK_FAILED, partition_name (mirror->dev),
7517+ conf->working_disks);
7518+}
7519
7520- PRINTK(("raid1_error called\n"));
7521+static int raid1_error (mddev_t *mddev, kdev_t dev)
7522+{
7523+ raid1_conf_t *conf = mddev_to_conf(mddev);
7524+ struct mirror_info * mirrors = conf->mirrors;
7525+ int disks = MD_SB_DISKS;
7526+ int i;
7527
7528- if (raid_conf->working_disks == 1) {
7529+ if (conf->working_disks == 1) {
7530 /*
7531 * Uh oh, we can do nothing if this is our last disk, but
7532 * first check if this is a queued request for a device
7533 * which has just failed.
7534 */
7535- for (i = 0, mirror = raid_conf->mirrors; i < disks;
7536- i++, mirror++)
7537- if (mirror->dev == dev && !mirror->operational)
7538+ for (i = 0; i < disks; i++) {
7539+ if (mirrors[i].dev==dev && !mirrors[i].operational)
7540 return 0;
7541+ }
7542 printk (LAST_DISK);
7543 } else {
7544- /* Mark disk as unusable */
7545- for (i = 0, mirror = raid_conf->mirrors; i < disks;
7546- i++, mirror++) {
7547- if (mirror->dev == dev && mirror->operational){
7548- mirror->operational = 0;
7549- raid1_fix_links (raid_conf, i);
7550- sb->disks[mirror->number].state |=
7551- (1 << MD_FAULTY_DEVICE);
7552- sb->disks[mirror->number].state &=
7553- ~(1 << MD_SYNC_DEVICE);
7554- sb->disks[mirror->number].state &=
7555- ~(1 << MD_ACTIVE_DEVICE);
7556- sb->active_disks--;
7557- sb->working_disks--;
7558- sb->failed_disks++;
7559- mddev->sb_dirty = 1;
7560- md_wakeup_thread(raid1_thread);
7561- raid_conf->working_disks--;
7562- printk (DISK_FAILED, kdevname (dev),
7563- raid_conf->working_disks);
7564+ /*
7565+ * Mark disk as unusable
7566+ */
7567+ for (i = 0; i < disks; i++) {
7568+ if (mirrors[i].dev==dev && mirrors[i].operational) {
7569+ mark_disk_bad (mddev, i);
7570+ break;
7571 }
7572 }
7573 }
7574@@ -442,219 +446,396 @@
7575 #undef START_SYNCING
7576
7577 /*
7578- * This is the personality-specific hot-addition routine
7579+ * Insert the spare disk into the drive-ring
7580 */
7581+static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
7582+{
7583+ int j, next;
7584+ int disks = MD_SB_DISKS;
7585+ struct mirror_info *p = conf->mirrors;
7586+
7587+ for (j = 0; j < disks; j++, p++)
7588+ if (p->operational && !p->write_only) {
7589+ next = p->next;
7590+ p->next = mirror->raid_disk;
7591+ mirror->next = next;
7592+ return;
7593+ }
7594
7595-#define NO_SUPERBLOCK KERN_ERR \
7596-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
7597+ printk("raid1: bug: no read-operational devices\n");
7598+}
7599
7600-#define WRONG_LEVEL KERN_ERR \
7601-"raid1: hot-add: level of disk is not RAID-1\n"
7602+static void print_raid1_conf (raid1_conf_t *conf)
7603+{
7604+ int i;
7605+ struct mirror_info *tmp;
7606
7607-#define HOT_ADD_SUCCEEDED KERN_INFO \
7608-"raid1: device %s hot-added\n"
7609+ printk("RAID1 conf printout:\n");
7610+ if (!conf) {
7611+ printk("(conf==NULL)\n");
7612+ return;
7613+ }
7614+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
7615+ conf->raid_disks, conf->nr_disks);
7616
7617-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
7618+ for (i = 0; i < MD_SB_DISKS; i++) {
7619+ tmp = conf->mirrors + i;
7620+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
7621+ i, tmp->spare,tmp->operational,
7622+ tmp->number,tmp->raid_disk,tmp->used_slot,
7623+ partition_name(tmp->dev));
7624+ }
7625+}
7626+
7627+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
7628 {
7629+ int err = 0;
7630+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
7631+ raid1_conf_t *conf = mddev->private;
7632+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
7633 unsigned long flags;
7634- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7635- struct mirror_info *mirror;
7636- md_superblock_t *sb = mddev->sb;
7637- struct real_dev * realdev;
7638- int n;
7639+ mdp_super_t *sb = mddev->sb;
7640+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
7641+
7642+ save_flags(flags);
7643+ cli();
7644
7645+ print_raid1_conf(conf);
7646 /*
7647- * The device has its superblock already read and it was found
7648- * to be consistent for generic RAID usage. Now we check whether
7649- * it's usable for RAID-1 hot addition.
7650+ * find the disk ...
7651 */
7652+ switch (state) {
7653
7654- n = mddev->nb_dev++;
7655- realdev = &mddev->devices[n];
7656- if (!realdev->sb) {
7657- printk (NO_SUPERBLOCK);
7658- return -EINVAL;
7659- }
7660- if (realdev->sb->level != 1) {
7661- printk (WRONG_LEVEL);
7662- return -EINVAL;
7663+ case DISKOP_SPARE_ACTIVE:
7664+
7665+ /*
7666+ * Find the failed disk within the RAID1 configuration ...
7667+ * (this can only be in the first conf->working_disks part)
7668+ */
7669+ for (i = 0; i < conf->raid_disks; i++) {
7670+ tmp = conf->mirrors + i;
7671+ if ((!tmp->operational && !tmp->spare) ||
7672+ !tmp->used_slot) {
7673+ failed_disk = i;
7674+ break;
7675+ }
7676+ }
7677+ /*
7678+ * When we activate a spare disk we _must_ have a disk in
7679+ * the lower (active) part of the array to replace.
7680+ */
7681+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
7682+ MD_BUG();
7683+ err = 1;
7684+ goto abort;
7685+ }
7686+ /* fall through */
7687+
7688+ case DISKOP_SPARE_WRITE:
7689+ case DISKOP_SPARE_INACTIVE:
7690+
7691+ /*
7692+ * Find the spare disk ... (can only be in the 'high'
7693+ * area of the array)
7694+ */
7695+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
7696+ tmp = conf->mirrors + i;
7697+ if (tmp->spare && tmp->number == (*d)->number) {
7698+ spare_disk = i;
7699+ break;
7700+ }
7701+ }
7702+ if (spare_disk == -1) {
7703+ MD_BUG();
7704+ err = 1;
7705+ goto abort;
7706+ }
7707+ break;
7708+
7709+ case DISKOP_HOT_REMOVE_DISK:
7710+
7711+ for (i = 0; i < MD_SB_DISKS; i++) {
7712+ tmp = conf->mirrors + i;
7713+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
7714+ if (tmp->operational) {
7715+ err = -EBUSY;
7716+ goto abort;
7717+ }
7718+ removed_disk = i;
7719+ break;
7720+ }
7721+ }
7722+ if (removed_disk == -1) {
7723+ MD_BUG();
7724+ err = 1;
7725+ goto abort;
7726+ }
7727+ break;
7728+
7729+ case DISKOP_HOT_ADD_DISK:
7730+
7731+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
7732+ tmp = conf->mirrors + i;
7733+ if (!tmp->used_slot) {
7734+ added_disk = i;
7735+ break;
7736+ }
7737+ }
7738+ if (added_disk == -1) {
7739+ MD_BUG();
7740+ err = 1;
7741+ goto abort;
7742+ }
7743+ break;
7744 }
7745- /* FIXME: are there other things left we could sanity-check? */
7746
7747+ switch (state) {
7748 /*
7749- * We have to disable interrupts, as our RAID-1 state is used
7750- * from irq handlers as well.
7751+ * Switch the spare disk to write-only mode:
7752 */
7753- save_flags(flags);
7754- cli();
7755+ case DISKOP_SPARE_WRITE:
7756+ sdisk = conf->mirrors + spare_disk;
7757+ sdisk->operational = 1;
7758+ sdisk->write_only = 1;
7759+ break;
7760+ /*
7761+ * Deactivate a spare disk:
7762+ */
7763+ case DISKOP_SPARE_INACTIVE:
7764+ sdisk = conf->mirrors + spare_disk;
7765+ sdisk->operational = 0;
7766+ sdisk->write_only = 0;
7767+ break;
7768+ /*
7769+ * Activate (mark read-write) the (now sync) spare disk,
7770+ * which means we switch it's 'raid position' (->raid_disk)
7771+ * with the failed disk. (only the first 'conf->nr_disks'
7772+ * slots are used for 'real' disks and we must preserve this
7773+ * property)
7774+ */
7775+ case DISKOP_SPARE_ACTIVE:
7776
7777- raid_conf->raid_disks++;
7778- mirror = raid_conf->mirrors+n;
7779+ sdisk = conf->mirrors + spare_disk;
7780+ fdisk = conf->mirrors + failed_disk;
7781
7782- mirror->number=n;
7783- mirror->raid_disk=n;
7784- mirror->dev=dev;
7785- mirror->next=0; /* FIXME */
7786- mirror->sect_limit=128;
7787-
7788- mirror->operational=0;
7789- mirror->spare=1;
7790- mirror->write_only=0;
7791-
7792- sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
7793- sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
7794- sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
7795- sb->nr_disks++;
7796- sb->spare_disks++;
7797+ spare_desc = &sb->disks[sdisk->number];
7798+ failed_desc = &sb->disks[fdisk->number];
7799
7800- restore_flags(flags);
7801+ if (spare_desc != *d) {
7802+ MD_BUG();
7803+ err = 1;
7804+ goto abort;
7805+ }
7806
7807- md_update_sb(MINOR(dev));
7808+ if (spare_desc->raid_disk != sdisk->raid_disk) {
7809+ MD_BUG();
7810+ err = 1;
7811+ goto abort;
7812+ }
7813+
7814+ if (sdisk->raid_disk != spare_disk) {
7815+ MD_BUG();
7816+ err = 1;
7817+ goto abort;
7818+ }
7819
7820- printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
7821+ if (failed_desc->raid_disk != fdisk->raid_disk) {
7822+ MD_BUG();
7823+ err = 1;
7824+ goto abort;
7825+ }
7826
7827- return 0;
7828-}
7829+ if (fdisk->raid_disk != failed_disk) {
7830+ MD_BUG();
7831+ err = 1;
7832+ goto abort;
7833+ }
7834
7835-#undef NO_SUPERBLOCK
7836-#undef WRONG_LEVEL
7837-#undef HOT_ADD_SUCCEEDED
7838+ /*
7839+ * do the switch finally
7840+ */
7841+ xchg_values(*spare_desc, *failed_desc);
7842+ xchg_values(*fdisk, *sdisk);
7843
7844-/*
7845- * Insert the spare disk into the drive-ring
7846- */
7847-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
7848-{
7849- int j, next;
7850- struct mirror_info *p = raid_conf->mirrors;
7851+ /*
7852+ * (careful, 'failed' and 'spare' are switched from now on)
7853+ *
7854+ * we want to preserve linear numbering and we want to
7855+ * give the proper raid_disk number to the now activated
7856+ * disk. (this means we switch back these values)
7857+ */
7858+
7859+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
7860+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
7861+ xchg_values(spare_desc->number, failed_desc->number);
7862+ xchg_values(sdisk->number, fdisk->number);
7863
7864- for (j = 0; j < raid_conf->raid_disks; j++, p++)
7865- if (p->operational && !p->write_only) {
7866- next = p->next;
7867- p->next = mirror->raid_disk;
7868- mirror->next = next;
7869- return;
7870- }
7871- printk("raid1: bug: no read-operational devices\n");
7872-}
7873+ *d = failed_desc;
7874
7875-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
7876- int state)
7877-{
7878- int i = 0, failed_disk = -1;
7879- struct raid1_data *raid_conf = mddev->private;
7880- struct mirror_info *mirror = raid_conf->mirrors;
7881- md_descriptor_t *descriptor;
7882- unsigned long flags;
7883+ if (sdisk->dev == MKDEV(0,0))
7884+ sdisk->used_slot = 0;
7885+ /*
7886+ * this really activates the spare.
7887+ */
7888+ fdisk->spare = 0;
7889+ fdisk->write_only = 0;
7890+ link_disk(conf, fdisk);
7891
7892- for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
7893- if (mirror->spare && mirror->number == spare->number)
7894- goto found;
7895- }
7896- return 1;
7897-found:
7898- for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
7899- i++, mirror++)
7900- if (!mirror->operational)
7901- failed_disk = i;
7902+ /*
7903+ * if we activate a spare, we definitely replace a
7904+ * non-operational disk slot in the 'low' area of
7905+ * the disk array.
7906+ */
7907
7908- save_flags(flags);
7909- cli();
7910- switch (state) {
7911- case SPARE_WRITE:
7912- mirror->operational = 1;
7913- mirror->write_only = 1;
7914- raid_conf->raid_disks = MAX(raid_conf->raid_disks,
7915- mirror->raid_disk + 1);
7916- break;
7917- case SPARE_INACTIVE:
7918- mirror->operational = 0;
7919- mirror->write_only = 0;
7920- break;
7921- case SPARE_ACTIVE:
7922- mirror->spare = 0;
7923- mirror->write_only = 0;
7924- raid_conf->working_disks++;
7925- add_ring(raid_conf, mirror);
7926-
7927- if (failed_disk != -1) {
7928- descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
7929- i = spare->raid_disk;
7930- spare->raid_disk = descriptor->raid_disk;
7931- descriptor->raid_disk = i;
7932- }
7933- break;
7934- default:
7935- printk("raid1_mark_spare: bug: state == %d\n", state);
7936- restore_flags(flags);
7937- return 1;
7938+ conf->working_disks++;
7939+
7940+ break;
7941+
7942+ case DISKOP_HOT_REMOVE_DISK:
7943+ rdisk = conf->mirrors + removed_disk;
7944+
7945+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
7946+ MD_BUG();
7947+ err = 1;
7948+ goto abort;
7949+ }
7950+ rdisk->dev = MKDEV(0,0);
7951+ rdisk->used_slot = 0;
7952+ conf->nr_disks--;
7953+ break;
7954+
7955+ case DISKOP_HOT_ADD_DISK:
7956+ adisk = conf->mirrors + added_disk;
7957+ added_desc = *d;
7958+
7959+ if (added_disk != added_desc->number) {
7960+ MD_BUG();
7961+ err = 1;
7962+ goto abort;
7963+ }
7964+
7965+ adisk->number = added_desc->number;
7966+ adisk->raid_disk = added_desc->raid_disk;
7967+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
7968+
7969+ adisk->operational = 0;
7970+ adisk->write_only = 0;
7971+ adisk->spare = 1;
7972+ adisk->used_slot = 1;
7973+ conf->nr_disks++;
7974+
7975+ break;
7976+
7977+ default:
7978+ MD_BUG();
7979+ err = 1;
7980+ goto abort;
7981 }
7982+abort:
7983 restore_flags(flags);
7984- return 0;
7985+ print_raid1_conf(conf);
7986+ return err;
7987 }
7988
7989+
7990+#define IO_ERROR KERN_ALERT \
7991+"raid1: %s: unrecoverable I/O read error for block %lu\n"
7992+
7993+#define REDIRECT_SECTOR KERN_ERR \
7994+"raid1: %s: redirecting sector %lu to another mirror\n"
7995+
7996 /*
7997 * This is a kernel thread which:
7998 *
7999 * 1. Retries failed read operations on working mirrors.
8000 * 2. Updates the raid superblock when problems encounter.
8001 */
8002-void raid1d (void *data)
8003+static void raid1d (void *data)
8004 {
8005 struct buffer_head *bh;
8006 kdev_t dev;
8007 unsigned long flags;
8008- struct raid1_bh * r1_bh;
8009- struct md_dev *mddev;
8010+ struct raid1_bh *r1_bh;
8011+ mddev_t *mddev;
8012
8013- PRINTK(("raid1d() active\n"));
8014- save_flags(flags);
8015- cli();
8016 while (raid1_retry_list) {
8017+ save_flags(flags);
8018+ cli();
8019 bh = raid1_retry_list;
8020 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
8021 raid1_retry_list = r1_bh->next_retry;
8022 restore_flags(flags);
8023
8024- mddev = md_dev + MINOR(bh->b_dev);
8025+ mddev = kdev_to_mddev(bh->b_dev);
8026 if (mddev->sb_dirty) {
8027- printk("dirty sb detected, updating.\n");
8028+ printk(KERN_INFO "dirty sb detected, updating.\n");
8029 mddev->sb_dirty = 0;
8030- md_update_sb(MINOR(bh->b_dev));
8031+ md_update_sb(mddev);
8032 }
8033 dev = bh->b_rdev;
8034- __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
8035+ __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
8036+ bh->b_size >> 9);
8037 if (bh->b_rdev == dev) {
8038- printk (KERN_ALERT
8039- "raid1: %s: unrecoverable I/O read error for block %lu\n",
8040- kdevname(bh->b_dev), bh->b_blocknr);
8041- raid1_end_buffer_io(r1_bh, 0);
8042+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
8043+ raid1_end_bh_io(r1_bh, 0);
8044 } else {
8045- printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
8046- kdevname(bh->b_dev), bh->b_blocknr);
8047+ printk (REDIRECT_SECTOR,
8048+ partition_name(bh->b_dev), bh->b_blocknr);
8049 map_and_make_request (r1_bh->cmd, bh);
8050 }
8051- cli();
8052 }
8053- restore_flags(flags);
8054+}
8055+#undef IO_ERROR
8056+#undef REDIRECT_SECTOR
8057+
8058+/*
8059+ * Private kernel thread to reconstruct mirrors after an unclean
8060+ * shutdown.
8061+ */
8062+static void raid1syncd (void *data)
8063+{
8064+ raid1_conf_t *conf = data;
8065+ mddev_t *mddev = conf->mddev;
8066+
8067+ if (!conf->resync_mirrors)
8068+ return;
8069+ if (conf->resync_mirrors == 2)
8070+ return;
8071+ down(&mddev->recovery_sem);
8072+ if (md_do_sync(mddev, NULL)) {
8073+ up(&mddev->recovery_sem);
8074+ return;
8075+ }
8076+ /*
8077+ * Only if everything went Ok.
8078+ */
8079+ conf->resync_mirrors = 0;
8080+ up(&mddev->recovery_sem);
8081 }
8082
8083+
8084 /*
8085 * This will catch the scenario in which one of the mirrors was
8086 * mounted as a normal device rather than as a part of a raid set.
8087+ *
8088+ * check_consistency is very personality-dependent, eg. RAID5 cannot
8089+ * do this check, it uses another method.
8090 */
8091-static int __check_consistency (struct md_dev *mddev, int row)
8092+static int __check_consistency (mddev_t *mddev, int row)
8093 {
8094- struct raid1_data *raid_conf = mddev->private;
8095+ raid1_conf_t *conf = mddev_to_conf(mddev);
8096+ int disks = MD_SB_DISKS;
8097 kdev_t dev;
8098 struct buffer_head *bh = NULL;
8099 int i, rc = 0;
8100 char *buffer = NULL;
8101
8102- for (i = 0; i < raid_conf->raid_disks; i++) {
8103- if (!raid_conf->mirrors[i].operational)
8104+ for (i = 0; i < disks; i++) {
8105+ printk("(checking disk %d)\n",i);
8106+ if (!conf->mirrors[i].operational)
8107 continue;
8108- dev = raid_conf->mirrors[i].dev;
8109+ printk("(really checking disk %d)\n",i);
8110+ dev = conf->mirrors[i].dev;
8111 set_blocksize(dev, 4096);
8112 if ((bh = bread(dev, row / 4, 4096)) == NULL)
8113 break;
8114@@ -683,167 +864,342 @@
8115 return rc;
8116 }
8117
8118-static int check_consistency (struct md_dev *mddev)
8119+static int check_consistency (mddev_t *mddev)
8120 {
8121- int size = mddev->sb->size;
8122- int row;
8123+ if (__check_consistency(mddev, 0))
8124+/*
8125+ * we do not do this currently, as it's perfectly possible to
8126+ * have an inconsistent array when it's freshly created. Only
8127+ * newly written data has to be consistent.
8128+ */
8129+ return 0;
8130
8131- for (row = 0; row < size; row += size / 8)
8132- if (__check_consistency(mddev, row))
8133- return 1;
8134 return 0;
8135 }
8136
8137-static int raid1_run (int minor, struct md_dev *mddev)
8138+#define INVALID_LEVEL KERN_WARNING \
8139+"raid1: md%d: raid level not set to mirroring (%d)\n"
8140+
8141+#define NO_SB KERN_ERR \
8142+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
8143+
8144+#define ERRORS KERN_ERR \
8145+"raid1: disabled mirror %s (errors detected)\n"
8146+
8147+#define NOT_IN_SYNC KERN_ERR \
8148+"raid1: disabled mirror %s (not in sync)\n"
8149+
8150+#define INCONSISTENT KERN_ERR \
8151+"raid1: disabled mirror %s (inconsistent descriptor)\n"
8152+
8153+#define ALREADY_RUNNING KERN_ERR \
8154+"raid1: disabled mirror %s (mirror %d already operational)\n"
8155+
8156+#define OPERATIONAL KERN_INFO \
8157+"raid1: device %s operational as mirror %d\n"
8158+
8159+#define MEM_ERROR KERN_ERR \
8160+"raid1: couldn't allocate memory for md%d\n"
8161+
8162+#define SPARE KERN_INFO \
8163+"raid1: spare disk %s\n"
8164+
8165+#define NONE_OPERATIONAL KERN_ERR \
8166+"raid1: no operational mirrors for md%d\n"
8167+
8168+#define RUNNING_CKRAID KERN_ERR \
8169+"raid1: detected mirror differences -- running resync\n"
8170+
8171+#define ARRAY_IS_ACTIVE KERN_INFO \
8172+"raid1: raid set md%d active with %d out of %d mirrors\n"
8173+
8174+#define THREAD_ERROR KERN_ERR \
8175+"raid1: couldn't allocate thread for md%d\n"
8176+
8177+#define START_RESYNC KERN_WARNING \
8178+"raid1: raid set md%d not clean; reconstructing mirrors\n"
8179+
8180+static int raid1_run (mddev_t *mddev)
8181 {
8182- struct raid1_data *raid_conf;
8183- int i, j, raid_disk;
8184- md_superblock_t *sb = mddev->sb;
8185- md_descriptor_t *descriptor;
8186- struct real_dev *realdev;
8187+ raid1_conf_t *conf;
8188+ int i, j, disk_idx;
8189+ struct mirror_info *disk;
8190+ mdp_super_t *sb = mddev->sb;
8191+ mdp_disk_t *descriptor;
8192+ mdk_rdev_t *rdev;
8193+ struct md_list_head *tmp;
8194+ int start_recovery = 0;
8195
8196 MOD_INC_USE_COUNT;
8197
8198 if (sb->level != 1) {
8199- printk("raid1: %s: raid level not set to mirroring (%d)\n",
8200- kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
8201- MOD_DEC_USE_COUNT;
8202- return -EIO;
8203- }
8204- /****
8205- * copy the now verified devices into our private RAID1 bookkeeping
8206- * area. [whatever we allocate in raid1_run(), should be freed in
8207- * raid1_stop()]
8208+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
8209+ goto out;
8210+ }
8211+ /*
8212+ * copy the already verified devices into our private RAID1
8213+ * bookkeeping area. [whatever we allocate in raid1_run(),
8214+ * should be freed in raid1_stop()]
8215 */
8216
8217- while (!( /* FIXME: now we are rather fault tolerant than nice */
8218- mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
8219- ) )
8220- {
8221- printk ("raid1_run(): out of memory\n");
8222- current->policy |= SCHED_YIELD;
8223- schedule();
8224- }
8225- raid_conf = mddev->private;
8226- memset(raid_conf, 0, sizeof(*raid_conf));
8227-
8228- PRINTK(("raid1_run(%d) called.\n", minor));
8229-
8230- for (i = 0; i < mddev->nb_dev; i++) {
8231- realdev = &mddev->devices[i];
8232- if (!realdev->sb) {
8233- printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
8234+ conf = raid1_kmalloc(sizeof(raid1_conf_t));
8235+ mddev->private = conf;
8236+ if (!conf) {
8237+ printk(MEM_ERROR, mdidx(mddev));
8238+ goto out;
8239+ }
8240+
8241+ ITERATE_RDEV(mddev,rdev,tmp) {
8242+ if (rdev->faulty) {
8243+ printk(ERRORS, partition_name(rdev->dev));
8244+ } else {
8245+ if (!rdev->sb) {
8246+ MD_BUG();
8247+ continue;
8248+ }
8249+ }
8250+ if (rdev->desc_nr == -1) {
8251+ MD_BUG();
8252 continue;
8253 }
8254-
8255- /*
8256- * This is important -- we are using the descriptor on
8257- * the disk only to get a pointer to the descriptor on
8258- * the main superblock, which might be more recent.
8259- */
8260- descriptor = &sb->disks[realdev->sb->descriptor.number];
8261- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
8262- printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
8263+ descriptor = &sb->disks[rdev->desc_nr];
8264+ disk_idx = descriptor->raid_disk;
8265+ disk = conf->mirrors + disk_idx;
8266+
8267+ if (disk_faulty(descriptor)) {
8268+ disk->number = descriptor->number;
8269+ disk->raid_disk = disk_idx;
8270+ disk->dev = rdev->dev;
8271+ disk->sect_limit = MAX_LINEAR_SECTORS;
8272+ disk->operational = 0;
8273+ disk->write_only = 0;
8274+ disk->spare = 0;
8275+ disk->used_slot = 1;
8276 continue;
8277 }
8278- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
8279- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
8280- printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
8281+ if (disk_active(descriptor)) {
8282+ if (!disk_sync(descriptor)) {
8283+ printk(NOT_IN_SYNC,
8284+ partition_name(rdev->dev));
8285 continue;
8286 }
8287- raid_disk = descriptor->raid_disk;
8288- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
8289- printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
8290+ if ((descriptor->number > MD_SB_DISKS) ||
8291+ (disk_idx > sb->raid_disks)) {
8292+
8293+ printk(INCONSISTENT,
8294+ partition_name(rdev->dev));
8295 continue;
8296 }
8297- if (raid_conf->mirrors[raid_disk].operational) {
8298- printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
8299+ if (disk->operational) {
8300+ printk(ALREADY_RUNNING,
8301+ partition_name(rdev->dev),
8302+ disk_idx);
8303 continue;
8304 }
8305- printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
8306- raid_conf->mirrors[raid_disk].number = descriptor->number;
8307- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
8308- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
8309- raid_conf->mirrors[raid_disk].operational = 1;
8310- raid_conf->mirrors[raid_disk].sect_limit = 128;
8311- raid_conf->working_disks++;
8312+ printk(OPERATIONAL, partition_name(rdev->dev),
8313+ disk_idx);
8314+ disk->number = descriptor->number;
8315+ disk->raid_disk = disk_idx;
8316+ disk->dev = rdev->dev;
8317+ disk->sect_limit = MAX_LINEAR_SECTORS;
8318+ disk->operational = 1;
8319+ disk->write_only = 0;
8320+ disk->spare = 0;
8321+ disk->used_slot = 1;
8322+ conf->working_disks++;
8323 } else {
8324 /*
8325 * Must be a spare disk ..
8326 */
8327- printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
8328- raid_disk = descriptor->raid_disk;
8329- raid_conf->mirrors[raid_disk].number = descriptor->number;
8330- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
8331- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
8332- raid_conf->mirrors[raid_disk].sect_limit = 128;
8333-
8334- raid_conf->mirrors[raid_disk].operational = 0;
8335- raid_conf->mirrors[raid_disk].write_only = 0;
8336- raid_conf->mirrors[raid_disk].spare = 1;
8337- }
8338- }
8339- if (!raid_conf->working_disks) {
8340- printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
8341- kfree(raid_conf);
8342- mddev->private = NULL;
8343- MOD_DEC_USE_COUNT;
8344- return -EIO;
8345- }
8346-
8347- raid_conf->raid_disks = sb->raid_disks;
8348- raid_conf->mddev = mddev;
8349-
8350- for (j = 0; !raid_conf->mirrors[j].operational; j++);
8351- raid_conf->last_used = j;
8352- for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
8353- if (raid_conf->mirrors[i].operational) {
8354- PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
8355- raid_conf->mirrors[i].next = j;
8356+ printk(SPARE, partition_name(rdev->dev));
8357+ disk->number = descriptor->number;
8358+ disk->raid_disk = disk_idx;
8359+ disk->dev = rdev->dev;
8360+ disk->sect_limit = MAX_LINEAR_SECTORS;
8361+ disk->operational = 0;
8362+ disk->write_only = 0;
8363+ disk->spare = 1;
8364+ disk->used_slot = 1;
8365+ }
8366+ }
8367+ if (!conf->working_disks) {
8368+ printk(NONE_OPERATIONAL, mdidx(mddev));
8369+ goto out_free_conf;
8370+ }
8371+
8372+ conf->raid_disks = sb->raid_disks;
8373+ conf->nr_disks = sb->nr_disks;
8374+ conf->mddev = mddev;
8375+
8376+ for (i = 0; i < MD_SB_DISKS; i++) {
8377+
8378+ descriptor = sb->disks+i;
8379+ disk_idx = descriptor->raid_disk;
8380+ disk = conf->mirrors + disk_idx;
8381+
8382+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
8383+ !disk->used_slot) {
8384+
8385+ disk->number = descriptor->number;
8386+ disk->raid_disk = disk_idx;
8387+ disk->dev = MKDEV(0,0);
8388+
8389+ disk->operational = 0;
8390+ disk->write_only = 0;
8391+ disk->spare = 0;
8392+ disk->used_slot = 1;
8393+ }
8394+ }
8395+
8396+ /*
8397+ * find the first working one and use it as a starting point
8398+ * to read balancing.
8399+ */
8400+ for (j = 0; !conf->mirrors[j].operational; j++)
8401+ /* nothing */;
8402+ conf->last_used = j;
8403+
8404+ /*
8405+ * initialize the 'working disks' list.
8406+ */
8407+ for (i = conf->raid_disks - 1; i >= 0; i--) {
8408+ if (conf->mirrors[i].operational) {
8409+ conf->mirrors[i].next = j;
8410 j = i;
8411 }
8412 }
8413
8414- if (check_consistency(mddev)) {
8415- printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
8416- sb->state |= 1 << MD_SB_ERRORS;
8417- kfree(raid_conf);
8418- mddev->private = NULL;
8419- MOD_DEC_USE_COUNT;
8420- return -EIO;
8421+ if (conf->working_disks != sb->raid_disks) {
8422+ printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
8423+ start_recovery = 1;
8424 }
8425
8426+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
8427+ /*
8428+ * we do sanity checks even if the device says
8429+ * it's clean ...
8430+ */
8431+ if (check_consistency(mddev)) {
8432+ printk(RUNNING_CKRAID);
8433+ sb->state &= ~(1 << MD_SB_CLEAN);
8434+ }
8435+ }
8436+
8437+ {
8438+ const char * name = "raid1d";
8439+
8440+ conf->thread = md_register_thread(raid1d, conf, name);
8441+ if (!conf->thread) {
8442+ printk(THREAD_ERROR, mdidx(mddev));
8443+ goto out_free_conf;
8444+ }
8445+ }
8446+
8447+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
8448+ const char * name = "raid1syncd";
8449+
8450+ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
8451+ if (!conf->resync_thread) {
8452+ printk(THREAD_ERROR, mdidx(mddev));
8453+ goto out_free_conf;
8454+ }
8455+
8456+ printk(START_RESYNC, mdidx(mddev));
8457+ conf->resync_mirrors = 1;
8458+ md_wakeup_thread(conf->resync_thread);
8459+ }
8460+
8461 /*
8462 * Regenerate the "device is in sync with the raid set" bit for
8463 * each device.
8464 */
8465- for (i = 0; i < sb->nr_disks ; i++) {
8466- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
8467+ for (i = 0; i < MD_SB_DISKS; i++) {
8468+ mark_disk_nonsync(sb->disks+i);
8469 for (j = 0; j < sb->raid_disks; j++) {
8470- if (!raid_conf->mirrors[j].operational)
8471+ if (!conf->mirrors[j].operational)
8472 continue;
8473- if (sb->disks[i].number == raid_conf->mirrors[j].number)
8474- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
8475+ if (sb->disks[i].number == conf->mirrors[j].number)
8476+ mark_disk_sync(sb->disks+i);
8477 }
8478 }
8479- sb->active_disks = raid_conf->working_disks;
8480+ sb->active_disks = conf->working_disks;
8481
8482- printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
8483- /* Ok, everything is just fine now */
8484- return (0);
8485+ if (start_recovery)
8486+ md_recover_arrays();
8487+
8488+
8489+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
8490+ /*
8491+ * Ok, everything is just fine now
8492+ */
8493+ return 0;
8494+
8495+out_free_conf:
8496+ kfree(conf);
8497+ mddev->private = NULL;
8498+out:
8499+ MOD_DEC_USE_COUNT;
8500+ return -EIO;
8501+}
8502+
8503+#undef INVALID_LEVEL
8504+#undef NO_SB
8505+#undef ERRORS
8506+#undef NOT_IN_SYNC
8507+#undef INCONSISTENT
8508+#undef ALREADY_RUNNING
8509+#undef OPERATIONAL
8510+#undef SPARE
8511+#undef NONE_OPERATIONAL
8512+#undef RUNNING_CKRAID
8513+#undef ARRAY_IS_ACTIVE
8514+
8515+static int raid1_stop_resync (mddev_t *mddev)
8516+{
8517+ raid1_conf_t *conf = mddev_to_conf(mddev);
8518+
8519+ if (conf->resync_thread) {
8520+ if (conf->resync_mirrors) {
8521+ conf->resync_mirrors = 2;
8522+ md_interrupt_thread(conf->resync_thread);
8523+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
8524+ return 1;
8525+ }
8526+ return 0;
8527+ }
8528+ return 0;
8529+}
8530+
8531+static int raid1_restart_resync (mddev_t *mddev)
8532+{
8533+ raid1_conf_t *conf = mddev_to_conf(mddev);
8534+
8535+ if (conf->resync_mirrors) {
8536+ if (!conf->resync_thread) {
8537+ MD_BUG();
8538+ return 0;
8539+ }
8540+ conf->resync_mirrors = 1;
8541+ md_wakeup_thread(conf->resync_thread);
8542+ return 1;
8543+ }
8544+ return 0;
8545 }
8546
8547-static int raid1_stop (int minor, struct md_dev *mddev)
8548+static int raid1_stop (mddev_t *mddev)
8549 {
8550- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
8551+ raid1_conf_t *conf = mddev_to_conf(mddev);
8552
8553- kfree (raid_conf);
8554+ md_unregister_thread(conf->thread);
8555+ if (conf->resync_thread)
8556+ md_unregister_thread(conf->resync_thread);
8557+ kfree(conf);
8558 mddev->private = NULL;
8559 MOD_DEC_USE_COUNT;
8560 return 0;
8561 }
8562
8563-static struct md_personality raid1_personality=
8564+static mdk_personality_t raid1_personality=
8565 {
8566 "raid1",
8567 raid1_map,
8568@@ -855,15 +1211,13 @@
8569 NULL, /* no ioctls */
8570 0,
8571 raid1_error,
8572- raid1_hot_add_disk,
8573- /* raid1_hot_remove_drive */ NULL,
8574- raid1_mark_spare
8575+ raid1_diskop,
8576+ raid1_stop_resync,
8577+ raid1_restart_resync
8578 };
8579
8580 int raid1_init (void)
8581 {
8582- if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
8583- return -EBUSY;
8584 return register_md_personality (RAID1, &raid1_personality);
8585 }
8586
8587@@ -875,7 +1229,6 @@
8588
8589 void cleanup_module (void)
8590 {
8591- md_unregister_thread (raid1_thread);
8592 unregister_md_personality (RAID1);
8593 }
8594 #endif
8595diff -ruN linux.orig/drivers/block/raid5.c linux-2.2.16/drivers/block/raid5.c
8596--- linux.orig/drivers/block/raid5.c Fri May 8 09:17:13 1998
8597+++ linux-2.2.16/drivers/block/raid5.c Fri Jun 9 11:37:45 2000
8598@@ -1,4 +1,4 @@
8599-/*****************************************************************************
8600+/*
8601 * raid5.c : Multiple Devices driver for Linux
8602 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8603 *
8604@@ -14,16 +14,15 @@
8605 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
8606 */
8607
8608+
8609 #include <linux/module.h>
8610 #include <linux/locks.h>
8611 #include <linux/malloc.h>
8612-#include <linux/md.h>
8613-#include <linux/raid5.h>
8614+#include <linux/raid/raid5.h>
8615 #include <asm/bitops.h>
8616 #include <asm/atomic.h>
8617-#include <asm/md.h>
8618
8619-static struct md_personality raid5_personality;
8620+static mdk_personality_t raid5_personality;
8621
8622 /*
8623 * Stripe cache
8624@@ -33,7 +32,7 @@
8625 #define HASH_PAGES_ORDER 0
8626 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
8627 #define HASH_MASK (NR_HASH - 1)
8628-#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
8629+#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
8630
8631 /*
8632 * The following can be used to debug the driver
8633@@ -46,6 +45,8 @@
8634 #define PRINTK(x) do { ; } while (0)
8635 #endif
8636
8637+static void print_raid5_conf (raid5_conf_t *conf);
8638+
8639 static inline int stripe_locked(struct stripe_head *sh)
8640 {
8641 return test_bit(STRIPE_LOCKED, &sh->state);
8642@@ -61,32 +62,32 @@
8643 */
8644 static inline void lock_stripe(struct stripe_head *sh)
8645 {
8646- struct raid5_data *raid_conf = sh->raid_conf;
8647- if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
8648+ raid5_conf_t *conf = sh->raid_conf;
8649+ if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
8650 PRINTK(("locking stripe %lu\n", sh->sector));
8651- raid_conf->nr_locked_stripes++;
8652+ conf->nr_locked_stripes++;
8653 }
8654 }
8655
8656 static inline void unlock_stripe(struct stripe_head *sh)
8657 {
8658- struct raid5_data *raid_conf = sh->raid_conf;
8659- if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
8660+ raid5_conf_t *conf = sh->raid_conf;
8661+ if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
8662 PRINTK(("unlocking stripe %lu\n", sh->sector));
8663- raid_conf->nr_locked_stripes--;
8664+ conf->nr_locked_stripes--;
8665 wake_up(&sh->wait);
8666 }
8667 }
8668
8669 static inline void finish_stripe(struct stripe_head *sh)
8670 {
8671- struct raid5_data *raid_conf = sh->raid_conf;
8672+ raid5_conf_t *conf = sh->raid_conf;
8673 unlock_stripe(sh);
8674 sh->cmd = STRIPE_NONE;
8675 sh->phase = PHASE_COMPLETE;
8676- raid_conf->nr_pending_stripes--;
8677- raid_conf->nr_cached_stripes++;
8678- wake_up(&raid_conf->wait_for_stripe);
8679+ conf->nr_pending_stripes--;
8680+ conf->nr_cached_stripes++;
8681+ wake_up(&conf->wait_for_stripe);
8682 }
8683
8684 void __wait_on_stripe(struct stripe_head *sh)
8685@@ -114,7 +115,7 @@
8686 __wait_on_stripe(sh);
8687 }
8688
8689-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
8690+static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
8691 {
8692 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
8693
8694@@ -123,21 +124,22 @@
8695 sh->hash_next->hash_pprev = sh->hash_pprev;
8696 *sh->hash_pprev = sh->hash_next;
8697 sh->hash_pprev = NULL;
8698- raid_conf->nr_hashed_stripes--;
8699+ conf->nr_hashed_stripes--;
8700 }
8701 }
8702
8703-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
8704+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
8705 {
8706- struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
8707+ struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
8708
8709- PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
8710+ PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
8711+ sh->sector, conf->nr_hashed_stripes));
8712
8713 if ((sh->hash_next = *shp) != NULL)
8714 (*shp)->hash_pprev = &sh->hash_next;
8715 *shp = sh;
8716 sh->hash_pprev = shp;
8717- raid_conf->nr_hashed_stripes++;
8718+ conf->nr_hashed_stripes++;
8719 }
8720
8721 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
8722@@ -145,13 +147,15 @@
8723 struct buffer_head *bh;
8724 unsigned long flags;
8725
8726- save_flags(flags);
8727- cli();
8728- if ((bh = sh->buffer_pool) == NULL)
8729- return NULL;
8730+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
8731+ bh = sh->buffer_pool;
8732+ if (!bh)
8733+ goto out_unlock;
8734 sh->buffer_pool = bh->b_next;
8735 bh->b_size = b_size;
8736- restore_flags(flags);
8737+out_unlock:
8738+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8739+
8740 return bh;
8741 }
8742
8743@@ -160,12 +164,14 @@
8744 struct buffer_head *bh;
8745 unsigned long flags;
8746
8747- save_flags(flags);
8748- cli();
8749- if ((bh = sh->bh_pool) == NULL)
8750- return NULL;
8751+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
8752+ bh = sh->bh_pool;
8753+ if (!bh)
8754+ goto out_unlock;
8755 sh->bh_pool = bh->b_next;
8756- restore_flags(flags);
8757+out_unlock:
8758+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8759+
8760 return bh;
8761 }
8762
8763@@ -173,54 +179,52 @@
8764 {
8765 unsigned long flags;
8766
8767- save_flags(flags);
8768- cli();
8769+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
8770 bh->b_next = sh->buffer_pool;
8771 sh->buffer_pool = bh;
8772- restore_flags(flags);
8773+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8774 }
8775
8776 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
8777 {
8778 unsigned long flags;
8779
8780- save_flags(flags);
8781- cli();
8782+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
8783 bh->b_next = sh->bh_pool;
8784 sh->bh_pool = bh;
8785- restore_flags(flags);
8786+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8787 }
8788
8789-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
8790+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
8791 {
8792 struct stripe_head *sh;
8793 unsigned long flags;
8794
8795 save_flags(flags);
8796 cli();
8797- if ((sh = raid_conf->free_sh_list) == NULL) {
8798+ if ((sh = conf->free_sh_list) == NULL) {
8799 restore_flags(flags);
8800 return NULL;
8801 }
8802- raid_conf->free_sh_list = sh->free_next;
8803- raid_conf->nr_free_sh--;
8804- if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
8805+ conf->free_sh_list = sh->free_next;
8806+ conf->nr_free_sh--;
8807+ if (!conf->nr_free_sh && conf->free_sh_list)
8808 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
8809 restore_flags(flags);
8810- if (sh->hash_pprev || sh->nr_pending || sh->count)
8811+ if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
8812 printk("get_free_stripe(): bug\n");
8813 return sh;
8814 }
8815
8816-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
8817+static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
8818 {
8819 unsigned long flags;
8820
8821 save_flags(flags);
8822 cli();
8823- sh->free_next = raid_conf->free_sh_list;
8824- raid_conf->free_sh_list = sh;
8825- raid_conf->nr_free_sh++;
8826+ sh->free_next = conf->free_sh_list;
8827+ conf->free_sh_list = sh;
8828+ conf->nr_free_sh++;
8829 restore_flags(flags);
8830 }
8831
8832@@ -324,8 +328,8 @@
8833
8834 static void kfree_stripe(struct stripe_head *sh)
8835 {
8836- struct raid5_data *raid_conf = sh->raid_conf;
8837- int disks = raid_conf->raid_disks, j;
8838+ raid5_conf_t *conf = sh->raid_conf;
8839+ int disks = conf->raid_disks, j;
8840
8841 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
8842 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
8843@@ -338,19 +342,19 @@
8844 if (sh->bh_new[j] || sh->bh_copy[j])
8845 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
8846 }
8847- remove_hash(raid_conf, sh);
8848- put_free_stripe(raid_conf, sh);
8849+ remove_hash(conf, sh);
8850+ put_free_stripe(conf, sh);
8851 }
8852
8853-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
8854+static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
8855 {
8856 struct stripe_head *sh;
8857 int i, count = 0;
8858
8859- PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
8860+ PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
8861 for (i = 0; i < NR_HASH; i++) {
8862 repeat:
8863- sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
8864+ sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
8865 for (; sh; sh = sh->hash_next) {
8866 if (sh->phase != PHASE_COMPLETE)
8867 continue;
8868@@ -360,30 +364,30 @@
8869 continue;
8870 kfree_stripe(sh);
8871 if (++count == nr) {
8872- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
8873- raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
8874+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
8875+ conf->clock = (i + conf->clock) & HASH_MASK;
8876 return nr;
8877 }
8878 goto repeat;
8879 }
8880 }
8881- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
8882+ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
8883 return count;
8884 }
8885
8886-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
8887+static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
8888 {
8889 struct stripe_head *sh;
8890
8891- if (raid_conf->buffer_size != size) {
8892- PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
8893- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
8894- raid_conf->buffer_size = size;
8895+ if (conf->buffer_size != size) {
8896+ PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
8897+ shrink_stripe_cache(conf, conf->max_nr_stripes);
8898+ conf->buffer_size = size;
8899 }
8900
8901 PRINTK(("find_stripe, sector %lu\n", sector));
8902- for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
8903- if (sh->sector == sector && sh->raid_conf == raid_conf) {
8904+ for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
8905+ if (sh->sector == sector && sh->raid_conf == conf) {
8906 if (sh->size == size) {
8907 PRINTK(("found stripe %lu\n", sector));
8908 return sh;
8909@@ -397,7 +401,7 @@
8910 return NULL;
8911 }
8912
8913-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
8914+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
8915 {
8916 struct stripe_head *sh;
8917
8918@@ -405,62 +409,64 @@
8919 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
8920 return 1;
8921 memset(sh, 0, sizeof(*sh));
8922- if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
8923- shrink_buffers(sh, 2 * raid_conf->raid_disks);
8924+ sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
8925+
8926+ if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
8927+ shrink_buffers(sh, 2 * conf->raid_disks);
8928 kfree(sh);
8929 return 1;
8930 }
8931- if (grow_bh(sh, raid_conf->raid_disks, priority)) {
8932- shrink_buffers(sh, 2 * raid_conf->raid_disks);
8933- shrink_bh(sh, raid_conf->raid_disks);
8934+ if (grow_bh(sh, conf->raid_disks, priority)) {
8935+ shrink_buffers(sh, 2 * conf->raid_disks);
8936+ shrink_bh(sh, conf->raid_disks);
8937 kfree(sh);
8938 return 1;
8939 }
8940- put_free_stripe(raid_conf, sh);
8941- raid_conf->nr_stripes++;
8942+ put_free_stripe(conf, sh);
8943+ conf->nr_stripes++;
8944 }
8945 return 0;
8946 }
8947
8948-static void shrink_stripes(struct raid5_data *raid_conf, int num)
8949+static void shrink_stripes(raid5_conf_t *conf, int num)
8950 {
8951 struct stripe_head *sh;
8952
8953 while (num--) {
8954- sh = get_free_stripe(raid_conf);
8955+ sh = get_free_stripe(conf);
8956 if (!sh)
8957 break;
8958- shrink_buffers(sh, raid_conf->raid_disks * 2);
8959- shrink_bh(sh, raid_conf->raid_disks);
8960+ shrink_buffers(sh, conf->raid_disks * 2);
8961+ shrink_bh(sh, conf->raid_disks);
8962 kfree(sh);
8963- raid_conf->nr_stripes--;
8964+ conf->nr_stripes--;
8965 }
8966 }
8967
8968-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
8969+static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
8970 {
8971 struct stripe_head *sh = NULL, *tmp;
8972 struct buffer_head *buffer_pool, *bh_pool;
8973
8974 PRINTK(("kmalloc_stripe called\n"));
8975
8976- while ((sh = get_free_stripe(raid_conf)) == NULL) {
8977- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
8978- if ((sh = get_free_stripe(raid_conf)) != NULL)
8979+ while ((sh = get_free_stripe(conf)) == NULL) {
8980+ shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
8981+ if ((sh = get_free_stripe(conf)) != NULL)
8982 break;
8983- if (!raid_conf->nr_pending_stripes)
8984+ if (!conf->nr_pending_stripes)
8985 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
8986- md_wakeup_thread(raid_conf->thread);
8987+ md_wakeup_thread(conf->thread);
8988 PRINTK(("waiting for some stripes to complete\n"));
8989- sleep_on(&raid_conf->wait_for_stripe);
8990+ sleep_on(&conf->wait_for_stripe);
8991 }
8992
8993 /*
8994 * The above might have slept, so perhaps another process
8995 * already created the stripe for us..
8996 */
8997- if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
8998- put_free_stripe(raid_conf, sh);
8999+ if ((tmp = find_stripe(conf, sector, size)) != NULL) {
9000+ put_free_stripe(conf, sh);
9001 wait_on_stripe(tmp);
9002 return tmp;
9003 }
9004@@ -472,25 +478,25 @@
9005 sh->bh_pool = bh_pool;
9006 sh->phase = PHASE_COMPLETE;
9007 sh->cmd = STRIPE_NONE;
9008- sh->raid_conf = raid_conf;
9009+ sh->raid_conf = conf;
9010 sh->sector = sector;
9011 sh->size = size;
9012- raid_conf->nr_cached_stripes++;
9013- insert_hash(raid_conf, sh);
9014+ conf->nr_cached_stripes++;
9015+ insert_hash(conf, sh);
9016 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
9017 return sh;
9018 }
9019
9020-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
9021+static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
9022 {
9023 struct stripe_head *sh;
9024
9025 PRINTK(("get_stripe, sector %lu\n", sector));
9026- sh = find_stripe(raid_conf, sector, size);
9027+ sh = find_stripe(conf, sector, size);
9028 if (sh)
9029 wait_on_stripe(sh);
9030 else
9031- sh = kmalloc_stripe(raid_conf, sector, size);
9032+ sh = kmalloc_stripe(conf, sector, size);
9033 return sh;
9034 }
9035
9036@@ -523,7 +529,7 @@
9037 bh->b_end_io(bh, uptodate);
9038 if (!uptodate)
9039 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
9040- "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
9041+ "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
9042 }
9043
9044 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
9045@@ -537,36 +543,35 @@
9046 static void raid5_end_request (struct buffer_head * bh, int uptodate)
9047 {
9048 struct stripe_head *sh = bh->b_dev_id;
9049- struct raid5_data *raid_conf = sh->raid_conf;
9050- int disks = raid_conf->raid_disks, i;
9051+ raid5_conf_t *conf = sh->raid_conf;
9052+ int disks = conf->raid_disks, i;
9053 unsigned long flags;
9054
9055 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
9056- save_flags(flags);
9057- cli();
9058+ md_spin_lock_irqsave(&sh->stripe_lock, flags);
9059 raid5_mark_buffer_uptodate(bh, uptodate);
9060- --sh->nr_pending;
9061- if (!sh->nr_pending) {
9062- md_wakeup_thread(raid_conf->thread);
9063- atomic_inc(&raid_conf->nr_handle);
9064+ if (atomic_dec_and_test(&sh->nr_pending)) {
9065+ md_wakeup_thread(conf->thread);
9066+ atomic_inc(&conf->nr_handle);
9067 }
9068- if (!uptodate)
9069+ if (!uptodate) {
9070 md_error(bh->b_dev, bh->b_rdev);
9071- if (raid_conf->failed_disks) {
9072+ }
9073+ if (conf->failed_disks) {
9074 for (i = 0; i < disks; i++) {
9075- if (raid_conf->disks[i].operational)
9076+ if (conf->disks[i].operational)
9077 continue;
9078 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
9079 continue;
9080- if (bh->b_rdev != raid_conf->disks[i].dev)
9081+ if (bh->b_rdev != conf->disks[i].dev)
9082 continue;
9083 set_bit(STRIPE_ERROR, &sh->state);
9084 }
9085 }
9086- restore_flags(flags);
9087+ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
9088 }
9089
9090-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
9091+static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9092 unsigned long *rsector, unsigned long size)
9093 {
9094 /* No complex mapping used: the core of the work is done in the
9095@@ -577,11 +582,10 @@
9096
9097 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
9098 {
9099- struct raid5_data *raid_conf = sh->raid_conf;
9100- struct md_dev *mddev = raid_conf->mddev;
9101- int minor = (int) (mddev - md_dev);
9102+ raid5_conf_t *conf = sh->raid_conf;
9103+ mddev_t *mddev = conf->mddev;
9104 char *b_data;
9105- kdev_t dev = MKDEV(MD_MAJOR, minor);
9106+ kdev_t dev = mddev_to_kdev(mddev);
9107 int block = sh->sector / (sh->size >> 9);
9108
9109 b_data = ((volatile struct buffer_head *) bh)->b_data;
9110@@ -589,7 +593,7 @@
9111 init_buffer(bh, dev, block, raid5_end_request, sh);
9112 ((volatile struct buffer_head *) bh)->b_data = b_data;
9113
9114- bh->b_rdev = raid_conf->disks[i].dev;
9115+ bh->b_rdev = conf->disks[i].dev;
9116 bh->b_rsector = sh->sector;
9117
9118 bh->b_state = (1 << BH_Req);
9119@@ -597,33 +601,62 @@
9120 bh->b_list = BUF_LOCKED;
9121 }
9122
9123-static int raid5_error (struct md_dev *mddev, kdev_t dev)
9124+static int raid5_error (mddev_t *mddev, kdev_t dev)
9125 {
9126- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
9127- md_superblock_t *sb = mddev->sb;
9128+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
9129+ mdp_super_t *sb = mddev->sb;
9130 struct disk_info *disk;
9131 int i;
9132
9133 PRINTK(("raid5_error called\n"));
9134- raid_conf->resync_parity = 0;
9135- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
9136+ conf->resync_parity = 0;
9137+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
9138 if (disk->dev == dev && disk->operational) {
9139 disk->operational = 0;
9140- sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
9141- sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
9142- sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
9143+ mark_disk_faulty(sb->disks+disk->number);
9144+ mark_disk_nonsync(sb->disks+disk->number);
9145+ mark_disk_inactive(sb->disks+disk->number);
9146 sb->active_disks--;
9147 sb->working_disks--;
9148 sb->failed_disks++;
9149 mddev->sb_dirty = 1;
9150- raid_conf->working_disks--;
9151- raid_conf->failed_disks++;
9152- md_wakeup_thread(raid_conf->thread);
9153+ conf->working_disks--;
9154+ conf->failed_disks++;
9155+ md_wakeup_thread(conf->thread);
9156 printk (KERN_ALERT
9157- "RAID5: Disk failure on %s, disabling device."
9158- "Operation continuing on %d devices\n",
9159- kdevname (dev), raid_conf->working_disks);
9160+ "raid5: Disk failure on %s, disabling device."
9161+ " Operation continuing on %d devices\n",
9162+ partition_name (dev), conf->working_disks);
9163+ return -EIO;
9164 }
9165+ }
9166+ /*
9167+ * handle errors in spares (during reconstruction)
9168+ */
9169+ if (conf->spare) {
9170+ disk = conf->spare;
9171+ if (disk->dev == dev) {
9172+ printk (KERN_ALERT
9173+ "raid5: Disk failure on spare %s\n",
9174+ partition_name (dev));
9175+ if (!conf->spare->operational) {
9176+ MD_BUG();
9177+ return -EIO;
9178+ }
9179+ disk->operational = 0;
9180+ disk->write_only = 0;
9181+ conf->spare = NULL;
9182+ mark_disk_faulty(sb->disks+disk->number);
9183+ mark_disk_nonsync(sb->disks+disk->number);
9184+ mark_disk_inactive(sb->disks+disk->number);
9185+ sb->spare_disks--;
9186+ sb->working_disks--;
9187+ sb->failed_disks++;
9188+
9189+ return -EIO;
9190+ }
9191+ }
9192+ MD_BUG();
9193 return 0;
9194 }
9195
9196@@ -634,12 +667,12 @@
9197 static inline unsigned long
9198 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
9199 unsigned int * dd_idx, unsigned int * pd_idx,
9200- struct raid5_data *raid_conf)
9201+ raid5_conf_t *conf)
9202 {
9203 unsigned int stripe;
9204 int chunk_number, chunk_offset;
9205 unsigned long new_sector;
9206- int sectors_per_chunk = raid_conf->chunk_size >> 9;
9207+ int sectors_per_chunk = conf->chunk_size >> 9;
9208
9209 /* First compute the information on this sector */
9210
9211@@ -662,9 +695,9 @@
9212 /*
9213 * Select the parity disk based on the user selected algorithm.
9214 */
9215- if (raid_conf->level == 4)
9216+ if (conf->level == 4)
9217 *pd_idx = data_disks;
9218- else switch (raid_conf->algorithm) {
9219+ else switch (conf->algorithm) {
9220 case ALGORITHM_LEFT_ASYMMETRIC:
9221 *pd_idx = data_disks - stripe % raid_disks;
9222 if (*dd_idx >= *pd_idx)
9223@@ -684,7 +717,7 @@
9224 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
9225 break;
9226 default:
9227- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
9228+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
9229 }
9230
9231 /*
9232@@ -705,16 +738,16 @@
9233
9234 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
9235 {
9236- struct raid5_data *raid_conf = sh->raid_conf;
9237- int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
9238+ raid5_conf_t *conf = sh->raid_conf;
9239+ int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
9240 unsigned long new_sector = sh->sector, check;
9241- int sectors_per_chunk = raid_conf->chunk_size >> 9;
9242+ int sectors_per_chunk = conf->chunk_size >> 9;
9243 unsigned long stripe = new_sector / sectors_per_chunk;
9244 int chunk_offset = new_sector % sectors_per_chunk;
9245 int chunk_number, dummy1, dummy2, dd_idx = i;
9246 unsigned long r_sector, blocknr;
9247
9248- switch (raid_conf->algorithm) {
9249+ switch (conf->algorithm) {
9250 case ALGORITHM_LEFT_ASYMMETRIC:
9251 case ALGORITHM_RIGHT_ASYMMETRIC:
9252 if (i > sh->pd_idx)
9253@@ -727,14 +760,14 @@
9254 i -= (sh->pd_idx + 1);
9255 break;
9256 default:
9257- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
9258+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
9259 }
9260
9261 chunk_number = stripe * data_disks + i;
9262 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
9263 blocknr = r_sector / (sh->size >> 9);
9264
9265- check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
9266+ check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
9267 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
9268 printk("compute_blocknr: map not correct\n");
9269 return 0;
9270@@ -742,36 +775,11 @@
9271 return blocknr;
9272 }
9273
9274-#ifdef HAVE_ARCH_XORBLOCK
9275-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
9276-{
9277- __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
9278-}
9279-#else
9280-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
9281-{
9282- long lines = dest->b_size / (sizeof (long)) / 8, i;
9283- long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
9284-
9285- for (i = lines; i > 0; i--) {
9286- *(destp + 0) ^= *(sourcep + 0);
9287- *(destp + 1) ^= *(sourcep + 1);
9288- *(destp + 2) ^= *(sourcep + 2);
9289- *(destp + 3) ^= *(sourcep + 3);
9290- *(destp + 4) ^= *(sourcep + 4);
9291- *(destp + 5) ^= *(sourcep + 5);
9292- *(destp + 6) ^= *(sourcep + 6);
9293- *(destp + 7) ^= *(sourcep + 7);
9294- destp += 8;
9295- sourcep += 8;
9296- }
9297-}
9298-#endif
9299-
9300 static void compute_block(struct stripe_head *sh, int dd_idx)
9301 {
9302- struct raid5_data *raid_conf = sh->raid_conf;
9303- int i, disks = raid_conf->raid_disks;
9304+ raid5_conf_t *conf = sh->raid_conf;
9305+ int i, count, disks = conf->raid_disks;
9306+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9307
9308 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
9309
9310@@ -780,69 +788,100 @@
9311 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
9312
9313 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
9314+ bh_ptr[0] = sh->bh_old[dd_idx];
9315+ count = 1;
9316 for (i = 0; i < disks; i++) {
9317 if (i == dd_idx)
9318 continue;
9319 if (sh->bh_old[i]) {
9320- xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
9321- continue;
9322- } else
9323+ bh_ptr[count++] = sh->bh_old[i];
9324+ } else {
9325 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
9326+ }
9327+ if (count == MAX_XOR_BLOCKS) {
9328+ xor_block(count, &bh_ptr[0]);
9329+ count = 1;
9330+ }
9331+ }
9332+ if(count != 1) {
9333+ xor_block(count, &bh_ptr[0]);
9334 }
9335 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
9336 }
9337
9338 static void compute_parity(struct stripe_head *sh, int method)
9339 {
9340- struct raid5_data *raid_conf = sh->raid_conf;
9341- int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
9342+ raid5_conf_t *conf = sh->raid_conf;
9343+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
9344+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9345
9346 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
9347+ lowprio = 1;
9348 for (i = 0; i < disks; i++) {
9349 if (i == pd_idx || !sh->bh_new[i])
9350 continue;
9351 if (!sh->bh_copy[i])
9352 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
9353 raid5_build_block(sh, sh->bh_copy[i], i);
9354+ if (!buffer_lowprio(sh->bh_new[i]))
9355+ lowprio = 0;
9356+ else
9357+ mark_buffer_lowprio(sh->bh_copy[i]);
9358 mark_buffer_clean(sh->bh_new[i]);
9359 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
9360 }
9361 if (sh->bh_copy[pd_idx] == NULL)
9362 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
9363 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
9364+ if (lowprio)
9365+ mark_buffer_lowprio(sh->bh_copy[pd_idx]);
9366
9367 if (method == RECONSTRUCT_WRITE) {
9368 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
9369+ bh_ptr[0] = sh->bh_copy[pd_idx];
9370+ count = 1;
9371 for (i = 0; i < disks; i++) {
9372 if (i == sh->pd_idx)
9373 continue;
9374 if (sh->bh_new[i]) {
9375- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
9376- continue;
9377+ bh_ptr[count++] = sh->bh_copy[i];
9378+ } else if (sh->bh_old[i]) {
9379+ bh_ptr[count++] = sh->bh_old[i];
9380 }
9381- if (sh->bh_old[i]) {
9382- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
9383- continue;
9384+ if (count == MAX_XOR_BLOCKS) {
9385+ xor_block(count, &bh_ptr[0]);
9386+ count = 1;
9387 }
9388 }
9389+ if (count != 1) {
9390+ xor_block(count, &bh_ptr[0]);
9391+ }
9392 } else if (method == READ_MODIFY_WRITE) {
9393 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
9394+ bh_ptr[0] = sh->bh_copy[pd_idx];
9395+ count = 1;
9396 for (i = 0; i < disks; i++) {
9397 if (i == sh->pd_idx)
9398 continue;
9399 if (sh->bh_new[i] && sh->bh_old[i]) {
9400- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
9401- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
9402- continue;
9403+ bh_ptr[count++] = sh->bh_copy[i];
9404+ bh_ptr[count++] = sh->bh_old[i];
9405+ }
9406+ if (count >= (MAX_XOR_BLOCKS - 1)) {
9407+ xor_block(count, &bh_ptr[0]);
9408+ count = 1;
9409 }
9410 }
9411+ if (count != 1) {
9412+ xor_block(count, &bh_ptr[0]);
9413+ }
9414 }
9415 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
9416 }
9417
9418 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
9419 {
9420- struct raid5_data *raid_conf = sh->raid_conf;
9421+ raid5_conf_t *conf = sh->raid_conf;
9422 struct buffer_head *bh_req;
9423
9424 if (sh->bh_new[dd_idx]) {
9425@@ -860,19 +899,22 @@
9426 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
9427 sh->phase = PHASE_BEGIN;
9428 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
9429- raid_conf->nr_pending_stripes++;
9430- atomic_inc(&raid_conf->nr_handle);
9431+ conf->nr_pending_stripes++;
9432+ atomic_inc(&conf->nr_handle);
9433 }
9434 sh->bh_new[dd_idx] = bh;
9435 sh->bh_req[dd_idx] = bh_req;
9436 sh->cmd_new[dd_idx] = rw;
9437 sh->new[dd_idx] = 1;
9438+
9439+ if (buffer_lowprio(bh))
9440+ mark_buffer_lowprio(bh_req);
9441 }
9442
9443 static void complete_stripe(struct stripe_head *sh)
9444 {
9445- struct raid5_data *raid_conf = sh->raid_conf;
9446- int disks = raid_conf->raid_disks;
9447+ raid5_conf_t *conf = sh->raid_conf;
9448+ int disks = conf->raid_disks;
9449 int i, new = 0;
9450
9451 PRINTK(("complete_stripe %lu\n", sh->sector));
9452@@ -909,6 +951,22 @@
9453 }
9454 }
9455
9456+
9457+static int is_stripe_lowprio(struct stripe_head *sh, int disks)
9458+{
9459+ int i, lowprio = 1;
9460+
9461+ for (i = 0; i < disks; i++) {
9462+ if (sh->bh_new[i])
9463+ if (!buffer_lowprio(sh->bh_new[i]))
9464+ lowprio = 0;
9465+ if (sh->bh_old[i])
9466+ if (!buffer_lowprio(sh->bh_old[i]))
9467+ lowprio = 0;
9468+ }
9469+ return lowprio;
9470+}
9471+
9472 /*
9473 * handle_stripe() is our main logic routine. Note that:
9474 *
9475@@ -919,28 +977,27 @@
9476 * 2. We should be careful to set sh->nr_pending whenever we sleep,
9477 * to prevent re-entry of handle_stripe() for the same sh.
9478 *
9479- * 3. raid_conf->failed_disks and disk->operational can be changed
9480+ * 3. conf->failed_disks and disk->operational can be changed
9481 * from an interrupt. This complicates things a bit, but it allows
9482 * us to stop issuing requests for a failed drive as soon as possible.
9483 */
9484 static void handle_stripe(struct stripe_head *sh)
9485 {
9486- struct raid5_data *raid_conf = sh->raid_conf;
9487- struct md_dev *mddev = raid_conf->mddev;
9488- int minor = (int) (mddev - md_dev);
9489+ raid5_conf_t *conf = sh->raid_conf;
9490+ mddev_t *mddev = conf->mddev;
9491 struct buffer_head *bh;
9492- int disks = raid_conf->raid_disks;
9493- int i, nr = 0, nr_read = 0, nr_write = 0;
9494+ int disks = conf->raid_disks;
9495+ int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
9496 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
9497 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
9498 int reading = 0, nr_writing = 0;
9499 int method1 = INT_MAX, method2 = INT_MAX;
9500 int block;
9501 unsigned long flags;
9502- int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
9503+ int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
9504
9505 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
9506- if (sh->nr_pending) {
9507+ if (md_atomic_read(&sh->nr_pending)) {
9508 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
9509 return;
9510 }
9511@@ -949,9 +1006,9 @@
9512 return;
9513 }
9514
9515- atomic_dec(&raid_conf->nr_handle);
9516+ atomic_dec(&conf->nr_handle);
9517
9518- if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
9519+ if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
9520 printk("raid5: restarting stripe %lu\n", sh->sector);
9521 sh->phase = PHASE_BEGIN;
9522 }
9523@@ -969,11 +1026,11 @@
9524 save_flags(flags);
9525 cli();
9526 for (i = 0; i < disks; i++) {
9527- operational[i] = raid_conf->disks[i].operational;
9528- if (i == sh->pd_idx && raid_conf->resync_parity)
9529+ operational[i] = conf->disks[i].operational;
9530+ if (i == sh->pd_idx && conf->resync_parity)
9531 operational[i] = 0;
9532 }
9533- failed_disks = raid_conf->failed_disks;
9534+ failed_disks = conf->failed_disks;
9535 restore_flags(flags);
9536
9537 if (failed_disks > 1) {
9538@@ -1017,7 +1074,7 @@
9539 }
9540
9541 if (nr_write && nr_read)
9542- printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
9543+ printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
9544
9545 if (nr_write) {
9546 /*
9547@@ -1030,7 +1087,7 @@
9548 if (sh->bh_new[i])
9549 continue;
9550 block = (int) compute_blocknr(sh, i);
9551- bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
9552+ bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
9553 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
9554 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
9555 add_stripe_bh(sh, bh, i, WRITE);
9556@@ -1064,21 +1121,22 @@
9557
9558 if (!method1 || !method2) {
9559 lock_stripe(sh);
9560- sh->nr_pending++;
9561+ lowprio = is_stripe_lowprio(sh, disks);
9562+ atomic_inc(&sh->nr_pending);
9563 sh->phase = PHASE_WRITE;
9564 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
9565 for (i = 0; i < disks; i++) {
9566- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
9567+ if (!operational[i] && !conf->spare && !conf->resync_parity)
9568 continue;
9569 if (i == sh->pd_idx || sh->bh_new[i])
9570 nr_writing++;
9571 }
9572
9573- sh->nr_pending = nr_writing;
9574- PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
9575+ md_atomic_set(&sh->nr_pending, nr_writing);
9576+ PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9577
9578 for (i = 0; i < disks; i++) {
9579- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
9580+ if (!operational[i] && !conf->spare && !conf->resync_parity)
9581 continue;
9582 bh = sh->bh_copy[i];
9583 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
9584@@ -1089,18 +1147,30 @@
9585 bh->b_state |= (1<<BH_Dirty);
9586 PRINTK(("making request for buffer %d\n", i));
9587 clear_bit(BH_Lock, &bh->b_state);
9588- if (!operational[i] && !raid_conf->resync_parity) {
9589- bh->b_rdev = raid_conf->spare->dev;
9590- make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
9591- } else
9592- make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
9593+ if (!operational[i] && !conf->resync_parity) {
9594+ bh->b_rdev = conf->spare->dev;
9595+ make_request(MAJOR(conf->spare->dev), WRITE, bh);
9596+ } else {
9597+#if 0
9598+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
9599+#else
9600+ if (!lowprio || (i==sh->pd_idx))
9601+ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
9602+ else {
9603+ mark_buffer_clean(bh);
9604+ raid5_end_request(bh,1);
9605+ sh->new[i] = 0;
9606+ }
9607+#endif
9608+ }
9609 }
9610 }
9611 return;
9612 }
9613
9614 lock_stripe(sh);
9615- sh->nr_pending++;
9616+ lowprio = is_stripe_lowprio(sh, disks);
9617+ atomic_inc(&sh->nr_pending);
9618 if (method1 < method2) {
9619 sh->write_method = RECONSTRUCT_WRITE;
9620 for (i = 0; i < disks; i++) {
9621@@ -1110,6 +1180,8 @@
9622 continue;
9623 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9624 raid5_build_block(sh, sh->bh_old[i], i);
9625+ if (lowprio)
9626+ mark_buffer_lowprio(sh->bh_old[i]);
9627 reading++;
9628 }
9629 } else {
9630@@ -1121,19 +1193,21 @@
9631 continue;
9632 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9633 raid5_build_block(sh, sh->bh_old[i], i);
9634+ if (lowprio)
9635+ mark_buffer_lowprio(sh->bh_old[i]);
9636 reading++;
9637 }
9638 }
9639 sh->phase = PHASE_READ_OLD;
9640- sh->nr_pending = reading;
9641- PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
9642+ md_atomic_set(&sh->nr_pending, reading);
9643+ PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9644 for (i = 0; i < disks; i++) {
9645 if (!sh->bh_old[i])
9646 continue;
9647 if (buffer_uptodate(sh->bh_old[i]))
9648 continue;
9649 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
9650- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
9651+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
9652 }
9653 } else {
9654 /*
9655@@ -1141,7 +1215,8 @@
9656 */
9657 method1 = nr_read - nr_cache_overwrite;
9658 lock_stripe(sh);
9659- sh->nr_pending++;
9660+ lowprio = is_stripe_lowprio(sh,disks);
9661+ atomic_inc(&sh->nr_pending);
9662
9663 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
9664 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
9665@@ -1149,18 +1224,22 @@
9666 for (i = 0; i < disks; i++) {
9667 if (!sh->bh_new[i])
9668 continue;
9669- if (!sh->bh_old[i])
9670+ if (!sh->bh_old[i]) {
9671 compute_block(sh, i);
9672+ if (lowprio)
9673+ mark_buffer_lowprio
9674+ (sh->bh_old[i]);
9675+ }
9676 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
9677 }
9678- sh->nr_pending--;
9679+ atomic_dec(&sh->nr_pending);
9680 complete_stripe(sh);
9681 return;
9682 }
9683 if (nr_failed_overwrite) {
9684 sh->phase = PHASE_READ_OLD;
9685- sh->nr_pending = (disks - 1) - nr_cache;
9686- PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
9687+ md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
9688+ PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9689 for (i = 0; i < disks; i++) {
9690 if (sh->bh_old[i])
9691 continue;
9692@@ -1168,13 +1247,16 @@
9693 continue;
9694 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9695 raid5_build_block(sh, sh->bh_old[i], i);
9696+ if (lowprio)
9697+ mark_buffer_lowprio(sh->bh_old[i]);
9698 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
9699- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
9700+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
9701 }
9702 } else {
9703 sh->phase = PHASE_READ;
9704- sh->nr_pending = nr_read - nr_cache_overwrite;
9705- PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
9706+ md_atomic_set(&sh->nr_pending,
9707+ nr_read - nr_cache_overwrite);
9708+ PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9709 for (i = 0; i < disks; i++) {
9710 if (!sh->bh_new[i])
9711 continue;
9712@@ -1182,16 +1264,16 @@
9713 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
9714 continue;
9715 }
9716- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
9717+ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
9718 }
9719 }
9720 }
9721 }
9722
9723-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9724+static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
9725 {
9726- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
9727- const unsigned int raid_disks = raid_conf->raid_disks;
9728+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
9729+ const unsigned int raid_disks = conf->raid_disks;
9730 const unsigned int data_disks = raid_disks - 1;
9731 unsigned int dd_idx, pd_idx;
9732 unsigned long new_sector;
9733@@ -1202,15 +1284,15 @@
9734 if (rw == WRITEA) rw = WRITE;
9735
9736 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
9737- &dd_idx, &pd_idx, raid_conf);
9738+ &dd_idx, &pd_idx, conf);
9739
9740 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
9741 repeat:
9742- sh = get_stripe(raid_conf, new_sector, bh->b_size);
9743+ sh = get_stripe(conf, new_sector, bh->b_size);
9744 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
9745 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
9746 lock_stripe(sh);
9747- if (!sh->nr_pending)
9748+ if (!md_atomic_read(&sh->nr_pending))
9749 handle_stripe(sh);
9750 goto repeat;
9751 }
9752@@ -1221,24 +1303,24 @@
9753 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
9754 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
9755 lock_stripe(sh);
9756- md_wakeup_thread(raid_conf->thread);
9757+ md_wakeup_thread(conf->thread);
9758 wait_on_stripe(sh);
9759 goto repeat;
9760 }
9761 add_stripe_bh(sh, bh, dd_idx, rw);
9762
9763- md_wakeup_thread(raid_conf->thread);
9764+ md_wakeup_thread(conf->thread);
9765 return 0;
9766 }
9767
9768 static void unplug_devices(struct stripe_head *sh)
9769 {
9770 #if 0
9771- struct raid5_data *raid_conf = sh->raid_conf;
9772+ raid5_conf_t *conf = sh->raid_conf;
9773 int i;
9774
9775- for (i = 0; i < raid_conf->raid_disks; i++)
9776- unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
9777+ for (i = 0; i < conf->raid_disks; i++)
9778+ unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
9779 #endif
9780 }
9781
9782@@ -1252,8 +1334,8 @@
9783 static void raid5d (void *data)
9784 {
9785 struct stripe_head *sh;
9786- struct raid5_data *raid_conf = data;
9787- struct md_dev *mddev = raid_conf->mddev;
9788+ raid5_conf_t *conf = data;
9789+ mddev_t *mddev = conf->mddev;
9790 int i, handled = 0, unplug = 0;
9791 unsigned long flags;
9792
9793@@ -1261,47 +1343,47 @@
9794
9795 if (mddev->sb_dirty) {
9796 mddev->sb_dirty = 0;
9797- md_update_sb((int) (mddev - md_dev));
9798+ md_update_sb(mddev);
9799 }
9800 for (i = 0; i < NR_HASH; i++) {
9801 repeat:
9802- sh = raid_conf->stripe_hashtbl[i];
9803+ sh = conf->stripe_hashtbl[i];
9804 for (; sh; sh = sh->hash_next) {
9805- if (sh->raid_conf != raid_conf)
9806+ if (sh->raid_conf != conf)
9807 continue;
9808 if (sh->phase == PHASE_COMPLETE)
9809 continue;
9810- if (sh->nr_pending)
9811+ if (md_atomic_read(&sh->nr_pending))
9812 continue;
9813- if (sh->sector == raid_conf->next_sector) {
9814- raid_conf->sector_count += (sh->size >> 9);
9815- if (raid_conf->sector_count >= 128)
9816+ if (sh->sector == conf->next_sector) {
9817+ conf->sector_count += (sh->size >> 9);
9818+ if (conf->sector_count >= 128)
9819 unplug = 1;
9820 } else
9821 unplug = 1;
9822 if (unplug) {
9823- PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
9824+ PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
9825 unplug_devices(sh);
9826 unplug = 0;
9827- raid_conf->sector_count = 0;
9828+ conf->sector_count = 0;
9829 }
9830- raid_conf->next_sector = sh->sector + (sh->size >> 9);
9831+ conf->next_sector = sh->sector + (sh->size >> 9);
9832 handled++;
9833 handle_stripe(sh);
9834 goto repeat;
9835 }
9836 }
9837- if (raid_conf) {
9838- PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
9839+ if (conf) {
9840+ PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
9841 save_flags(flags);
9842 cli();
9843- if (!atomic_read(&raid_conf->nr_handle))
9844- clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
9845+ if (!md_atomic_read(&conf->nr_handle))
9846+ clear_bit(THREAD_WAKEUP, &conf->thread->flags);
9847+ restore_flags(flags);
9848 }
9849 PRINTK(("--- raid5d inactive\n"));
9850 }
9851
9852-#if SUPPORT_RECONSTRUCTION
9853 /*
9854 * Private kernel thread for parity reconstruction after an unclean
9855 * shutdown. Reconstruction on spare drives in case of a failed drive
9856@@ -1309,44 +1391,64 @@
9857 */
9858 static void raid5syncd (void *data)
9859 {
9860- struct raid5_data *raid_conf = data;
9861- struct md_dev *mddev = raid_conf->mddev;
9862+ raid5_conf_t *conf = data;
9863+ mddev_t *mddev = conf->mddev;
9864
9865- if (!raid_conf->resync_parity)
9866+ if (!conf->resync_parity)
9867+ return;
9868+ if (conf->resync_parity == 2)
9869+ return;
9870+ down(&mddev->recovery_sem);
9871+ if (md_do_sync(mddev,NULL)) {
9872+ up(&mddev->recovery_sem);
9873+ printk("raid5: resync aborted!\n");
9874 return;
9875- md_do_sync(mddev);
9876- raid_conf->resync_parity = 0;
9877+ }
9878+ conf->resync_parity = 0;
9879+ up(&mddev->recovery_sem);
9880+ printk("raid5: resync finished.\n");
9881 }
9882-#endif /* SUPPORT_RECONSTRUCTION */
9883
9884-static int __check_consistency (struct md_dev *mddev, int row)
9885+static int __check_consistency (mddev_t *mddev, int row)
9886 {
9887- struct raid5_data *raid_conf = mddev->private;
9888+ raid5_conf_t *conf = mddev->private;
9889 kdev_t dev;
9890 struct buffer_head *bh[MD_SB_DISKS], tmp;
9891- int i, rc = 0, nr = 0;
9892+ int i, rc = 0, nr = 0, count;
9893+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9894
9895- if (raid_conf->working_disks != raid_conf->raid_disks)
9896+ if (conf->working_disks != conf->raid_disks)
9897 return 0;
9898 tmp.b_size = 4096;
9899 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
9900 return 0;
9901+ md_clear_page((unsigned long)tmp.b_data);
9902 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
9903- for (i = 0; i < raid_conf->raid_disks; i++) {
9904- dev = raid_conf->disks[i].dev;
9905+ for (i = 0; i < conf->raid_disks; i++) {
9906+ dev = conf->disks[i].dev;
9907 set_blocksize(dev, 4096);
9908 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
9909 break;
9910 nr++;
9911 }
9912- if (nr == raid_conf->raid_disks) {
9913- for (i = 1; i < nr; i++)
9914- xor_block(&tmp, bh[i]);
9915+ if (nr == conf->raid_disks) {
9916+ bh_ptr[0] = &tmp;
9917+ count = 1;
9918+ for (i = 1; i < nr; i++) {
9919+ bh_ptr[count++] = bh[i];
9920+ if (count == MAX_XOR_BLOCKS) {
9921+ xor_block(count, &bh_ptr[0]);
9922+ count = 1;
9923+ }
9924+ }
9925+ if (count != 1) {
9926+ xor_block(count, &bh_ptr[0]);
9927+ }
9928 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
9929 rc = 1;
9930 }
9931- for (i = 0; i < raid_conf->raid_disks; i++) {
9932- dev = raid_conf->disks[i].dev;
9933+ for (i = 0; i < conf->raid_disks; i++) {
9934+ dev = conf->disks[i].dev;
9935 if (bh[i]) {
9936 bforget(bh[i]);
9937 bh[i] = NULL;
9938@@ -1358,285 +1460,607 @@
9939 return rc;
9940 }
9941
9942-static int check_consistency (struct md_dev *mddev)
9943+static int check_consistency (mddev_t *mddev)
9944 {
9945- int size = mddev->sb->size;
9946- int row;
9947+ if (__check_consistency(mddev, 0))
9948+/*
9949+ * We are not checking this currently, as it's legitimate to have
9950+ * an inconsistent array, at creation time.
9951+ */
9952+ return 0;
9953
9954- for (row = 0; row < size; row += size / 8)
9955- if (__check_consistency(mddev, row))
9956- return 1;
9957 return 0;
9958 }
9959
9960-static int raid5_run (int minor, struct md_dev *mddev)
9961+static int raid5_run (mddev_t *mddev)
9962 {
9963- struct raid5_data *raid_conf;
9964+ raid5_conf_t *conf;
9965 int i, j, raid_disk, memory;
9966- md_superblock_t *sb = mddev->sb;
9967- md_descriptor_t *descriptor;
9968- struct real_dev *realdev;
9969+ mdp_super_t *sb = mddev->sb;
9970+ mdp_disk_t *desc;
9971+ mdk_rdev_t *rdev;
9972+ struct disk_info *disk;
9973+ struct md_list_head *tmp;
9974+ int start_recovery = 0;
9975
9976 MOD_INC_USE_COUNT;
9977
9978 if (sb->level != 5 && sb->level != 4) {
9979- printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
9980+ printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
9981 MOD_DEC_USE_COUNT;
9982 return -EIO;
9983 }
9984
9985- mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
9986- if ((raid_conf = mddev->private) == NULL)
9987+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
9988+ if ((conf = mddev->private) == NULL)
9989 goto abort;
9990- memset (raid_conf, 0, sizeof (*raid_conf));
9991- raid_conf->mddev = mddev;
9992+ memset (conf, 0, sizeof (*conf));
9993+ conf->mddev = mddev;
9994
9995- if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
9996+ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
9997 goto abort;
9998- memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
9999+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
10000
10001- init_waitqueue(&raid_conf->wait_for_stripe);
10002- PRINTK(("raid5_run(%d) called.\n", minor));
10003-
10004- for (i = 0; i < mddev->nb_dev; i++) {
10005- realdev = &mddev->devices[i];
10006- if (!realdev->sb) {
10007- printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10008- continue;
10009- }
10010+ init_waitqueue(&conf->wait_for_stripe);
10011+ PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
10012
10013+ ITERATE_RDEV(mddev,rdev,tmp) {
10014 /*
10015 * This is important -- we are using the descriptor on
10016 * the disk only to get a pointer to the descriptor on
10017 * the main superblock, which might be more recent.
10018 */
10019- descriptor = &sb->disks[realdev->sb->descriptor.number];
10020- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10021- printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
10022+ desc = sb->disks + rdev->desc_nr;
10023+ raid_disk = desc->raid_disk;
10024+ disk = conf->disks + raid_disk;
10025+
10026+ if (disk_faulty(desc)) {
10027+ printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
10028+ if (!rdev->faulty) {
10029+ MD_BUG();
10030+ goto abort;
10031+ }
10032+ disk->number = desc->number;
10033+ disk->raid_disk = raid_disk;
10034+ disk->dev = rdev->dev;
10035+
10036+ disk->operational = 0;
10037+ disk->write_only = 0;
10038+ disk->spare = 0;
10039+ disk->used_slot = 1;
10040 continue;
10041 }
10042- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10043- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10044- printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
10045- continue;
10046+ if (disk_active(desc)) {
10047+ if (!disk_sync(desc)) {
10048+ printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
10049+ MD_BUG();
10050+ goto abort;
10051 }
10052- raid_disk = descriptor->raid_disk;
10053- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10054- printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10055+ if (raid_disk > sb->raid_disks) {
10056+ printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
10057 continue;
10058 }
10059- if (raid_conf->disks[raid_disk].operational) {
10060- printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
10061+ if (disk->operational) {
10062+ printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
10063 continue;
10064 }
10065- printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
10066+ printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
10067
10068- raid_conf->disks[raid_disk].number = descriptor->number;
10069- raid_conf->disks[raid_disk].raid_disk = raid_disk;
10070- raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
10071- raid_conf->disks[raid_disk].operational = 1;
10072+ disk->number = desc->number;
10073+ disk->raid_disk = raid_disk;
10074+ disk->dev = rdev->dev;
10075+ disk->operational = 1;
10076+ disk->used_slot = 1;
10077
10078- raid_conf->working_disks++;
10079+ conf->working_disks++;
10080 } else {
10081 /*
10082 * Must be a spare disk ..
10083 */
10084- printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
10085- raid_disk = descriptor->raid_disk;
10086- raid_conf->disks[raid_disk].number = descriptor->number;
10087- raid_conf->disks[raid_disk].raid_disk = raid_disk;
10088- raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
10089-
10090- raid_conf->disks[raid_disk].operational = 0;
10091- raid_conf->disks[raid_disk].write_only = 0;
10092- raid_conf->disks[raid_disk].spare = 1;
10093- }
10094- }
10095- raid_conf->raid_disks = sb->raid_disks;
10096- raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
10097- raid_conf->mddev = mddev;
10098- raid_conf->chunk_size = sb->chunk_size;
10099- raid_conf->level = sb->level;
10100- raid_conf->algorithm = sb->parity_algorithm;
10101- raid_conf->max_nr_stripes = NR_STRIPES;
10102+ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
10103+ disk->number = desc->number;
10104+ disk->raid_disk = raid_disk;
10105+ disk->dev = rdev->dev;
10106
10107- if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
10108- printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
10109- goto abort;
10110+ disk->operational = 0;
10111+ disk->write_only = 0;
10112+ disk->spare = 1;
10113+ disk->used_slot = 1;
10114+ }
10115 }
10116- if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
10117- printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
10118+
10119+ for (i = 0; i < MD_SB_DISKS; i++) {
10120+ desc = sb->disks + i;
10121+ raid_disk = desc->raid_disk;
10122+ disk = conf->disks + raid_disk;
10123+
10124+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
10125+ !conf->disks[raid_disk].used_slot) {
10126+
10127+ disk->number = desc->number;
10128+ disk->raid_disk = raid_disk;
10129+ disk->dev = MKDEV(0,0);
10130+
10131+ disk->operational = 0;
10132+ disk->write_only = 0;
10133+ disk->spare = 0;
10134+ disk->used_slot = 1;
10135+ }
10136+ }
10137+
10138+ conf->raid_disks = sb->raid_disks;
10139+ /*
10140+ * 0 for a fully functional array, 1 for a degraded array.
10141+ */
10142+ conf->failed_disks = conf->raid_disks - conf->working_disks;
10143+ conf->mddev = mddev;
10144+ conf->chunk_size = sb->chunk_size;
10145+ conf->level = sb->level;
10146+ conf->algorithm = sb->layout;
10147+ conf->max_nr_stripes = NR_STRIPES;
10148+
10149+#if 0
10150+ for (i = 0; i < conf->raid_disks; i++) {
10151+ if (!conf->disks[i].used_slot) {
10152+ MD_BUG();
10153+ goto abort;
10154+ }
10155+ }
10156+#endif
10157+ if (!conf->chunk_size || conf->chunk_size % 4) {
10158+ printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
10159 goto abort;
10160 }
10161- if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
10162- printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
10163+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
10164+ printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
10165 goto abort;
10166 }
10167- if (raid_conf->failed_disks > 1) {
10168- printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
10169+ if (conf->failed_disks > 1) {
10170+ printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
10171 goto abort;
10172 }
10173
10174- if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
10175- printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
10176- sb->state |= 1 << MD_SB_ERRORS;
10177- goto abort;
10178+ if (conf->working_disks != sb->raid_disks) {
10179+ printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10180+ start_recovery = 1;
10181 }
10182
10183- if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
10184- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10185- goto abort;
10186+ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
10187+ check_consistency(mddev)) {
10188+ printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
10189+ sb->state &= ~(1 << MD_SB_CLEAN);
10190 }
10191
10192-#if SUPPORT_RECONSTRUCTION
10193- if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
10194- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10195- goto abort;
10196+ {
10197+ const char * name = "raid5d";
10198+
10199+ conf->thread = md_register_thread(raid5d, conf, name);
10200+ if (!conf->thread) {
10201+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
10202+ goto abort;
10203+ }
10204 }
10205-#endif /* SUPPORT_RECONSTRUCTION */
10206
10207- memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
10208- raid_conf->raid_disks * (sizeof(struct buffer_head) +
10209+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
10210+ conf->raid_disks * (sizeof(struct buffer_head) +
10211 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
10212- if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
10213+ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
10214 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
10215- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
10216+ shrink_stripes(conf, conf->max_nr_stripes);
10217 goto abort;
10218 } else
10219- printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
10220+ printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
10221
10222 /*
10223 * Regenerate the "device is in sync with the raid set" bit for
10224 * each device.
10225 */
10226- for (i = 0; i < sb->nr_disks ; i++) {
10227- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10228+ for (i = 0; i < MD_SB_DISKS ; i++) {
10229+ mark_disk_nonsync(sb->disks + i);
10230 for (j = 0; j < sb->raid_disks; j++) {
10231- if (!raid_conf->disks[j].operational)
10232+ if (!conf->disks[j].operational)
10233 continue;
10234- if (sb->disks[i].number == raid_conf->disks[j].number)
10235- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10236+ if (sb->disks[i].number == conf->disks[j].number)
10237+ mark_disk_sync(sb->disks + i);
10238 }
10239 }
10240- sb->active_disks = raid_conf->working_disks;
10241+ sb->active_disks = conf->working_disks;
10242
10243 if (sb->active_disks == sb->raid_disks)
10244- printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
10245+ printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
10246 else
10247- printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
10248+ printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
10249+
10250+ if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
10251+ const char * name = "raid5syncd";
10252+
10253+ conf->resync_thread = md_register_thread(raid5syncd, conf,name);
10254+ if (!conf->resync_thread) {
10255+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
10256+ goto abort;
10257+ }
10258
10259- if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
10260- printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
10261- raid_conf->resync_parity = 1;
10262-#if SUPPORT_RECONSTRUCTION
10263- md_wakeup_thread(raid_conf->resync_thread);
10264-#endif /* SUPPORT_RECONSTRUCTION */
10265+ printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
10266+ conf->resync_parity = 1;
10267+ md_wakeup_thread(conf->resync_thread);
10268 }
10269
10270+ print_raid5_conf(conf);
10271+ if (start_recovery)
10272+ md_recover_arrays();
10273+ print_raid5_conf(conf);
10274+
10275 /* Ok, everything is just fine now */
10276 return (0);
10277 abort:
10278- if (raid_conf) {
10279- if (raid_conf->stripe_hashtbl)
10280- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
10281- kfree(raid_conf);
10282+ if (conf) {
10283+ print_raid5_conf(conf);
10284+ if (conf->stripe_hashtbl)
10285+ free_pages((unsigned long) conf->stripe_hashtbl,
10286+ HASH_PAGES_ORDER);
10287+ kfree(conf);
10288 }
10289 mddev->private = NULL;
10290- printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10291+ printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
10292 MOD_DEC_USE_COUNT;
10293 return -EIO;
10294 }
10295
10296-static int raid5_stop (int minor, struct md_dev *mddev)
10297+static int raid5_stop_resync (mddev_t *mddev)
10298+{
10299+ raid5_conf_t *conf = mddev_to_conf(mddev);
10300+ mdk_thread_t *thread = conf->resync_thread;
10301+
10302+ if (thread) {
10303+ if (conf->resync_parity) {
10304+ conf->resync_parity = 2;
10305+ md_interrupt_thread(thread);
10306+ printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
10307+ return 1;
10308+ }
10309+ return 0;
10310+ }
10311+ return 0;
10312+}
10313+
10314+static int raid5_restart_resync (mddev_t *mddev)
10315 {
10316- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
10317+ raid5_conf_t *conf = mddev_to_conf(mddev);
10318
10319- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
10320- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
10321- md_unregister_thread(raid_conf->thread);
10322-#if SUPPORT_RECONSTRUCTION
10323- md_unregister_thread(raid_conf->resync_thread);
10324-#endif /* SUPPORT_RECONSTRUCTION */
10325- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
10326- kfree(raid_conf);
10327+ if (conf->resync_parity) {
10328+ if (!conf->resync_thread) {
10329+ MD_BUG();
10330+ return 0;
10331+ }
10332+ printk("raid5: waking up raid5resync.\n");
10333+ conf->resync_parity = 1;
10334+ md_wakeup_thread(conf->resync_thread);
10335+ return 1;
10336+ } else
10337+ printk("raid5: no restart-resync needed.\n");
10338+ return 0;
10339+}
10340+
10341+
10342+static int raid5_stop (mddev_t *mddev)
10343+{
10344+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
10345+
10346+ shrink_stripe_cache(conf, conf->max_nr_stripes);
10347+ shrink_stripes(conf, conf->max_nr_stripes);
10348+ md_unregister_thread(conf->thread);
10349+ if (conf->resync_thread)
10350+ md_unregister_thread(conf->resync_thread);
10351+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
10352+ kfree(conf);
10353 mddev->private = NULL;
10354 MOD_DEC_USE_COUNT;
10355 return 0;
10356 }
10357
10358-static int raid5_status (char *page, int minor, struct md_dev *mddev)
10359+static int raid5_status (char *page, mddev_t *mddev)
10360 {
10361- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
10362- md_superblock_t *sb = mddev->sb;
10363+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
10364+ mdp_super_t *sb = mddev->sb;
10365 int sz = 0, i;
10366
10367- sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
10368- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
10369- for (i = 0; i < raid_conf->raid_disks; i++)
10370- sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
10371+ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
10372+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
10373+ for (i = 0; i < conf->raid_disks; i++)
10374+ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
10375 sz += sprintf (page+sz, "]");
10376 return sz;
10377 }
10378
10379-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
10380+static void print_raid5_conf (raid5_conf_t *conf)
10381+{
10382+ int i;
10383+ struct disk_info *tmp;
10384+
10385+ printk("RAID5 conf printout:\n");
10386+ if (!conf) {
10387+ printk("(conf==NULL)\n");
10388+ return;
10389+ }
10390+ printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
10391+ conf->working_disks, conf->failed_disks);
10392+
10393+ for (i = 0; i < MD_SB_DISKS; i++) {
10394+ tmp = conf->disks + i;
10395+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10396+ i, tmp->spare,tmp->operational,
10397+ tmp->number,tmp->raid_disk,tmp->used_slot,
10398+ partition_name(tmp->dev));
10399+ }
10400+}
10401+
10402+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10403 {
10404- int i = 0, failed_disk = -1;
10405- struct raid5_data *raid_conf = mddev->private;
10406- struct disk_info *disk = raid_conf->disks;
10407+ int err = 0;
10408+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10409+ raid5_conf_t *conf = mddev->private;
10410+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10411 unsigned long flags;
10412- md_superblock_t *sb = mddev->sb;
10413- md_descriptor_t *descriptor;
10414+ mdp_super_t *sb = mddev->sb;
10415+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10416
10417- for (i = 0; i < MD_SB_DISKS; i++, disk++) {
10418- if (disk->spare && disk->number == spare->number)
10419- goto found;
10420- }
10421- return 1;
10422-found:
10423- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
10424- if (!disk->operational)
10425- failed_disk = i;
10426- if (failed_disk == -1)
10427- return 1;
10428 save_flags(flags);
10429 cli();
10430+
10431+ print_raid5_conf(conf);
10432+ /*
10433+ * find the disk ...
10434+ */
10435 switch (state) {
10436- case SPARE_WRITE:
10437- disk->operational = 1;
10438- disk->write_only = 1;
10439- raid_conf->spare = disk;
10440- break;
10441- case SPARE_INACTIVE:
10442- disk->operational = 0;
10443- disk->write_only = 0;
10444- raid_conf->spare = NULL;
10445- break;
10446- case SPARE_ACTIVE:
10447- disk->spare = 0;
10448- disk->write_only = 0;
10449
10450- descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
10451- i = spare->raid_disk;
10452- disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
10453- if (disk->raid_disk != failed_disk)
10454- printk("raid5: disk->raid_disk != failed_disk");
10455- descriptor->raid_disk = i;
10456-
10457- raid_conf->spare = NULL;
10458- raid_conf->working_disks++;
10459- raid_conf->failed_disks--;
10460- raid_conf->disks[failed_disk] = *disk;
10461- break;
10462- default:
10463- printk("raid5_mark_spare: bug: state == %d\n", state);
10464- restore_flags(flags);
10465- return 1;
10466+ case DISKOP_SPARE_ACTIVE:
10467+
10468+ /*
10469+ * Find the failed disk within the RAID5 configuration ...
10470+ * (this can only be in the first conf->raid_disks part)
10471+ */
10472+ for (i = 0; i < conf->raid_disks; i++) {
10473+ tmp = conf->disks + i;
10474+ if ((!tmp->operational && !tmp->spare) ||
10475+ !tmp->used_slot) {
10476+ failed_disk = i;
10477+ break;
10478+ }
10479+ }
10480+ /*
10481+ * When we activate a spare disk we _must_ have a disk in
10482+ * the lower (active) part of the array to replace.
10483+ */
10484+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10485+ MD_BUG();
10486+ err = 1;
10487+ goto abort;
10488+ }
10489+ /* fall through */
10490+
10491+ case DISKOP_SPARE_WRITE:
10492+ case DISKOP_SPARE_INACTIVE:
10493+
10494+ /*
10495+ * Find the spare disk ... (can only be in the 'high'
10496+ * area of the array)
10497+ */
10498+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10499+ tmp = conf->disks + i;
10500+ if (tmp->spare && tmp->number == (*d)->number) {
10501+ spare_disk = i;
10502+ break;
10503+ }
10504+ }
10505+ if (spare_disk == -1) {
10506+ MD_BUG();
10507+ err = 1;
10508+ goto abort;
10509+ }
10510+ break;
10511+
10512+ case DISKOP_HOT_REMOVE_DISK:
10513+
10514+ for (i = 0; i < MD_SB_DISKS; i++) {
10515+ tmp = conf->disks + i;
10516+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
10517+ if (tmp->operational) {
10518+ err = -EBUSY;
10519+ goto abort;
10520+ }
10521+ removed_disk = i;
10522+ break;
10523+ }
10524+ }
10525+ if (removed_disk == -1) {
10526+ MD_BUG();
10527+ err = 1;
10528+ goto abort;
10529+ }
10530+ break;
10531+
10532+ case DISKOP_HOT_ADD_DISK:
10533+
10534+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10535+ tmp = conf->disks + i;
10536+ if (!tmp->used_slot) {
10537+ added_disk = i;
10538+ break;
10539+ }
10540+ }
10541+ if (added_disk == -1) {
10542+ MD_BUG();
10543+ err = 1;
10544+ goto abort;
10545+ }
10546+ break;
10547+ }
10548+
10549+ switch (state) {
10550+ /*
10551+ * Switch the spare disk to write-only mode:
10552+ */
10553+ case DISKOP_SPARE_WRITE:
10554+ if (conf->spare) {
10555+ MD_BUG();
10556+ err = 1;
10557+ goto abort;
10558+ }
10559+ sdisk = conf->disks + spare_disk;
10560+ sdisk->operational = 1;
10561+ sdisk->write_only = 1;
10562+ conf->spare = sdisk;
10563+ break;
10564+ /*
10565+ * Deactivate a spare disk:
10566+ */
10567+ case DISKOP_SPARE_INACTIVE:
10568+ sdisk = conf->disks + spare_disk;
10569+ sdisk->operational = 0;
10570+ sdisk->write_only = 0;
10571+ /*
10572+ * Was the spare being resynced?
10573+ */
10574+ if (conf->spare == sdisk)
10575+ conf->spare = NULL;
10576+ break;
10577+ /*
10578+ * Activate (mark read-write) the (now sync) spare disk,
10579+ * which means we switch it's 'raid position' (->raid_disk)
10580+ * with the failed disk. (only the first 'conf->raid_disks'
10581+ * slots are used for 'real' disks and we must preserve this
10582+ * property)
10583+ */
10584+ case DISKOP_SPARE_ACTIVE:
10585+ if (!conf->spare) {
10586+ MD_BUG();
10587+ err = 1;
10588+ goto abort;
10589+ }
10590+ sdisk = conf->disks + spare_disk;
10591+ fdisk = conf->disks + failed_disk;
10592+
10593+ spare_desc = &sb->disks[sdisk->number];
10594+ failed_desc = &sb->disks[fdisk->number];
10595+
10596+ if (spare_desc != *d) {
10597+ MD_BUG();
10598+ err = 1;
10599+ goto abort;
10600+ }
10601+
10602+ if (spare_desc->raid_disk != sdisk->raid_disk) {
10603+ MD_BUG();
10604+ err = 1;
10605+ goto abort;
10606+ }
10607+
10608+ if (sdisk->raid_disk != spare_disk) {
10609+ MD_BUG();
10610+ err = 1;
10611+ goto abort;
10612+ }
10613+
10614+ if (failed_desc->raid_disk != fdisk->raid_disk) {
10615+ MD_BUG();
10616+ err = 1;
10617+ goto abort;
10618+ }
10619+
10620+ if (fdisk->raid_disk != failed_disk) {
10621+ MD_BUG();
10622+ err = 1;
10623+ goto abort;
10624+ }
10625+
10626+ /*
10627+ * do the switch finally
10628+ */
10629+ xchg_values(*spare_desc, *failed_desc);
10630+ xchg_values(*fdisk, *sdisk);
10631+
10632+ /*
10633+ * (careful, 'failed' and 'spare' are switched from now on)
10634+ *
10635+ * we want to preserve linear numbering and we want to
10636+ * give the proper raid_disk number to the now activated
10637+ * disk. (this means we switch back these values)
10638+ */
10639+
10640+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10641+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10642+ xchg_values(spare_desc->number, failed_desc->number);
10643+ xchg_values(sdisk->number, fdisk->number);
10644+
10645+ *d = failed_desc;
10646+
10647+ if (sdisk->dev == MKDEV(0,0))
10648+ sdisk->used_slot = 0;
10649+
10650+ /*
10651+ * this really activates the spare.
10652+ */
10653+ fdisk->spare = 0;
10654+ fdisk->write_only = 0;
10655+
10656+ /*
10657+ * if we activate a spare, we definitely replace a
10658+ * non-operational disk slot in the 'low' area of
10659+ * the disk array.
10660+ */
10661+ conf->failed_disks--;
10662+ conf->working_disks++;
10663+ conf->spare = NULL;
10664+
10665+ break;
10666+
10667+ case DISKOP_HOT_REMOVE_DISK:
10668+ rdisk = conf->disks + removed_disk;
10669+
10670+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10671+ MD_BUG();
10672+ err = 1;
10673+ goto abort;
10674+ }
10675+ rdisk->dev = MKDEV(0,0);
10676+ rdisk->used_slot = 0;
10677+
10678+ break;
10679+
10680+ case DISKOP_HOT_ADD_DISK:
10681+ adisk = conf->disks + added_disk;
10682+ added_desc = *d;
10683+
10684+ if (added_disk != added_desc->number) {
10685+ MD_BUG();
10686+ err = 1;
10687+ goto abort;
10688+ }
10689+
10690+ adisk->number = added_desc->number;
10691+ adisk->raid_disk = added_desc->raid_disk;
10692+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10693+
10694+ adisk->operational = 0;
10695+ adisk->write_only = 0;
10696+ adisk->spare = 1;
10697+ adisk->used_slot = 1;
10698+
10699+
10700+ break;
10701+
10702+ default:
10703+ MD_BUG();
10704+ err = 1;
10705+ goto abort;
10706 }
10707+abort:
10708 restore_flags(flags);
10709- return 0;
10710+ print_raid5_conf(conf);
10711+ return err;
10712 }
10713
10714-static struct md_personality raid5_personality=
10715+static mdk_personality_t raid5_personality=
10716 {
10717 "raid5",
10718 raid5_map,
10719@@ -1648,14 +2072,19 @@
10720 NULL, /* no ioctls */
10721 0,
10722 raid5_error,
10723- /* raid5_hot_add_disk, */ NULL,
10724- /* raid1_hot_remove_drive */ NULL,
10725- raid5_mark_spare
10726+ raid5_diskop,
10727+ raid5_stop_resync,
10728+ raid5_restart_resync
10729 };
10730
10731 int raid5_init (void)
10732 {
10733- return register_md_personality (RAID5, &raid5_personality);
10734+ int err;
10735+
10736+ err = register_md_personality (RAID5, &raid5_personality);
10737+ if (err)
10738+ return err;
10739+ return 0;
10740 }
10741
10742 #ifdef MODULE
10743diff -ruN linux.orig/drivers/block/translucent.c linux-2.2.16/drivers/block/translucent.c
10744--- linux.orig/drivers/block/translucent.c Thu Jan 1 01:00:00 1970
10745+++ linux-2.2.16/drivers/block/translucent.c Fri Jun 9 11:37:45 2000
10746@@ -0,0 +1,136 @@
10747+/*
10748+ translucent.c : Translucent RAID driver for Linux
10749+ Copyright (C) 1998 Ingo Molnar
10750+
10751+ Translucent mode management functions.
10752+
10753+ This program is free software; you can redistribute it and/or modify
10754+ it under the terms of the GNU General Public License as published by
10755+ the Free Software Foundation; either version 2, or (at your option)
10756+ any later version.
10757+
10758+ You should have received a copy of the GNU General Public License
10759+ (for example /usr/src/linux/COPYING); if not, write to the Free
10760+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
10761+*/
10762+
10763+#include <linux/module.h>
10764+
10765+#include <linux/raid/md.h>
10766+#include <linux/malloc.h>
10767+
10768+#include <linux/raid/translucent.h>
10769+
10770+#define MAJOR_NR MD_MAJOR
10771+#define MD_DRIVER
10772+#define MD_PERSONALITY
10773+
10774+static int translucent_run (mddev_t *mddev)
10775+{
10776+ translucent_conf_t *conf;
10777+ mdk_rdev_t *rdev;
10778+ int i;
10779+
10780+ MOD_INC_USE_COUNT;
10781+
10782+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
10783+ if (!conf)
10784+ goto out;
10785+ mddev->private = conf;
10786+
10787+ if (mddev->nb_dev != 2) {
10788+ printk("translucent: this mode needs 2 disks, aborting!\n");
10789+ goto out;
10790+ }
10791+
10792+ if (md_check_ordering(mddev)) {
10793+ printk("translucent: disks are not ordered, aborting!\n");
10794+ goto out;
10795+ }
10796+
10797+ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
10798+ dev_info_t *disk = conf->disks + i;
10799+
10800+ disk->dev = rdev->dev;
10801+ disk->size = rdev->size;
10802+ }
10803+
10804+ return 0;
10805+
10806+out:
10807+ if (conf)
10808+ kfree(conf);
10809+
10810+ MOD_DEC_USE_COUNT;
10811+ return 1;
10812+}
10813+
10814+static int translucent_stop (mddev_t *mddev)
10815+{
10816+ translucent_conf_t *conf = mddev_to_conf(mddev);
10817+
10818+ kfree(conf);
10819+
10820+ MOD_DEC_USE_COUNT;
10821+
10822+ return 0;
10823+}
10824+
10825+
10826+static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
10827+ unsigned long *rsector, unsigned long size)
10828+{
10829+ translucent_conf_t *conf = mddev_to_conf(mddev);
10830+
10831+ *rdev = conf->disks[0].dev;
10832+
10833+ return 0;
10834+}
10835+
10836+static int translucent_status (char *page, mddev_t *mddev)
10837+{
10838+ int sz = 0;
10839+
10840+ sz += sprintf(page+sz, " %d%% full", 10);
10841+ return sz;
10842+}
10843+
10844+
10845+static mdk_personality_t translucent_personality=
10846+{
10847+ "translucent",
10848+ translucent_map,
10849+ NULL,
10850+ NULL,
10851+ translucent_run,
10852+ translucent_stop,
10853+ translucent_status,
10854+ NULL,
10855+ 0,
10856+ NULL,
10857+ NULL,
10858+ NULL,
10859+ NULL
10860+};
10861+
10862+#ifndef MODULE
10863+
10864+md__initfunc(void translucent_init (void))
10865+{
10866+ register_md_personality (TRANSLUCENT, &translucent_personality);
10867+}
10868+
10869+#else
10870+
10871+int init_module (void)
10872+{
10873+ return (register_md_personality (TRANSLUCENT, &translucent_personality));
10874+}
10875+
10876+void cleanup_module (void)
10877+{
10878+ unregister_md_personality (TRANSLUCENT);
10879+}
10880+
10881+#endif
10882+
10883diff -ruN linux.orig/drivers/block/xor.c linux-2.2.16/drivers/block/xor.c
10884--- linux.orig/drivers/block/xor.c Thu Jan 1 01:00:00 1970
10885+++ linux-2.2.16/drivers/block/xor.c Fri Jun 9 11:37:45 2000
10886@@ -0,0 +1,1894 @@
10887+/*
10888+ * xor.c : Multiple Devices driver for Linux
10889+ *
10890+ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
10891+ *
10892+ *
10893+ * optimized RAID-5 checksumming functions.
10894+ *
10895+ * This program is free software; you can redistribute it and/or modify
10896+ * it under the terms of the GNU General Public License as published by
10897+ * the Free Software Foundation; either version 2, or (at your option)
10898+ * any later version.
10899+ *
10900+ * You should have received a copy of the GNU General Public License
10901+ * (for example /usr/src/linux/COPYING); if not, write to the Free
10902+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
10903+ */
10904+#include <linux/module.h>
10905+#include <linux/raid/md.h>
10906+#ifdef __sparc_v9__
10907+#include <asm/head.h>
10908+#include <asm/asi.h>
10909+#include <asm/visasm.h>
10910+#endif
10911+
10912+/*
10913+ * we use the 'XOR function template' to register multiple xor
10914+ * functions runtime. The kernel measures their speed upon bootup
10915+ * and decides which one to use. (compile-time registration is
10916+ * not enough as certain CPU features like MMX can only be detected
10917+ * runtime)
10918+ *
10919+ * this architecture makes it pretty easy to add new routines
10920+ * that are faster on certain CPUs, without killing other CPU's
10921+ * 'native' routine. Although the current routines are belived
10922+ * to be the physically fastest ones on all CPUs tested, but
10923+ * feel free to prove me wrong and add yet another routine =B-)
10924+ * --mingo
10925+ */
10926+
10927+#define MAX_XOR_BLOCKS 5
10928+
10929+#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
10930+
10931+typedef void (*xor_block_t) XOR_ARGS;
10932+xor_block_t xor_block = NULL;
10933+
10934+#ifndef __sparc_v9__
10935+
10936+struct xor_block_template;
10937+
10938+struct xor_block_template {
10939+ char * name;
10940+ xor_block_t xor_block;
10941+ int speed;
10942+ struct xor_block_template * next;
10943+};
10944+
10945+struct xor_block_template * xor_functions = NULL;
10946+
10947+#define XORBLOCK_TEMPLATE(x) \
10948+static void xor_block_##x XOR_ARGS; \
10949+static struct xor_block_template t_xor_block_##x = \
10950+ { #x, xor_block_##x, 0, NULL }; \
10951+static void xor_block_##x XOR_ARGS
10952+
10953+#ifdef __i386__
10954+
10955+#ifdef CONFIG_X86_XMM
10956+/*
10957+ * Cache avoiding checksumming functions utilizing KNI instructions
10958+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
10959+ */
10960+
10961+XORBLOCK_TEMPLATE(pIII_kni)
10962+{
10963+ char xmm_save[16*4];
10964+ int cr0;
10965+ int lines = (bh_ptr[0]->b_size>>8);
10966+
10967+ __asm__ __volatile__ (
10968+ "movl %%cr0,%0 ;\n\t"
10969+ "clts ;\n\t"
10970+ "movups %%xmm0,(%1) ;\n\t"
10971+ "movups %%xmm1,0x10(%1) ;\n\t"
10972+ "movups %%xmm2,0x20(%1) ;\n\t"
10973+ "movups %%xmm3,0x30(%1) ;\n\t"
10974+ : "=r" (cr0)
10975+ : "r" (xmm_save)
10976+ : "memory" );
10977+
10978+#define OFFS(x) "8*("#x"*2)"
10979+#define PF0(x) \
10980+ " prefetcht0 "OFFS(x)"(%1) ;\n"
10981+#define LD(x,y) \
10982+ " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
10983+#define ST(x,y) \
10984+ " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
10985+#define PF1(x) \
10986+ " prefetchnta "OFFS(x)"(%2) ;\n"
10987+#define PF2(x) \
10988+ " prefetchnta "OFFS(x)"(%3) ;\n"
10989+#define PF3(x) \
10990+ " prefetchnta "OFFS(x)"(%4) ;\n"
10991+#define PF4(x) \
10992+ " prefetchnta "OFFS(x)"(%5) ;\n"
10993+#define PF5(x) \
10994+ " prefetchnta "OFFS(x)"(%6) ;\n"
10995+#define XO1(x,y) \
10996+ " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
10997+#define XO2(x,y) \
10998+ " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
10999+#define XO3(x,y) \
11000+ " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
11001+#define XO4(x,y) \
11002+ " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
11003+#define XO5(x,y) \
11004+ " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
11005+
11006+ switch(count) {
11007+ case 2:
11008+ __asm__ __volatile__ (
11009+#undef BLOCK
11010+#define BLOCK(i) \
11011+ LD(i,0) \
11012+ LD(i+1,1) \
11013+ PF1(i) \
11014+ PF1(i+2) \
11015+ LD(i+2,2) \
11016+ LD(i+3,3) \
11017+ PF0(i+4) \
11018+ PF0(i+6) \
11019+ XO1(i,0) \
11020+ XO1(i+1,1) \
11021+ XO1(i+2,2) \
11022+ XO1(i+3,3) \
11023+ ST(i,0) \
11024+ ST(i+1,1) \
11025+ ST(i+2,2) \
11026+ ST(i+3,3) \
11027+
11028+
11029+ PF0(0)
11030+ PF0(2)
11031+
11032+ " .align 32,0x90 ;\n"
11033+ " 1: ;\n"
11034+
11035+ BLOCK(0)
11036+ BLOCK(4)
11037+ BLOCK(8)
11038+ BLOCK(12)
11039+
11040+ " addl $256, %1 ;\n"
11041+ " addl $256, %2 ;\n"
11042+ " decl %0 ;\n"
11043+ " jnz 1b ;\n"
11044+
11045+ :
11046+ : "r" (lines),
11047+ "r" (bh_ptr[0]->b_data),
11048+ "r" (bh_ptr[1]->b_data)
11049+ : "memory" );
11050+ break;
11051+ case 3:
11052+ __asm__ __volatile__ (
11053+#undef BLOCK
11054+#define BLOCK(i) \
11055+ PF1(i) \
11056+ PF1(i+2) \
11057+ LD(i,0) \
11058+ LD(i+1,1) \
11059+ LD(i+2,2) \
11060+ LD(i+3,3) \
11061+ PF2(i) \
11062+ PF2(i+2) \
11063+ PF0(i+4) \
11064+ PF0(i+6) \
11065+ XO1(i,0) \
11066+ XO1(i+1,1) \
11067+ XO1(i+2,2) \
11068+ XO1(i+3,3) \
11069+ XO2(i,0) \
11070+ XO2(i+1,1) \
11071+ XO2(i+2,2) \
11072+ XO2(i+3,3) \
11073+ ST(i,0) \
11074+ ST(i+1,1) \
11075+ ST(i+2,2) \
11076+ ST(i+3,3) \
11077+
11078+
11079+ PF0(0)
11080+ PF0(2)
11081+
11082+ " .align 32,0x90 ;\n"
11083+ " 1: ;\n"
11084+
11085+ BLOCK(0)
11086+ BLOCK(4)
11087+ BLOCK(8)
11088+ BLOCK(12)
11089+
11090+ " addl $256, %1 ;\n"
11091+ " addl $256, %2 ;\n"
11092+ " addl $256, %3 ;\n"
11093+ " decl %0 ;\n"
11094+ " jnz 1b ;\n"
11095+ :
11096+ : "r" (lines),
11097+ "r" (bh_ptr[0]->b_data),
11098+ "r" (bh_ptr[1]->b_data),
11099+ "r" (bh_ptr[2]->b_data)
11100+ : "memory" );
11101+ break;
11102+ case 4:
11103+ __asm__ __volatile__ (
11104+#undef BLOCK
11105+#define BLOCK(i) \
11106+ PF1(i) \
11107+ PF1(i+2) \
11108+ LD(i,0) \
11109+ LD(i+1,1) \
11110+ LD(i+2,2) \
11111+ LD(i+3,3) \
11112+ PF2(i) \
11113+ PF2(i+2) \
11114+ XO1(i,0) \
11115+ XO1(i+1,1) \
11116+ XO1(i+2,2) \
11117+ XO1(i+3,3) \
11118+ PF3(i) \
11119+ PF3(i+2) \
11120+ PF0(i+4) \
11121+ PF0(i+6) \
11122+ XO2(i,0) \
11123+ XO2(i+1,1) \
11124+ XO2(i+2,2) \
11125+ XO2(i+3,3) \
11126+ XO3(i,0) \
11127+ XO3(i+1,1) \
11128+ XO3(i+2,2) \
11129+ XO3(i+3,3) \
11130+ ST(i,0) \
11131+ ST(i+1,1) \
11132+ ST(i+2,2) \
11133+ ST(i+3,3) \
11134+
11135+
11136+ PF0(0)
11137+ PF0(2)
11138+
11139+ " .align 32,0x90 ;\n"
11140+ " 1: ;\n"
11141+
11142+ BLOCK(0)
11143+ BLOCK(4)
11144+ BLOCK(8)
11145+ BLOCK(12)
11146+
11147+ " addl $256, %1 ;\n"
11148+ " addl $256, %2 ;\n"
11149+ " addl $256, %3 ;\n"
11150+ " addl $256, %4 ;\n"
11151+ " decl %0 ;\n"
11152+ " jnz 1b ;\n"
11153+
11154+ :
11155+ : "r" (lines),
11156+ "r" (bh_ptr[0]->b_data),
11157+ "r" (bh_ptr[1]->b_data),
11158+ "r" (bh_ptr[2]->b_data),
11159+ "r" (bh_ptr[3]->b_data)
11160+ : "memory" );
11161+ break;
11162+ case 5:
11163+ __asm__ __volatile__ (
11164+#undef BLOCK
11165+#define BLOCK(i) \
11166+ PF1(i) \
11167+ PF1(i+2) \
11168+ LD(i,0) \
11169+ LD(i+1,1) \
11170+ LD(i+2,2) \
11171+ LD(i+3,3) \
11172+ PF2(i) \
11173+ PF2(i+2) \
11174+ XO1(i,0) \
11175+ XO1(i+1,1) \
11176+ XO1(i+2,2) \
11177+ XO1(i+3,3) \
11178+ PF3(i) \
11179+ PF3(i+2) \
11180+ XO2(i,0) \
11181+ XO2(i+1,1) \
11182+ XO2(i+2,2) \
11183+ XO2(i+3,3) \
11184+ PF4(i) \
11185+ PF4(i+2) \
11186+ PF0(i+4) \
11187+ PF0(i+6) \
11188+ XO3(i,0) \
11189+ XO3(i+1,1) \
11190+ XO3(i+2,2) \
11191+ XO3(i+3,3) \
11192+ XO4(i,0) \
11193+ XO4(i+1,1) \
11194+ XO4(i+2,2) \
11195+ XO4(i+3,3) \
11196+ ST(i,0) \
11197+ ST(i+1,1) \
11198+ ST(i+2,2) \
11199+ ST(i+3,3) \
11200+
11201+
11202+ PF0(0)
11203+ PF0(2)
11204+
11205+ " .align 32,0x90 ;\n"
11206+ " 1: ;\n"
11207+
11208+ BLOCK(0)
11209+ BLOCK(4)
11210+ BLOCK(8)
11211+ BLOCK(12)
11212+
11213+ " addl $256, %1 ;\n"
11214+ " addl $256, %2 ;\n"
11215+ " addl $256, %3 ;\n"
11216+ " addl $256, %4 ;\n"
11217+ " addl $256, %5 ;\n"
11218+ " decl %0 ;\n"
11219+ " jnz 1b ;\n"
11220+
11221+ :
11222+ : "r" (lines),
11223+ "r" (bh_ptr[0]->b_data),
11224+ "r" (bh_ptr[1]->b_data),
11225+ "r" (bh_ptr[2]->b_data),
11226+ "r" (bh_ptr[3]->b_data),
11227+ "r" (bh_ptr[4]->b_data)
11228+ : "memory");
11229+ break;
11230+ }
11231+
11232+ __asm__ __volatile__ (
11233+ "sfence ;\n\t"
11234+ "movups (%1),%%xmm0 ;\n\t"
11235+ "movups 0x10(%1),%%xmm1 ;\n\t"
11236+ "movups 0x20(%1),%%xmm2 ;\n\t"
11237+ "movups 0x30(%1),%%xmm3 ;\n\t"
11238+ "movl %0,%%cr0 ;\n\t"
11239+ :
11240+ : "r" (cr0), "r" (xmm_save)
11241+ : "memory" );
11242+}
11243+
11244+#undef OFFS
11245+#undef LD
11246+#undef ST
11247+#undef PF0
11248+#undef PF1
11249+#undef PF2
11250+#undef PF3
11251+#undef PF4
11252+#undef PF5
11253+#undef XO1
11254+#undef XO2
11255+#undef XO3
11256+#undef XO4
11257+#undef XO5
11258+#undef BLOCK
11259+
11260+#endif /* CONFIG_X86_XMM */
11261+
11262+/*
11263+ * high-speed RAID5 checksumming functions utilizing MMX instructions
11264+ * Copyright (C) 1998 Ingo Molnar
11265+ */
11266+XORBLOCK_TEMPLATE(pII_mmx)
11267+{
11268+ char fpu_save[108];
11269+ int lines = (bh_ptr[0]->b_size>>7);
11270+
11271+ if (!(current->flags & PF_USEDFPU))
11272+ __asm__ __volatile__ ( " clts;\n");
11273+
11274+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
11275+
11276+#define LD(x,y) \
11277+ " movq 8*("#x")(%1), %%mm"#y" ;\n"
11278+#define ST(x,y) \
11279+ " movq %%mm"#y", 8*("#x")(%1) ;\n"
11280+#define XO1(x,y) \
11281+ " pxor 8*("#x")(%2), %%mm"#y" ;\n"
11282+#define XO2(x,y) \
11283+ " pxor 8*("#x")(%3), %%mm"#y" ;\n"
11284+#define XO3(x,y) \
11285+ " pxor 8*("#x")(%4), %%mm"#y" ;\n"
11286+#define XO4(x,y) \
11287+ " pxor 8*("#x")(%5), %%mm"#y" ;\n"
11288+
11289+ switch(count) {
11290+ case 2:
11291+ __asm__ __volatile__ (
11292+#undef BLOCK
11293+#define BLOCK(i) \
11294+ LD(i,0) \
11295+ LD(i+1,1) \
11296+ LD(i+2,2) \
11297+ LD(i+3,3) \
11298+ XO1(i,0) \
11299+ ST(i,0) \
11300+ XO1(i+1,1) \
11301+ ST(i+1,1) \
11302+ XO1(i+2,2) \
11303+ ST(i+2,2) \
11304+ XO1(i+3,3) \
11305+ ST(i+3,3)
11306+
11307+ " .align 32,0x90 ;\n"
11308+ " 1: ;\n"
11309+
11310+ BLOCK(0)
11311+ BLOCK(4)
11312+ BLOCK(8)
11313+ BLOCK(12)
11314+
11315+ " addl $128, %1 ;\n"
11316+ " addl $128, %2 ;\n"
11317+ " decl %0 ;\n"
11318+ " jnz 1b ;\n"
11319+ :
11320+ : "r" (lines),
11321+ "r" (bh_ptr[0]->b_data),
11322+ "r" (bh_ptr[1]->b_data)
11323+ : "memory");
11324+ break;
11325+ case 3:
11326+ __asm__ __volatile__ (
11327+#undef BLOCK
11328+#define BLOCK(i) \
11329+ LD(i,0) \
11330+ LD(i+1,1) \
11331+ LD(i+2,2) \
11332+ LD(i+3,3) \
11333+ XO1(i,0) \
11334+ XO1(i+1,1) \
11335+ XO1(i+2,2) \
11336+ XO1(i+3,3) \
11337+ XO2(i,0) \
11338+ ST(i,0) \
11339+ XO2(i+1,1) \
11340+ ST(i+1,1) \
11341+ XO2(i+2,2) \
11342+ ST(i+2,2) \
11343+ XO2(i+3,3) \
11344+ ST(i+3,3)
11345+
11346+ " .align 32,0x90 ;\n"
11347+ " 1: ;\n"
11348+
11349+ BLOCK(0)
11350+ BLOCK(4)
11351+ BLOCK(8)
11352+ BLOCK(12)
11353+
11354+ " addl $128, %1 ;\n"
11355+ " addl $128, %2 ;\n"
11356+ " addl $128, %3 ;\n"
11357+ " decl %0 ;\n"
11358+ " jnz 1b ;\n"
11359+ :
11360+ : "r" (lines),
11361+ "r" (bh_ptr[0]->b_data),
11362+ "r" (bh_ptr[1]->b_data),
11363+ "r" (bh_ptr[2]->b_data)
11364+ : "memory");
11365+ break;
11366+ case 4:
11367+ __asm__ __volatile__ (
11368+#undef BLOCK
11369+#define BLOCK(i) \
11370+ LD(i,0) \
11371+ LD(i+1,1) \
11372+ LD(i+2,2) \
11373+ LD(i+3,3) \
11374+ XO1(i,0) \
11375+ XO1(i+1,1) \
11376+ XO1(i+2,2) \
11377+ XO1(i+3,3) \
11378+ XO2(i,0) \
11379+ XO2(i+1,1) \
11380+ XO2(i+2,2) \
11381+ XO2(i+3,3) \
11382+ XO3(i,0) \
11383+ ST(i,0) \
11384+ XO3(i+1,1) \
11385+ ST(i+1,1) \
11386+ XO3(i+2,2) \
11387+ ST(i+2,2) \
11388+ XO3(i+3,3) \
11389+ ST(i+3,3)
11390+
11391+ " .align 32,0x90 ;\n"
11392+ " 1: ;\n"
11393+
11394+ BLOCK(0)
11395+ BLOCK(4)
11396+ BLOCK(8)
11397+ BLOCK(12)
11398+
11399+ " addl $128, %1 ;\n"
11400+ " addl $128, %2 ;\n"
11401+ " addl $128, %3 ;\n"
11402+ " addl $128, %4 ;\n"
11403+ " decl %0 ;\n"
11404+ " jnz 1b ;\n"
11405+ :
11406+ : "r" (lines),
11407+ "r" (bh_ptr[0]->b_data),
11408+ "r" (bh_ptr[1]->b_data),
11409+ "r" (bh_ptr[2]->b_data),
11410+ "r" (bh_ptr[3]->b_data)
11411+ : "memory");
11412+ break;
11413+ case 5:
11414+ __asm__ __volatile__ (
11415+#undef BLOCK
11416+#define BLOCK(i) \
11417+ LD(i,0) \
11418+ LD(i+1,1) \
11419+ LD(i+2,2) \
11420+ LD(i+3,3) \
11421+ XO1(i,0) \
11422+ XO1(i+1,1) \
11423+ XO1(i+2,2) \
11424+ XO1(i+3,3) \
11425+ XO2(i,0) \
11426+ XO2(i+1,1) \
11427+ XO2(i+2,2) \
11428+ XO2(i+3,3) \
11429+ XO3(i,0) \
11430+ XO3(i+1,1) \
11431+ XO3(i+2,2) \
11432+ XO3(i+3,3) \
11433+ XO4(i,0) \
11434+ ST(i,0) \
11435+ XO4(i+1,1) \
11436+ ST(i+1,1) \
11437+ XO4(i+2,2) \
11438+ ST(i+2,2) \
11439+ XO4(i+3,3) \
11440+ ST(i+3,3)
11441+
11442+ " .align 32,0x90 ;\n"
11443+ " 1: ;\n"
11444+
11445+ BLOCK(0)
11446+ BLOCK(4)
11447+ BLOCK(8)
11448+ BLOCK(12)
11449+
11450+ " addl $128, %1 ;\n"
11451+ " addl $128, %2 ;\n"
11452+ " addl $128, %3 ;\n"
11453+ " addl $128, %4 ;\n"
11454+ " addl $128, %5 ;\n"
11455+ " decl %0 ;\n"
11456+ " jnz 1b ;\n"
11457+ :
11458+ : "r" (lines),
11459+ "r" (bh_ptr[0]->b_data),
11460+ "r" (bh_ptr[1]->b_data),
11461+ "r" (bh_ptr[2]->b_data),
11462+ "r" (bh_ptr[3]->b_data),
11463+ "r" (bh_ptr[4]->b_data)
11464+ : "memory");
11465+ break;
11466+ }
11467+
11468+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
11469+
11470+ if (!(current->flags & PF_USEDFPU))
11471+ stts();
11472+}
11473+
11474+#undef LD
11475+#undef XO1
11476+#undef XO2
11477+#undef XO3
11478+#undef XO4
11479+#undef ST
11480+#undef BLOCK
11481+
11482+XORBLOCK_TEMPLATE(p5_mmx)
11483+{
11484+ char fpu_save[108];
11485+ int lines = (bh_ptr[0]->b_size>>6);
11486+
11487+ if (!(current->flags & PF_USEDFPU))
11488+ __asm__ __volatile__ ( " clts;\n");
11489+
11490+ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
11491+
11492+ switch(count) {
11493+ case 2:
11494+ __asm__ __volatile__ (
11495+
11496+ " .align 32,0x90 ;\n"
11497+ " 1: ;\n"
11498+ " movq (%1), %%mm0 ;\n"
11499+ " movq 8(%1), %%mm1 ;\n"
11500+ " pxor (%2), %%mm0 ;\n"
11501+ " movq 16(%1), %%mm2 ;\n"
11502+ " movq %%mm0, (%1) ;\n"
11503+ " pxor 8(%2), %%mm1 ;\n"
11504+ " movq 24(%1), %%mm3 ;\n"
11505+ " movq %%mm1, 8(%1) ;\n"
11506+ " pxor 16(%2), %%mm2 ;\n"
11507+ " movq 32(%1), %%mm4 ;\n"
11508+ " movq %%mm2, 16(%1) ;\n"
11509+ " pxor 24(%2), %%mm3 ;\n"
11510+ " movq 40(%1), %%mm5 ;\n"
11511+ " movq %%mm3, 24(%1) ;\n"
11512+ " pxor 32(%2), %%mm4 ;\n"
11513+ " movq 48(%1), %%mm6 ;\n"
11514+ " movq %%mm4, 32(%1) ;\n"
11515+ " pxor 40(%2), %%mm5 ;\n"
11516+ " movq 56(%1), %%mm7 ;\n"
11517+ " movq %%mm5, 40(%1) ;\n"
11518+ " pxor 48(%2), %%mm6 ;\n"
11519+ " pxor 56(%2), %%mm7 ;\n"
11520+ " movq %%mm6, 48(%1) ;\n"
11521+ " movq %%mm7, 56(%1) ;\n"
11522+
11523+ " addl $64, %1 ;\n"
11524+ " addl $64, %2 ;\n"
11525+ " decl %0 ;\n"
11526+ " jnz 1b ;\n"
11527+
11528+ :
11529+ : "r" (lines),
11530+ "r" (bh_ptr[0]->b_data),
11531+ "r" (bh_ptr[1]->b_data)
11532+ : "memory" );
11533+ break;
11534+ case 3:
11535+ __asm__ __volatile__ (
11536+
11537+ " .align 32,0x90 ;\n"
11538+ " 1: ;\n"
11539+ " movq (%1), %%mm0 ;\n"
11540+ " movq 8(%1), %%mm1 ;\n"
11541+ " pxor (%2), %%mm0 ;\n"
11542+ " movq 16(%1), %%mm2 ;\n"
11543+ " pxor 8(%2), %%mm1 ;\n"
11544+ " pxor (%3), %%mm0 ;\n"
11545+ " pxor 16(%2), %%mm2 ;\n"
11546+ " movq %%mm0, (%1) ;\n"
11547+ " pxor 8(%3), %%mm1 ;\n"
11548+ " pxor 16(%3), %%mm2 ;\n"
11549+ " movq 24(%1), %%mm3 ;\n"
11550+ " movq %%mm1, 8(%1) ;\n"
11551+ " movq 32(%1), %%mm4 ;\n"
11552+ " movq 40(%1), %%mm5 ;\n"
11553+ " pxor 24(%2), %%mm3 ;\n"
11554+ " movq %%mm2, 16(%1) ;\n"
11555+ " pxor 32(%2), %%mm4 ;\n"
11556+ " pxor 24(%3), %%mm3 ;\n"
11557+ " pxor 40(%2), %%mm5 ;\n"
11558+ " movq %%mm3, 24(%1) ;\n"
11559+ " pxor 32(%3), %%mm4 ;\n"
11560+ " pxor 40(%3), %%mm5 ;\n"
11561+ " movq 48(%1), %%mm6 ;\n"
11562+ " movq %%mm4, 32(%1) ;\n"
11563+ " movq 56(%1), %%mm7 ;\n"
11564+ " pxor 48(%2), %%mm6 ;\n"
11565+ " movq %%mm5, 40(%1) ;\n"
11566+ " pxor 56(%2), %%mm7 ;\n"
11567+ " pxor 48(%3), %%mm6 ;\n"
11568+ " pxor 56(%3), %%mm7 ;\n"
11569+ " movq %%mm6, 48(%1) ;\n"
11570+ " movq %%mm7, 56(%1) ;\n"
11571+
11572+ " addl $64, %1 ;\n"
11573+ " addl $64, %2 ;\n"
11574+ " addl $64, %3 ;\n"
11575+ " decl %0 ;\n"
11576+ " jnz 1b ;\n"
11577+
11578+ :
11579+ : "r" (lines),
11580+ "r" (bh_ptr[0]->b_data),
11581+ "r" (bh_ptr[1]->b_data),
11582+ "r" (bh_ptr[2]->b_data)
11583+ : "memory" );
11584+ break;
11585+ case 4:
11586+ __asm__ __volatile__ (
11587+
11588+ " .align 32,0x90 ;\n"
11589+ " 1: ;\n"
11590+ " movq (%1), %%mm0 ;\n"
11591+ " movq 8(%1), %%mm1 ;\n"
11592+ " pxor (%2), %%mm0 ;\n"
11593+ " movq 16(%1), %%mm2 ;\n"
11594+ " pxor 8(%2), %%mm1 ;\n"
11595+ " pxor (%3), %%mm0 ;\n"
11596+ " pxor 16(%2), %%mm2 ;\n"
11597+ " pxor 8(%3), %%mm1 ;\n"
11598+ " pxor (%4), %%mm0 ;\n"
11599+ " movq 24(%1), %%mm3 ;\n"
11600+ " pxor 16(%3), %%mm2 ;\n"
11601+ " pxor 8(%4), %%mm1 ;\n"
11602+ " movq %%mm0, (%1) ;\n"
11603+ " movq 32(%1), %%mm4 ;\n"
11604+ " pxor 24(%2), %%mm3 ;\n"
11605+ " pxor 16(%4), %%mm2 ;\n"
11606+ " movq %%mm1, 8(%1) ;\n"
11607+ " movq 40(%1), %%mm5 ;\n"
11608+ " pxor 32(%2), %%mm4 ;\n"
11609+ " pxor 24(%3), %%mm3 ;\n"
11610+ " movq %%mm2, 16(%1) ;\n"
11611+ " pxor 40(%2), %%mm5 ;\n"
11612+ " pxor 32(%3), %%mm4 ;\n"
11613+ " pxor 24(%4), %%mm3 ;\n"
11614+ " movq %%mm3, 24(%1) ;\n"
11615+ " movq 56(%1), %%mm7 ;\n"
11616+ " movq 48(%1), %%mm6 ;\n"
11617+ " pxor 40(%3), %%mm5 ;\n"
11618+ " pxor 32(%4), %%mm4 ;\n"
11619+ " pxor 48(%2), %%mm6 ;\n"
11620+ " movq %%mm4, 32(%1) ;\n"
11621+ " pxor 56(%2), %%mm7 ;\n"
11622+ " pxor 40(%4), %%mm5 ;\n"
11623+ " pxor 48(%3), %%mm6 ;\n"
11624+ " pxor 56(%3), %%mm7 ;\n"
11625+ " movq %%mm5, 40(%1) ;\n"
11626+ " pxor 48(%4), %%mm6 ;\n"
11627+ " pxor 56(%4), %%mm7 ;\n"
11628+ " movq %%mm6, 48(%1) ;\n"
11629+ " movq %%mm7, 56(%1) ;\n"
11630+
11631+ " addl $64, %1 ;\n"
11632+ " addl $64, %2 ;\n"
11633+ " addl $64, %3 ;\n"
11634+ " addl $64, %4 ;\n"
11635+ " decl %0 ;\n"
11636+ " jnz 1b ;\n"
11637+
11638+ :
11639+ : "r" (lines),
11640+ "r" (bh_ptr[0]->b_data),
11641+ "r" (bh_ptr[1]->b_data),
11642+ "r" (bh_ptr[2]->b_data),
11643+ "r" (bh_ptr[3]->b_data)
11644+ : "memory" );
11645+ break;
11646+ case 5:
11647+ __asm__ __volatile__ (
11648+
11649+ " .align 32,0x90 ;\n"
11650+ " 1: ;\n"
11651+ " movq (%1), %%mm0 ;\n"
11652+ " movq 8(%1), %%mm1 ;\n"
11653+ " pxor (%2), %%mm0 ;\n"
11654+ " pxor 8(%2), %%mm1 ;\n"
11655+ " movq 16(%1), %%mm2 ;\n"
11656+ " pxor (%3), %%mm0 ;\n"
11657+ " pxor 8(%3), %%mm1 ;\n"
11658+ " pxor 16(%2), %%mm2 ;\n"
11659+ " pxor (%4), %%mm0 ;\n"
11660+ " pxor 8(%4), %%mm1 ;\n"
11661+ " pxor 16(%3), %%mm2 ;\n"
11662+ " movq 24(%1), %%mm3 ;\n"
11663+ " pxor (%5), %%mm0 ;\n"
11664+ " pxor 8(%5), %%mm1 ;\n"
11665+ " movq %%mm0, (%1) ;\n"
11666+ " pxor 16(%4), %%mm2 ;\n"
11667+ " pxor 24(%2), %%mm3 ;\n"
11668+ " movq %%mm1, 8(%1) ;\n"
11669+ " pxor 16(%5), %%mm2 ;\n"
11670+ " pxor 24(%3), %%mm3 ;\n"
11671+ " movq 32(%1), %%mm4 ;\n"
11672+ " movq %%mm2, 16(%1) ;\n"
11673+ " pxor 24(%4), %%mm3 ;\n"
11674+ " pxor 32(%2), %%mm4 ;\n"
11675+ " movq 40(%1), %%mm5 ;\n"
11676+ " pxor 24(%5), %%mm3 ;\n"
11677+ " pxor 32(%3), %%mm4 ;\n"
11678+ " pxor 40(%2), %%mm5 ;\n"
11679+ " movq %%mm3, 24(%1) ;\n"
11680+ " pxor 32(%4), %%mm4 ;\n"
11681+ " pxor 40(%3), %%mm5 ;\n"
11682+ " movq 48(%1), %%mm6 ;\n"
11683+ " movq 56(%1), %%mm7 ;\n"
11684+ " pxor 32(%5), %%mm4 ;\n"
11685+ " pxor 40(%4), %%mm5 ;\n"
11686+ " pxor 48(%2), %%mm6 ;\n"
11687+ " pxor 56(%2), %%mm7 ;\n"
11688+ " movq %%mm4, 32(%1) ;\n"
11689+ " pxor 48(%3), %%mm6 ;\n"
11690+ " pxor 56(%3), %%mm7 ;\n"
11691+ " pxor 40(%5), %%mm5 ;\n"
11692+ " pxor 48(%4), %%mm6 ;\n"
11693+ " pxor 56(%4), %%mm7 ;\n"
11694+ " movq %%mm5, 40(%1) ;\n"
11695+ " pxor 48(%5), %%mm6 ;\n"
11696+ " pxor 56(%5), %%mm7 ;\n"
11697+ " movq %%mm6, 48(%1) ;\n"
11698+ " movq %%mm7, 56(%1) ;\n"
11699+
11700+ " addl $64, %1 ;\n"
11701+ " addl $64, %2 ;\n"
11702+ " addl $64, %3 ;\n"
11703+ " addl $64, %4 ;\n"
11704+ " addl $64, %5 ;\n"
11705+ " decl %0 ;\n"
11706+ " jnz 1b ;\n"
11707+
11708+ :
11709+ : "r" (lines),
11710+ "r" (bh_ptr[0]->b_data),
11711+ "r" (bh_ptr[1]->b_data),
11712+ "r" (bh_ptr[2]->b_data),
11713+ "r" (bh_ptr[3]->b_data),
11714+ "r" (bh_ptr[4]->b_data)
11715+ : "memory" );
11716+ break;
11717+ }
11718+
11719+ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
11720+
11721+ if (!(current->flags & PF_USEDFPU))
11722+ stts();
11723+}
11724+#endif /* __i386__ */
11725+#endif /* !__sparc_v9__ */
11726+
11727+#ifdef __sparc_v9__
11728+/*
11729+ * High speed xor_block operation for RAID4/5 utilizing the
11730+ * UltraSparc Visual Instruction Set.
11731+ *
11732+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
11733+ *
11734+ * Requirements:
11735+ * !(((long)dest | (long)sourceN) & (64 - 1)) &&
11736+ * !(len & 127) && len >= 256
11737+ *
11738+ * It is done in pure assembly, as otherwise gcc makes it
11739+ * a non-leaf function, which is not what we want.
11740+ * Also, we don't measure the speeds as on other architectures,
11741+ * as the measuring routine does not take into account cold caches
11742+ * and the fact that xor_block_VIS bypasses the caches.
11743+ * xor_block_32regs might be 5% faster for count 2 if caches are hot
11744+ * and things just right (for count 3 VIS is about as fast as 32regs for
11745+ * hot caches and for count 4 and 5 VIS is faster by good margin always),
11746+ * but I think it is better not to pollute the caches.
11747+ * Actually, if I'd just fight for speed for hot caches, I could
11748+ * write a hybrid VIS/integer routine, which would do always two
11749+ * 64B blocks in VIS and two in IEUs, but I really care more about
11750+ * caches.
11751+ */
11752+extern void *VISenter(void);
11753+extern void xor_block_VIS XOR_ARGS;
11754+
11755+void __xor_block_VIS(void)
11756+{
11757+__asm__ ("
11758+ .globl xor_block_VIS
11759+xor_block_VIS:
11760+ ldx [%%o1 + 0], %%o4
11761+ ldx [%%o1 + 8], %%o3
11762+ ldx [%%o4 + %1], %%g5
11763+ ldx [%%o4 + %0], %%o4
11764+ ldx [%%o3 + %0], %%o3
11765+ rd %%fprs, %%o5
11766+ andcc %%o5, %2, %%g0
11767+ be,pt %%icc, 297f
11768+ sethi %%hi(%5), %%g1
11769+ jmpl %%g1 + %%lo(%5), %%g7
11770+ add %%g7, 8, %%g7
11771+297: wr %%g0, %4, %%fprs
11772+ membar #LoadStore|#StoreLoad|#StoreStore
11773+ sub %%g5, 64, %%g5
11774+ ldda [%%o4] %3, %%f0
11775+ ldda [%%o3] %3, %%f16
11776+ cmp %%o0, 4
11777+ bgeu,pt %%xcc, 10f
11778+ cmp %%o0, 3
11779+ be,pn %%xcc, 13f
11780+ mov -64, %%g1
11781+ sub %%g5, 64, %%g5
11782+ rd %%asi, %%g1
11783+ wr %%g0, %3, %%asi
11784+
11785+2: ldda [%%o4 + 64] %%asi, %%f32
11786+ fxor %%f0, %%f16, %%f16
11787+ fxor %%f2, %%f18, %%f18
11788+ fxor %%f4, %%f20, %%f20
11789+ fxor %%f6, %%f22, %%f22
11790+ fxor %%f8, %%f24, %%f24
11791+ fxor %%f10, %%f26, %%f26
11792+ fxor %%f12, %%f28, %%f28
11793+ fxor %%f14, %%f30, %%f30
11794+ stda %%f16, [%%o4] %3
11795+ ldda [%%o3 + 64] %%asi, %%f48
11796+ ldda [%%o4 + 128] %%asi, %%f0
11797+ fxor %%f32, %%f48, %%f48
11798+ fxor %%f34, %%f50, %%f50
11799+ add %%o4, 128, %%o4
11800+ fxor %%f36, %%f52, %%f52
11801+ add %%o3, 128, %%o3
11802+ fxor %%f38, %%f54, %%f54
11803+ subcc %%g5, 128, %%g5
11804+ fxor %%f40, %%f56, %%f56
11805+ fxor %%f42, %%f58, %%f58
11806+ fxor %%f44, %%f60, %%f60
11807+ fxor %%f46, %%f62, %%f62
11808+ stda %%f48, [%%o4 - 64] %%asi
11809+ bne,pt %%xcc, 2b
11810+ ldda [%%o3] %3, %%f16
11811+
11812+ ldda [%%o4 + 64] %%asi, %%f32
11813+ fxor %%f0, %%f16, %%f16
11814+ fxor %%f2, %%f18, %%f18
11815+ fxor %%f4, %%f20, %%f20
11816+ fxor %%f6, %%f22, %%f22
11817+ fxor %%f8, %%f24, %%f24
11818+ fxor %%f10, %%f26, %%f26
11819+ fxor %%f12, %%f28, %%f28
11820+ fxor %%f14, %%f30, %%f30
11821+ stda %%f16, [%%o4] %3
11822+ ldda [%%o3 + 64] %%asi, %%f48
11823+ membar #Sync
11824+ fxor %%f32, %%f48, %%f48
11825+ fxor %%f34, %%f50, %%f50
11826+ fxor %%f36, %%f52, %%f52
11827+ fxor %%f38, %%f54, %%f54
11828+ fxor %%f40, %%f56, %%f56
11829+ fxor %%f42, %%f58, %%f58
11830+ fxor %%f44, %%f60, %%f60
11831+ fxor %%f46, %%f62, %%f62
11832+ stda %%f48, [%%o4 + 64] %%asi
11833+ membar #Sync|#StoreStore|#StoreLoad
11834+ wr %%g0, 0, %%fprs
11835+ retl
11836+ wr %%g1, %%g0, %%asi
11837+
11838+13: ldx [%%o1 + 16], %%o2
11839+ ldx [%%o2 + %0], %%o2
11840+
11841+3: ldda [%%o2] %3, %%f32
11842+ fxor %%f0, %%f16, %%f48
11843+ fxor %%f2, %%f18, %%f50
11844+ add %%o4, 64, %%o4
11845+ fxor %%f4, %%f20, %%f52
11846+ fxor %%f6, %%f22, %%f54
11847+ add %%o3, 64, %%o3
11848+ fxor %%f8, %%f24, %%f56
11849+ fxor %%f10, %%f26, %%f58
11850+ fxor %%f12, %%f28, %%f60
11851+ fxor %%f14, %%f30, %%f62
11852+ ldda [%%o4] %3, %%f0
11853+ fxor %%f48, %%f32, %%f48
11854+ fxor %%f50, %%f34, %%f50
11855+ fxor %%f52, %%f36, %%f52
11856+ fxor %%f54, %%f38, %%f54
11857+ add %%o2, 64, %%o2
11858+ fxor %%f56, %%f40, %%f56
11859+ fxor %%f58, %%f42, %%f58
11860+ subcc %%g5, 64, %%g5
11861+ fxor %%f60, %%f44, %%f60
11862+ fxor %%f62, %%f46, %%f62
11863+ stda %%f48, [%%o4 + %%g1] %3
11864+ bne,pt %%xcc, 3b
11865+ ldda [%%o3] %3, %%f16
11866+
11867+ ldda [%%o2] %3, %%f32
11868+ fxor %%f0, %%f16, %%f48
11869+ fxor %%f2, %%f18, %%f50
11870+ fxor %%f4, %%f20, %%f52
11871+ fxor %%f6, %%f22, %%f54
11872+ fxor %%f8, %%f24, %%f56
11873+ fxor %%f10, %%f26, %%f58
11874+ fxor %%f12, %%f28, %%f60
11875+ fxor %%f14, %%f30, %%f62
11876+ membar #Sync
11877+ fxor %%f48, %%f32, %%f48
11878+ fxor %%f50, %%f34, %%f50
11879+ fxor %%f52, %%f36, %%f52
11880+ fxor %%f54, %%f38, %%f54
11881+ fxor %%f56, %%f40, %%f56
11882+ fxor %%f58, %%f42, %%f58
11883+ fxor %%f60, %%f44, %%f60
11884+ fxor %%f62, %%f46, %%f62
11885+ stda %%f48, [%%o4] %3
11886+ membar #Sync|#StoreStore|#StoreLoad
11887+ retl
11888+ wr %%g0, 0, %%fprs
11889+
11890+10: cmp %%o0, 5
11891+ be,pt %%xcc, 15f
11892+ mov -64, %%g1
11893+
11894+14: ldx [%%o1 + 16], %%o2
11895+ ldx [%%o1 + 24], %%o0
11896+ ldx [%%o2 + %0], %%o2
11897+ ldx [%%o0 + %0], %%o0
11898+
11899+4: ldda [%%o2] %3, %%f32
11900+ fxor %%f0, %%f16, %%f16
11901+ fxor %%f2, %%f18, %%f18
11902+ add %%o4, 64, %%o4
11903+ fxor %%f4, %%f20, %%f20
11904+ fxor %%f6, %%f22, %%f22
11905+ add %%o3, 64, %%o3
11906+ fxor %%f8, %%f24, %%f24
11907+ fxor %%f10, %%f26, %%f26
11908+ fxor %%f12, %%f28, %%f28
11909+ fxor %%f14, %%f30, %%f30
11910+ ldda [%%o0] %3, %%f48
11911+ fxor %%f16, %%f32, %%f32
11912+ fxor %%f18, %%f34, %%f34
11913+ fxor %%f20, %%f36, %%f36
11914+ fxor %%f22, %%f38, %%f38
11915+ add %%o2, 64, %%o2
11916+ fxor %%f24, %%f40, %%f40
11917+ fxor %%f26, %%f42, %%f42
11918+ fxor %%f28, %%f44, %%f44
11919+ fxor %%f30, %%f46, %%f46
11920+ ldda [%%o4] %3, %%f0
11921+ fxor %%f32, %%f48, %%f48
11922+ fxor %%f34, %%f50, %%f50
11923+ fxor %%f36, %%f52, %%f52
11924+ add %%o0, 64, %%o0
11925+ fxor %%f38, %%f54, %%f54
11926+ fxor %%f40, %%f56, %%f56
11927+ fxor %%f42, %%f58, %%f58
11928+ subcc %%g5, 64, %%g5
11929+ fxor %%f44, %%f60, %%f60
11930+ fxor %%f46, %%f62, %%f62
11931+ stda %%f48, [%%o4 + %%g1] %3
11932+ bne,pt %%xcc, 4b
11933+ ldda [%%o3] %3, %%f16
11934+
11935+ ldda [%%o2] %3, %%f32
11936+ fxor %%f0, %%f16, %%f16
11937+ fxor %%f2, %%f18, %%f18
11938+ fxor %%f4, %%f20, %%f20
11939+ fxor %%f6, %%f22, %%f22
11940+ fxor %%f8, %%f24, %%f24
11941+ fxor %%f10, %%f26, %%f26
11942+ fxor %%f12, %%f28, %%f28
11943+ fxor %%f14, %%f30, %%f30
11944+ ldda [%%o0] %3, %%f48
11945+ fxor %%f16, %%f32, %%f32
11946+ fxor %%f18, %%f34, %%f34
11947+ fxor %%f20, %%f36, %%f36
11948+ fxor %%f22, %%f38, %%f38
11949+ fxor %%f24, %%f40, %%f40
11950+ fxor %%f26, %%f42, %%f42
11951+ fxor %%f28, %%f44, %%f44
11952+ fxor %%f30, %%f46, %%f46
11953+ membar #Sync
11954+ fxor %%f32, %%f48, %%f48
11955+ fxor %%f34, %%f50, %%f50
11956+ fxor %%f36, %%f52, %%f52
11957+ fxor %%f38, %%f54, %%f54
11958+ fxor %%f40, %%f56, %%f56
11959+ fxor %%f42, %%f58, %%f58
11960+ fxor %%f44, %%f60, %%f60
11961+ fxor %%f46, %%f62, %%f62
11962+ stda %%f48, [%%o4] %3
11963+ membar #Sync|#StoreStore|#StoreLoad
11964+ retl
11965+ wr %%g0, 0, %%fprs
11966+
11967+15: ldx [%%o1 + 16], %%o2
11968+ ldx [%%o1 + 24], %%o0
11969+ ldx [%%o1 + 32], %%o1
11970+ ldx [%%o2 + %0], %%o2
11971+ ldx [%%o0 + %0], %%o0
11972+ ldx [%%o1 + %0], %%o1
11973+
11974+5: ldda [%%o2] %3, %%f32
11975+ fxor %%f0, %%f16, %%f48
11976+ fxor %%f2, %%f18, %%f50
11977+ add %%o4, 64, %%o4
11978+ fxor %%f4, %%f20, %%f52
11979+ fxor %%f6, %%f22, %%f54
11980+ add %%o3, 64, %%o3
11981+ fxor %%f8, %%f24, %%f56
11982+ fxor %%f10, %%f26, %%f58
11983+ fxor %%f12, %%f28, %%f60
11984+ fxor %%f14, %%f30, %%f62
11985+ ldda [%%o0] %3, %%f16
11986+ fxor %%f48, %%f32, %%f48
11987+ fxor %%f50, %%f34, %%f50
11988+ fxor %%f52, %%f36, %%f52
11989+ fxor %%f54, %%f38, %%f54
11990+ add %%o2, 64, %%o2
11991+ fxor %%f56, %%f40, %%f56
11992+ fxor %%f58, %%f42, %%f58
11993+ fxor %%f60, %%f44, %%f60
11994+ fxor %%f62, %%f46, %%f62
11995+ ldda [%%o1] %3, %%f32
11996+ fxor %%f48, %%f16, %%f48
11997+ fxor %%f50, %%f18, %%f50
11998+ add %%o0, 64, %%o0
11999+ fxor %%f52, %%f20, %%f52
12000+ fxor %%f54, %%f22, %%f54
12001+ add %%o1, 64, %%o1
12002+ fxor %%f56, %%f24, %%f56
12003+ fxor %%f58, %%f26, %%f58
12004+ fxor %%f60, %%f28, %%f60
12005+ fxor %%f62, %%f30, %%f62
12006+ ldda [%%o4] %3, %%f0
12007+ fxor %%f48, %%f32, %%f48
12008+ fxor %%f50, %%f34, %%f50
12009+ fxor %%f52, %%f36, %%f52
12010+ fxor %%f54, %%f38, %%f54
12011+ fxor %%f56, %%f40, %%f56
12012+ fxor %%f58, %%f42, %%f58
12013+ subcc %%g5, 64, %%g5
12014+ fxor %%f60, %%f44, %%f60
12015+ fxor %%f62, %%f46, %%f62
12016+ stda %%f48, [%%o4 + %%g1] %3
12017+ bne,pt %%xcc, 5b
12018+ ldda [%%o3] %3, %%f16
12019+
12020+ ldda [%%o2] %3, %%f32
12021+ fxor %%f0, %%f16, %%f48
12022+ fxor %%f2, %%f18, %%f50
12023+ fxor %%f4, %%f20, %%f52
12024+ fxor %%f6, %%f22, %%f54
12025+ fxor %%f8, %%f24, %%f56
12026+ fxor %%f10, %%f26, %%f58
12027+ fxor %%f12, %%f28, %%f60
12028+ fxor %%f14, %%f30, %%f62
12029+ ldda [%%o0] %3, %%f16
12030+ fxor %%f48, %%f32, %%f48
12031+ fxor %%f50, %%f34, %%f50
12032+ fxor %%f52, %%f36, %%f52
12033+ fxor %%f54, %%f38, %%f54
12034+ fxor %%f56, %%f40, %%f56
12035+ fxor %%f58, %%f42, %%f58
12036+ fxor %%f60, %%f44, %%f60
12037+ fxor %%f62, %%f46, %%f62
12038+ ldda [%%o1] %3, %%f32
12039+ fxor %%f48, %%f16, %%f48
12040+ fxor %%f50, %%f18, %%f50
12041+ fxor %%f52, %%f20, %%f52
12042+ fxor %%f54, %%f22, %%f54
12043+ fxor %%f56, %%f24, %%f56
12044+ fxor %%f58, %%f26, %%f58
12045+ fxor %%f60, %%f28, %%f60
12046+ fxor %%f62, %%f30, %%f62
12047+ membar #Sync
12048+ fxor %%f48, %%f32, %%f48
12049+ fxor %%f50, %%f34, %%f50
12050+ fxor %%f52, %%f36, %%f52
12051+ fxor %%f54, %%f38, %%f54
12052+ fxor %%f56, %%f40, %%f56
12053+ fxor %%f58, %%f42, %%f58
12054+ fxor %%f60, %%f44, %%f60
12055+ fxor %%f62, %%f46, %%f62
12056+ stda %%f48, [%%o4] %3
12057+ membar #Sync|#StoreStore|#StoreLoad
12058+ retl
12059+ wr %%g0, 0, %%fprs
12060+ " : :
12061+ "i" (&((struct buffer_head *)0)->b_data),
12062+ "i" (&((struct buffer_head *)0)->b_size),
12063+ "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
12064+ "i" (FPRS_FEF), "i" (VISenter));
12065+}
12066+#endif /* __sparc_v9__ */
12067+
12068+#if defined(__sparc__) && !defined(__sparc_v9__)
12069+/*
12070+ * High speed xor_block operation for RAID4/5 utilizing the
12071+ * ldd/std SPARC instructions.
12072+ *
12073+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
12074+ *
12075+ */
12076+
12077+XORBLOCK_TEMPLATE(SPARC)
12078+{
12079+ int size = bh_ptr[0]->b_size;
12080+ int lines = size / (sizeof (long)) / 8, i;
12081+ long *destp = (long *) bh_ptr[0]->b_data;
12082+ long *source1 = (long *) bh_ptr[1]->b_data;
12083+ long *source2, *source3, *source4;
12084+
12085+ switch (count) {
12086+ case 2:
12087+ for (i = lines; i > 0; i--) {
12088+ __asm__ __volatile__("
12089+ ldd [%0 + 0x00], %%g2
12090+ ldd [%0 + 0x08], %%g4
12091+ ldd [%0 + 0x10], %%o0
12092+ ldd [%0 + 0x18], %%o2
12093+ ldd [%1 + 0x00], %%o4
12094+ ldd [%1 + 0x08], %%l0
12095+ ldd [%1 + 0x10], %%l2
12096+ ldd [%1 + 0x18], %%l4
12097+ xor %%g2, %%o4, %%g2
12098+ xor %%g3, %%o5, %%g3
12099+ xor %%g4, %%l0, %%g4
12100+ xor %%g5, %%l1, %%g5
12101+ xor %%o0, %%l2, %%o0
12102+ xor %%o1, %%l3, %%o1
12103+ xor %%o2, %%l4, %%o2
12104+ xor %%o3, %%l5, %%o3
12105+ std %%g2, [%0 + 0x00]
12106+ std %%g4, [%0 + 0x08]
12107+ std %%o0, [%0 + 0x10]
12108+ std %%o2, [%0 + 0x18]
12109+ " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
12110+ "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
12111+ destp += 8;
12112+ source1 += 8;
12113+ }
12114+ break;
12115+ case 3:
12116+ source2 = (long *) bh_ptr[2]->b_data;
12117+ for (i = lines; i > 0; i--) {
12118+ __asm__ __volatile__("
12119+ ldd [%0 + 0x00], %%g2
12120+ ldd [%0 + 0x08], %%g4
12121+ ldd [%0 + 0x10], %%o0
12122+ ldd [%0 + 0x18], %%o2
12123+ ldd [%1 + 0x00], %%o4
12124+ ldd [%1 + 0x08], %%l0
12125+ ldd [%1 + 0x10], %%l2
12126+ ldd [%1 + 0x18], %%l4
12127+ xor %%g2, %%o4, %%g2
12128+ xor %%g3, %%o5, %%g3
12129+ ldd [%2 + 0x00], %%o4
12130+ xor %%g4, %%l0, %%g4
12131+ xor %%g5, %%l1, %%g5
12132+ ldd [%2 + 0x08], %%l0
12133+ xor %%o0, %%l2, %%o0
12134+ xor %%o1, %%l3, %%o1
12135+ ldd [%2 + 0x10], %%l2
12136+ xor %%o2, %%l4, %%o2
12137+ xor %%o3, %%l5, %%o3
12138+ ldd [%2 + 0x18], %%l4
12139+ xor %%g2, %%o4, %%g2
12140+ xor %%g3, %%o5, %%g3
12141+ xor %%g4, %%l0, %%g4
12142+ xor %%g5, %%l1, %%g5
12143+ xor %%o0, %%l2, %%o0
12144+ xor %%o1, %%l3, %%o1
12145+ xor %%o2, %%l4, %%o2
12146+ xor %%o3, %%l5, %%o3
12147+ std %%g2, [%0 + 0x00]
12148+ std %%g4, [%0 + 0x08]
12149+ std %%o0, [%0 + 0x10]
12150+ std %%o2, [%0 + 0x18]
12151+ " : : "r" (destp), "r" (source1), "r" (source2)
12152+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12153+ "l0", "l1", "l2", "l3", "l4", "l5");
12154+ destp += 8;
12155+ source1 += 8;
12156+ source2 += 8;
12157+ }
12158+ break;
12159+ case 4:
12160+ source2 = (long *) bh_ptr[2]->b_data;
12161+ source3 = (long *) bh_ptr[3]->b_data;
12162+ for (i = lines; i > 0; i--) {
12163+ __asm__ __volatile__("
12164+ ldd [%0 + 0x00], %%g2
12165+ ldd [%0 + 0x08], %%g4
12166+ ldd [%0 + 0x10], %%o0
12167+ ldd [%0 + 0x18], %%o2
12168+ ldd [%1 + 0x00], %%o4
12169+ ldd [%1 + 0x08], %%l0
12170+ ldd [%1 + 0x10], %%l2
12171+ ldd [%1 + 0x18], %%l4
12172+ xor %%g2, %%o4, %%g2
12173+ xor %%g3, %%o5, %%g3
12174+ ldd [%2 + 0x00], %%o4
12175+ xor %%g4, %%l0, %%g4
12176+ xor %%g5, %%l1, %%g5
12177+ ldd [%2 + 0x08], %%l0
12178+ xor %%o0, %%l2, %%o0
12179+ xor %%o1, %%l3, %%o1
12180+ ldd [%2 + 0x10], %%l2
12181+ xor %%o2, %%l4, %%o2
12182+ xor %%o3, %%l5, %%o3
12183+ ldd [%2 + 0x18], %%l4
12184+ xor %%g2, %%o4, %%g2
12185+ xor %%g3, %%o5, %%g3
12186+ ldd [%3 + 0x00], %%o4
12187+ xor %%g4, %%l0, %%g4
12188+ xor %%g5, %%l1, %%g5
12189+ ldd [%3 + 0x08], %%l0
12190+ xor %%o0, %%l2, %%o0
12191+ xor %%o1, %%l3, %%o1
12192+ ldd [%3 + 0x10], %%l2
12193+ xor %%o2, %%l4, %%o2
12194+ xor %%o3, %%l5, %%o3
12195+ ldd [%3 + 0x18], %%l4
12196+ xor %%g2, %%o4, %%g2
12197+ xor %%g3, %%o5, %%g3
12198+ xor %%g4, %%l0, %%g4
12199+ xor %%g5, %%l1, %%g5
12200+ xor %%o0, %%l2, %%o0
12201+ xor %%o1, %%l3, %%o1
12202+ xor %%o2, %%l4, %%o2
12203+ xor %%o3, %%l5, %%o3
12204+ std %%g2, [%0 + 0x00]
12205+ std %%g4, [%0 + 0x08]
12206+ std %%o0, [%0 + 0x10]
12207+ std %%o2, [%0 + 0x18]
12208+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
12209+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12210+ "l0", "l1", "l2", "l3", "l4", "l5");
12211+ destp += 8;
12212+ source1 += 8;
12213+ source2 += 8;
12214+ source3 += 8;
12215+ }
12216+ break;
12217+ case 5:
12218+ source2 = (long *) bh_ptr[2]->b_data;
12219+ source3 = (long *) bh_ptr[3]->b_data;
12220+ source4 = (long *) bh_ptr[4]->b_data;
12221+ for (i = lines; i > 0; i--) {
12222+ __asm__ __volatile__("
12223+ ldd [%0 + 0x00], %%g2
12224+ ldd [%0 + 0x08], %%g4
12225+ ldd [%0 + 0x10], %%o0
12226+ ldd [%0 + 0x18], %%o2
12227+ ldd [%1 + 0x00], %%o4
12228+ ldd [%1 + 0x08], %%l0
12229+ ldd [%1 + 0x10], %%l2
12230+ ldd [%1 + 0x18], %%l4
12231+ xor %%g2, %%o4, %%g2
12232+ xor %%g3, %%o5, %%g3
12233+ ldd [%2 + 0x00], %%o4
12234+ xor %%g4, %%l0, %%g4
12235+ xor %%g5, %%l1, %%g5
12236+ ldd [%2 + 0x08], %%l0
12237+ xor %%o0, %%l2, %%o0
12238+ xor %%o1, %%l3, %%o1
12239+ ldd [%2 + 0x10], %%l2
12240+ xor %%o2, %%l4, %%o2
12241+ xor %%o3, %%l5, %%o3
12242+ ldd [%2 + 0x18], %%l4
12243+ xor %%g2, %%o4, %%g2
12244+ xor %%g3, %%o5, %%g3
12245+ ldd [%3 + 0x00], %%o4
12246+ xor %%g4, %%l0, %%g4
12247+ xor %%g5, %%l1, %%g5
12248+ ldd [%3 + 0x08], %%l0
12249+ xor %%o0, %%l2, %%o0
12250+ xor %%o1, %%l3, %%o1
12251+ ldd [%3 + 0x10], %%l2
12252+ xor %%o2, %%l4, %%o2
12253+ xor %%o3, %%l5, %%o3
12254+ ldd [%3 + 0x18], %%l4
12255+ xor %%g2, %%o4, %%g2
12256+ xor %%g3, %%o5, %%g3
12257+ ldd [%4 + 0x00], %%o4
12258+ xor %%g4, %%l0, %%g4
12259+ xor %%g5, %%l1, %%g5
12260+ ldd [%4 + 0x08], %%l0
12261+ xor %%o0, %%l2, %%o0
12262+ xor %%o1, %%l3, %%o1
12263+ ldd [%4 + 0x10], %%l2
12264+ xor %%o2, %%l4, %%o2
12265+ xor %%o3, %%l5, %%o3
12266+ ldd [%4 + 0x18], %%l4
12267+ xor %%g2, %%o4, %%g2
12268+ xor %%g3, %%o5, %%g3
12269+ xor %%g4, %%l0, %%g4
12270+ xor %%g5, %%l1, %%g5
12271+ xor %%o0, %%l2, %%o0
12272+ xor %%o1, %%l3, %%o1
12273+ xor %%o2, %%l4, %%o2
12274+ xor %%o3, %%l5, %%o3
12275+ std %%g2, [%0 + 0x00]
12276+ std %%g4, [%0 + 0x08]
12277+ std %%o0, [%0 + 0x10]
12278+ std %%o2, [%0 + 0x18]
12279+ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
12280+ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12281+ "l0", "l1", "l2", "l3", "l4", "l5");
12282+ destp += 8;
12283+ source1 += 8;
12284+ source2 += 8;
12285+ source3 += 8;
12286+ source4 += 8;
12287+ }
12288+ break;
12289+ }
12290+}
12291+#endif /* __sparc_v[78]__ */
12292+
12293+#ifndef __sparc_v9__
12294+
12295+/*
12296+ * this one works reasonably on any x86 CPU
12297+ * (send me an assembly version for inclusion if you can make it faster)
12298+ *
12299+ * this one is just as fast as written in pure assembly on x86.
12300+ * the reason for this separate version is that the
12301+ * fast open-coded xor routine "32reg" produces suboptimal code
12302+ * on x86, due to lack of registers.
12303+ */
12304+XORBLOCK_TEMPLATE(8regs)
12305+{
12306+ int len = bh_ptr[0]->b_size;
12307+ long *destp = (long *) bh_ptr[0]->b_data;
12308+ long *source1, *source2, *source3, *source4;
12309+ long lines = len / (sizeof (long)) / 8, i;
12310+
12311+ switch(count) {
12312+ case 2:
12313+ source1 = (long *) bh_ptr[1]->b_data;
12314+ for (i = lines; i > 0; i--) {
12315+ *(destp + 0) ^= *(source1 + 0);
12316+ *(destp + 1) ^= *(source1 + 1);
12317+ *(destp + 2) ^= *(source1 + 2);
12318+ *(destp + 3) ^= *(source1 + 3);
12319+ *(destp + 4) ^= *(source1 + 4);
12320+ *(destp + 5) ^= *(source1 + 5);
12321+ *(destp + 6) ^= *(source1 + 6);
12322+ *(destp + 7) ^= *(source1 + 7);
12323+ source1 += 8;
12324+ destp += 8;
12325+ }
12326+ break;
12327+ case 3:
12328+ source2 = (long *) bh_ptr[2]->b_data;
12329+ source1 = (long *) bh_ptr[1]->b_data;
12330+ for (i = lines; i > 0; i--) {
12331+ *(destp + 0) ^= *(source1 + 0);
12332+ *(destp + 0) ^= *(source2 + 0);
12333+ *(destp + 1) ^= *(source1 + 1);
12334+ *(destp + 1) ^= *(source2 + 1);
12335+ *(destp + 2) ^= *(source1 + 2);
12336+ *(destp + 2) ^= *(source2 + 2);
12337+ *(destp + 3) ^= *(source1 + 3);
12338+ *(destp + 3) ^= *(source2 + 3);
12339+ *(destp + 4) ^= *(source1 + 4);
12340+ *(destp + 4) ^= *(source2 + 4);
12341+ *(destp + 5) ^= *(source1 + 5);
12342+ *(destp + 5) ^= *(source2 + 5);
12343+ *(destp + 6) ^= *(source1 + 6);
12344+ *(destp + 6) ^= *(source2 + 6);
12345+ *(destp + 7) ^= *(source1 + 7);
12346+ *(destp + 7) ^= *(source2 + 7);
12347+ source1 += 8;
12348+ source2 += 8;
12349+ destp += 8;
12350+ }
12351+ break;
12352+ case 4:
12353+ source3 = (long *) bh_ptr[3]->b_data;
12354+ source2 = (long *) bh_ptr[2]->b_data;
12355+ source1 = (long *) bh_ptr[1]->b_data;
12356+ for (i = lines; i > 0; i--) {
12357+ *(destp + 0) ^= *(source1 + 0);
12358+ *(destp + 0) ^= *(source2 + 0);
12359+ *(destp + 0) ^= *(source3 + 0);
12360+ *(destp + 1) ^= *(source1 + 1);
12361+ *(destp + 1) ^= *(source2 + 1);
12362+ *(destp + 1) ^= *(source3 + 1);
12363+ *(destp + 2) ^= *(source1 + 2);
12364+ *(destp + 2) ^= *(source2 + 2);
12365+ *(destp + 2) ^= *(source3 + 2);
12366+ *(destp + 3) ^= *(source1 + 3);
12367+ *(destp + 3) ^= *(source2 + 3);
12368+ *(destp + 3) ^= *(source3 + 3);
12369+ *(destp + 4) ^= *(source1 + 4);
12370+ *(destp + 4) ^= *(source2 + 4);
12371+ *(destp + 4) ^= *(source3 + 4);
12372+ *(destp + 5) ^= *(source1 + 5);
12373+ *(destp + 5) ^= *(source2 + 5);
12374+ *(destp + 5) ^= *(source3 + 5);
12375+ *(destp + 6) ^= *(source1 + 6);
12376+ *(destp + 6) ^= *(source2 + 6);
12377+ *(destp + 6) ^= *(source3 + 6);
12378+ *(destp + 7) ^= *(source1 + 7);
12379+ *(destp + 7) ^= *(source2 + 7);
12380+ *(destp + 7) ^= *(source3 + 7);
12381+ source1 += 8;
12382+ source2 += 8;
12383+ source3 += 8;
12384+ destp += 8;
12385+ }
12386+ break;
12387+ case 5:
12388+ source4 = (long *) bh_ptr[4]->b_data;
12389+ source3 = (long *) bh_ptr[3]->b_data;
12390+ source2 = (long *) bh_ptr[2]->b_data;
12391+ source1 = (long *) bh_ptr[1]->b_data;
12392+ for (i = lines; i > 0; i--) {
12393+ *(destp + 0) ^= *(source1 + 0);
12394+ *(destp + 0) ^= *(source2 + 0);
12395+ *(destp + 0) ^= *(source3 + 0);
12396+ *(destp + 0) ^= *(source4 + 0);
12397+ *(destp + 1) ^= *(source1 + 1);
12398+ *(destp + 1) ^= *(source2 + 1);
12399+ *(destp + 1) ^= *(source3 + 1);
12400+ *(destp + 1) ^= *(source4 + 1);
12401+ *(destp + 2) ^= *(source1 + 2);
12402+ *(destp + 2) ^= *(source2 + 2);
12403+ *(destp + 2) ^= *(source3 + 2);
12404+ *(destp + 2) ^= *(source4 + 2);
12405+ *(destp + 3) ^= *(source1 + 3);
12406+ *(destp + 3) ^= *(source2 + 3);
12407+ *(destp + 3) ^= *(source3 + 3);
12408+ *(destp + 3) ^= *(source4 + 3);
12409+ *(destp + 4) ^= *(source1 + 4);
12410+ *(destp + 4) ^= *(source2 + 4);
12411+ *(destp + 4) ^= *(source3 + 4);
12412+ *(destp + 4) ^= *(source4 + 4);
12413+ *(destp + 5) ^= *(source1 + 5);
12414+ *(destp + 5) ^= *(source2 + 5);
12415+ *(destp + 5) ^= *(source3 + 5);
12416+ *(destp + 5) ^= *(source4 + 5);
12417+ *(destp + 6) ^= *(source1 + 6);
12418+ *(destp + 6) ^= *(source2 + 6);
12419+ *(destp + 6) ^= *(source3 + 6);
12420+ *(destp + 6) ^= *(source4 + 6);
12421+ *(destp + 7) ^= *(source1 + 7);
12422+ *(destp + 7) ^= *(source2 + 7);
12423+ *(destp + 7) ^= *(source3 + 7);
12424+ *(destp + 7) ^= *(source4 + 7);
12425+ source1 += 8;
12426+ source2 += 8;
12427+ source3 += 8;
12428+ source4 += 8;
12429+ destp += 8;
12430+ }
12431+ break;
12432+ }
12433+}
12434+
12435+/*
12436+ * platform independent RAID5 checksum calculation, this should
12437+ * be very fast on any platform that has a decent amount of
12438+ * registers. (32 or more)
12439+ */
12440+XORBLOCK_TEMPLATE(32regs)
12441+{
12442+ int size = bh_ptr[0]->b_size;
12443+ int lines = size / (sizeof (long)) / 8, i;
12444+ long *destp = (long *) bh_ptr[0]->b_data;
12445+ long *source1, *source2, *source3, *source4;
12446+
12447+ /* LOTS of registers available...
12448+ We do explicite loop-unrolling here for code which
12449+ favours RISC machines. In fact this is almoast direct
12450+ RISC assembly on Alpha and SPARC :-) */
12451+
12452+
12453+ switch(count) {
12454+ case 2:
12455+ source1 = (long *) bh_ptr[1]->b_data;
12456+ for (i = lines; i > 0; i--) {
12457+ register long d0, d1, d2, d3, d4, d5, d6, d7;
12458+ d0 = destp[0]; /* Pull the stuff into registers */
12459+ d1 = destp[1]; /* ... in bursts, if possible. */
12460+ d2 = destp[2];
12461+ d3 = destp[3];
12462+ d4 = destp[4];
12463+ d5 = destp[5];
12464+ d6 = destp[6];
12465+ d7 = destp[7];
12466+ d0 ^= source1[0];
12467+ d1 ^= source1[1];
12468+ d2 ^= source1[2];
12469+ d3 ^= source1[3];
12470+ d4 ^= source1[4];
12471+ d5 ^= source1[5];
12472+ d6 ^= source1[6];
12473+ d7 ^= source1[7];
12474+ destp[0] = d0; /* Store the result (in burts) */
12475+ destp[1] = d1;
12476+ destp[2] = d2;
12477+ destp[3] = d3;
12478+ destp[4] = d4; /* Store the result (in burts) */
12479+ destp[5] = d5;
12480+ destp[6] = d6;
12481+ destp[7] = d7;
12482+ source1 += 8;
12483+ destp += 8;
12484+ }
12485+ break;
12486+ case 3:
12487+ source2 = (long *) bh_ptr[2]->b_data;
12488+ source1 = (long *) bh_ptr[1]->b_data;
12489+ for (i = lines; i > 0; i--) {
12490+ register long d0, d1, d2, d3, d4, d5, d6, d7;
12491+ d0 = destp[0]; /* Pull the stuff into registers */
12492+ d1 = destp[1]; /* ... in bursts, if possible. */
12493+ d2 = destp[2];
12494+ d3 = destp[3];
12495+ d4 = destp[4];
12496+ d5 = destp[5];
12497+ d6 = destp[6];
12498+ d7 = destp[7];
12499+ d0 ^= source1[0];
12500+ d1 ^= source1[1];
12501+ d2 ^= source1[2];
12502+ d3 ^= source1[3];
12503+ d4 ^= source1[4];
12504+ d5 ^= source1[5];
12505+ d6 ^= source1[6];
12506+ d7 ^= source1[7];
12507+ d0 ^= source2[0];
12508+ d1 ^= source2[1];
12509+ d2 ^= source2[2];
12510+ d3 ^= source2[3];
12511+ d4 ^= source2[4];
12512+ d5 ^= source2[5];
12513+ d6 ^= source2[6];
12514+ d7 ^= source2[7];
12515+ destp[0] = d0; /* Store the result (in burts) */
12516+ destp[1] = d1;
12517+ destp[2] = d2;
12518+ destp[3] = d3;
12519+ destp[4] = d4; /* Store the result (in burts) */
12520+ destp[5] = d5;
12521+ destp[6] = d6;
12522+ destp[7] = d7;
12523+ source1 += 8;
12524+ source2 += 8;
12525+ destp += 8;
12526+ }
12527+ break;
12528+ case 4:
12529+ source3 = (long *) bh_ptr[3]->b_data;
12530+ source2 = (long *) bh_ptr[2]->b_data;
12531+ source1 = (long *) bh_ptr[1]->b_data;
12532+ for (i = lines; i > 0; i--) {
12533+ register long d0, d1, d2, d3, d4, d5, d6, d7;
12534+ d0 = destp[0]; /* Pull the stuff into registers */
12535+ d1 = destp[1]; /* ... in bursts, if possible. */
12536+ d2 = destp[2];
12537+ d3 = destp[3];
12538+ d4 = destp[4];
12539+ d5 = destp[5];
12540+ d6 = destp[6];
12541+ d7 = destp[7];
12542+ d0 ^= source1[0];
12543+ d1 ^= source1[1];
12544+ d2 ^= source1[2];
12545+ d3 ^= source1[3];
12546+ d4 ^= source1[4];
12547+ d5 ^= source1[5];
12548+ d6 ^= source1[6];
12549+ d7 ^= source1[7];
12550+ d0 ^= source2[0];
12551+ d1 ^= source2[1];
12552+ d2 ^= source2[2];
12553+ d3 ^= source2[3];
12554+ d4 ^= source2[4];
12555+ d5 ^= source2[5];
12556+ d6 ^= source2[6];
12557+ d7 ^= source2[7];
12558+ d0 ^= source3[0];
12559+ d1 ^= source3[1];
12560+ d2 ^= source3[2];
12561+ d3 ^= source3[3];
12562+ d4 ^= source3[4];
12563+ d5 ^= source3[5];
12564+ d6 ^= source3[6];
12565+ d7 ^= source3[7];
12566+ destp[0] = d0; /* Store the result (in burts) */
12567+ destp[1] = d1;
12568+ destp[2] = d2;
12569+ destp[3] = d3;
12570+ destp[4] = d4; /* Store the result (in burts) */
12571+ destp[5] = d5;
12572+ destp[6] = d6;
12573+ destp[7] = d7;
12574+ source1 += 8;
12575+ source2 += 8;
12576+ source3 += 8;
12577+ destp += 8;
12578+ }
12579+ break;
12580+ case 5:
12581+ source4 = (long *) bh_ptr[4]->b_data;
12582+ source3 = (long *) bh_ptr[3]->b_data;
12583+ source2 = (long *) bh_ptr[2]->b_data;
12584+ source1 = (long *) bh_ptr[1]->b_data;
12585+ for (i = lines; i > 0; i--) {
12586+ register long d0, d1, d2, d3, d4, d5, d6, d7;
12587+ d0 = destp[0]; /* Pull the stuff into registers */
12588+ d1 = destp[1]; /* ... in bursts, if possible. */
12589+ d2 = destp[2];
12590+ d3 = destp[3];
12591+ d4 = destp[4];
12592+ d5 = destp[5];
12593+ d6 = destp[6];
12594+ d7 = destp[7];
12595+ d0 ^= source1[0];
12596+ d1 ^= source1[1];
12597+ d2 ^= source1[2];
12598+ d3 ^= source1[3];
12599+ d4 ^= source1[4];
12600+ d5 ^= source1[5];
12601+ d6 ^= source1[6];
12602+ d7 ^= source1[7];
12603+ d0 ^= source2[0];
12604+ d1 ^= source2[1];
12605+ d2 ^= source2[2];
12606+ d3 ^= source2[3];
12607+ d4 ^= source2[4];
12608+ d5 ^= source2[5];
12609+ d6 ^= source2[6];
12610+ d7 ^= source2[7];
12611+ d0 ^= source3[0];
12612+ d1 ^= source3[1];
12613+ d2 ^= source3[2];
12614+ d3 ^= source3[3];
12615+ d4 ^= source3[4];
12616+ d5 ^= source3[5];
12617+ d6 ^= source3[6];
12618+ d7 ^= source3[7];
12619+ d0 ^= source4[0];
12620+ d1 ^= source4[1];
12621+ d2 ^= source4[2];
12622+ d3 ^= source4[3];
12623+ d4 ^= source4[4];
12624+ d5 ^= source4[5];
12625+ d6 ^= source4[6];
12626+ d7 ^= source4[7];
12627+ destp[0] = d0; /* Store the result (in burts) */
12628+ destp[1] = d1;
12629+ destp[2] = d2;
12630+ destp[3] = d3;
12631+ destp[4] = d4; /* Store the result (in burts) */
12632+ destp[5] = d5;
12633+ destp[6] = d6;
12634+ destp[7] = d7;
12635+ source1 += 8;
12636+ source2 += 8;
12637+ source3 += 8;
12638+ source4 += 8;
12639+ destp += 8;
12640+ }
12641+ break;
12642+ }
12643+}
12644+
12645+/*
12646+ * (the -6*32 shift factor colors the cache)
12647+ */
12648+#define SIZE (PAGE_SIZE-6*32)
12649+
12650+static void xor_speed ( struct xor_block_template * func,
12651+ struct buffer_head *b1, struct buffer_head *b2)
12652+{
12653+ int speed;
12654+ unsigned long now;
12655+ int i, count, max;
12656+ struct buffer_head *bh_ptr[6];
12657+
12658+ func->next = xor_functions;
12659+ xor_functions = func;
12660+ bh_ptr[0] = b1;
12661+ bh_ptr[1] = b2;
12662+
12663+ /*
12664+ * count the number of XORs done during a whole jiffy.
12665+ * calculate the speed of checksumming from this.
12666+ * (we use a 2-page allocation to have guaranteed
12667+ * color L1-cache layout)
12668+ */
12669+ max = 0;
12670+ for (i = 0; i < 5; i++) {
12671+ now = jiffies;
12672+ count = 0;
12673+ while (jiffies == now) {
12674+ mb();
12675+ func->xor_block(2,bh_ptr);
12676+ mb();
12677+ count++;
12678+ mb();
12679+ }
12680+ if (count > max)
12681+ max = count;
12682+ }
12683+
12684+ speed = max * (HZ*SIZE/1024);
12685+ func->speed = speed;
12686+
12687+ printk( " %-10s: %5d.%03d MB/sec\n", func->name,
12688+ speed / 1000, speed % 1000);
12689+}
12690+
12691+static inline void pick_fastest_function(void)
12692+{
12693+ struct xor_block_template *f, *fastest;
12694+
12695+ fastest = xor_functions;
12696+ for (f = fastest; f; f = f->next) {
12697+ if (f->speed > fastest->speed)
12698+ fastest = f;
12699+ }
12700+#ifdef CONFIG_X86_XMM
12701+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
12702+ fastest = &t_xor_block_pIII_kni;
12703+ }
12704+#endif
12705+ xor_block = fastest->xor_block;
12706+ printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
12707+ fastest->speed / 1000, fastest->speed % 1000);
12708+}
12709+
12710+
12711+void calibrate_xor_block(void)
12712+{
12713+ struct buffer_head b1, b2;
12714+
12715+ memset(&b1,0,sizeof(b1));
12716+ b2 = b1;
12717+
12718+ b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
12719+ if (!b1.b_data) {
12720+ pick_fastest_function();
12721+ return;
12722+ }
12723+ b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
12724+
12725+ b1.b_size = SIZE;
12726+
12727+ printk(KERN_INFO "raid5: measuring checksumming speed\n");
12728+
12729+ sti(); /* should be safe */
12730+
12731+#if defined(__sparc__) && !defined(__sparc_v9__)
12732+ printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
12733+ xor_speed(&t_xor_block_SPARC,&b1,&b2);
12734+#endif
12735+
12736+#ifdef CONFIG_X86_XMM
12737+ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
12738+ printk(KERN_INFO
12739+ "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
12740+ /* we force the use of the KNI xor block because it
12741+ can write around l2. we may also be able
12742+ to load into the l1 only depending on how
12743+ the cpu deals with a load to a line that is
12744+ being prefetched.
12745+ */
12746+ xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
12747+ }
12748+#endif /* CONFIG_X86_XMM */
12749+
12750+#ifdef __i386__
12751+
12752+ if (md_cpu_has_mmx()) {
12753+ printk(KERN_INFO
12754+ "raid5: MMX detected, trying high-speed MMX checksum routines\n");
12755+ xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
12756+ xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
12757+ }
12758+
12759+#endif /* __i386__ */
12760+
12761+
12762+ xor_speed(&t_xor_block_8regs,&b1,&b2);
12763+ xor_speed(&t_xor_block_32regs,&b1,&b2);
12764+
12765+ free_pages((unsigned long)b1.b_data,2);
12766+ pick_fastest_function();
12767+}
12768+
12769+#else /* __sparc_v9__ */
12770+
12771+void calibrate_xor_block(void)
12772+{
12773+ printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
12774+ xor_block = xor_block_VIS;
12775+}
12776+
12777+#endif /* __sparc_v9__ */
12778+
12779+MD_EXPORT_SYMBOL(xor_block);
12780+
12781diff -ruN linux.orig/include/asm-alpha/md.h linux-2.2.16/include/asm-alpha/md.h
12782--- linux.orig/include/asm-alpha/md.h Fri May 8 09:17:13 1998
12783+++ linux-2.2.16/include/asm-alpha/md.h Thu Jan 1 01:00:00 1970
12784@@ -1,13 +0,0 @@
12785-/* $Id$
12786- * md.h: High speed xor_block operation for RAID4/5
12787- *
12788- */
12789-
12790-#ifndef __ASM_MD_H
12791-#define __ASM_MD_H
12792-
12793-/* #define HAVE_ARCH_XORBLOCK */
12794-
12795-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12796-
12797-#endif /* __ASM_MD_H */
12798diff -ruN linux.orig/include/asm-i386/md.h linux-2.2.16/include/asm-i386/md.h
12799--- linux.orig/include/asm-i386/md.h Fri May 8 09:17:13 1998
12800+++ linux-2.2.16/include/asm-i386/md.h Thu Jan 1 01:00:00 1970
12801@@ -1,13 +0,0 @@
12802-/* $Id$
12803- * md.h: High speed xor_block operation for RAID4/5
12804- *
12805- */
12806-
12807-#ifndef __ASM_MD_H
12808-#define __ASM_MD_H
12809-
12810-/* #define HAVE_ARCH_XORBLOCK */
12811-
12812-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12813-
12814-#endif /* __ASM_MD_H */
12815diff -ruN linux.orig/include/asm-m68k/md.h linux-2.2.16/include/asm-m68k/md.h
12816--- linux.orig/include/asm-m68k/md.h Fri May 8 09:15:22 1998
12817+++ linux-2.2.16/include/asm-m68k/md.h Thu Jan 1 01:00:00 1970
12818@@ -1,13 +0,0 @@
12819-/* $Id$
12820- * md.h: High speed xor_block operation for RAID4/5
12821- *
12822- */
12823-
12824-#ifndef __ASM_MD_H
12825-#define __ASM_MD_H
12826-
12827-/* #define HAVE_ARCH_XORBLOCK */
12828-
12829-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12830-
12831-#endif /* __ASM_MD_H */
12832diff -ruN linux.orig/include/asm-ppc/md.h linux-2.2.16/include/asm-ppc/md.h
12833--- linux.orig/include/asm-ppc/md.h Wed Oct 27 02:53:42 1999
12834+++ linux-2.2.16/include/asm-ppc/md.h Thu Jan 1 01:00:00 1970
12835@@ -1,13 +0,0 @@
12836-/* $Id$
12837- * md.h: High speed xor_block operation for RAID4/5
12838- *
12839- */
12840-
12841-#ifndef __ASM_MD_H
12842-#define __ASM_MD_H
12843-
12844-/* #define HAVE_ARCH_XORBLOCK */
12845-
12846-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12847-
12848-#endif /* __ASM_MD_H */
12849diff -ruN linux.orig/include/asm-sparc/md.h linux-2.2.16/include/asm-sparc/md.h
12850--- linux.orig/include/asm-sparc/md.h Tue Jan 13 00:15:54 1998
12851+++ linux-2.2.16/include/asm-sparc/md.h Thu Jan 1 01:00:00 1970
12852@@ -1,13 +0,0 @@
12853-/* $Id$
12854- * md.h: High speed xor_block operation for RAID4/5
12855- *
12856- */
12857-
12858-#ifndef __ASM_MD_H
12859-#define __ASM_MD_H
12860-
12861-/* #define HAVE_ARCH_XORBLOCK */
12862-
12863-#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12864-
12865-#endif /* __ASM_MD_H */
12866diff -ruN linux.orig/include/asm-sparc64/md.h linux-2.2.16/include/asm-sparc64/md.h
12867--- linux.orig/include/asm-sparc64/md.h Tue Jan 13 00:15:58 1998
12868+++ linux-2.2.16/include/asm-sparc64/md.h Thu Jan 1 01:00:00 1970
12869@@ -1,91 +0,0 @@
12870-/* $Id$
12871- * md.h: High speed xor_block operation for RAID4/5
12872- * utilizing the UltraSparc Visual Instruction Set.
12873- *
12874- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
12875- */
12876-
12877-#ifndef __ASM_MD_H
12878-#define __ASM_MD_H
12879-
12880-#include <asm/head.h>
12881-#include <asm/asi.h>
12882-
12883-#define HAVE_ARCH_XORBLOCK
12884-
12885-#define MD_XORBLOCK_ALIGNMENT 64
12886-
12887-/* void __xor_block (char *dest, char *src, long len)
12888- * {
12889- * while (len--) *dest++ ^= *src++;
12890- * }
12891- *
12892- * Requirements:
12893- * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
12894- * !(len & 127) && len >= 256
12895- */
12896-
12897-static inline void __xor_block (char *dest, char *src, long len)
12898-{
12899- __asm__ __volatile__ ("
12900- wr %%g0, %3, %%fprs
12901- wr %%g0, %4, %%asi
12902- membar #LoadStore|#StoreLoad|#StoreStore
12903- sub %2, 128, %2
12904- ldda [%0] %4, %%f0
12905- ldda [%1] %4, %%f16
12906-1: ldda [%0 + 64] %%asi, %%f32
12907- fxor %%f0, %%f16, %%f16
12908- fxor %%f2, %%f18, %%f18
12909- fxor %%f4, %%f20, %%f20
12910- fxor %%f6, %%f22, %%f22
12911- fxor %%f8, %%f24, %%f24
12912- fxor %%f10, %%f26, %%f26
12913- fxor %%f12, %%f28, %%f28
12914- fxor %%f14, %%f30, %%f30
12915- stda %%f16, [%0] %4
12916- ldda [%1 + 64] %%asi, %%f48
12917- ldda [%0 + 128] %%asi, %%f0
12918- fxor %%f32, %%f48, %%f48
12919- fxor %%f34, %%f50, %%f50
12920- add %0, 128, %0
12921- fxor %%f36, %%f52, %%f52
12922- add %1, 128, %1
12923- fxor %%f38, %%f54, %%f54
12924- subcc %2, 128, %2
12925- fxor %%f40, %%f56, %%f56
12926- fxor %%f42, %%f58, %%f58
12927- fxor %%f44, %%f60, %%f60
12928- fxor %%f46, %%f62, %%f62
12929- stda %%f48, [%0 - 64] %%asi
12930- bne,pt %%xcc, 1b
12931- ldda [%1] %4, %%f16
12932- ldda [%0 + 64] %%asi, %%f32
12933- fxor %%f0, %%f16, %%f16
12934- fxor %%f2, %%f18, %%f18
12935- fxor %%f4, %%f20, %%f20
12936- fxor %%f6, %%f22, %%f22
12937- fxor %%f8, %%f24, %%f24
12938- fxor %%f10, %%f26, %%f26
12939- fxor %%f12, %%f28, %%f28
12940- fxor %%f14, %%f30, %%f30
12941- stda %%f16, [%0] %4
12942- ldda [%1 + 64] %%asi, %%f48
12943- membar #Sync
12944- fxor %%f32, %%f48, %%f48
12945- fxor %%f34, %%f50, %%f50
12946- fxor %%f36, %%f52, %%f52
12947- fxor %%f38, %%f54, %%f54
12948- fxor %%f40, %%f56, %%f56
12949- fxor %%f42, %%f58, %%f58
12950- fxor %%f44, %%f60, %%f60
12951- fxor %%f46, %%f62, %%f62
12952- stda %%f48, [%0 + 64] %%asi
12953- membar #Sync|#StoreStore|#StoreLoad
12954- wr %%g0, 0, %%fprs
12955- " : :
12956- "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
12957- "cc", "memory");
12958-}
12959-
12960-#endif /* __ASM_MD_H */
12961Binary files linux.orig/include/linux/.sysctl.h.rej.swp and linux-2.2.16/include/linux/.sysctl.h.rej.swp differ
12962diff -ruN linux.orig/include/linux/blkdev.h linux-2.2.16/include/linux/blkdev.h
12963--- linux.orig/include/linux/blkdev.h Wed Jun 7 23:26:44 2000
12964+++ linux-2.2.16/include/linux/blkdev.h Fri Jun 9 11:37:44 2000
12965@@ -93,8 +93,9 @@
12966 extern void make_request(int major,int rw, struct buffer_head * bh);
12967
12968 /* md needs this function to remap requests */
12969-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
12970-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
12971+extern int md_map (kdev_t dev, kdev_t *rdev,
12972+ unsigned long *rsector, unsigned long size);
12973+extern int md_make_request (struct buffer_head * bh, int rw);
12974 extern int md_error (kdev_t mddev, kdev_t rdev);
12975
12976 extern int * blk_size[MAX_BLKDEV];
12977diff -ruN linux.orig/include/linux/fs.h linux-2.2.16/include/linux/fs.h
12978--- linux.orig/include/linux/fs.h Wed Jun 7 23:26:44 2000
12979+++ linux-2.2.16/include/linux/fs.h Fri Jun 9 11:37:44 2000
12980@@ -185,6 +185,7 @@
12981 #define BH_Lock 2 /* 1 if the buffer is locked */
12982 #define BH_Req 3 /* 0 if the buffer has been invalidated */
12983 #define BH_Protected 6 /* 1 if the buffer is protected */
12984+#define BH_LowPrio 7 /* 1 if the buffer is lowprio */
12985
12986 /*
12987 * Try to keep the most commonly used fields in single cache lines (16
12988@@ -755,6 +756,7 @@
12989 extern void refile_buffer(struct buffer_head * buf);
12990 extern void set_writetime(struct buffer_head * buf, int flag);
12991 extern int try_to_free_buffers(struct page *);
12992+extern void cache_drop_behind(struct buffer_head *bh);
12993
12994 extern int nr_buffers;
12995 extern long buffermem;
12996@@ -775,6 +777,25 @@
12997 }
12998 }
12999
13000+extern inline void mark_buffer_highprio(struct buffer_head * bh)
13001+{
13002+ clear_bit(BH_LowPrio, &bh->b_state);
13003+}
13004+
13005+extern inline void mark_buffer_lowprio(struct buffer_head * bh)
13006+{
13007+ /*
13008+ * dirty buffers cannot be marked lowprio.
13009+ */
13010+ if (!buffer_dirty(bh))
13011+ set_bit(BH_LowPrio, &bh->b_state);
13012+}
13013+
13014+static inline int buffer_lowprio(struct buffer_head * bh)
13015+{
13016+ return test_bit(BH_LowPrio, &bh->b_state);
13017+}
13018+
13019 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
13020 {
13021 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
13022@@ -782,6 +803,23 @@
13023 if (bh->b_list != BUF_DIRTY)
13024 refile_buffer(bh);
13025 }
13026+ /*
13027+ * if a buffer gets marked dirty then it has to lose
13028+ * it's lowprio state.
13029+ */
13030+ mark_buffer_highprio(bh);
13031+}
13032+
13033+extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
13034+{
13035+ if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
13036+ if (bh->b_list != BUF_DIRTY)
13037+ refile_buffer(bh);
13038+ /*
13039+ * Mark it lowprio only if it was not dirty before!
13040+ */
13041+ set_bit(BH_LowPrio, &bh->b_state);
13042+ }
13043 }
13044
13045 extern int check_disk_change(kdev_t dev);
13046@@ -855,6 +893,7 @@
13047 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
13048 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
13049 extern int is_read_only(kdev_t);
13050+extern int is_device_idle(kdev_t);
13051 extern void __brelse(struct buffer_head *);
13052 extern inline void brelse(struct buffer_head *buf)
13053 {
13054@@ -870,8 +909,12 @@
13055 extern void set_blocksize(kdev_t dev, int size);
13056 extern unsigned int get_hardblocksize(kdev_t dev);
13057 extern struct buffer_head * bread(kdev_t dev, int block, int size);
13058+extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
13059+extern void bread_ahead (kdev_t dev, int block, int size);
13060 extern struct buffer_head * breada(kdev_t dev,int block, int size,
13061 unsigned int pos, unsigned int filesize);
13062+extern struct buffer_head * breada_blocks(kdev_t dev,int block,
13063+ int size, int blocks);
13064
13065 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
13066
13067diff -ruN linux.orig/include/linux/md.h linux-2.2.16/include/linux/md.h
13068--- linux.orig/include/linux/md.h Fri May 8 09:17:13 1998
13069+++ linux-2.2.16/include/linux/md.h Thu Jan 1 01:00:00 1970
13070@@ -1,300 +0,0 @@
13071-/*
13072- md.h : Multiple Devices driver for Linux
13073- Copyright (C) 1994-96 Marc ZYNGIER
13074- <zyngier@ufr-info-p7.ibp.fr> or
13075- <maz@gloups.fdn.fr>
13076-
13077- This program is free software; you can redistribute it and/or modify
13078- it under the terms of the GNU General Public License as published by
13079- the Free Software Foundation; either version 2, or (at your option)
13080- any later version.
13081-
13082- You should have received a copy of the GNU General Public License
13083- (for example /usr/src/linux/COPYING); if not, write to the Free
13084- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13085-*/
13086-
13087-#ifndef _MD_H
13088-#define _MD_H
13089-
13090-#include <linux/major.h>
13091-#include <linux/ioctl.h>
13092-#include <linux/types.h>
13093-
13094-/*
13095- * Different major versions are not compatible.
13096- * Different minor versions are only downward compatible.
13097- * Different patchlevel versions are downward and upward compatible.
13098- */
13099-#define MD_MAJOR_VERSION 0
13100-#define MD_MINOR_VERSION 36
13101-#define MD_PATCHLEVEL_VERSION 6
13102-
13103-#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
13104-
13105-/* ioctls */
13106-#define REGISTER_DEV _IO (MD_MAJOR, 1)
13107-#define START_MD _IO (MD_MAJOR, 2)
13108-#define STOP_MD _IO (MD_MAJOR, 3)
13109-#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
13110-
13111-/*
13112- personalities :
13113- Byte 0 : Chunk size factor
13114- Byte 1 : Fault tolerance count for each physical device
13115- ( 0 means no fault tolerance,
13116- 0xFF means always tolerate faults), not used by now.
13117- Byte 2 : Personality
13118- Byte 3 : Reserved.
13119- */
13120-
13121-#define FAULT_SHIFT 8
13122-#define PERSONALITY_SHIFT 16
13123-
13124-#define FACTOR_MASK 0x000000FFUL
13125-#define FAULT_MASK 0x0000FF00UL
13126-#define PERSONALITY_MASK 0x00FF0000UL
13127-
13128-#define MD_RESERVED 0 /* Not used by now */
13129-#define LINEAR (1UL << PERSONALITY_SHIFT)
13130-#define STRIPED (2UL << PERSONALITY_SHIFT)
13131-#define RAID0 STRIPED
13132-#define RAID1 (3UL << PERSONALITY_SHIFT)
13133-#define RAID5 (4UL << PERSONALITY_SHIFT)
13134-#define MAX_PERSONALITY 5
13135-
13136-/*
13137- * MD superblock.
13138- *
13139- * The MD superblock maintains some statistics on each MD configuration.
13140- * Each real device in the MD set contains it near the end of the device.
13141- * Some of the ideas are copied from the ext2fs implementation.
13142- *
13143- * We currently use 4096 bytes as follows:
13144- *
13145- * word offset function
13146- *
13147- * 0 - 31 Constant generic MD device information.
13148- * 32 - 63 Generic state information.
13149- * 64 - 127 Personality specific information.
13150- * 128 - 511 12 32-words descriptors of the disks in the raid set.
13151- * 512 - 911 Reserved.
13152- * 912 - 1023 Disk specific descriptor.
13153- */
13154-
13155-/*
13156- * If x is the real device size in bytes, we return an apparent size of:
13157- *
13158- * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
13159- *
13160- * and place the 4kB superblock at offset y.
13161- */
13162-#define MD_RESERVED_BYTES (64 * 1024)
13163-#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
13164-#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
13165-
13166-#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
13167-#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
13168-
13169-#define MD_SB_BYTES 4096
13170-#define MD_SB_WORDS (MD_SB_BYTES / 4)
13171-#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
13172-#define MD_SB_SECTORS (MD_SB_BYTES / 512)
13173-
13174-/*
13175- * The following are counted in 32-bit words
13176- */
13177-#define MD_SB_GENERIC_OFFSET 0
13178-#define MD_SB_PERSONALITY_OFFSET 64
13179-#define MD_SB_DISKS_OFFSET 128
13180-#define MD_SB_DESCRIPTOR_OFFSET 992
13181-
13182-#define MD_SB_GENERIC_CONSTANT_WORDS 32
13183-#define MD_SB_GENERIC_STATE_WORDS 32
13184-#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
13185-#define MD_SB_PERSONALITY_WORDS 64
13186-#define MD_SB_DISKS_WORDS 384
13187-#define MD_SB_DESCRIPTOR_WORDS 32
13188-#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
13189-#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
13190-#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
13191-
13192-/*
13193- * Device "operational" state bits
13194- */
13195-#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
13196-#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
13197-#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
13198-
13199-typedef struct md_device_descriptor_s {
13200- __u32 number; /* 0 Device number in the entire set */
13201- __u32 major; /* 1 Device major number */
13202- __u32 minor; /* 2 Device minor number */
13203- __u32 raid_disk; /* 3 The role of the device in the raid set */
13204- __u32 state; /* 4 Operational state */
13205- __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
13206-} md_descriptor_t;
13207-
13208-#define MD_SB_MAGIC 0xa92b4efc
13209-
13210-/*
13211- * Superblock state bits
13212- */
13213-#define MD_SB_CLEAN 0
13214-#define MD_SB_ERRORS 1
13215-
13216-typedef struct md_superblock_s {
13217-
13218- /*
13219- * Constant generic information
13220- */
13221- __u32 md_magic; /* 0 MD identifier */
13222- __u32 major_version; /* 1 major version to which the set conforms */
13223- __u32 minor_version; /* 2 minor version to which the set conforms */
13224- __u32 patch_version; /* 3 patchlevel version to which the set conforms */
13225- __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
13226- __u32 set_magic; /* 5 Raid set identifier */
13227- __u32 ctime; /* 6 Creation time */
13228- __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
13229- __u32 size; /* 8 Apparent size of each individual disk, in kB */
13230- __u32 nr_disks; /* 9 Number of total disks in the raid set */
13231- __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
13232- __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
13233-
13234- /*
13235- * Generic state information
13236- */
13237- __u32 utime; /* 0 Superblock update time */
13238- __u32 state; /* 1 State bits (clean, ...) */
13239- __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
13240- __u32 working_disks; /* 3 Number of working disks */
13241- __u32 failed_disks; /* 4 Number of failed disks */
13242- __u32 spare_disks; /* 5 Number of spare disks */
13243- __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
13244-
13245- /*
13246- * Personality information
13247- */
13248- __u32 parity_algorithm;
13249- __u32 chunk_size;
13250- __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
13251-
13252- /*
13253- * Disks information
13254- */
13255- md_descriptor_t disks[MD_SB_DISKS];
13256-
13257- /*
13258- * Reserved
13259- */
13260- __u32 reserved[MD_SB_RESERVED_WORDS];
13261-
13262- /*
13263- * Active descriptor
13264- */
13265- md_descriptor_t descriptor;
13266-} md_superblock_t;
13267-
13268-#ifdef __KERNEL__
13269-
13270-#include <linux/mm.h>
13271-#include <linux/fs.h>
13272-#include <linux/blkdev.h>
13273-#include <asm/semaphore.h>
13274-
13275-/*
13276- * Kernel-based reconstruction is mostly working, but still requires
13277- * some additional work.
13278- */
13279-#define SUPPORT_RECONSTRUCTION 0
13280-
13281-#define MAX_REAL 8 /* Max number of physical dev per md dev */
13282-#define MAX_MD_DEV 4 /* Max number of md dev */
13283-
13284-#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
13285-#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
13286-#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
13287-
13288-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
13289-
13290-struct real_dev
13291-{
13292- kdev_t dev; /* Device number */
13293- int size; /* Device size (in blocks) */
13294- int offset; /* Real device offset (in blocks) in md dev
13295- (only used in linear mode) */
13296- struct inode *inode; /* Lock inode */
13297- md_superblock_t *sb;
13298- u32 sb_offset;
13299-};
13300-
13301-struct md_dev;
13302-
13303-#define SPARE_INACTIVE 0
13304-#define SPARE_WRITE 1
13305-#define SPARE_ACTIVE 2
13306-
13307-struct md_personality
13308-{
13309- char *name;
13310- int (*map)(struct md_dev *mddev, kdev_t *rdev,
13311- unsigned long *rsector, unsigned long size);
13312- int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
13313- void (*end_request)(struct buffer_head * bh, int uptodate);
13314- int (*run)(int minor, struct md_dev *mddev);
13315- int (*stop)(int minor, struct md_dev *mddev);
13316- int (*status)(char *page, int minor, struct md_dev *mddev);
13317- int (*ioctl)(struct inode *inode, struct file *file,
13318- unsigned int cmd, unsigned long arg);
13319- int max_invalid_dev;
13320- int (*error_handler)(struct md_dev *mddev, kdev_t dev);
13321-
13322-/*
13323- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
13324- * hot-removed. Hot removal is different from failure. (failure marks
13325- * a disk inactive, but the disk is still part of the array)
13326- */
13327- int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
13328- int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
13329- int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
13330-};
13331-
13332-struct md_dev
13333-{
13334- struct real_dev devices[MAX_REAL];
13335- struct md_personality *pers;
13336- md_superblock_t *sb;
13337- int sb_dirty;
13338- int repartition;
13339- int busy;
13340- int nb_dev;
13341- void *private;
13342-};
13343-
13344-struct md_thread {
13345- void (*run) (void *data);
13346- void *data;
13347- struct wait_queue *wqueue;
13348- unsigned long flags;
13349- struct semaphore *sem;
13350- struct task_struct *tsk;
13351-};
13352-
13353-#define THREAD_WAKEUP 0
13354-
13355-extern struct md_dev md_dev[MAX_MD_DEV];
13356-extern int md_size[MAX_MD_DEV];
13357-extern int md_maxreadahead[MAX_MD_DEV];
13358-
13359-extern char *partition_name (kdev_t dev);
13360-
13361-extern int register_md_personality (int p_num, struct md_personality *p);
13362-extern int unregister_md_personality (int p_num);
13363-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
13364-extern void md_unregister_thread (struct md_thread *thread);
13365-extern void md_wakeup_thread(struct md_thread *thread);
13366-extern int md_update_sb (int minor);
13367-extern int md_do_sync(struct md_dev *mddev);
13368-
13369-#endif __KERNEL__
13370-#endif _MD_H
13371diff -ruN linux.orig/include/linux/raid/hsm.h linux-2.2.16/include/linux/raid/hsm.h
13372--- linux.orig/include/linux/raid/hsm.h Thu Jan 1 01:00:00 1970
13373+++ linux-2.2.16/include/linux/raid/hsm.h Fri Jun 9 11:37:44 2000
13374@@ -0,0 +1,65 @@
13375+#ifndef _HSM_H
13376+#define _HSM_H
13377+
13378+#include <linux/raid/md.h>
13379+
13380+#if __alpha__
13381+#error fix cpu_addr on Alpha first
13382+#endif
13383+
13384+#include <linux/raid/hsm_p.h>
13385+
13386+#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
13387+#define index_dev(lv,index) index_pv((lv),(index))->dev
13388+#define index_block(lv,index) (index)->data.phys_block
13389+#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
13390+
13391+#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
13392+
13393+
13394+typedef struct pv_bg_desc_s {
13395+ unsigned int free_blocks;
13396+ pv_block_group_t *bg;
13397+} pv_bg_desc_t;
13398+
13399+typedef struct pv_s pv_t;
13400+typedef struct vg_s vg_t;
13401+typedef struct lv_s lv_t;
13402+
13403+struct pv_s
13404+{
13405+ int phys_nr;
13406+ kdev_t dev;
13407+ pv_sb_t *pv_sb;
13408+ pv_bg_desc_t *bg_array;
13409+};
13410+
13411+struct lv_s
13412+{
13413+ int log_id;
13414+ vg_t *vg;
13415+
13416+ unsigned int max_indices;
13417+ unsigned int free_indices;
13418+ lv_lptr_t root_index;
13419+
13420+ kdev_t dev;
13421+};
13422+
13423+struct vg_s
13424+{
13425+ int nr_pv;
13426+ pv_t pv_array [MD_SB_DISKS];
13427+
13428+ int nr_lv;
13429+ lv_t lv_array [HSM_MAX_LVS_PER_VG];
13430+
13431+ vg_sb_t *vg_sb;
13432+ mddev_t *mddev;
13433+};
13434+
13435+#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
13436+#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
13437+
13438+#endif
13439+
13440diff -ruN linux.orig/include/linux/raid/hsm_p.h linux-2.2.16/include/linux/raid/hsm_p.h
13441--- linux.orig/include/linux/raid/hsm_p.h Thu Jan 1 01:00:00 1970
13442+++ linux-2.2.16/include/linux/raid/hsm_p.h Fri Jun 9 11:37:44 2000
13443@@ -0,0 +1,237 @@
13444+#ifndef _HSM_P_H
13445+#define _HSM_P_H
13446+
13447+#define HSM_BLOCKSIZE 4096
13448+#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
13449+#define PACKED __attribute__ ((packed))
13450+
13451+/*
13452+ * Identifies a block in physical space
13453+ */
13454+typedef struct phys_idx_s {
13455+ __u16 phys_nr;
13456+ __u32 phys_block;
13457+
13458+} PACKED phys_idx_t;
13459+
13460+/*
13461+ * Identifies a block in logical space
13462+ */
13463+typedef struct log_idx_s {
13464+ __u16 log_id;
13465+ __u32 log_index;
13466+
13467+} PACKED log_idx_t;
13468+
13469+/*
13470+ * Describes one PV
13471+ */
13472+#define HSM_PV_SB_MAGIC 0xf091ae9fU
13473+
13474+#define HSM_PV_SB_GENERIC_WORDS 32
13475+#define HSM_PV_SB_RESERVED_WORDS \
13476+ (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
13477+
13478+/*
13479+ * On-disk PV identification data, on block 0 in any PV.
13480+ */
13481+typedef struct pv_sb_s
13482+{
13483+ __u32 pv_magic; /* 0 */
13484+
13485+ __u32 pv_uuid0; /* 1 */
13486+ __u32 pv_uuid1; /* 2 */
13487+ __u32 pv_uuid2; /* 3 */
13488+ __u32 pv_uuid3; /* 4 */
13489+
13490+ __u32 pv_major; /* 5 */
13491+ __u32 pv_minor; /* 6 */
13492+ __u32 pv_patch; /* 7 */
13493+
13494+ __u32 pv_ctime; /* 8 Creation time */
13495+
13496+ __u32 pv_total_size; /* 9 size of this PV, in blocks */
13497+ __u32 pv_first_free; /* 10 first free block */
13498+ __u32 pv_first_used; /* 11 first used block */
13499+ __u32 pv_blocks_left; /* 12 unallocated blocks */
13500+ __u32 pv_bg_size; /* 13 size of a block group, in blocks */
13501+ __u32 pv_block_size; /* 14 size of blocks, in bytes */
13502+ __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
13503+ __u32 pv_block_groups; /* 16 number of block groups */
13504+
13505+ __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
13506+
13507+ /*
13508+ * Reserved
13509+ */
13510+ __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
13511+
13512+} PACKED pv_sb_t;
13513+
13514+/*
13515+ * this is pretty much arbitrary, but has to be less than ~64
13516+ */
13517+#define HSM_MAX_LVS_PER_VG 32
13518+
13519+#define HSM_VG_SB_GENERIC_WORDS 32
13520+
13521+#define LV_DESCRIPTOR_WORDS 8
13522+#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
13523+ LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
13524+
13525+#if (HSM_PV_SB_RESERVED_WORDS < 0)
13526+#error you messed this one up dude ...
13527+#endif
13528+
13529+typedef struct lv_descriptor_s
13530+{
13531+ __u32 lv_id; /* 0 */
13532+ phys_idx_t lv_root_idx; /* 1 */
13533+ __u16 __reserved; /* 2 */
13534+ __u32 lv_max_indices; /* 3 */
13535+ __u32 lv_free_indices; /* 4 */
13536+ __u32 md_id; /* 5 */
13537+
13538+ __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
13539+
13540+} PACKED lv_descriptor_t;
13541+
13542+#define HSM_VG_SB_MAGIC 0x98320d7aU
13543+/*
13544+ * On-disk VG identification data, in block 1 on all PVs
13545+ */
13546+typedef struct vg_sb_s
13547+{
13548+ __u32 vg_magic; /* 0 */
13549+ __u32 nr_lvs; /* 1 */
13550+
13551+ __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
13552+
13553+ lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
13554+ /*
13555+ * Reserved
13556+ */
13557+ __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
13558+
13559+} PACKED vg_sb_t;
13560+
13561+/*
13562+ * Describes one LV
13563+ */
13564+
13565+#define HSM_LV_SB_MAGIC 0xe182bd8aU
13566+
13567+/* do we need lv_sb_t? */
13568+
13569+typedef struct lv_sb_s
13570+{
13571+ /*
13572+ * On-disk LV identifier
13573+ */
13574+ __u32 lv_magic; /* 0 LV identifier */
13575+ __u32 lv_uuid0; /* 1 */
13576+ __u32 lv_uuid1; /* 2 */
13577+ __u32 lv_uuid2; /* 3 */
13578+ __u32 lv_uuid3; /* 4 */
13579+
13580+ __u32 lv_major; /* 5 PV identifier */
13581+ __u32 lv_minor; /* 6 PV identifier */
13582+ __u32 lv_patch; /* 7 PV identifier */
13583+
13584+ __u32 ctime; /* 8 Creation time */
13585+ __u32 size; /* 9 size of this LV, in blocks */
13586+ phys_idx_t start; /* 10 position of root index block */
13587+ log_idx_t first_free; /* 11-12 first free index */
13588+
13589+ /*
13590+ * Reserved
13591+ */
13592+ __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
13593+
13594+} PACKED lv_sb_t;
13595+
13596+/*
13597+ * Pointer pointing from the physical space, points to
13598+ * the LV owning this block. It also contains various
13599+ * statistics about the physical block.
13600+ */
13601+typedef struct pv_pptr_s
13602+{
13603+ union {
13604+ /* case 1 */
13605+ struct {
13606+ log_idx_t owner;
13607+ log_idx_t predicted;
13608+ __u32 last_referenced;
13609+ } used;
13610+ /* case 2 */
13611+ struct {
13612+ __u16 log_id;
13613+ __u16 __unused1;
13614+ __u32 next_free;
13615+ __u32 __unused2;
13616+ __u32 __unused3;
13617+ } free;
13618+ } u;
13619+} PACKED pv_pptr_t;
13620+
13621+static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
13622+{
13623+ return !pptr->u.free.log_id;
13624+}
13625+
13626+
13627+#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
13628+
13629+#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
13630+/*
13631+ * A table of pointers filling up a single block, managing
13632+ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
13633+ * groups form the physical space of blocks.
13634+ */
13635+typedef struct pv_block_group_s
13636+{
13637+ __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
13638+
13639+ pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
13640+
13641+} PACKED pv_block_group_t;
13642+
13643+/*
13644+ * Pointer from the logical space, points to
13645+ * the (PV,block) containing this logical block
13646+ */
13647+typedef struct lv_lptr_s
13648+{
13649+ phys_idx_t data;
13650+ __u16 __reserved;
13651+ __u32 cpu_addr;
13652+ __u32 __reserved2;
13653+
13654+} PACKED lv_lptr_t;
13655+
13656+static __inline__ int index_free (const lv_lptr_t * index)
13657+{
13658+ return !index->data.phys_block;
13659+}
13660+
13661+static __inline__ int index_present (const lv_lptr_t * index)
13662+{
13663+ return index->cpu_addr;
13664+}
13665+
13666+
13667+#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
13668+/*
13669+ * A table of pointers filling up a single block, managing
13670+ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
13671+ * the logical space of blocks.
13672+ */
13673+typedef struct lv_index_block_s
13674+{
13675+ lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
13676+
13677+} PACKED lv_index_block_t;
13678+
13679+#endif
13680+
13681diff -ruN linux.orig/include/linux/raid/linear.h linux-2.2.16/include/linux/raid/linear.h
13682--- linux.orig/include/linux/raid/linear.h Thu Jan 1 01:00:00 1970
13683+++ linux-2.2.16/include/linux/raid/linear.h Fri Jun 9 11:37:44 2000
13684@@ -0,0 +1,32 @@
13685+#ifndef _LINEAR_H
13686+#define _LINEAR_H
13687+
13688+#include <linux/raid/md.h>
13689+
13690+struct dev_info {
13691+ kdev_t dev;
13692+ int size;
13693+ unsigned int offset;
13694+};
13695+
13696+typedef struct dev_info dev_info_t;
13697+
13698+struct linear_hash
13699+{
13700+ dev_info_t *dev0, *dev1;
13701+};
13702+
13703+struct linear_private_data
13704+{
13705+ struct linear_hash *hash_table;
13706+ dev_info_t disks[MD_SB_DISKS];
13707+ dev_info_t *smallest;
13708+ int nr_zones;
13709+};
13710+
13711+
13712+typedef struct linear_private_data linear_conf_t;
13713+
13714+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
13715+
13716+#endif
13717diff -ruN linux.orig/include/linux/raid/md.h linux-2.2.16/include/linux/raid/md.h
13718--- linux.orig/include/linux/raid/md.h Thu Jan 1 01:00:00 1970
13719+++ linux-2.2.16/include/linux/raid/md.h Fri Jun 9 11:37:44 2000
13720@@ -0,0 +1,96 @@
13721+/*
13722+ md.h : Multiple Devices driver for Linux
13723+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
13724+ Copyright (C) 1994-96 Marc ZYNGIER
13725+ <zyngier@ufr-info-p7.ibp.fr> or
13726+ <maz@gloups.fdn.fr>
13727+
13728+ This program is free software; you can redistribute it and/or modify
13729+ it under the terms of the GNU General Public License as published by
13730+ the Free Software Foundation; either version 2, or (at your option)
13731+ any later version.
13732+
13733+ You should have received a copy of the GNU General Public License
13734+ (for example /usr/src/linux/COPYING); if not, write to the Free
13735+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13736+*/
13737+
13738+#ifndef _MD_H
13739+#define _MD_H
13740+
13741+#include <linux/mm.h>
13742+#include <linux/fs.h>
13743+#include <linux/blkdev.h>
13744+#include <asm/semaphore.h>
13745+#include <linux/major.h>
13746+#include <linux/ioctl.h>
13747+#include <linux/types.h>
13748+#include <asm/bitops.h>
13749+#include <linux/module.h>
13750+#include <linux/mm.h>
13751+#include <linux/hdreg.h>
13752+#include <linux/sysctl.h>
13753+#include <linux/fs.h>
13754+#include <linux/proc_fs.h>
13755+#include <linux/smp_lock.h>
13756+#include <linux/delay.h>
13757+#include <net/checksum.h>
13758+#include <linux/random.h>
13759+#include <linux/locks.h>
13760+#include <asm/io.h>
13761+
13762+#include <linux/raid/md_compatible.h>
13763+/*
13764+ * 'md_p.h' holds the 'physical' layout of RAID devices
13765+ * 'md_u.h' holds the user <=> kernel API
13766+ *
13767+ * 'md_k.h' holds kernel internal definitions
13768+ */
13769+
13770+#include <linux/raid/md_p.h>
13771+#include <linux/raid/md_u.h>
13772+#include <linux/raid/md_k.h>
13773+
13774+/*
13775+ * Different major versions are not compatible.
13776+ * Different minor versions are only downward compatible.
13777+ * Different patchlevel versions are downward and upward compatible.
13778+ */
13779+#define MD_MAJOR_VERSION 0
13780+#define MD_MINOR_VERSION 90
13781+#define MD_PATCHLEVEL_VERSION 0
13782+
13783+extern int md_size[MAX_MD_DEVS];
13784+extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
13785+
13786+extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
13787+extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
13788+extern char * partition_name (kdev_t dev);
13789+extern int register_md_personality (int p_num, mdk_personality_t *p);
13790+extern int unregister_md_personality (int p_num);
13791+extern mdk_thread_t * md_register_thread (void (*run) (void *data),
13792+ void *data, const char *name);
13793+extern void md_unregister_thread (mdk_thread_t *thread);
13794+extern void md_wakeup_thread(mdk_thread_t *thread);
13795+extern void md_interrupt_thread (mdk_thread_t *thread);
13796+extern int md_update_sb (mddev_t *mddev);
13797+extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
13798+extern void md_recover_arrays (void);
13799+extern int md_check_ordering (mddev_t *mddev);
13800+extern void autodetect_raid(void);
13801+extern struct gendisk * find_gendisk (kdev_t dev);
13802+extern int md_notify_reboot(struct notifier_block *this,
13803+ unsigned long code, void *x);
13804+#if CONFIG_BLK_DEV_MD
13805+extern void raid_setup(char *str,int *ints) md__init;
13806+#endif
13807+#ifdef CONFIG_MD_BOOT
13808+extern void md_setup(char *str,int *ints) md__init;
13809+#endif
13810+
13811+extern void md_print_devices (void);
13812+
13813+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
13814+
13815+#endif _MD_H
13816+
13817diff -ruN linux.orig/include/linux/raid/md_compatible.h linux-2.2.16/include/linux/raid/md_compatible.h
13818--- linux.orig/include/linux/raid/md_compatible.h Thu Jan 1 01:00:00 1970
13819+++ linux-2.2.16/include/linux/raid/md_compatible.h Fri Jun 9 11:37:44 2000
13820@@ -0,0 +1,387 @@
13821+
13822+/*
13823+ md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
13824+ Copyright (C) 1998 Ingo Molnar
13825+
13826+ This program is free software; you can redistribute it and/or modify
13827+ it under the terms of the GNU General Public License as published by
13828+ the Free Software Foundation; either version 2, or (at your option)
13829+ any later version.
13830+
13831+ You should have received a copy of the GNU General Public License
13832+ (for example /usr/src/linux/COPYING); if not, write to the Free
13833+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13834+*/
13835+
13836+#include <linux/version.h>
13837+
13838+#ifndef _MD_COMPATIBLE_H
13839+#define _MD_COMPATIBLE_H
13840+
13841+#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
13842+
13843+#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
13844+
13845+/* 000 */
13846+#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
13847+
13848+#ifdef __i386__
13849+/* 001 */
13850+extern __inline__ int md_cpu_has_mmx(void)
13851+{
13852+ return x86_capability & 0x00800000;
13853+}
13854+#endif
13855+
13856+/* 002 */
13857+#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
13858+
13859+/* 003 */
13860+/*
13861+ * someone please suggest a sane compatibility layer for modules
13862+ */
13863+#define MD_EXPORT_SYMBOL(x)
13864+
13865+/* 004 */
13866+static inline unsigned long
13867+md_copy_from_user(void *to, const void *from, unsigned long n)
13868+{
13869+ int err;
13870+
13871+ err = verify_area(VERIFY_READ,from,n);
13872+ if (!err)
13873+ memcpy_fromfs(to, from, n);
13874+ return err;
13875+}
13876+
13877+/* 005 */
13878+extern inline unsigned long
13879+md_copy_to_user(void *to, const void *from, unsigned long n)
13880+{
13881+ int err;
13882+
13883+ err = verify_area(VERIFY_WRITE,to,n);
13884+ if (!err)
13885+ memcpy_tofs(to, from, n);
13886+ return err;
13887+}
13888+
13889+/* 006 */
13890+#define md_put_user(x,ptr) \
13891+({ \
13892+ int __err; \
13893+ \
13894+ __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
13895+ if (!__err) \
13896+ put_user(x,ptr); \
13897+ __err; \
13898+})
13899+
13900+/* 007 */
13901+extern inline int md_capable_admin(void)
13902+{
13903+ return suser();
13904+}
13905+
13906+/* 008 */
13907+#define MD_FILE_TO_INODE(file) ((file)->f_inode)
13908+
13909+/* 009 */
13910+extern inline void md_flush_signals (void)
13911+{
13912+ current->signal = 0;
13913+}
13914+
13915+/* 010 */
13916+#define __S(nr) (1<<((nr)-1))
13917+extern inline void md_init_signals (void)
13918+{
13919+ current->exit_signal = SIGCHLD;
13920+ current->blocked = ~(__S(SIGKILL));
13921+}
13922+#undef __S
13923+
13924+/* 011 */
13925+extern inline unsigned long md_signal_pending (struct task_struct * tsk)
13926+{
13927+ return (tsk->signal & ~tsk->blocked);
13928+}
13929+
13930+/* 012 */
13931+#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
13932+
13933+/* 013 */
13934+#define md_mdelay(n) (\
13935+ {unsigned long msec=(n); while (msec--) udelay(1000);})
13936+
13937+/* 014 */
13938+#define MD_SYS_DOWN 0
13939+#define MD_SYS_HALT 0
13940+#define MD_SYS_POWER_OFF 0
13941+
13942+/* 015 */
13943+#define md_register_reboot_notifier(x)
13944+
13945+/* 016 */
13946+extern __inline__ unsigned long
13947+md_test_and_set_bit(int nr, void * addr)
13948+{
13949+ unsigned long flags;
13950+ unsigned long oldbit;
13951+
13952+ save_flags(flags);
13953+ cli();
13954+ oldbit = test_bit(nr,addr);
13955+ set_bit(nr,addr);
13956+ restore_flags(flags);
13957+ return oldbit;
13958+}
13959+
13960+/* 017 */
13961+extern __inline__ unsigned long
13962+md_test_and_clear_bit(int nr, void * addr)
13963+{
13964+ unsigned long flags;
13965+ unsigned long oldbit;
13966+
13967+ save_flags(flags);
13968+ cli();
13969+ oldbit = test_bit(nr,addr);
13970+ clear_bit(nr,addr);
13971+ restore_flags(flags);
13972+ return oldbit;
13973+}
13974+
13975+/* 018 */
13976+#define md_atomic_read(x) (*(volatile int *)(x))
13977+#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
13978+
13979+/* 019 */
13980+extern __inline__ void md_lock_kernel (void)
13981+{
13982+#if __SMP__
13983+ lock_kernel();
13984+ syscall_count++;
13985+#endif
13986+}
13987+
13988+extern __inline__ void md_unlock_kernel (void)
13989+{
13990+#if __SMP__
13991+ syscall_count--;
13992+ unlock_kernel();
13993+#endif
13994+}
13995+/* 020 */
13996+
13997+#define md__init
13998+#define md__initdata
13999+#define md__initfunc(__arginit) __arginit
14000+
14001+/* 021 */
14002+
14003+/* 022 */
14004+
14005+struct md_list_head {
14006+ struct md_list_head *next, *prev;
14007+};
14008+
14009+#define MD_LIST_HEAD(name) \
14010+ struct md_list_head name = { &name, &name }
14011+
14012+#define MD_INIT_LIST_HEAD(ptr) do { \
14013+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
14014+} while (0)
14015+
14016+static __inline__ void md__list_add(struct md_list_head * new,
14017+ struct md_list_head * prev,
14018+ struct md_list_head * next)
14019+{
14020+ next->prev = new;
14021+ new->next = next;
14022+ new->prev = prev;
14023+ prev->next = new;
14024+}
14025+
14026+static __inline__ void md_list_add(struct md_list_head *new,
14027+ struct md_list_head *head)
14028+{
14029+ md__list_add(new, head, head->next);
14030+}
14031+
14032+static __inline__ void md__list_del(struct md_list_head * prev,
14033+ struct md_list_head * next)
14034+{
14035+ next->prev = prev;
14036+ prev->next = next;
14037+}
14038+
14039+static __inline__ void md_list_del(struct md_list_head *entry)
14040+{
14041+ md__list_del(entry->prev, entry->next);
14042+}
14043+
14044+static __inline__ int md_list_empty(struct md_list_head *head)
14045+{
14046+ return head->next == head;
14047+}
14048+
14049+#define md_list_entry(ptr, type, member) \
14050+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
14051+
14052+/* 023 */
14053+
14054+static __inline__ signed long md_schedule_timeout(signed long timeout)
14055+{
14056+ current->timeout = jiffies + timeout;
14057+ schedule();
14058+ return 0;
14059+}
14060+
14061+/* 024 */
14062+#define md_need_resched(tsk) (need_resched)
14063+
14064+/* 025 */
14065+typedef struct { int gcc_is_buggy; } md_spinlock_t;
14066+#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
14067+
14068+#define md_spin_lock_irq cli
14069+#define md_spin_unlock_irq sti
14070+#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
14071+#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
14072+
14073+/* END */
14074+
14075+#else
14076+
14077+#include <linux/reboot.h>
14078+#include <linux/vmalloc.h>
14079+
14080+/* 000 */
14081+#define md__get_free_pages(x,y) __get_free_pages(x,y)
14082+
14083+#ifdef __i386__
14084+/* 001 */
14085+extern __inline__ int md_cpu_has_mmx(void)
14086+{
14087+ return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
14088+}
14089+#endif
14090+
14091+/* 002 */
14092+#define md_clear_page(page) clear_page(page)
14093+
14094+/* 003 */
14095+#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
14096+
14097+/* 004 */
14098+#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
14099+
14100+/* 005 */
14101+#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
14102+
14103+/* 006 */
14104+#define md_put_user put_user
14105+
14106+/* 007 */
14107+extern inline int md_capable_admin(void)
14108+{
14109+ return capable(CAP_SYS_ADMIN);
14110+}
14111+
14112+/* 008 */
14113+#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
14114+
14115+/* 009 */
14116+extern inline void md_flush_signals (void)
14117+{
14118+ spin_lock(&current->sigmask_lock);
14119+ flush_signals(current);
14120+ spin_unlock(&current->sigmask_lock);
14121+}
14122+
14123+/* 010 */
14124+extern inline void md_init_signals (void)
14125+{
14126+ current->exit_signal = SIGCHLD;
14127+ siginitsetinv(&current->blocked, sigmask(SIGKILL));
14128+}
14129+
14130+/* 011 */
14131+#define md_signal_pending signal_pending
14132+
14133+/* 012 */
14134+extern inline void md_set_global_readahead(int * table)
14135+{
14136+ max_readahead[MD_MAJOR] = table;
14137+}
14138+
14139+/* 013 */
14140+#define md_mdelay(x) mdelay(x)
14141+
14142+/* 014 */
14143+#define MD_SYS_DOWN SYS_DOWN
14144+#define MD_SYS_HALT SYS_HALT
14145+#define MD_SYS_POWER_OFF SYS_POWER_OFF
14146+
14147+/* 015 */
14148+#define md_register_reboot_notifier register_reboot_notifier
14149+
14150+/* 016 */
14151+#define md_test_and_set_bit test_and_set_bit
14152+
14153+/* 017 */
14154+#define md_test_and_clear_bit test_and_clear_bit
14155+
14156+/* 018 */
14157+#define md_atomic_read atomic_read
14158+#define md_atomic_set atomic_set
14159+
14160+/* 019 */
14161+#define md_lock_kernel lock_kernel
14162+#define md_unlock_kernel unlock_kernel
14163+
14164+/* 020 */
14165+
14166+#include <linux/init.h>
14167+
14168+#define md__init __init
14169+#define md__initdata __initdata
14170+#define md__initfunc(__arginit) __initfunc(__arginit)
14171+
14172+/* 021 */
14173+
14174+
14175+/* 022 */
14176+
14177+#define md_list_head list_head
14178+#define MD_LIST_HEAD(name) LIST_HEAD(name)
14179+#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
14180+#define md_list_add list_add
14181+#define md_list_del list_del
14182+#define md_list_empty list_empty
14183+
14184+#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
14185+
14186+/* 023 */
14187+
14188+#define md_schedule_timeout schedule_timeout
14189+
14190+/* 024 */
14191+#define md_need_resched(tsk) ((tsk)->need_resched)
14192+
14193+/* 025 */
14194+#define md_spinlock_t spinlock_t
14195+#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
14196+
14197+#define md_spin_lock_irq spin_lock_irq
14198+#define md_spin_unlock_irq spin_unlock_irq
14199+#define md_spin_unlock_irqrestore spin_unlock_irqrestore
14200+#define md_spin_lock_irqsave spin_lock_irqsave
14201+
14202+/* END */
14203+
14204+#endif
14205+
14206+#endif _MD_COMPATIBLE_H
14207+
14208diff -ruN linux.orig/include/linux/raid/md_k.h linux-2.2.16/include/linux/raid/md_k.h
14209--- linux.orig/include/linux/raid/md_k.h Thu Jan 1 01:00:00 1970
14210+++ linux-2.2.16/include/linux/raid/md_k.h Fri Jun 9 11:37:44 2000
14211@@ -0,0 +1,338 @@
14212+/*
14213+ md_k.h : kernel internal structure of the Linux MD driver
14214+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
14215+
14216+ This program is free software; you can redistribute it and/or modify
14217+ it under the terms of the GNU General Public License as published by
14218+ the Free Software Foundation; either version 2, or (at your option)
14219+ any later version.
14220+
14221+ You should have received a copy of the GNU General Public License
14222+ (for example /usr/src/linux/COPYING); if not, write to the Free
14223+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14224+*/
14225+
14226+#ifndef _MD_K_H
14227+#define _MD_K_H
14228+
14229+#define MD_RESERVED 0UL
14230+#define LINEAR 1UL
14231+#define STRIPED 2UL
14232+#define RAID0 STRIPED
14233+#define RAID1 3UL
14234+#define RAID5 4UL
14235+#define TRANSLUCENT 5UL
14236+#define HSM 6UL
14237+#define MAX_PERSONALITY 7UL
14238+
14239+extern inline int pers_to_level (int pers)
14240+{
14241+ switch (pers) {
14242+ case HSM: return -3;
14243+ case TRANSLUCENT: return -2;
14244+ case LINEAR: return -1;
14245+ case RAID0: return 0;
14246+ case RAID1: return 1;
14247+ case RAID5: return 5;
14248+ }
14249+ panic("pers_to_level()");
14250+}
14251+
14252+extern inline int level_to_pers (int level)
14253+{
14254+ switch (level) {
14255+ case -3: return HSM;
14256+ case -2: return TRANSLUCENT;
14257+ case -1: return LINEAR;
14258+ case 0: return RAID0;
14259+ case 1: return RAID1;
14260+ case 4:
14261+ case 5: return RAID5;
14262+ }
14263+ return MD_RESERVED;
14264+}
14265+
14266+typedef struct mddev_s mddev_t;
14267+typedef struct mdk_rdev_s mdk_rdev_t;
14268+
14269+#if (MINORBITS != 8)
14270+#error MD doesnt handle bigger kdev yet
14271+#endif
14272+
14273+#define MAX_REAL 12 /* Max number of disks per md dev */
14274+#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
14275+
14276+/*
14277+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
14278+ * the personality. (eg. HSM uses this to identify individual LVs)
14279+ */
14280+typedef struct dev_mapping_s {
14281+ mddev_t *mddev;
14282+ void *data;
14283+} dev_mapping_t;
14284+
14285+extern dev_mapping_t mddev_map [MAX_MD_DEVS];
14286+
14287+extern inline mddev_t * kdev_to_mddev (kdev_t dev)
14288+{
14289+ return mddev_map[MINOR(dev)].mddev;
14290+}
14291+
14292+/*
14293+ * options passed in raidrun:
14294+ */
14295+
14296+#define MAX_CHUNK_SIZE (4096*1024)
14297+
14298+/*
14299+ * default readahead
14300+ */
14301+#define MD_READAHEAD (256 * 512)
14302+
14303+extern inline int disk_faulty(mdp_disk_t * d)
14304+{
14305+ return d->state & (1 << MD_DISK_FAULTY);
14306+}
14307+
14308+extern inline int disk_active(mdp_disk_t * d)
14309+{
14310+ return d->state & (1 << MD_DISK_ACTIVE);
14311+}
14312+
14313+extern inline int disk_sync(mdp_disk_t * d)
14314+{
14315+ return d->state & (1 << MD_DISK_SYNC);
14316+}
14317+
14318+extern inline int disk_spare(mdp_disk_t * d)
14319+{
14320+ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
14321+}
14322+
14323+extern inline int disk_removed(mdp_disk_t * d)
14324+{
14325+ return d->state & (1 << MD_DISK_REMOVED);
14326+}
14327+
14328+extern inline void mark_disk_faulty(mdp_disk_t * d)
14329+{
14330+ d->state |= (1 << MD_DISK_FAULTY);
14331+}
14332+
14333+extern inline void mark_disk_active(mdp_disk_t * d)
14334+{
14335+ d->state |= (1 << MD_DISK_ACTIVE);
14336+}
14337+
14338+extern inline void mark_disk_sync(mdp_disk_t * d)
14339+{
14340+ d->state |= (1 << MD_DISK_SYNC);
14341+}
14342+
14343+extern inline void mark_disk_spare(mdp_disk_t * d)
14344+{
14345+ d->state = 0;
14346+}
14347+
14348+extern inline void mark_disk_removed(mdp_disk_t * d)
14349+{
14350+ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
14351+}
14352+
14353+extern inline void mark_disk_inactive(mdp_disk_t * d)
14354+{
14355+ d->state &= ~(1 << MD_DISK_ACTIVE);
14356+}
14357+
14358+extern inline void mark_disk_nonsync(mdp_disk_t * d)
14359+{
14360+ d->state &= ~(1 << MD_DISK_SYNC);
14361+}
14362+
14363+/*
14364+ * MD's 'extended' device
14365+ */
14366+struct mdk_rdev_s
14367+{
14368+ struct md_list_head same_set; /* RAID devices within the same set */
14369+ struct md_list_head all; /* all RAID devices */
14370+ struct md_list_head pending; /* undetected RAID devices */
14371+
14372+ kdev_t dev; /* Device number */
14373+ kdev_t old_dev; /* "" when it was last imported */
14374+ int size; /* Device size (in blocks) */
14375+ mddev_t *mddev; /* RAID array if running */
14376+ unsigned long last_events; /* IO event timestamp */
14377+
14378+ struct inode *inode; /* Lock inode */
14379+ struct file filp; /* Lock file */
14380+
14381+ mdp_super_t *sb;
14382+ int sb_offset;
14383+
14384+ int faulty; /* if faulty do not issue IO requests */
14385+ int desc_nr; /* descriptor index in the superblock */
14386+};
14387+
14388+
14389+/*
14390+ * disk operations in a working array:
14391+ */
14392+#define DISKOP_SPARE_INACTIVE 0
14393+#define DISKOP_SPARE_WRITE 1
14394+#define DISKOP_SPARE_ACTIVE 2
14395+#define DISKOP_HOT_REMOVE_DISK 3
14396+#define DISKOP_HOT_ADD_DISK 4
14397+
14398+typedef struct mdk_personality_s mdk_personality_t;
14399+
14400+struct mddev_s
14401+{
14402+ void *private;
14403+ mdk_personality_t *pers;
14404+ int __minor;
14405+ mdp_super_t *sb;
14406+ int nb_dev;
14407+ struct md_list_head disks;
14408+ int sb_dirty;
14409+ mdu_param_t param;
14410+ int ro;
14411+ unsigned int curr_resync;
14412+ unsigned long resync_start;
14413+ char *name;
14414+ int recovery_running;
14415+ struct semaphore reconfig_sem;
14416+ struct semaphore recovery_sem;
14417+ struct semaphore resync_sem;
14418+ struct md_list_head all_mddevs;
14419+};
14420+
14421+struct mdk_personality_s
14422+{
14423+ char *name;
14424+ int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
14425+ unsigned long *rsector, unsigned long size);
14426+ int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
14427+ void (*end_request)(struct buffer_head * bh, int uptodate);
14428+ int (*run)(mddev_t *mddev);
14429+ int (*stop)(mddev_t *mddev);
14430+ int (*status)(char *page, mddev_t *mddev);
14431+ int (*ioctl)(struct inode *inode, struct file *file,
14432+ unsigned int cmd, unsigned long arg);
14433+ int max_invalid_dev;
14434+ int (*error_handler)(mddev_t *mddev, kdev_t dev);
14435+
14436+/*
14437+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
14438+ * hot-removed. Hot removal is different from failure. (failure marks
14439+ * a disk inactive, but the disk is still part of the array) The interface
14440+ * to such operations is the 'pers->diskop()' function, can be NULL.
14441+ *
14442+ * the diskop function can change the pointer pointing to the incoming
14443+ * descriptor, but must do so very carefully. (currently only
14444+ * SPARE_ACTIVE expects such a change)
14445+ */
14446+ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
14447+
14448+ int (*stop_resync)(mddev_t *mddev);
14449+ int (*restart_resync)(mddev_t *mddev);
14450+};
14451+
14452+
14453+/*
14454+ * Currently we index md_array directly, based on the minor
14455+ * number. This will have to change to dynamic allocation
14456+ * once we start supporting partitioning of md devices.
14457+ */
14458+extern inline int mdidx (mddev_t * mddev)
14459+{
14460+ return mddev->__minor;
14461+}
14462+
14463+extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
14464+{
14465+ return MKDEV(MD_MAJOR, mdidx(mddev));
14466+}
14467+
14468+extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
14469+extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
14470+
14471+/*
14472+ * iterates through some rdev ringlist. It's safe to remove the
14473+ * current 'rdev'. Dont touch 'tmp' though.
14474+ */
14475+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
14476+ \
14477+ for (tmp = head.next; \
14478+ rdev = md_list_entry(tmp, mdk_rdev_t, field), \
14479+ tmp = tmp->next, tmp->prev != &head \
14480+ ; )
14481+/*
14482+ * iterates through the 'same array disks' ringlist
14483+ */
14484+#define ITERATE_RDEV(mddev,rdev,tmp) \
14485+ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
14486+
14487+/*
14488+ * Same as above, but assumes that the device has rdev->desc_nr numbered
14489+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
14490+ */
14491+#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
14492+ for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
14493+
14494+
14495+/*
14496+ * Iterates through all 'RAID managed disks'
14497+ */
14498+#define ITERATE_RDEV_ALL(rdev,tmp) \
14499+ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
14500+
14501+/*
14502+ * Iterates through 'pending RAID disks'
14503+ */
14504+#define ITERATE_RDEV_PENDING(rdev,tmp) \
14505+ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
14506+
14507+/*
14508+ * iterates through all used mddevs in the system.
14509+ */
14510+#define ITERATE_MDDEV(mddev,tmp) \
14511+ \
14512+ for (tmp = all_mddevs.next; \
14513+ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
14514+ tmp = tmp->next, tmp->prev != &all_mddevs \
14515+ ; )
14516+
14517+extern inline int lock_mddev (mddev_t * mddev)
14518+{
14519+ return down_interruptible(&mddev->reconfig_sem);
14520+}
14521+
14522+extern inline void unlock_mddev (mddev_t * mddev)
14523+{
14524+ up(&mddev->reconfig_sem);
14525+}
14526+
14527+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
14528+ x = y; y = __tmp; } while (0)
14529+
14530+typedef struct mdk_thread_s {
14531+ void (*run) (void *data);
14532+ void *data;
14533+ struct wait_queue *wqueue;
14534+ unsigned long flags;
14535+ struct semaphore *sem;
14536+ struct task_struct *tsk;
14537+ const char *name;
14538+} mdk_thread_t;
14539+
14540+#define THREAD_WAKEUP 0
14541+
14542+typedef struct dev_name_s {
14543+ struct md_list_head list;
14544+ kdev_t dev;
14545+ char name [MAX_DISKNAME_LEN];
14546+} dev_name_t;
14547+
14548+#endif _MD_K_H
14549+
14550diff -ruN linux.orig/include/linux/raid/md_p.h linux-2.2.16/include/linux/raid/md_p.h
14551--- linux.orig/include/linux/raid/md_p.h Thu Jan 1 01:00:00 1970
14552+++ linux-2.2.16/include/linux/raid/md_p.h Fri Jun 9 11:37:44 2000
14553@@ -0,0 +1,161 @@
14554+/*
14555+ md_p.h : physical layout of Linux RAID devices
14556+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
14557+
14558+ This program is free software; you can redistribute it and/or modify
14559+ it under the terms of the GNU General Public License as published by
14560+ the Free Software Foundation; either version 2, or (at your option)
14561+ any later version.
14562+
14563+ You should have received a copy of the GNU General Public License
14564+ (for example /usr/src/linux/COPYING); if not, write to the Free
14565+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14566+*/
14567+
14568+#ifndef _MD_P_H
14569+#define _MD_P_H
14570+
14571+/*
14572+ * RAID superblock.
14573+ *
14574+ * The RAID superblock maintains some statistics on each RAID configuration.
14575+ * Each real device in the RAID set contains it near the end of the device.
14576+ * Some of the ideas are copied from the ext2fs implementation.
14577+ *
14578+ * We currently use 4096 bytes as follows:
14579+ *
14580+ * word offset function
14581+ *
14582+ * 0 - 31 Constant generic RAID device information.
14583+ * 32 - 63 Generic state information.
14584+ * 64 - 127 Personality specific information.
14585+ * 128 - 511 12 32-words descriptors of the disks in the raid set.
14586+ * 512 - 911 Reserved.
14587+ * 912 - 1023 Disk specific descriptor.
14588+ */
14589+
14590+/*
14591+ * If x is the real device size in bytes, we return an apparent size of:
14592+ *
14593+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
14594+ *
14595+ * and place the 4kB superblock at offset y.
14596+ */
14597+#define MD_RESERVED_BYTES (64 * 1024)
14598+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
14599+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
14600+
14601+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
14602+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
14603+
14604+#define MD_SB_BYTES 4096
14605+#define MD_SB_WORDS (MD_SB_BYTES / 4)
14606+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
14607+#define MD_SB_SECTORS (MD_SB_BYTES / 512)
14608+
14609+/*
14610+ * The following are counted in 32-bit words
14611+ */
14612+#define MD_SB_GENERIC_OFFSET 0
14613+#define MD_SB_PERSONALITY_OFFSET 64
14614+#define MD_SB_DISKS_OFFSET 128
14615+#define MD_SB_DESCRIPTOR_OFFSET 992
14616+
14617+#define MD_SB_GENERIC_CONSTANT_WORDS 32
14618+#define MD_SB_GENERIC_STATE_WORDS 32
14619+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
14620+#define MD_SB_PERSONALITY_WORDS 64
14621+#define MD_SB_DISKS_WORDS 384
14622+#define MD_SB_DESCRIPTOR_WORDS 32
14623+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
14624+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
14625+#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
14626+
14627+/*
14628+ * Device "operational" state bits
14629+ */
14630+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
14631+#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
14632+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
14633+#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
14634+
14635+typedef struct mdp_device_descriptor_s {
14636+ __u32 number; /* 0 Device number in the entire set */
14637+ __u32 major; /* 1 Device major number */
14638+ __u32 minor; /* 2 Device minor number */
14639+ __u32 raid_disk; /* 3 The role of the device in the raid set */
14640+ __u32 state; /* 4 Operational state */
14641+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
14642+} mdp_disk_t;
14643+
14644+#define MD_SB_MAGIC 0xa92b4efc
14645+
14646+/*
14647+ * Superblock state bits
14648+ */
14649+#define MD_SB_CLEAN 0
14650+#define MD_SB_ERRORS 1
14651+
14652+typedef struct mdp_superblock_s {
14653+ /*
14654+ * Constant generic information
14655+ */
14656+ __u32 md_magic; /* 0 MD identifier */
14657+ __u32 major_version; /* 1 major version to which the set conforms */
14658+ __u32 minor_version; /* 2 minor version ... */
14659+ __u32 patch_version; /* 3 patchlevel version ... */
14660+ __u32 gvalid_words; /* 4 Number of used words in this section */
14661+ __u32 set_uuid0; /* 5 Raid set identifier */
14662+ __u32 ctime; /* 6 Creation time */
14663+ __u32 level; /* 7 Raid personality */
14664+ __u32 size; /* 8 Apparent size of each individual disk */
14665+ __u32 nr_disks; /* 9 total disks in the raid set */
14666+ __u32 raid_disks; /* 10 disks in a fully functional raid set */
14667+ __u32 md_minor; /* 11 preferred MD minor device number */
14668+ __u32 not_persistent; /* 12 does it have a persistent superblock */
14669+ __u32 set_uuid1; /* 13 Raid set identifier #2 */
14670+ __u32 set_uuid2; /* 14 Raid set identifier #3 */
14671+ __u32 set_uuid3; /* 14 Raid set identifier #4 */
14672+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
14673+
14674+ /*
14675+ * Generic state information
14676+ */
14677+ __u32 utime; /* 0 Superblock update time */
14678+ __u32 state; /* 1 State bits (clean, ...) */
14679+ __u32 active_disks; /* 2 Number of currently active disks */
14680+ __u32 working_disks; /* 3 Number of working disks */
14681+ __u32 failed_disks; /* 4 Number of failed disks */
14682+ __u32 spare_disks; /* 5 Number of spare disks */
14683+ __u32 sb_csum; /* 6 checksum of the whole superblock */
14684+ __u64 events; /* 7 number of superblock updates (64-bit!) */
14685+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
14686+
14687+ /*
14688+ * Personality information
14689+ */
14690+ __u32 layout; /* 0 the array's physical layout */
14691+ __u32 chunk_size; /* 1 chunk size in bytes */
14692+ __u32 root_pv; /* 2 LV root PV */
14693+ __u32 root_block; /* 3 LV root block */
14694+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
14695+
14696+ /*
14697+ * Disks information
14698+ */
14699+ mdp_disk_t disks[MD_SB_DISKS];
14700+
14701+ /*
14702+ * Reserved
14703+ */
14704+ __u32 reserved[MD_SB_RESERVED_WORDS];
14705+
14706+ /*
14707+ * Active descriptor
14708+ */
14709+ mdp_disk_t this_disk;
14710+
14711+} mdp_super_t;
14712+
14713+#endif _MD_P_H
14714+
14715diff -ruN linux.orig/include/linux/raid/md_u.h linux-2.2.16/include/linux/raid/md_u.h
14716--- linux.orig/include/linux/raid/md_u.h Thu Jan 1 01:00:00 1970
14717+++ linux-2.2.16/include/linux/raid/md_u.h Fri Jun 9 11:37:44 2000
14718@@ -0,0 +1,115 @@
14719+/*
14720+ md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
14721+ Copyright (C) 1998 Ingo Molnar
14722+
14723+ This program is free software; you can redistribute it and/or modify
14724+ it under the terms of the GNU General Public License as published by
14725+ the Free Software Foundation; either version 2, or (at your option)
14726+ any later version.
14727+
14728+ You should have received a copy of the GNU General Public License
14729+ (for example /usr/src/linux/COPYING); if not, write to the Free
14730+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14731+*/
14732+
14733+#ifndef _MD_U_H
14734+#define _MD_U_H
14735+
14736+/* ioctls */
14737+
14738+/* status */
14739+#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
14740+#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
14741+#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
14742+#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
14743+
14744+/* configuration */
14745+#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
14746+#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
14747+#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
14748+#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
14749+#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
14750+#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
14751+#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
14752+#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
14753+#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
14754+#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
14755+
14756+/* usage */
14757+#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
14758+#define START_ARRAY _IO (MD_MAJOR, 0x31)
14759+#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
14760+#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
14761+#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
14762+
14763+typedef struct mdu_version_s {
14764+ int major;
14765+ int minor;
14766+ int patchlevel;
14767+} mdu_version_t;
14768+
14769+typedef struct mdu_array_info_s {
14770+ /*
14771+ * Generic constant information
14772+ */
14773+ int major_version;
14774+ int minor_version;
14775+ int patch_version;
14776+ int ctime;
14777+ int level;
14778+ int size;
14779+ int nr_disks;
14780+ int raid_disks;
14781+ int md_minor;
14782+ int not_persistent;
14783+
14784+ /*
14785+ * Generic state information
14786+ */
14787+ int utime; /* 0 Superblock update time */
14788+ int state; /* 1 State bits (clean, ...) */
14789+ int active_disks; /* 2 Number of currently active disks */
14790+ int working_disks; /* 3 Number of working disks */
14791+ int failed_disks; /* 4 Number of failed disks */
14792+ int spare_disks; /* 5 Number of spare disks */
14793+
14794+ /*
14795+ * Personality information
14796+ */
14797+ int layout; /* 0 the array's physical layout */
14798+ int chunk_size; /* 1 chunk size in bytes */
14799+
14800+} mdu_array_info_t;
14801+
14802+typedef struct mdu_disk_info_s {
14803+ /*
14804+ * configuration/status of one particular disk
14805+ */
14806+ int number;
14807+ int major;
14808+ int minor;
14809+ int raid_disk;
14810+ int state;
14811+
14812+} mdu_disk_info_t;
14813+
14814+typedef struct mdu_start_info_s {
14815+ /*
14816+ * configuration/status of one particular disk
14817+ */
14818+ int major;
14819+ int minor;
14820+ int raid_disk;
14821+ int state;
14822+
14823+} mdu_start_info_t;
14824+
14825+typedef struct mdu_param_s
14826+{
14827+ int personality; /* 1,2,3,4 */
14828+ int chunk_size; /* in bytes */
14829+ int max_fault; /* unused for now */
14830+} mdu_param_t;
14831+
14832+#endif _MD_U_H
14833+
14834diff -ruN linux.orig/include/linux/raid/raid0.h linux-2.2.16/include/linux/raid/raid0.h
14835--- linux.orig/include/linux/raid/raid0.h Thu Jan 1 01:00:00 1970
14836+++ linux-2.2.16/include/linux/raid/raid0.h Fri Jun 9 11:37:44 2000
14837@@ -0,0 +1,33 @@
14838+#ifndef _RAID0_H
14839+#define _RAID0_H
14840+
14841+#include <linux/raid/md.h>
14842+
14843+struct strip_zone
14844+{
14845+ int zone_offset; /* Zone offset in md_dev */
14846+ int dev_offset; /* Zone offset in real dev */
14847+ int size; /* Zone size */
14848+ int nb_dev; /* # of devices attached to the zone */
14849+ mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
14850+};
14851+
14852+struct raid0_hash
14853+{
14854+ struct strip_zone *zone0, *zone1;
14855+};
14856+
14857+struct raid0_private_data
14858+{
14859+ struct raid0_hash *hash_table; /* Dynamically allocated */
14860+ struct strip_zone *strip_zone; /* This one too */
14861+ int nr_strip_zones;
14862+ struct strip_zone *smallest;
14863+ int nr_zones;
14864+};
14865+
14866+typedef struct raid0_private_data raid0_conf_t;
14867+
14868+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
14869+
14870+#endif
14871diff -ruN linux.orig/include/linux/raid/raid1.h linux-2.2.16/include/linux/raid/raid1.h
14872--- linux.orig/include/linux/raid/raid1.h Thu Jan 1 01:00:00 1970
14873+++ linux-2.2.16/include/linux/raid/raid1.h Fri Jun 9 11:37:44 2000
14874@@ -0,0 +1,64 @@
14875+#ifndef _RAID1_H
14876+#define _RAID1_H
14877+
14878+#include <linux/raid/md.h>
14879+
14880+struct mirror_info {
14881+ int number;
14882+ int raid_disk;
14883+ kdev_t dev;
14884+ int next;
14885+ int sect_limit;
14886+
14887+ /*
14888+ * State bits:
14889+ */
14890+ int operational;
14891+ int write_only;
14892+ int spare;
14893+
14894+ int used_slot;
14895+};
14896+
14897+struct raid1_private_data {
14898+ mddev_t *mddev;
14899+ struct mirror_info mirrors[MD_SB_DISKS];
14900+ int nr_disks;
14901+ int raid_disks;
14902+ int working_disks;
14903+ int last_used;
14904+ unsigned long next_sect;
14905+ int sect_count;
14906+ mdk_thread_t *thread, *resync_thread;
14907+ int resync_mirrors;
14908+ struct mirror_info *spare;
14909+};
14910+
14911+typedef struct raid1_private_data raid1_conf_t;
14912+
14913+/*
14914+ * this is the only point in the RAID code where we violate
14915+ * C type safety. mddev->private is an 'opaque' pointer.
14916+ */
14917+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
14918+
14919+/*
14920+ * this is our 'private' 'collective' RAID1 buffer head.
14921+ * it contains information about what kind of IO operations were started
14922+ * for this RAID1 operation, and about their status:
14923+ */
14924+
14925+struct raid1_bh {
14926+ atomic_t remaining; /* 'have we finished' count,
14927+ * used from IRQ handlers
14928+ */
14929+ int cmd;
14930+ unsigned long state;
14931+ mddev_t *mddev;
14932+ struct buffer_head *master_bh;
14933+ struct buffer_head *mirror_bh [MD_SB_DISKS];
14934+ struct buffer_head bh_req;
14935+ struct buffer_head *next_retry;
14936+};
14937+
14938+#endif
14939diff -ruN linux.orig/include/linux/raid/raid5.h linux-2.2.16/include/linux/raid/raid5.h
14940--- linux.orig/include/linux/raid/raid5.h Thu Jan 1 01:00:00 1970
14941+++ linux-2.2.16/include/linux/raid/raid5.h Fri Jun 9 11:37:44 2000
14942@@ -0,0 +1,113 @@
14943+#ifndef _RAID5_H
14944+#define _RAID5_H
14945+
14946+#include <linux/raid/md.h>
14947+#include <linux/raid/xor.h>
14948+
14949+struct disk_info {
14950+ kdev_t dev;
14951+ int operational;
14952+ int number;
14953+ int raid_disk;
14954+ int write_only;
14955+ int spare;
14956+ int used_slot;
14957+};
14958+
14959+struct stripe_head {
14960+ md_spinlock_t stripe_lock;
14961+ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
14962+ struct stripe_head *free_next; /* pool of free sh's */
14963+ struct buffer_head *buffer_pool; /* pool of free buffers */
14964+ struct buffer_head *bh_pool; /* pool of free bh's */
14965+ struct raid5_private_data *raid_conf;
14966+ struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
14967+ struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
14968+ struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
14969+ struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
14970+ int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
14971+ int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
14972+ unsigned long sector; /* sector of this row */
14973+ int size; /* buffers size */
14974+ int pd_idx; /* parity disk index */
14975+ atomic_t nr_pending; /* nr of pending cmds */
14976+ unsigned long state; /* state flags */
14977+ int cmd; /* stripe cmd */
14978+ int count; /* nr of waiters */
14979+ int write_method; /* reconstruct-write / read-modify-write */
14980+ int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
14981+ struct wait_queue *wait; /* processes waiting for this stripe */
14982+};
14983+
14984+/*
14985+ * Phase
14986+ */
14987+#define PHASE_BEGIN 0
14988+#define PHASE_READ_OLD 1
14989+#define PHASE_WRITE 2
14990+#define PHASE_READ 3
14991+#define PHASE_COMPLETE 4
14992+
14993+/*
14994+ * Write method
14995+ */
14996+#define METHOD_NONE 0
14997+#define RECONSTRUCT_WRITE 1
14998+#define READ_MODIFY_WRITE 2
14999+
15000+/*
15001+ * Stripe state
15002+ */
15003+#define STRIPE_LOCKED 0
15004+#define STRIPE_ERROR 1
15005+
15006+/*
15007+ * Stripe commands
15008+ */
15009+#define STRIPE_NONE 0
15010+#define STRIPE_WRITE 1
15011+#define STRIPE_READ 2
15012+
15013+struct raid5_private_data {
15014+ struct stripe_head **stripe_hashtbl;
15015+ mddev_t *mddev;
15016+ mdk_thread_t *thread, *resync_thread;
15017+ struct disk_info disks[MD_SB_DISKS];
15018+ struct disk_info *spare;
15019+ int buffer_size;
15020+ int chunk_size, level, algorithm;
15021+ int raid_disks, working_disks, failed_disks;
15022+ int sector_count;
15023+ unsigned long next_sector;
15024+ atomic_t nr_handle;
15025+ struct stripe_head *next_free_stripe;
15026+ int nr_stripes;
15027+ int resync_parity;
15028+ int max_nr_stripes;
15029+ int clock;
15030+ int nr_hashed_stripes;
15031+ int nr_locked_stripes;
15032+ int nr_pending_stripes;
15033+ int nr_cached_stripes;
15034+
15035+ /*
15036+ * Free stripes pool
15037+ */
15038+ int nr_free_sh;
15039+ struct stripe_head *free_sh_list;
15040+ struct wait_queue *wait_for_stripe;
15041+};
15042+
15043+typedef struct raid5_private_data raid5_conf_t;
15044+
15045+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
15046+
15047+/*
15048+ * Our supported algorithms
15049+ */
15050+#define ALGORITHM_LEFT_ASYMMETRIC 0
15051+#define ALGORITHM_RIGHT_ASYMMETRIC 1
15052+#define ALGORITHM_LEFT_SYMMETRIC 2
15053+#define ALGORITHM_RIGHT_SYMMETRIC 3
15054+
15055+#endif
15056diff -ruN linux.orig/include/linux/raid/translucent.h linux-2.2.16/include/linux/raid/translucent.h
15057--- linux.orig/include/linux/raid/translucent.h Thu Jan 1 01:00:00 1970
15058+++ linux-2.2.16/include/linux/raid/translucent.h Fri Jun 9 11:37:44 2000
15059@@ -0,0 +1,23 @@
15060+#ifndef _TRANSLUCENT_H
15061+#define _TRANSLUCENT_H
15062+
15063+#include <linux/raid/md.h>
15064+
15065+typedef struct dev_info dev_info_t;
15066+
15067+struct dev_info {
15068+ kdev_t dev;
15069+ int size;
15070+};
15071+
15072+struct translucent_private_data
15073+{
15074+ dev_info_t disks[MD_SB_DISKS];
15075+};
15076+
15077+
15078+typedef struct translucent_private_data translucent_conf_t;
15079+
15080+#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
15081+
15082+#endif
15083diff -ruN linux.orig/include/linux/raid/xor.h linux-2.2.16/include/linux/raid/xor.h
15084--- linux.orig/include/linux/raid/xor.h Thu Jan 1 01:00:00 1970
15085+++ linux-2.2.16/include/linux/raid/xor.h Fri Jun 9 11:37:44 2000
15086@@ -0,0 +1,12 @@
15087+#ifndef _XOR_H
15088+#define _XOR_H
15089+
15090+#include <linux/raid/md.h>
15091+
15092+#define MAX_XOR_BLOCKS 5
15093+
15094+extern void calibrate_xor_block(void);
15095+extern void (*xor_block)(unsigned int count,
15096+ struct buffer_head **bh_ptr);
15097+
15098+#endif
15099diff -ruN linux.orig/include/linux/raid0.h linux-2.2.16/include/linux/raid0.h
15100--- linux.orig/include/linux/raid0.h Tue Oct 29 14:20:24 1996
15101+++ linux-2.2.16/include/linux/raid0.h Thu Jan 1 01:00:00 1970
15102@@ -1,27 +0,0 @@
15103-#ifndef _RAID0_H
15104-#define _RAID0_H
15105-
15106-struct strip_zone
15107-{
15108- int zone_offset; /* Zone offset in md_dev */
15109- int dev_offset; /* Zone offset in real dev */
15110- int size; /* Zone size */
15111- int nb_dev; /* Number of devices attached to the zone */
15112- struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
15113-};
15114-
15115-struct raid0_hash
15116-{
15117- struct strip_zone *zone0, *zone1;
15118-};
15119-
15120-struct raid0_data
15121-{
15122- struct raid0_hash *hash_table; /* Dynamically allocated */
15123- struct strip_zone *strip_zone; /* This one too */
15124- int nr_strip_zones;
15125- struct strip_zone *smallest;
15126- int nr_zones;
15127-};
15128-
15129-#endif
15130diff -ruN linux.orig/include/linux/raid1.h linux-2.2.16/include/linux/raid1.h
15131--- linux.orig/include/linux/raid1.h Fri May 8 09:17:13 1998
15132+++ linux-2.2.16/include/linux/raid1.h Thu Jan 1 01:00:00 1970
15133@@ -1,49 +0,0 @@
15134-#ifndef _RAID1_H
15135-#define _RAID1_H
15136-
15137-#include <linux/md.h>
15138-
15139-struct mirror_info {
15140- int number;
15141- int raid_disk;
15142- kdev_t dev;
15143- int next;
15144- int sect_limit;
15145-
15146- /*
15147- * State bits:
15148- */
15149- int operational;
15150- int write_only;
15151- int spare;
15152-};
15153-
15154-struct raid1_data {
15155- struct md_dev *mddev;
15156- struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
15157- int raid_disks;
15158- int working_disks; /* Number of working disks */
15159- int last_used;
15160- unsigned long next_sect;
15161- int sect_count;
15162- int resync_running;
15163-};
15164-
15165-/*
15166- * this is our 'private' 'collective' RAID1 buffer head.
15167- * it contains information about what kind of IO operations were started
15168- * for this RAID5 operation, and about their status:
15169- */
15170-
15171-struct raid1_bh {
15172- unsigned int remaining;
15173- int cmd;
15174- unsigned long state;
15175- struct md_dev *mddev;
15176- struct buffer_head *master_bh;
15177- struct buffer_head *mirror_bh [MD_SB_DISKS];
15178- struct buffer_head bh_req;
15179- struct buffer_head *next_retry;
15180-};
15181-
15182-#endif
15183diff -ruN linux.orig/include/linux/raid5.h linux-2.2.16/include/linux/raid5.h
15184--- linux.orig/include/linux/raid5.h Fri May 8 09:17:13 1998
15185+++ linux-2.2.16/include/linux/raid5.h Thu Jan 1 01:00:00 1970
15186@@ -1,110 +0,0 @@
15187-#ifndef _RAID5_H
15188-#define _RAID5_H
15189-
15190-#ifdef __KERNEL__
15191-#include <linux/md.h>
15192-#include <asm/atomic.h>
15193-
15194-struct disk_info {
15195- kdev_t dev;
15196- int operational;
15197- int number;
15198- int raid_disk;
15199- int write_only;
15200- int spare;
15201-};
15202-
15203-struct stripe_head {
15204- struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
15205- struct stripe_head *free_next; /* pool of free sh's */
15206- struct buffer_head *buffer_pool; /* pool of free buffers */
15207- struct buffer_head *bh_pool; /* pool of free bh's */
15208- struct raid5_data *raid_conf;
15209- struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
15210- struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
15211- struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
15212- struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
15213- int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
15214- int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
15215- unsigned long sector; /* sector of this row */
15216- int size; /* buffers size */
15217- int pd_idx; /* parity disk index */
15218- int nr_pending; /* nr of pending cmds */
15219- unsigned long state; /* state flags */
15220- int cmd; /* stripe cmd */
15221- int count; /* nr of waiters */
15222- int write_method; /* reconstruct-write / read-modify-write */
15223- int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
15224- struct wait_queue *wait; /* processes waiting for this stripe */
15225-};
15226-
15227-/*
15228- * Phase
15229- */
15230-#define PHASE_BEGIN 0
15231-#define PHASE_READ_OLD 1
15232-#define PHASE_WRITE 2
15233-#define PHASE_READ 3
15234-#define PHASE_COMPLETE 4
15235-
15236-/*
15237- * Write method
15238- */
15239-#define METHOD_NONE 0
15240-#define RECONSTRUCT_WRITE 1
15241-#define READ_MODIFY_WRITE 2
15242-
15243-/*
15244- * Stripe state
15245- */
15246-#define STRIPE_LOCKED 0
15247-#define STRIPE_ERROR 1
15248-
15249-/*
15250- * Stripe commands
15251- */
15252-#define STRIPE_NONE 0
15253-#define STRIPE_WRITE 1
15254-#define STRIPE_READ 2
15255-
15256-struct raid5_data {
15257- struct stripe_head **stripe_hashtbl;
15258- struct md_dev *mddev;
15259- struct md_thread *thread, *resync_thread;
15260- struct disk_info disks[MD_SB_DISKS];
15261- struct disk_info *spare;
15262- int buffer_size;
15263- int chunk_size, level, algorithm;
15264- int raid_disks, working_disks, failed_disks;
15265- int sector_count;
15266- unsigned long next_sector;
15267- atomic_t nr_handle;
15268- struct stripe_head *next_free_stripe;
15269- int nr_stripes;
15270- int resync_parity;
15271- int max_nr_stripes;
15272- int clock;
15273- int nr_hashed_stripes;
15274- int nr_locked_stripes;
15275- int nr_pending_stripes;
15276- int nr_cached_stripes;
15277-
15278- /*
15279- * Free stripes pool
15280- */
15281- int nr_free_sh;
15282- struct stripe_head *free_sh_list;
15283- struct wait_queue *wait_for_stripe;
15284-};
15285-
15286-#endif
15287-
15288-/*
15289- * Our supported algorithms
15290- */
15291-#define ALGORITHM_LEFT_ASYMMETRIC 0
15292-#define ALGORITHM_RIGHT_ASYMMETRIC 1
15293-#define ALGORITHM_LEFT_SYMMETRIC 2
15294-#define ALGORITHM_RIGHT_SYMMETRIC 3
15295-
15296-#endif
15297diff -ruN linux.orig/include/linux/sysctl.h linux-2.2.16/include/linux/sysctl.h
15298--- linux.orig/include/linux/sysctl.h Wed Jun 7 23:26:44 2000
15299+++ linux-2.2.16/include/linux/sysctl.h Fri Jun 9 11:45:55 2000
15300@@ -430,7 +430,8 @@
15301 /* CTL_DEV names: */
15302 enum {
15303 DEV_CDROM=1,
15304- DEV_HWMON=2
15305+ DEV_HWMON=2,
15306+ DEV_MD=3
15307 };
15308
15309 /* /proc/sys/dev/cdrom */
15310@@ -441,6 +442,11 @@
15311 DEV_CDROM_DEBUG=4,
15312 DEV_CDROM_LOCK=5,
15313 DEV_CDROM_CHECK_MEDIA=6
15314+};
15315+
15316+/* /proc/sys/dev/md */
15317+enum {
15318+ DEV_MD_SPEED_LIMIT=1
15319 };
15320
15321 #ifdef __KERNEL__
15322diff -ruN linux.orig/init/main.c linux-2.2.16/init/main.c
15323--- linux.orig/init/main.c Wed Jun 7 23:26:44 2000
15324+++ linux-2.2.16/init/main.c Fri Jun 9 11:37:44 2000
15325@@ -19,6 +19,7 @@
15326 #include <linux/utsname.h>
15327 #include <linux/ioport.h>
15328 #include <linux/init.h>
15329+#include <linux/raid/md.h>
15330 #include <linux/smp_lock.h>
15331 #include <linux/blk.h>
15332 #include <linux/hdreg.h>
15333@@ -540,7 +541,7 @@
15334 #ifdef CONFIG_BLK_DEV_FD
15335 { "fd", 0x0200 },
15336 #endif
15337-#ifdef CONFIG_MD_BOOT
15338+#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
15339 { "md", 0x0900 },
15340 #endif
15341 #ifdef CONFIG_BLK_DEV_XD
15342@@ -1042,6 +1043,9 @@
15343 #ifdef CONFIG_MD_BOOT
15344 { "md=", md_setup},
15345 #endif
15346+#if CONFIG_BLK_DEV_MD
15347+ { "raid=", raid_setup},
15348+#endif
15349 #ifdef CONFIG_ADBMOUSE
15350 { "adb_buttons=", adb_mouse_setup },
15351 #endif
15352@@ -1580,6 +1584,9 @@
15353 while (pid != wait(&i));
15354 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
15355 || MINOR(real_root_dev) != 0) {
15356+#ifdef CONFIG_BLK_DEV_MD
15357+ autodetect_raid();
15358+#endif
15359 error = change_root(real_root_dev,"/initrd");
15360 if (error)
15361 printk(KERN_ERR "Change root to /initrd: "
This page took 3.437137 seconds and 4 git commands to generate.