1 diff -ruN linux.orig/Documentation/Configure.help linux-2.2.16/Documentation/Configure.help
2 --- linux.orig/Documentation/Configure.help Wed Jun 7 23:26:42 2000
3 +++ linux-2.2.16/Documentation/Configure.help Fri Jun 9 11:37:48 2000
8 +Autodetect RAID partitions
9 +CONFIG_AUTODETECT_RAID
10 + This feature lets the kernel detect RAID partitions on bootup.
11 + An autodetect RAID partition is a normal partition with partition
12 + type 0xfd. Use this if you want to boot RAID devices, or want to
13 + run them automatically.
17 If you say Y here, then your multiple devices driver will be able to
18 @@ -1039,6 +1046,21 @@
19 Documentation/modules.txt.
23 +Translucent Block Device Support (EXPERIMENTAL)
24 +CONFIG_MD_TRANSLUCENT
25 + DO NOT USE THIS STUFF YET!
27 + currently there is only a placeholder there as the implementation
30 +Hierarchical Storage Management support (EXPERIMENTAL)
32 + DO NOT USE THIS STUFF YET!
34 + i have released this so people can comment on the architecture,
35 + but user-space tools are still unusable so there is nothing much
36 + you can do with this.
38 Boot support (linear, striped)
40 diff -ruN linux.orig/arch/i386/defconfig linux-2.2.16/arch/i386/defconfig
41 --- linux.orig/arch/i386/defconfig Thu May 4 02:16:30 2000
42 +++ linux-2.2.16/arch/i386/defconfig Fri Jun 9 11:37:45 2000
45 # CONFIG_BLK_DEV_LOOP is not set
46 # CONFIG_BLK_DEV_NBD is not set
47 -# CONFIG_BLK_DEV_MD is not set
49 +CONFIG_AUTODETECT_RAID=y
50 +CONFIG_MD_TRANSLUCENT=y
53 +CONFIG_MD_MIRRORING=y
57 # CONFIG_BLK_DEV_RAM is not set
58 # CONFIG_BLK_DEV_XD is not set
59 # CONFIG_BLK_DEV_DAC960 is not set
60 diff -ruN linux.orig/arch/sparc/config.in linux-2.2.16/arch/sparc/config.in
61 --- linux.orig/arch/sparc/config.in Wed Jun 7 23:26:42 2000
62 +++ linux-2.2.16/arch/sparc/config.in Fri Jun 9 11:37:45 2000
66 # For a description of the syntax of this configuration file,
67 # see Documentation/kbuild/config-language.txt.
71 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
72 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
73 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
74 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
75 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
76 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
77 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
78 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
79 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
81 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
82 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
85 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
86 diff -ruN linux.orig/arch/sparc/defconfig linux-2.2.16/arch/sparc/defconfig
87 --- linux.orig/arch/sparc/defconfig Thu May 4 02:16:32 2000
88 +++ linux-2.2.16/arch/sparc/defconfig Fri Jun 9 11:37:45 2000
93 +# CONFIG_AUTODETECT_RAID is not set
98 +# CONFIG_MD_TRANSLUCENT is not set
99 +# CONFIG_MD_HSM is not set
101 CONFIG_BLK_DEV_INITRD=y
102 CONFIG_BLK_DEV_LOOP=m
103 diff -ruN linux.orig/arch/sparc64/config.in linux-2.2.16/arch/sparc64/config.in
104 --- linux.orig/arch/sparc64/config.in Wed Jun 7 23:26:42 2000
105 +++ linux-2.2.16/arch/sparc64/config.in Fri Jun 9 11:37:48 2000
108 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
109 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
110 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
111 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
112 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
113 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
114 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
115 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
116 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
118 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
119 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
122 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
123 diff -ruN linux.orig/arch/sparc64/defconfig linux-2.2.16/arch/sparc64/defconfig
124 --- linux.orig/arch/sparc64/defconfig Wed Jun 7 23:26:42 2000
125 +++ linux-2.2.16/arch/sparc64/defconfig Fri Jun 9 11:37:48 2000
126 @@ -107,10 +107,13 @@
130 +# CONFIG_AUTODETECT_RAID is not set
133 CONFIG_MD_MIRRORING=m
135 +# CONFIG_MD_TRANSLUCENT is not set
136 +# CONFIG_MD_HSM is not set
138 CONFIG_BLK_DEV_INITRD=y
139 CONFIG_BLK_DEV_LOOP=m
140 diff -ruN linux.orig/drivers/block/Config.in linux-2.2.16/drivers/block/Config.in
141 --- linux.orig/drivers/block/Config.in Wed Jun 7 23:26:42 2000
142 +++ linux-2.2.16/drivers/block/Config.in Fri Jun 9 11:37:45 2000
143 @@ -102,10 +102,13 @@
145 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
146 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
147 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
148 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
149 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
150 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
151 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
152 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
153 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
155 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
156 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
157 diff -ruN linux.orig/drivers/block/Makefile linux-2.2.16/drivers/block/Makefile
158 --- linux.orig/drivers/block/Makefile Thu May 4 02:16:32 2000
159 +++ linux-2.2.16/drivers/block/Makefile Fri Jun 9 11:37:45 2000
160 @@ -282,10 +282,28 @@
163 ifeq ($(CONFIG_MD_RAID5),y)
167 ifeq ($(CONFIG_MD_RAID5),m)
173 +ifeq ($(CONFIG_MD_TRANSLUCENT),y)
174 +L_OBJS += translucent.o
176 + ifeq ($(CONFIG_MD_TRANSLUCENT),m)
177 + M_OBJS += translucent.o
181 +ifeq ($(CONFIG_MD_HSM),y)
184 + ifeq ($(CONFIG_MD_HSM),m)
189 diff -ruN linux.orig/drivers/block/genhd.c linux-2.2.16/drivers/block/genhd.c
190 --- linux.orig/drivers/block/genhd.c Wed Jun 7 23:26:42 2000
191 +++ linux-2.2.16/drivers/block/genhd.c Fri Jun 9 11:37:45 2000
193 #include <linux/string.h>
194 #include <linux/blk.h>
195 #include <linux/init.h>
196 +#include <linux/raid/md.h>
198 #include <asm/system.h>
199 #include <asm/byteorder.h>
200 @@ -1649,6 +1650,9 @@
205 +#ifdef CONFIG_BLK_DEV_MD
208 #ifdef CONFIG_MD_BOOT
210 diff -ruN linux.orig/drivers/block/hsm.c linux-2.2.16/drivers/block/hsm.c
211 --- linux.orig/drivers/block/hsm.c Thu Jan 1 01:00:00 1970
212 +++ linux-2.2.16/drivers/block/hsm.c Fri Jun 9 11:37:45 2000
215 + hsm.c : HSM RAID driver for Linux
216 + Copyright (C) 1998 Ingo Molnar
218 + HSM mode management functions.
220 + This program is free software; you can redistribute it and/or modify
221 + it under the terms of the GNU General Public License as published by
222 + the Free Software Foundation; either version 2, or (at your option)
225 + You should have received a copy of the GNU General Public License
226 + (for example /usr/src/linux/COPYING); if not, write to the Free
227 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
230 +#include <linux/module.h>
232 +#include <linux/raid/md.h>
233 +#include <linux/malloc.h>
235 +#include <linux/raid/hsm.h>
236 +#include <linux/blk.h>
238 +#define MAJOR_NR MD_MAJOR
240 +#define MD_PERSONALITY
246 +#define dprintk(x,y...) printk(x,##y)
248 +#define dprintk(x,y...) do { } while (0)
251 +void print_bh(struct buffer_head *bh)
253 + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
254 + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
255 + bh->b_rsector, bh->b_this_page, bh->b_state,
256 + bh->b_next_free, bh->b_count, bh->b_data,
257 + bh->b_list, bh->b_flushtime
261 +static int check_bg (pv_t *pv, pv_block_group_t * bg)
265 + dprintk("checking bg ...\n");
267 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
268 + if (pv_pptr_free(bg->blocks + i)) {
270 + if (test_bit(i, bg->used_bitmap)) {
271 + printk("hm, bit %d set?\n", i);
274 + if (!test_bit(i, bg->used_bitmap)) {
275 + printk("hm, bit %d not set?\n", i);
279 + dprintk("%d free blocks in bg ...\n", free);
283 +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
285 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
286 + struct buffer_head *bh;
288 + dprintk("... getting BG at %u ...\n", bg_pos);
290 + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
295 + desc->bg = (pv_block_group_t *) bh->b_data;
296 + desc->free_blocks = check_bg(pv, desc->bg);
299 +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
300 + unsigned int lblock, lv_lptr_t * index)
304 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
305 + pv_pptr_t * bptr = desc->bg->blocks + i;
306 + if (pv_pptr_free(bptr)) {
307 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
309 + if (test_bit(i, desc->bg->used_bitmap)) {
313 + bptr->u.used.owner.log_id = lv->log_id;
314 + bptr->u.used.owner.log_index = lblock;
315 + index->data.phys_nr = pv->phys_nr;
316 + index->data.phys_block = bg_pos + i + 1;
317 + set_bit(i, desc->bg->used_bitmap);
318 + desc->free_blocks--;
319 + dprintk(".....free blocks left in bg %p: %d\n",
320 + desc->bg, desc->free_blocks);
327 +static int __get_free_block (lv_t *lv, pv_t *pv,
328 + unsigned int lblock, lv_lptr_t * index)
332 + dprintk("trying to get free block for lblock %d ...\n", lblock);
334 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
335 + pv_bg_desc_t *desc = pv->bg_array + i;
337 + dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
339 + get_bg(pv, desc, i);
341 + if (desc->bg && desc->free_blocks)
342 + return find_free_block(lv, pv, desc, i,
345 + dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
349 +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
353 + if (!lv->free_indices)
357 + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
359 + if (err || !index->data.phys_block) {
364 + lv->free_indices--;
370 + * fix me: wordsize assumptions ...
372 +#define INDEX_BITS 8
373 +#define INDEX_DEPTH (32/INDEX_BITS)
374 +#define INDEX_MASK ((1<<INDEX_BITS) - 1)
376 +static void print_index_list (lv_t *lv, lv_lptr_t *index)
381 + dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
382 + index->data.phys_block, index->cpu_addr);
384 + tmp = index_child(index);
385 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
386 + if (index_block(lv, tmp))
387 + dprintk("(%d->%d)", i, index_block(lv, tmp));
393 +static int read_index_group (lv_t *lv, lv_lptr_t *index)
395 + lv_lptr_t *index_group, *tmp;
396 + struct buffer_head *bh;
399 + dprintk("reading index group <%s:%d>\n",
400 + partition_name(index_dev(lv, index)), index_block(lv, index));
402 + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
407 + if (!buffer_uptodate(bh))
410 + index_group = (lv_lptr_t *) bh->b_data;
412 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
413 + if (index_block(lv, tmp)) {
414 + dprintk("index group has BLOCK %d, non-present.\n", i);
419 + index->cpu_addr = ptr_to_cpuaddr(index_group);
421 + dprintk("have read index group %p at block %d.\n",
422 + index_group, index_block(lv, index));
423 + print_index_list(lv, index);
428 +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
430 + struct buffer_head *bh;
431 + lv_lptr_t * index_group;
433 + if (get_free_block(lv, lblock, index))
436 + dprintk("creating block for index group <%s:%d>\n",
437 + partition_name(index_dev(lv, index)), index_block(lv, index));
439 + bh = getblk(index_dev(lv, index),
440 + index_block(lv, index), HSM_BLOCKSIZE);
442 + index_group = (lv_lptr_t *) bh->b_data;
443 + md_clear_page(index_group);
444 + mark_buffer_uptodate(bh, 1);
446 + index->cpu_addr = ptr_to_cpuaddr(index_group);
448 + dprintk("allocated index group %p at block %d.\n",
449 + index_group, index_block(lv, index));
453 +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
455 + lv_lptr_t * index = index_child(&lv->root_index);
458 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
459 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
463 + if (!index_present(index)) {
464 + dprintk("no group, level %u, pos %u\n", l, idx);
465 + if (alloc_index_group(lv, lblock, index))
468 + index = index_child(index);
470 + if (!index_block(lv,index)) {
471 + dprintk("no data, pos %u\n", idx);
472 + if (get_free_block(lv, lblock, index))
480 +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
482 + lv_lptr_t * index = index_child(&lv->root_index);
485 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
486 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
490 + if (index_free(index))
492 + if (!index_present(index))
493 + read_index_group(lv, index);
494 + if (!index_present(index)) {
498 + index = index_child(index);
500 + if (!index_block(lv,index))
505 +static int read_root_index(lv_t *lv)
508 + lv_lptr_t *index = &lv->root_index;
510 + if (!index_block(lv, index)) {
511 + printk("LV has no root index yet, creating.\n");
513 + err = alloc_index_group (lv, 0, index);
515 + printk("could not create index group, err:%d\n", err);
518 + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
519 + lv->root_index.data;
521 + printk("LV already has a root index.\n");
522 + printk("... at <%s:%d>.\n",
523 + partition_name(index_dev(lv, index)),
524 + index_block(lv, index));
526 + read_index_group(lv, index);
531 +static int init_pv(pv_t *pv)
533 + struct buffer_head *bh;
536 + bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
542 + pv_sb = (pv_sb_t *) bh->b_data;
545 + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
546 + printk("%s is not a PV, has magic %x instead of %x!\n",
547 + partition_name(pv->dev), pv_sb->pv_magic,
551 + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
553 + printk("... created under HSM version %d.%d.%d, at %x.\n",
554 + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
555 + printk("... total # of blocks: %d (%d left unallocated).\n",
556 + pv_sb->pv_total_size, pv_sb->pv_blocks_left);
558 + printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
559 + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
560 + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
561 + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
563 + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
567 + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
568 + if (!pv->bg_array) {
572 + memset(pv->bg_array, 0, PAGE_SIZE);
577 +static int free_pv(pv_t *pv)
579 + struct buffer_head *bh;
581 + dprintk("freeing PV %d ...\n", pv->phys_nr);
583 + if (pv->bg_array) {
586 + dprintk(".... freeing BGs ...\n");
587 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
588 + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
589 + pv_bg_desc_t *desc = pv->bg_array + i;
592 + dprintk(".... freeing BG %d ...\n", i);
593 + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
594 + mark_buffer_dirty(bh, 1);
599 + free_page((unsigned long)pv->bg_array);
603 + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
608 + mark_buffer_dirty(bh, 1);
615 +struct semaphore hsm_sem = MUTEX;
617 +#define HSM_SECTORS (HSM_BLOCKSIZE/512)
619 +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
620 + unsigned long *rsector, unsigned long bsectors)
622 + lv_t *lv = kdev_to_lv(dev);
624 + unsigned int lblock = *rsector / HSM_SECTORS;
625 + unsigned int offset = *rsector % HSM_SECTORS;
629 + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
632 + if (offset + bsectors > HSM_SECTORS) {
637 + index = find_index(lv, lblock);
639 + printk("no block %u yet ... allocating\n", lblock);
640 + index = alloc_fixed_index(lv, lblock);
645 + printk(" %u <%s : %ld(%ld)> -> ", lblock,
646 + partition_name(*rdev), *rsector, bsectors);
648 + *rdev = index_dev(lv, index);
649 + *rsector = index_block(lv, index) * HSM_SECTORS + offset;
651 + printk(" <%s : %ld> %u\n",
652 + partition_name(*rdev), *rsector, index_block(lv, index));
659 +static void free_index (lv_t *lv, lv_lptr_t * index)
661 + struct buffer_head *bh;
663 + printk("tryin to get cached block for index group <%s:%d>\n",
664 + partition_name(index_dev(lv, index)), index_block(lv, index));
666 + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
668 + printk("....FREEING ");
669 + print_index_list(lv, index);
672 + if (!buffer_uptodate(bh))
674 + if ((lv_lptr_t *)bh->b_data != index_child(index)) {
675 + printk("huh? b_data is %p, index content is %p.\n",
676 + bh->b_data, index_child(index));
678 + printk("good, b_data == index content == %p.\n",
679 + index_child(index));
680 + printk("b_count == %d, writing.\n", bh->b_count);
681 + mark_buffer_dirty(bh, 1);
686 + printk("FAILED!\n");
688 + print_index_list(lv, index);
689 + index_child(index) = NULL;
692 +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
698 + nr_dots = (INDEX_DEPTH-level)*3;
699 + memcpy(dots,"...............",nr_dots);
702 + dprintk("%s level %d index group block:\n", dots, level);
706 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
707 + if (index->data.phys_block) {
708 + dprintk("%s block <%u,%u,%x>\n", dots,
709 + index->data.phys_nr,
710 + index->data.phys_block,
712 + if (level && index_present(index)) {
713 + dprintk("%s==> deeper one level\n", dots);
714 + free_index_group(lv, level-1,
715 + index_child(index));
716 + dprintk("%s freeing index group block %p ...",
717 + dots, index_child(index));
718 + free_index(lv, index);
723 + dprintk("%s DONE: level %d index group block.\n", dots, level);
726 +static void free_lv_indextree (lv_t *lv)
728 + dprintk("freeing LV %d ...\n", lv->log_id);
729 + dprintk("..root index: %p\n", index_child(&lv->root_index));
730 + dprintk("..INDEX TREE:\n");
731 + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
732 + dprintk("..freeing root index %p ...", index_child(&lv->root_index));
733 + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
734 + lv->root_index.data.phys_block, lv->root_index.cpu_addr);
735 + free_index(lv, &lv->root_index);
736 + dprintk("..INDEX TREE done.\n");
737 + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
738 + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
741 +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
747 + nr_dots = (INDEX_DEPTH-level)*3;
748 + memcpy(dots,"...............",nr_dots);
751 + dprintk("%s level %d index group block:\n", dots, level);
754 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
755 + index = index_0 + i;
756 + if (index->data.phys_block) {
757 + dprintk("%s block <%u,%u,%x>\n", dots,
758 + index->data.phys_nr,
759 + index->data.phys_block,
761 + if (level && index_present(index)) {
762 + dprintk("%s==> deeper one level\n", dots);
763 + print_index_group(lv, level-1,
764 + index_child(index));
768 + dprintk("%s DONE: level %d index group block.\n", dots, level);
771 +static void print_lv (lv_t *lv)
773 + dprintk("printing LV %d ...\n", lv->log_id);
774 + dprintk("..root index: %p\n", index_child(&lv->root_index));
775 + dprintk("..INDEX TREE:\n");
776 + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
777 + dprintk("..INDEX TREE done.\n");
780 +static int map_lv (lv_t *lv)
782 + kdev_t dev = lv->dev;
783 + unsigned int nr = MINOR(dev);
784 + mddev_t *mddev = lv->vg->mddev;
786 + if (MAJOR(dev) != MD_MAJOR) {
790 + if (kdev_to_mddev(dev)) {
794 + md_hd_struct[nr].start_sect = 0;
795 + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
796 + md_size[nr] = md_size[mdidx(mddev)];
797 + add_mddev_mapping(mddev, dev, lv);
802 +static int unmap_lv (lv_t *lv)
804 + kdev_t dev = lv->dev;
805 + unsigned int nr = MINOR(dev);
807 + if (MAJOR(dev) != MD_MAJOR) {
811 + md_hd_struct[nr].start_sect = 0;
812 + md_hd_struct[nr].nr_sects = 0;
814 + del_mddev_mapping(lv->vg->mddev, dev);
819 +static int init_vg (vg_t *vg)
825 + struct buffer_head *bh;
826 + lv_descriptor_t *lv_desc;
829 + * fix me: read all PVs and compare the SB
831 + dev = vg->pv_array[0].dev;
832 + bh = bread (dev, 1, HSM_BLOCKSIZE);
838 + vg_sb = (vg_sb_t *) bh->b_data;
841 + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
842 + printk("%s is not a valid VG, has magic %x instead of %x!\n",
843 + partition_name(dev), vg_sb->vg_magic,
849 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
851 + lv_desc = vg->vg_sb->lv_array + i;
853 + id = lv_desc->lv_id;
855 + printk("... LV desc %d empty\n", i);
858 + if (id >= HSM_MAX_LVS_PER_VG) {
863 + lv = vg->lv_array + id;
870 + lv->max_indices = lv_desc->lv_max_indices;
871 + lv->free_indices = lv_desc->lv_free_indices;
872 + lv->root_index.data = lv_desc->lv_root_idx;
873 + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
878 + if (read_root_index(lv)) {
881 + memset(lv, 0, sizeof(*lv));
884 + if (vg->nr_lv != vg_sb->nr_lvs)
890 +static int hsm_run (mddev_t *mddev)
898 + vg = kmalloc (sizeof (*vg), GFP_KERNEL);
901 + memset(vg, 0, sizeof(*vg));
902 + mddev->private = vg;
905 + if (md_check_ordering(mddev)) {
906 + printk("hsm: disks are not ordered, aborting!\n");
910 + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
912 + vg->nr_pv = mddev->nb_dev;
913 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
914 + pv_t *pv = vg->pv_array + i;
916 + pv->dev = rdev->dev;
917 + fsync_dev (pv->dev);
918 + set_blocksize (pv->dev, HSM_BLOCKSIZE);
931 + mddev->private = NULL;
938 +static int hsm_stop (mddev_t *mddev)
944 + vg = mddev_to_vg(mddev);
946 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
947 + lv = vg->lv_array + i;
951 + free_lv_indextree(lv);
954 + for (i = 0; i < vg->nr_pv; i++)
955 + free_pv(vg->pv_array + i);
965 +static int hsm_status (char *page, mddev_t *mddev)
971 + vg = mddev_to_vg(mddev);
973 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
974 + lv = vg->lv_array + i;
977 + sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
978 + lv->max_indices - lv->free_indices, lv->max_indices);
984 +static mdk_personality_t hsm_personality=
1003 +md__initfunc(void hsm_init (void))
1005 + register_md_personality (HSM, &hsm_personality);
1010 +int init_module (void)
1012 + return (register_md_personality (HSM, &hsm_personality));
1015 +void cleanup_module (void)
1017 + unregister_md_personality (HSM);
1023 + * This Linus-trick catches bugs via the linker.
1026 +extern void __BUG__in__hsm_dot_c_1(void);
1027 +extern void __BUG__in__hsm_dot_c_2(void);
1028 +extern void __BUG__in__hsm_dot_c_3(void);
1029 +extern void __BUG__in__hsm_dot_c_4(void);
1030 +extern void __BUG__in__hsm_dot_c_5(void);
1031 +extern void __BUG__in__hsm_dot_c_6(void);
1032 +extern void __BUG__in__hsm_dot_c_7(void);
1034 +void bugcatcher (void)
1036 + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
1037 + __BUG__in__hsm_dot_c_1();
1038 + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
1039 + __BUG__in__hsm_dot_c_2();
1041 + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
1042 + __BUG__in__hsm_dot_c_4();
1043 + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
1044 + __BUG__in__hsm_dot_c_3();
1045 + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
1046 + __BUG__in__hsm_dot_c_6();
1048 + if (sizeof(lv_lptr_t) != 16)
1049 + __BUG__in__hsm_dot_c_5();
1050 + if (sizeof(pv_pptr_t) != 16)
1051 + __BUG__in__hsm_dot_c_6();
1054 diff -ruN linux.orig/drivers/block/linear.c linux-2.2.16/drivers/block/linear.c
1055 --- linux.orig/drivers/block/linear.c Sat Nov 8 20:39:12 1997
1056 +++ linux-2.2.16/drivers/block/linear.c Fri Jun 9 11:37:45 2000
1060 linear.c : Multiple Devices driver for Linux
1061 Copyright (C) 1994-96 Marc ZYNGIER
1062 @@ -19,186 +18,207 @@
1064 #include <linux/module.h>
1066 -#include <linux/md.h>
1067 +#include <linux/raid/md.h>
1068 #include <linux/malloc.h>
1069 -#include <linux/init.h>
1071 -#include "linear.h"
1072 +#include <linux/raid/linear.h>
1074 #define MAJOR_NR MD_MAJOR
1076 #define MD_PERSONALITY
1078 -static int linear_run (int minor, struct md_dev *mddev)
1079 +static int linear_run (mddev_t *mddev)
1081 - int cur=0, i, size, dev0_size, nb_zone;
1082 - struct linear_data *data;
1084 - MOD_INC_USE_COUNT;
1086 - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
1087 - data=(struct linear_data *) mddev->private;
1090 - Find out the smallest device. This was previously done
1091 - at registry time, but since it violates modularity,
1092 - I moved it here... Any comment ? ;-)
1095 - data->smallest=mddev->devices;
1096 - for (i=1; i<mddev->nb_dev; i++)
1097 - if (data->smallest->size > mddev->devices[i].size)
1098 - data->smallest=mddev->devices+i;
1100 - nb_zone=data->nr_zones=
1101 - md_size[minor]/data->smallest->size +
1102 - (md_size[minor]%data->smallest->size ? 1 : 0);
1104 - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
1106 - size=mddev->devices[cur].size;
1107 + linear_conf_t *conf;
1108 + struct linear_hash *table;
1110 + int size, i, j, nb_zone;
1111 + unsigned int curr_offset;
1113 + MOD_INC_USE_COUNT;
1115 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
1118 + mddev->private = conf;
1120 + if (md_check_ordering(mddev)) {
1121 + printk("linear: disks are not ordered, aborting!\n");
1125 + * Find the smallest device.
1128 + conf->smallest = NULL;
1130 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
1131 + dev_info_t *disk = conf->disks + j;
1133 + disk->dev = rdev->dev;
1134 + disk->size = rdev->size;
1135 + disk->offset = curr_offset;
1137 + curr_offset += disk->size;
1139 + if (!conf->smallest || (disk->size < conf->smallest->size))
1140 + conf->smallest = disk;
1143 + nb_zone = conf->nr_zones =
1144 + md_size[mdidx(mddev)] / conf->smallest->size +
1145 + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
1147 + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
1149 + if (!conf->hash_table)
1153 + * Here we generate the linear hash table
1155 + table = conf->hash_table;
1158 + for (j = 0; j < mddev->nb_dev; j++) {
1159 + dev_info_t *disk = conf->disks + j;
1162 + table->dev1 = disk;
1165 + size += disk->size;
1168 + table->dev0 = disk;
1169 + size -= conf->smallest->size;
1172 + table->dev1 = NULL;
1176 + table->dev1 = NULL;
1183 + MOD_DEC_USE_COUNT;
1187 +static int linear_stop (mddev_t *mddev)
1189 + linear_conf_t *conf = mddev_to_conf(mddev);
1191 + kfree(conf->hash_table);
1195 - while (cur<mddev->nb_dev)
1197 - data->hash_table[i].dev0=mddev->devices+cur;
1198 + MOD_DEC_USE_COUNT;
1200 - if (size>=data->smallest->size) /* If we completely fill the slot */
1202 - data->hash_table[i++].dev1=NULL;
1203 - size-=data->smallest->size;
1207 - if (++cur==mddev->nb_dev) continue;
1208 - size=mddev->devices[cur].size;
1214 - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
1216 - data->hash_table[i].dev1=NULL;
1220 - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
1221 - size=mddev->devices[cur].size;
1222 - data->hash_table[i++].dev1=mddev->devices+cur;
1223 - size-=(data->smallest->size - dev0_size);
1229 -static int linear_stop (int minor, struct md_dev *mddev)
1231 - struct linear_data *data=(struct linear_data *) mddev->private;
1233 - kfree (data->hash_table);
1236 - MOD_DEC_USE_COUNT;
1243 -static int linear_map (struct md_dev *mddev, kdev_t *rdev,
1244 +static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1245 unsigned long *rsector, unsigned long size)
1247 - struct linear_data *data=(struct linear_data *) mddev->private;
1248 - struct linear_hash *hash;
1249 - struct real_dev *tmp_dev;
1252 - block=*rsector >> 1;
1253 - hash=data->hash_table+(block/data->smallest->size);
1255 - if (block >= (hash->dev0->size + hash->dev0->offset))
1259 - printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
1263 - tmp_dev=hash->dev1;
1266 - tmp_dev=hash->dev0;
1267 + linear_conf_t *conf = mddev_to_conf(mddev);
1268 + struct linear_hash *hash;
1269 + dev_info_t *tmp_dev;
1272 + block = *rsector >> 1;
1273 + hash = conf->hash_table + (block / conf->smallest->size);
1275 + if (block >= (hash->dev0->size + hash->dev0->offset))
1279 + printk ("linear_map : hash->dev1==NULL for block %ld\n",
1283 + tmp_dev = hash->dev1;
1285 + tmp_dev = hash->dev0;
1287 - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
1288 - printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
1289 - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
1290 + if (block >= (tmp_dev->size + tmp_dev->offset)
1291 + || block < tmp_dev->offset)
1292 + printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
1293 + block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
1295 - *rdev=tmp_dev->dev;
1296 - *rsector=(block-(tmp_dev->offset)) << 1;
1297 + *rdev = tmp_dev->dev;
1298 + *rsector = (block - tmp_dev->offset) << 1;
1304 -static int linear_status (char *page, int minor, struct md_dev *mddev)
1305 +static int linear_status (char *page, mddev_t *mddev)
1313 - struct linear_data *data=(struct linear_data *) mddev->private;
1315 + linear_conf_t *conf = mddev_to_conf(mddev);
1317 - sz+=sprintf (page+sz, " ");
1318 - for (j=0; j<data->nr_zones; j++)
1320 - sz+=sprintf (page+sz, "[%s",
1321 - partition_name (data->hash_table[j].dev0->dev));
1323 - if (data->hash_table[j].dev1)
1324 - sz+=sprintf (page+sz, "/%s] ",
1325 - partition_name(data->hash_table[j].dev1->dev));
1327 - sz+=sprintf (page+sz, "] ");
1330 - sz+=sprintf (page+sz, "\n");
1331 + sz += sprintf(page+sz, " ");
1332 + for (j = 0; j < conf->nr_zones; j++)
1334 + sz += sprintf(page+sz, "[%s",
1335 + partition_name(conf->hash_table[j].dev0->dev));
1337 + if (conf->hash_table[j].dev1)
1338 + sz += sprintf(page+sz, "/%s] ",
1339 + partition_name(conf->hash_table[j].dev1->dev));
1341 + sz += sprintf(page+sz, "] ");
1343 + sz += sprintf(page+sz, "\n");
1345 - sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
1347 + sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
1352 -static struct md_personality linear_personality=
1353 +static mdk_personality_t linear_personality=
1362 - NULL, /* no ioctls */
1382 -__initfunc(void linear_init (void))
1383 +md__initfunc(void linear_init (void))
1385 - register_md_personality (LINEAR, &linear_personality);
1386 + register_md_personality (LINEAR, &linear_personality);
1391 int init_module (void)
1393 - return (register_md_personality (LINEAR, &linear_personality));
1394 + return (register_md_personality (LINEAR, &linear_personality));
1397 void cleanup_module (void)
1399 - unregister_md_personality (LINEAR);
1400 + unregister_md_personality (LINEAR);
1405 diff -ruN linux.orig/drivers/block/linear.h linux-2.2.16/drivers/block/linear.h
1406 --- linux.orig/drivers/block/linear.h Fri Nov 22 15:07:23 1996
1407 +++ linux-2.2.16/drivers/block/linear.h Thu Jan 1 01:00:00 1970
1414 - struct real_dev *dev0, *dev1;
1419 - struct linear_hash *hash_table; /* Dynamically allocated */
1420 - struct real_dev *smallest;
1425 diff -ruN linux.orig/drivers/block/ll_rw_blk.c linux-2.2.16/drivers/block/ll_rw_blk.c
1426 --- linux.orig/drivers/block/ll_rw_blk.c Wed Jun 7 23:26:42 2000
1427 +++ linux-2.2.16/drivers/block/ll_rw_blk.c Fri Jun 9 11:37:45 2000
1430 #include <asm/uaccess.h>
1431 #include <linux/blk.h>
1432 +#include <linux/raid/md.h>
1434 #include <linux/module.h>
1437 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
1440 + * per-major idle-IO detection
1442 +unsigned long io_events[MAX_BLKDEV] = {0, };
1445 * used to wait on when there are no free requests
1447 struct wait_queue * wait_for_request;
1450 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
1452 + if (!buffer_lowprio(bh))
1453 + io_events[major]++;
1455 if (blk_size[major]) {
1456 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
1458 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
1459 #ifdef CONFIG_BLK_DEV_MD
1460 if (major==MD_MAJOR &&
1461 - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
1462 + md_map (bh[i]->b_dev, &bh[i]->b_rdev,
1463 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
1465 "Bad md_map in ll_rw_block\n");
1467 set_bit(BH_Req, &bh[i]->b_state);
1468 #ifdef CONFIG_BLK_DEV_MD
1469 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
1470 - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
1471 + md_make_request(bh[i], rw);
1475 diff -ruN linux.orig/drivers/block/md.c linux-2.2.16/drivers/block/md.c
1476 --- linux.orig/drivers/block/md.c Wed Jun 7 23:26:42 2000
1477 +++ linux-2.2.16/drivers/block/md.c Fri Jun 9 11:43:43 2000
1481 md.c : Multiple Devices driver for Linux
1482 - Copyright (C) 1994-96 Marc ZYNGIER
1483 - <zyngier@ufr-info-p7.ibp.fr> or
1484 - <maz@gloups.fdn.fr>
1485 + Copyright (C) 1998, 1999 Ingo Molnar
1487 - A lot of inspiration came from hd.c ...
1488 + completely rewritten, based on the MD driver code from Marc Zyngier
1490 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
1491 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
1494 - RAID-1/RAID-5 extensions by:
1495 - Ingo Molnar, Miguel de Icaza, Gadi Oxman
1496 + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
1497 + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
1498 + - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
1499 + - kmod support by: Cyrus Durgin
1500 + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
1502 - Changes for kmod by:
1505 This program is free software; you can redistribute it and/or modify
1506 it under the terms of the GNU General Public License as published by
1507 the Free Software Foundation; either version 2, or (at your option)
1508 @@ -26,809 +22,3007 @@
1509 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1513 - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
1514 - * the extra system load does not show up that much. Increase it if your
1515 - * system can take more.
1517 -#define SPEED_LIMIT 1024
1518 +#include <linux/raid/md.h>
1519 +#include <linux/raid/xor.h>
1521 -#include <linux/config.h>
1522 -#include <linux/module.h>
1523 -#include <linux/version.h>
1524 -#include <linux/malloc.h>
1525 -#include <linux/mm.h>
1526 -#include <linux/md.h>
1527 -#include <linux/hdreg.h>
1528 -#include <linux/stat.h>
1529 -#include <linux/fs.h>
1530 -#include <linux/proc_fs.h>
1531 -#include <linux/blkdev.h>
1532 -#include <linux/genhd.h>
1533 -#include <linux/smp_lock.h>
1535 #include <linux/kmod.h>
1537 -#include <linux/errno.h>
1538 -#include <linux/init.h>
1540 #define __KERNEL_SYSCALLS__
1541 #include <linux/unistd.h>
1543 +#include <asm/unaligned.h>
1545 +extern asmlinkage int sys_sched_yield(void);
1546 +extern asmlinkage int sys_setsid(void);
1548 +extern unsigned long io_events[MAX_BLKDEV];
1550 #define MAJOR_NR MD_MAJOR
1553 #include <linux/blk.h>
1554 -#include <asm/uaccess.h>
1555 -#include <asm/bitops.h>
1556 -#include <asm/atomic.h>
1558 #ifdef CONFIG_MD_BOOT
1559 -extern kdev_t name_to_kdev_t(char *line) __init;
1560 +extern kdev_t name_to_kdev_t(char *line) md__init;
1563 -static struct hd_struct md_hd_struct[MAX_MD_DEV];
1564 -static int md_blocksizes[MAX_MD_DEV];
1565 -int md_maxreadahead[MAX_MD_DEV];
1566 -#if SUPPORT_RECONSTRUCTION
1567 -static struct md_thread *md_sync_thread = NULL;
1568 -#endif /* SUPPORT_RECONSTRUCTION */
1569 +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
1572 + * these have to be allocated separately because external
1573 + * subsystems want to have a pre-defined structure
1575 +struct hd_struct md_hd_struct[MAX_MD_DEVS];
1576 +static int md_blocksizes[MAX_MD_DEVS];
1577 +static int md_maxreadahead[MAX_MD_DEVS];
1578 +static mdk_thread_t *md_recovery_thread = NULL;
1580 -int md_size[MAX_MD_DEV]={0, };
1581 +int md_size[MAX_MD_DEVS] = {0, };
1583 static void md_geninit (struct gendisk *);
1585 static struct gendisk md_gendisk=
1611 -static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
1612 -struct md_dev md_dev[MAX_MD_DEV];
1614 -int md_thread(void * arg);
1616 + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
1617 + * is 100 KB/sec, so the extra system load does not show up that much.
1618 + * Increase it if you want to have more _guaranteed_ speed. Note that
1619 + * the RAID driver will use the maximum available bandwith if the IO
1620 + * subsystem is idle.
1622 + * you can change it via /proc/sys/dev/speed-limit
1625 -static struct gendisk *find_gendisk (kdev_t dev)
1627 - struct gendisk *tmp=gendisk_head;
1628 +static int sysctl_speed_limit = 100;
1630 - while (tmp != NULL)
1632 - if (tmp->major==MAJOR(dev))
1637 +static struct ctl_table_header *md_table_header;
1641 +static ctl_table md_table[] = {
1642 + {DEV_MD_SPEED_LIMIT, "speed-limit",
1643 + &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
1647 -char *partition_name (kdev_t dev)
1649 - static char name[40]; /* This should be long
1650 - enough for a device name ! */
1651 - struct gendisk *hd = find_gendisk (dev);
1652 +static ctl_table md_dir_table[] = {
1653 + {DEV_MD, "md", NULL, 0, 0555, md_table},
1659 - sprintf (name, "[dev %s]", kdevname(dev));
1662 +static ctl_table md_root_table[] = {
1663 + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
1667 - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
1668 +static void md_register_sysctl(void)
1670 + md_table_header = register_sysctl_table(md_root_table, 1);
1673 -static int legacy_raid_sb (int minor, int pnum)
1674 +void md_unregister_sysctl(void)
1677 + unregister_sysctl_table(md_table_header);
1681 + * The mapping between kdev and mddev is not necessary a simple
1682 + * one! Eg. HSM uses several sub-devices to implement Logical
1683 + * Volumes. All these sub-devices map to the same mddev.
1685 +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
1687 - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
1688 +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
1690 + unsigned int minor = MINOR(dev);
1693 - * do size and offset calculations.
1695 - for (i=0; i<md_dev[minor].nb_dev; i++) {
1696 - md_dev[minor].devices[i].size &= ~(factor - 1);
1697 - md_size[minor] += md_dev[minor].devices[i].size;
1698 - md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
1699 - md_dev[minor].devices[i-1].size) : 0;
1700 + if (MAJOR(dev) != MD_MAJOR) {
1704 - if (pnum == RAID0 >> PERSONALITY_SHIFT)
1705 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
1707 + if (mddev_map[minor].mddev != NULL) {
1711 + mddev_map[minor].mddev = mddev;
1712 + mddev_map[minor].data = data;
1715 -static void free_sb (struct md_dev *mddev)
1716 +void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
1719 - struct real_dev *realdev;
1720 + unsigned int minor = MINOR(dev);
1723 - free_page((unsigned long) mddev->sb);
1725 + if (MAJOR(dev) != MD_MAJOR) {
1729 - for (i = 0; i <mddev->nb_dev; i++) {
1730 - realdev = mddev->devices + i;
1731 - if (realdev->sb) {
1732 - free_page((unsigned long) realdev->sb);
1733 - realdev->sb = NULL;
1735 + if (mddev_map[minor].mddev != mddev) {
1739 + mddev_map[minor].mddev = NULL;
1740 + mddev_map[minor].data = NULL;
1744 - * Check one RAID superblock for generic plausibility
1745 + * Enables to iterate over all existing md arrays
1747 +static MD_LIST_HEAD(all_mddevs);
1749 -#define BAD_MAGIC KERN_ERR \
1750 -"md: %s: invalid raid superblock magic (%x) on block %u\n"
1751 +static mddev_t * alloc_mddev (kdev_t dev)
1755 -#define OUT_OF_MEM KERN_ALERT \
1756 -"md: out of memory.\n"
1757 + if (MAJOR(dev) != MD_MAJOR) {
1761 + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
1765 + memset(mddev, 0, sizeof(*mddev));
1767 -#define NO_DEVICE KERN_ERR \
1768 -"md: disabled device %s\n"
1769 + mddev->__minor = MINOR(dev);
1770 + mddev->reconfig_sem = MUTEX;
1771 + mddev->recovery_sem = MUTEX;
1772 + mddev->resync_sem = MUTEX;
1773 + MD_INIT_LIST_HEAD(&mddev->disks);
1775 + * The 'base' mddev is the one with data NULL.
1776 + * personalities can create additional mddevs
1779 + add_mddev_mapping(mddev, dev, 0);
1780 + md_list_add(&mddev->all_mddevs, &all_mddevs);
1787 -static int analyze_one_sb (struct real_dev * rdev)
1788 +static void free_mddev (mddev_t *mddev)
1790 - int ret = FAILURE;
1791 - struct buffer_head *bh;
1792 - kdev_t dev = rdev->dev;
1793 - md_superblock_t *sb;
1800 - * Read the superblock, it's at the end of the disk
1801 + * Make sure nobody else is using this mddev
1802 + * (careful, we rely on the global kernel lock here)
1804 - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
1805 - set_blocksize (dev, MD_SB_BYTES);
1806 - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
1809 - sb = (md_superblock_t *) bh->b_data;
1810 - if (sb->md_magic != MD_SB_MAGIC) {
1811 - printk (BAD_MAGIC, kdevname(dev),
1812 - sb->md_magic, rdev->sb_offset);
1815 - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
1817 - printk (OUT_OF_MEM);
1820 - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
1821 + while (md_atomic_read(&mddev->resync_sem.count) != 1)
1823 + while (md_atomic_read(&mddev->recovery_sem.count) != 1)
1826 - rdev->size = sb->size;
1828 - printk (NO_DEVICE,kdevname(rdev->dev));
1834 + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
1835 + md_list_del(&mddev->all_mddevs);
1836 + MD_INIT_LIST_HEAD(&mddev->all_mddevs);
1848 - * Check a full RAID array for plausibility
1850 +struct gendisk * find_gendisk (kdev_t dev)
1852 + struct gendisk *tmp = gendisk_head;
1854 -#define INCONSISTENT KERN_ERR \
1855 -"md: superblock inconsistency -- run ckraid\n"
1856 + while (tmp != NULL) {
1857 + if (tmp->major == MAJOR(dev))
1864 -#define OUT_OF_DATE KERN_ERR \
1865 -"md: superblock update time inconsistenty -- using the most recent one\n"
1866 +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
1868 + mdk_rdev_t * rdev;
1869 + struct md_list_head *tmp;
1871 -#define OLD_VERSION KERN_ALERT \
1872 -"md: %s: unsupported raid array version %d.%d.%d\n"
1873 + ITERATE_RDEV(mddev,rdev,tmp) {
1874 + if (rdev->desc_nr == nr)
1880 -#define NOT_CLEAN KERN_ERR \
1881 -"md: %s: raid array is not clean -- run ckraid\n"
1882 +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
1884 + struct md_list_head *tmp;
1887 -#define NOT_CLEAN_IGNORE KERN_ERR \
1888 -"md: %s: raid array is not clean -- reconstructing parity\n"
1889 + ITERATE_RDEV(mddev,rdev,tmp) {
1890 + if (rdev->dev == dev)
1896 -#define UNKNOWN_LEVEL KERN_ERR \
1897 -"md: %s: unsupported raid level %d\n"
1898 +static MD_LIST_HEAD(device_names);
1900 -static int analyze_sbs (int minor, int pnum)
1901 +char * partition_name (kdev_t dev)
1903 - struct md_dev *mddev = md_dev + minor;
1904 - int i, N = mddev->nb_dev, out_of_date = 0;
1905 - struct real_dev * disks = mddev->devices;
1906 - md_superblock_t *sb, *freshest = NULL;
1907 + struct gendisk *hd;
1908 + static char nomem [] = "<nomem>";
1909 + dev_name_t *dname;
1910 + struct md_list_head *tmp = device_names.next;
1913 - * RAID-0 and linear don't use a RAID superblock
1915 - if (pnum == RAID0 >> PERSONALITY_SHIFT ||
1916 - pnum == LINEAR >> PERSONALITY_SHIFT)
1917 - return legacy_raid_sb (minor, pnum);
1918 + while (tmp != &device_names) {
1919 + dname = md_list_entry(tmp, dev_name_t, list);
1920 + if (dname->dev == dev)
1921 + return dname->name;
1925 + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
1930 - * Verify the RAID superblock on each real device
1931 + * ok, add this new device name to the list
1933 - for (i = 0; i < N; i++)
1934 - if (analyze_one_sb(disks+i))
1936 + hd = find_gendisk (dev);
1939 + sprintf (dname->name, "[dev %s]", kdevname(dev));
1941 + disk_name (hd, MINOR(dev), dname->name);
1944 + md_list_add(&dname->list, &device_names);
1946 + return dname->name;
1949 +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
1952 + unsigned int size = 0;
1954 + if (blk_size[MAJOR(dev)])
1955 + size = blk_size[MAJOR(dev)][MINOR(dev)];
1957 + size = MD_NEW_SIZE_BLOCKS(size);
1961 +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
1963 + unsigned int size;
1965 + size = calc_dev_sboffset(dev, mddev, persistent);
1970 + if (mddev->sb->chunk_size)
1971 + size &= ~(mddev->sb->chunk_size/1024 - 1);
1976 + * We check wether all devices are numbered from 0 to nb_dev-1. The
1977 + * order is guaranteed even after device name changes.
1979 + * Some personalities (raid0, linear) use this. Personalities that
1980 + * provide data have to be able to deal with loss of individual
1981 + * disks, so they do their checking themselves.
1983 +int md_check_ordering (mddev_t *mddev)
1987 + struct md_list_head *tmp;
1990 - * The superblock constant part has to be the same
1991 - * for all disks in the array.
1992 + * First, all devices must be fully functional
1995 - for (i = 0; i < N; i++) {
2003 - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
2004 - printk (INCONSISTENT);
2005 + ITERATE_RDEV(mddev,rdev,tmp) {
2006 + if (rdev->faulty) {
2007 + printk("md: md%d's device %s faulty, aborting.\n",
2008 + mdidx(mddev), partition_name(rdev->dev));
2014 - * OK, we have all disks and the array is ready to run. Let's
2015 - * find the freshest superblock, that one will be the superblock
2016 - * that represents the whole array.
2018 - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
2020 + ITERATE_RDEV(mddev,rdev,tmp) {
2023 + if (c != mddev->nb_dev) {
2027 - for (i = 0; i < N; i++) {
2031 - freshest = disks[i].sb;
2035 - * Find the newest superblock version
2037 - if (disks[i].sb->utime != freshest->utime) {
2039 - if (disks[i].sb->utime > freshest->utime)
2040 - freshest = disks[i].sb;
2044 - printk(OUT_OF_DATE);
2045 - memcpy (sb, freshest, sizeof(*freshest));
2048 - * Check if we can support this RAID array
2050 - if (sb->major_version != MD_MAJOR_VERSION ||
2051 - sb->minor_version > MD_MINOR_VERSION) {
2053 - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
2054 - sb->major_version, sb->minor_version,
2055 - sb->patch_version);
2056 + if (mddev->nb_dev != mddev->sb->raid_disks) {
2057 + printk("md: md%d, array needs %d disks, has %d, aborting.\n",
2058 + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
2063 - * We need to add this as a superblock option.
2064 + * Now the numbering check
2066 -#if SUPPORT_RECONSTRUCTION
2067 - if (sb->state != (1 << MD_SB_CLEAN)) {
2068 - if (sb->level == 1) {
2069 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
2070 + for (i = 0; i < mddev->nb_dev; i++) {
2072 + ITERATE_RDEV(mddev,rdev,tmp) {
2073 + if (rdev->desc_nr == i)
2077 + printk("md: md%d, missing disk #%d, aborting.\n",
2081 - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
2084 - if (sb->state != (1 << MD_SB_CLEAN)) {
2085 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
2088 -#endif /* SUPPORT_RECONSTRUCTION */
2090 - switch (sb->level) {
2092 - md_size[minor] = sb->size;
2093 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
2097 - md_size[minor] = sb->size * (sb->raid_disks - 1);
2098 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
2101 - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
2105 + printk("md: md%d, too many disks #%d, aborting.\n",
2116 -#undef INCONSISTENT
2122 -int md_update_sb(int minor)
2123 +static unsigned int zoned_raid_size (mddev_t *mddev)
2125 - struct md_dev *mddev = md_dev + minor;
2126 - struct buffer_head *bh;
2127 - md_superblock_t *sb = mddev->sb;
2128 - struct real_dev *realdev;
2132 + unsigned int mask;
2133 + mdk_rdev_t * rdev;
2134 + struct md_list_head *tmp;
2136 - sb->utime = CURRENT_TIME;
2137 - for (i = 0; i < mddev->nb_dev; i++) {
2138 - realdev = mddev->devices + i;
2141 - dev = realdev->dev;
2142 - sb_offset = realdev->sb_offset;
2143 - set_blocksize(dev, MD_SB_BYTES);
2144 - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
2145 - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2147 - sb = (md_superblock_t *) bh->b_data;
2148 - memcpy(sb, mddev->sb, MD_SB_BYTES);
2149 - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
2150 - mark_buffer_uptodate(bh, 1);
2151 - mark_buffer_dirty(bh, 1);
2152 - ll_rw_block(WRITE, 1, &bh);
2153 - wait_on_buffer(bh);
2156 - invalidate_buffers(dev);
2158 - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
2164 + * do size and offset calculations.
2166 + mask = ~(mddev->sb->chunk_size/1024 - 1);
2167 +printk("mask %08x\n", mask);
2169 + ITERATE_RDEV(mddev,rdev,tmp) {
2170 +printk(" rdev->size: %d\n", rdev->size);
2171 + rdev->size &= mask;
2172 +printk(" masked rdev->size: %d\n", rdev->size);
2173 + md_size[mdidx(mddev)] += rdev->size;
2174 +printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
2179 -static int do_md_run (int minor, int repart)
2180 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
2182 - int pnum, i, min, factor, err;
2183 + if (disk_active(disk)) {
2184 + sb->working_disks--;
2186 + if (disk_spare(disk)) {
2187 + sb->spare_disks--;
2188 + sb->working_disks--;
2190 + sb->failed_disks--;
2196 + mark_disk_removed(disk);
2199 - if (!md_dev[minor].nb_dev)
2202 - if (md_dev[minor].pers)
2204 +#define BAD_MAGIC KERN_ERR \
2205 +"md: invalid raid superblock magic on %s\n"
2207 - md_dev[minor].repartition=repart;
2209 - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
2210 - >= MAX_PERSONALITY)
2213 - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
2214 - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
2215 - for (i = 0; i < md_dev [minor].nb_dev; i++)
2216 - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
2222 - char module_name[80];
2223 - sprintf (module_name, "md-personality-%d", pnum);
2224 - request_module (module_name);
2230 - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
2232 - md_blocksizes[minor] <<= FACTOR_SHIFT(FACTOR((md_dev+minor)));
2233 +#define BAD_MINOR KERN_ERR \
2234 +"md: %s: invalid raid minor (%x)\n"
2236 - for (i=0; i<md_dev[minor].nb_dev; i++)
2237 - if (md_dev[minor].devices[i].size<min)
2239 - printk ("Dev %s smaller than %dk, cannot shrink\n",
2240 - partition_name (md_dev[minor].devices[i].dev), min);
2244 - for (i=0; i<md_dev[minor].nb_dev; i++) {
2245 - fsync_dev(md_dev[minor].devices[i].dev);
2246 - invalidate_buffers(md_dev[minor].devices[i].dev);
2249 - /* Resize devices according to the factor. It is used to align
2250 - partitions size on a given chunk size. */
2254 - * Analyze the raid superblock
2256 - if (analyze_sbs(minor, pnum))
2258 +#define OUT_OF_MEM KERN_ALERT \
2259 +"md: out of memory.\n"
2261 - md_dev[minor].pers=pers[pnum];
2263 - if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
2265 - md_dev[minor].pers=NULL;
2266 - free_sb(md_dev + minor);
2270 - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
2272 - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
2273 - md_update_sb(minor);
2276 - /* FIXME : We assume here we have blocks
2277 - that are twice as large as sectors.
2278 - THIS MAY NOT BE TRUE !!! */
2279 - md_hd_struct[minor].start_sect=0;
2280 - md_hd_struct[minor].nr_sects=md_size[minor]<<1;
2282 - read_ahead[MD_MAJOR] = 128;
2285 +#define NO_SB KERN_ERR \
2286 +"md: disabled device %s, could not read superblock.\n"
2288 +#define BAD_CSUM KERN_WARNING \
2289 +"md: invalid superblock checksum on %s\n"
2291 -static int do_md_stop (int minor, struct inode *inode)
2292 +static int alloc_array_sb (mddev_t * mddev)
2296 - if (inode->i_count>1 || md_dev[minor].busy>1) {
2298 - * ioctl : one open channel
2300 - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
2301 - minor, inode->i_count, md_dev[minor].busy);
2305 - if (md_dev[minor].pers) {
2307 - * It is safe to call stop here, it only frees private
2308 - * data. Also, it tells us if a device is unstoppable
2309 - * (eg. resyncing is in progress)
2311 - if (md_dev[minor].pers->stop (minor, md_dev+minor))
2314 - * The device won't exist anymore -> flush it now
2316 - fsync_dev (inode->i_rdev);
2317 - invalidate_buffers (inode->i_rdev);
2318 - if (md_dev[minor].sb) {
2319 - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
2320 - md_update_sb(minor);
2327 - /* Remove locks. */
2328 - if (md_dev[minor].sb)
2329 - free_sb(md_dev + minor);
2330 - for (i=0; i<md_dev[minor].nb_dev; i++)
2331 - clear_inode (md_dev[minor].devices[i].inode);
2333 - md_dev[minor].nb_dev=md_size[minor]=0;
2334 - md_hd_struct[minor].nr_sects=0;
2335 - md_dev[minor].pers=NULL;
2337 - read_ahead[MD_MAJOR] = 128;
2341 + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
2344 + md_clear_page((unsigned long)mddev->sb);
2348 -static int do_md_add (int minor, kdev_t dev)
2349 +static int alloc_disk_sb (mdk_rdev_t * rdev)
2353 - struct real_dev *realdev;
2357 - if (md_dev[minor].nb_dev==MAX_REAL)
2358 + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
2360 + printk (OUT_OF_MEM);
2363 + md_clear_page((unsigned long)rdev->sb);
2365 - if (!fs_may_mount (dev))
2370 - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
2371 - printk("md_add(): zero device size, huh, bailing out.\n");
2373 +static void free_disk_sb (mdk_rdev_t * rdev)
2376 + free_page((unsigned long) rdev->sb);
2378 + rdev->sb_offset = 0;
2381 + if (!rdev->faulty)
2386 - if (md_dev[minor].pers) {
2388 - * The array is already running, hot-add the drive, or
2391 - if (!md_dev[minor].pers->hot_add_disk)
2395 +static void mark_rdev_faulty (mdk_rdev_t * rdev)
2397 + unsigned long flags;
2403 + save_flags(flags);
2405 + free_disk_sb(rdev);
2407 + restore_flags(flags);
2410 +static int read_disk_sb (mdk_rdev_t * rdev)
2412 + int ret = -EINVAL;
2413 + struct buffer_head *bh = NULL;
2414 + kdev_t dev = rdev->dev;
2424 - * Careful. We cannot increase nb_dev for a running array.
2425 + * Calculate the position of the superblock,
2426 + * it's at the end of the disk
2428 - i=md_dev[minor].nb_dev;
2429 - realdev = &md_dev[minor].devices[i];
2432 - /* Lock the device by inserting a dummy inode. This doesn't
2433 - smell very good, but I need to be consistent with the
2434 - mount stuff, specially with fs_may_mount. If someone have
2435 - a better idea, please help ! */
2437 - realdev->inode=get_empty_inode ();
2438 - realdev->inode->i_dev=dev; /* don't care about other fields */
2439 - insert_inode_hash (realdev->inode);
2441 - /* Sizes are now rounded at run time */
2443 -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
2445 - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
2446 + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
2447 + rdev->sb_offset = sb_offset;
2448 + printk("(read) %s's sb offset: %d", partition_name(dev),
2451 + set_blocksize (dev, MD_SB_BYTES);
2452 + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2456 - * Check the superblock for consistency.
2457 - * The personality itself has to check whether it's getting
2458 - * added with the proper flags. The personality has to be
2461 + sb = (mdp_super_t *) bh->b_data;
2462 + memcpy (rdev->sb, sb, MD_SB_BYTES);
2464 + printk (NO_SB,partition_name(rdev->dev));
2467 + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
2475 +static unsigned int calc_sb_csum (mdp_super_t * sb)
2477 + unsigned int disk_csum, csum;
2479 + disk_csum = sb->sb_csum;
2481 + csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
2482 + sb->sb_csum = disk_csum;
2487 + * Check one RAID superblock for generic plausibility
2490 +static int check_disk_sb (mdk_rdev_t * rdev)
2493 + int ret = -EINVAL;
2501 + if (sb->md_magic != MD_SB_MAGIC) {
2502 + printk (BAD_MAGIC, partition_name(rdev->dev));
2506 + if (sb->md_minor >= MAX_MD_DEVS) {
2507 + printk (BAD_MINOR, partition_name(rdev->dev),
2512 + if (calc_sb_csum(sb) != sb->sb_csum)
2513 + printk(BAD_CSUM, partition_name(rdev->dev));
2519 +static kdev_t dev_unit(kdev_t dev)
2521 + unsigned int mask;
2522 + struct gendisk *hd = find_gendisk(dev);
2526 + mask = ~((1 << hd->minor_shift) - 1);
2528 + return MKDEV(MAJOR(dev), MINOR(dev) & mask);
2531 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
2533 + struct md_list_head *tmp;
2536 + ITERATE_RDEV(mddev,rdev,tmp)
2537 + if (dev_unit(rdev->dev) == dev_unit(dev))
2543 +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
2545 + struct md_list_head *tmp;
2548 + ITERATE_RDEV(mddev1,rdev,tmp)
2549 + if (match_dev_unit(mddev2, rdev->dev))
2555 +static MD_LIST_HEAD(all_raid_disks);
2556 +static MD_LIST_HEAD(pending_raid_disks);
2558 +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
2560 + mdk_rdev_t *same_pdev;
2562 + if (rdev->mddev) {
2566 + same_pdev = match_dev_unit(mddev, rdev->dev);
2568 + printk( KERN_WARNING
2569 +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
2570 +" protection against single-disk failure might be compromised.\n",
2571 + mdidx(mddev), partition_name(rdev->dev),
2572 + partition_name(same_pdev->dev));
2574 + md_list_add(&rdev->same_set, &mddev->disks);
2575 + rdev->mddev = mddev;
2577 + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
2580 +static void unbind_rdev_from_array (mdk_rdev_t * rdev)
2582 + if (!rdev->mddev) {
2586 + md_list_del(&rdev->same_set);
2587 + MD_INIT_LIST_HEAD(&rdev->same_set);
2588 + rdev->mddev->nb_dev--;
2589 + printk("unbind<%s,%d>\n", partition_name(rdev->dev),
2590 + rdev->mddev->nb_dev);
2591 + rdev->mddev = NULL;
2595 + * prevent the device from being mounted, repartitioned or
2596 + * otherwise reused by a RAID array (or any other kernel
2597 + * subsystem), by opening the device. [simply getting an
2598 + * inode is not enough, the SCSI module usage code needs
2599 + * an explicit open() on the device]
2601 +static int lock_rdev (mdk_rdev_t *rdev)
2606 + * First insert a dummy inode.
2610 + rdev->inode = get_empty_inode();
2612 + * we dont care about any other fields
2614 + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
2615 + insert_inode_hash(rdev->inode);
2617 + memset(&rdev->filp, 0, sizeof(rdev->filp));
2618 + rdev->filp.f_mode = 3; /* read write */
2619 + err = blkdev_open(rdev->inode, &rdev->filp);
2621 + printk("blkdev_open() failed: %d\n", err);
2622 + clear_inode(rdev->inode);
2623 + rdev->inode = NULL;
2628 +static void unlock_rdev (mdk_rdev_t *rdev)
2630 + blkdev_release(rdev->inode);
2633 + clear_inode(rdev->inode);
2634 + rdev->inode = NULL;
2637 +static void export_rdev (mdk_rdev_t * rdev)
2639 + printk("export_rdev(%s)\n",partition_name(rdev->dev));
2642 + unlock_rdev(rdev);
2643 + free_disk_sb(rdev);
2644 + md_list_del(&rdev->all);
2645 + MD_INIT_LIST_HEAD(&rdev->all);
2646 + if (rdev->pending.next != &rdev->pending) {
2647 + printk("(%s was pending)\n",partition_name(rdev->dev));
2648 + md_list_del(&rdev->pending);
2649 + MD_INIT_LIST_HEAD(&rdev->pending);
2656 +static void kick_rdev_from_array (mdk_rdev_t * rdev)
2658 + unbind_rdev_from_array(rdev);
2659 + export_rdev(rdev);
2662 +static void export_array (mddev_t *mddev)
2664 + struct md_list_head *tmp;
2666 + mdp_super_t *sb = mddev->sb;
2670 + free_page((unsigned long) sb);
2673 + ITERATE_RDEV(mddev,rdev,tmp) {
2674 + if (!rdev->mddev) {
2678 + kick_rdev_from_array(rdev);
2680 + if (mddev->nb_dev)
2689 +static void print_desc(mdp_disk_t *desc)
2691 + printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
2692 + partition_name(MKDEV(desc->major,desc->minor)),
2693 + desc->major,desc->minor,desc->raid_disk,desc->state);
2696 +static void print_sb(mdp_super_t *sb)
2700 + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2701 + sb->major_version, sb->minor_version, sb->patch_version,
2702 + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2704 + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
2705 + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
2706 + sb->layout, sb->chunk_size);
2707 + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
2708 + sb->utime, sb->state, sb->active_disks, sb->working_disks,
2709 + sb->failed_disks, sb->spare_disks,
2710 + sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
2712 + for (i = 0; i < MD_SB_DISKS; i++) {
2715 + desc = sb->disks + i;
2716 + printk(" D %2d: ", i);
2719 + printk(" THIS: ");
2720 + print_desc(&sb->this_disk);
2724 +static void print_rdev(mdk_rdev_t *rdev)
2726 + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
2727 + partition_name(rdev->dev), partition_name(rdev->old_dev),
2728 + rdev->size, rdev->faulty, rdev->desc_nr);
2730 + printk("rdev superblock:\n");
2731 + print_sb(rdev->sb);
2733 + printk("no rdev superblock!\n");
2736 +void md_print_devices (void)
2738 + struct md_list_head *tmp, *tmp2;
2743 + printk(" **********************************\n");
2744 + printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
2745 + printk(" **********************************\n");
2746 + ITERATE_MDDEV(mddev,tmp) {
2747 + printk("md%d: ", mdidx(mddev));
2749 + ITERATE_RDEV(mddev,rdev,tmp2)
2750 + printk("<%s>", partition_name(rdev->dev));
2753 + printk(" array superblock:\n");
2754 + print_sb(mddev->sb);
2756 + printk(" no array superblock.\n");
2758 + ITERATE_RDEV(mddev,rdev,tmp2)
2761 + printk(" **********************************\n");
2765 +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
2768 + mdp_super_t *tmp1, *tmp2;
2770 + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
2771 + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
2773 + if (!tmp1 || !tmp2) {
2782 + * nr_disks is not constant
2784 + tmp1->nr_disks = 0;
2785 + tmp2->nr_disks = 0;
2787 + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
2801 +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
2803 + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
2804 + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
2805 + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
2806 + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
2813 +static mdk_rdev_t * find_rdev_all (kdev_t dev)
2815 + struct md_list_head *tmp;
2818 + tmp = all_raid_disks.next;
2819 + while (tmp != &all_raid_disks) {
2820 + rdev = md_list_entry(tmp, mdk_rdev_t, all);
2821 + if (rdev->dev == dev)
2828 +#define GETBLK_FAILED KERN_ERR \
2829 +"md: getblk failed for device %s\n"
2831 +static int write_disk_sb(mdk_rdev_t * rdev)
2833 + struct buffer_head *bh;
2835 + u32 sb_offset, size;
2842 + if (rdev->faulty) {
2846 + if (rdev->sb->md_magic != MD_SB_MAGIC) {
2852 + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
2853 + if (rdev->sb_offset != sb_offset) {
2854 + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
2858 + * If the disk went offline meanwhile and it's just a spare, then
2859 + * it's size has changed to zero silently, and the MD code does
2860 + * not yet know that it's faulty.
2862 + size = calc_dev_size(dev, rdev->mddev, 1);
2863 + if (size != rdev->size) {
2864 + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
2868 + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
2870 + set_blocksize(dev, MD_SB_BYTES);
2871 + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
2873 + printk(GETBLK_FAILED, partition_name(dev));
2876 + memset(bh->b_data,0,bh->b_size);
2877 + sb = (mdp_super_t *) bh->b_data;
2878 + memcpy(sb, rdev->sb, MD_SB_BYTES);
2880 + mark_buffer_uptodate(bh, 1);
2881 + mark_buffer_dirty(bh, 1);
2882 + ll_rw_block(WRITE, 1, &bh);
2883 + wait_on_buffer(bh);
2889 +#undef GETBLK_FAILED KERN_ERR
2891 +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2896 + for (i = 0; i < MD_SB_DISKS; i++) {
2897 + desc = mddev->sb->disks + i;
2899 + if (disk_faulty(desc)) {
2900 + if (MKDEV(desc->major,desc->minor) == rdev->dev)
2905 + if (MKDEV(desc->major,desc->minor) == rdev->dev) {
2906 + rdev->sb->this_disk = *desc;
2907 + rdev->desc_nr = desc->number;
2918 +static int sync_sbs(mddev_t * mddev)
2922 + struct md_list_head *tmp;
2924 + ITERATE_RDEV(mddev,rdev,tmp) {
2929 + set_this_disk(mddev, rdev);
2930 + sb->sb_csum = calc_sb_csum(sb);
2935 +int md_update_sb(mddev_t * mddev)
2937 + int first, err, count = 100;
2938 + struct md_list_head *tmp;
2943 + mddev->sb->utime = CURRENT_TIME;
2944 + ev = get_unaligned(&mddev->sb->events);
2946 + put_unaligned(ev,&mddev->sb->events);
2947 + if (ev == (__u64)0) {
2949 + * oops, this 64-bit counter should never wrap.
2950 + * Either we are in around ~1 trillion A.C., assuming
2951 + * 1 reboot per second, or we have a bug:
2955 + put_unaligned(ev,&mddev->sb->events);
2960 + * do not write anything to disk if using
2961 + * nonpersistent superblocks
2963 + if (mddev->sb->not_persistent)
2966 + printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
2971 + ITERATE_RDEV(mddev,rdev,tmp) {
2977 + printk("(skipping faulty ");
2978 + printk("%s ", partition_name(rdev->dev));
2979 + if (!rdev->faulty) {
2980 + printk("[events: %08lx]",
2981 + (unsigned long)get_unaligned(&rdev->sb->events));
2982 + err += write_disk_sb(rdev);
2988 + printk("errors occured during superblock update, repeating\n");
2991 + printk("excessive errors occured during superblock update, exiting\n");
2997 + * Import a device. If 'on_disk', then sanity check the superblock
2999 + * mark the device faulty if:
3001 + * - the device is nonexistent (zero size)
3002 + * - the device has no valid superblock
3004 + * a faulty rdev _never_ has rdev->sb set.
3006 +static int md_import_device (kdev_t newdev, int on_disk)
3010 + unsigned int size;
3012 + if (find_rdev_all(newdev))
3015 + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
3017 + printk("could not alloc mem for %s!\n", partition_name(newdev));
3020 + memset(rdev, 0, sizeof(*rdev));
3022 + if (!fs_may_mount(newdev)) {
3023 + printk("md: can not import %s, has active inodes!\n",
3024 + partition_name(newdev));
3029 + if ((err = alloc_disk_sb(rdev)))
3032 + rdev->dev = newdev;
3033 + if (lock_rdev(rdev)) {
3034 + printk("md: could not lock %s, zero-size? Marking faulty.\n",
3035 + partition_name(newdev));
3039 + rdev->desc_nr = -1;
3043 + if (blk_size[MAJOR(newdev)])
3044 + size = blk_size[MAJOR(newdev)][MINOR(newdev)];
3046 + printk("md: %s has zero size, marking faulty!\n",
3047 + partition_name(newdev));
3053 + if ((err = read_disk_sb(rdev))) {
3054 + printk("md: could not read %s's sb, not importing!\n",
3055 + partition_name(newdev));
3058 + if ((err = check_disk_sb(rdev))) {
3059 + printk("md: %s has invalid sb, not importing!\n",
3060 + partition_name(newdev));
3064 + rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
3065 + rdev->sb->this_disk.minor);
3066 + rdev->desc_nr = rdev->sb->this_disk.number;
3068 + md_list_add(&rdev->all, &all_raid_disks);
3069 + MD_INIT_LIST_HEAD(&rdev->pending);
3071 + if (rdev->faulty && rdev->sb)
3072 + free_disk_sb(rdev);
3078 + unlock_rdev(rdev);
3079 + free_disk_sb(rdev);
3086 + * Check a full RAID array for plausibility
3089 +#define INCONSISTENT KERN_ERR \
3090 +"md: fatal superblock inconsistency in %s -- removing from array\n"
3092 +#define OUT_OF_DATE KERN_ERR \
3093 +"md: superblock update time inconsistency -- using the most recent one\n"
3095 +#define OLD_VERSION KERN_ALERT \
3096 +"md: md%d: unsupported raid array version %d.%d.%d\n"
3098 +#define NOT_CLEAN_IGNORE KERN_ERR \
3099 +"md: md%d: raid array is not clean -- starting background reconstruction\n"
3101 +#define UNKNOWN_LEVEL KERN_ERR \
3102 +"md: md%d: unsupported raid level %d\n"
3104 +static int analyze_sbs (mddev_t * mddev)
3106 + int out_of_date = 0, i;
3107 + struct md_list_head *tmp, *tmp2;
3108 + mdk_rdev_t *rdev, *rdev2, *freshest;
3112 + * Verify the RAID superblock on each real device
3114 + ITERATE_RDEV(mddev,rdev,tmp) {
3115 + if (rdev->faulty) {
3123 + if (check_disk_sb(rdev))
3128 + * The superblock constant part has to be the same
3129 + * for all disks in the array.
3133 + ITERATE_RDEV(mddev,rdev,tmp) {
3138 + if (!sb_equal(sb, rdev->sb)) {
3139 + printk (INCONSISTENT, partition_name(rdev->dev));
3140 + kick_rdev_from_array(rdev);
3146 + * OK, we have all disks and the array is ready to run. Let's
3147 + * find the freshest superblock, that one will be the superblock
3148 + * that represents the whole array.
3151 + if (alloc_array_sb(mddev))
3156 + ITERATE_RDEV(mddev,rdev,tmp) {
3159 + * if the checksum is invalid, use the superblock
3160 + * only as a last resort. (decrease it's age by
3163 + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
3164 + __u64 ev = get_unaligned(&rdev->sb->events);
3165 + if (ev != (__u64)0) {
3167 + put_unaligned(ev,&rdev->sb->events);
3171 + printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
3172 + (unsigned long)get_unaligned(&rdev->sb->events));
3178 + * Find the newest superblock version
3180 + ev1 = get_unaligned(&rdev->sb->events);
3181 + ev2 = get_unaligned(&freshest->sb->events);
3188 + if (out_of_date) {
3189 + printk(OUT_OF_DATE);
3190 + printk("freshest: %s\n", partition_name(freshest->dev));
3192 + memcpy (sb, freshest->sb, sizeof(*sb));
3195 + * at this point we have picked the 'best' superblock
3196 + * from all available superblocks.
3197 + * now we validate this superblock and kick out possibly
3200 + ITERATE_RDEV(mddev,rdev,tmp) {
3202 + * Kick all non-fresh devices faulty
3205 + ev1 = get_unaligned(&rdev->sb->events);
3206 + ev2 = get_unaligned(&sb->events);
3209 + printk("md: kicking non-fresh %s from array!\n",
3210 + partition_name(rdev->dev));
3211 + kick_rdev_from_array(rdev);
3217 + * Fix up changed device names ... but only if this disk has a
3218 + * recent update time. Use faulty checksum ones too.
3220 + ITERATE_RDEV(mddev,rdev,tmp) {
3221 + __u64 ev1, ev2, ev3;
3222 + if (rdev->faulty) { /* REMOVEME */
3226 + ev1 = get_unaligned(&rdev->sb->events);
3227 + ev2 = get_unaligned(&sb->events);
3230 + if ((rdev->dev != rdev->old_dev) &&
3231 + ((ev1 == ev2) || (ev1 == ev3))) {
3234 + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
3235 + if (rdev->desc_nr == -1) {
3239 + desc = &sb->disks[rdev->desc_nr];
3240 + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
3244 + desc->major = MAJOR(rdev->dev);
3245 + desc->minor = MINOR(rdev->dev);
3246 + desc = &rdev->sb->this_disk;
3247 + desc->major = MAJOR(rdev->dev);
3248 + desc->minor = MINOR(rdev->dev);
3253 + * Remove unavailable and faulty devices ...
3255 + * note that if an array becomes completely unrunnable due to
3256 + * missing devices, we do not write the superblock back, so the
3257 + * administrator has a chance to fix things up. The removal thus
3258 + * only happens if it's nonfatal to the contents of the array.
3260 + for (i = 0; i < MD_SB_DISKS; i++) {
3265 + desc = sb->disks + i;
3266 + dev = MKDEV(desc->major, desc->minor);
3269 + * We kick faulty devices/descriptors immediately.
3271 + if (disk_faulty(desc)) {
3273 + ITERATE_RDEV(mddev,rdev,tmp) {
3274 + if (rdev->desc_nr != desc->number)
3276 + printk("md%d: kicking faulty %s!\n",
3277 + mdidx(mddev),partition_name(rdev->dev));
3278 + kick_rdev_from_array(rdev);
3283 + if (dev == MKDEV(0,0))
3285 + printk("md%d: removing former faulty %s!\n",
3286 + mdidx(mddev), partition_name(dev));
3288 + remove_descriptor(desc, sb);
3292 + if (dev == MKDEV(0,0))
3295 + * Is this device present in the rdev ring?
3298 + ITERATE_RDEV(mddev,rdev,tmp) {
3299 + if (rdev->desc_nr == desc->number) {
3307 + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
3308 + remove_descriptor(desc, sb);
3312 + * Double check wether all devices mentioned in the
3313 + * superblock are in the rdev ring.
3315 + for (i = 0; i < MD_SB_DISKS; i++) {
3319 + desc = sb->disks + i;
3320 + dev = MKDEV(desc->major, desc->minor);
3322 + if (dev == MKDEV(0,0))
3325 + if (disk_faulty(desc)) {
3330 + rdev = find_rdev(mddev, dev);
3338 + * Do a final reality check.
3340 + ITERATE_RDEV(mddev,rdev,tmp) {
3341 + if (rdev->desc_nr == -1) {
3346 + * is the desc_nr unique?
3348 + ITERATE_RDEV(mddev,rdev2,tmp2) {
3349 + if ((rdev2 != rdev) &&
3350 + (rdev2->desc_nr == rdev->desc_nr)) {
3356 + * is the device unique?
3358 + ITERATE_RDEV(mddev,rdev2,tmp2) {
3359 + if ((rdev2 != rdev) &&
3360 + (rdev2->dev == rdev->dev)) {
3368 + * Check if we can support this RAID array
3370 + if (sb->major_version != MD_MAJOR_VERSION ||
3371 + sb->minor_version > MD_MINOR_VERSION) {
3373 + printk (OLD_VERSION, mdidx(mddev), sb->major_version,
3374 + sb->minor_version, sb->patch_version);
3378 + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
3379 + (sb->level == 4) || (sb->level == 5)))
3380 + printk (NOT_CLEAN_IGNORE, mdidx(mddev));
3387 +#undef INCONSISTENT
3392 +static int device_size_calculation (mddev_t * mddev)
3394 + int data_disks = 0, persistent;
3395 + unsigned int readahead;
3396 + mdp_super_t *sb = mddev->sb;
3397 + struct md_list_head *tmp;
3401 + * Do device size calculation. Bail out if too small.
3402 + * (we have to do this after having validated chunk_size,
3403 + * because device size has to be modulo chunk_size)
3405 + persistent = !mddev->sb->not_persistent;
3406 + ITERATE_RDEV(mddev,rdev,tmp) {
3413 + rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
3414 + if (rdev->size < sb->chunk_size / 1024) {
3415 + printk (KERN_WARNING
3416 + "Dev %s smaller than chunk_size: %dk < %dk\n",
3417 + partition_name(rdev->dev),
3418 + rdev->size, sb->chunk_size / 1024);
3423 + switch (sb->level) {
3431 + zoned_raid_size(mddev);
3435 + zoned_raid_size(mddev);
3436 + data_disks = sb->raid_disks;
3443 + data_disks = sb->raid_disks-1;
3446 + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
3449 + if (!md_size[mdidx(mddev)])
3450 + md_size[mdidx(mddev)] = sb->size * data_disks;
3452 + readahead = MD_READAHEAD;
3453 + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
3454 + readahead = mddev->sb->chunk_size * 4 * data_disks;
3455 + if (readahead < data_disks * MAX_SECTORS*512*2)
3456 + readahead = data_disks * MAX_SECTORS*512*2;
3458 + if (sb->level == -3)
3461 + md_maxreadahead[mdidx(mddev)] = readahead;
3463 + printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
3464 + mdidx(mddev), readahead/1024);
3467 + "md%d: %d data-disks, max readahead per data-disk: %dk\n",
3468 + mdidx(mddev), data_disks, readahead/data_disks/1024);
3475 +#define TOO_BIG_CHUNKSIZE KERN_ERR \
3476 +"too big chunk_size: %d > %d\n"
3478 +#define TOO_SMALL_CHUNKSIZE KERN_ERR \
3479 +"too small chunk_size: %d < %ld\n"
3481 +#define BAD_CHUNKSIZE KERN_ERR \
3482 +"no chunksize specified, see 'man raidtab'\n"
3484 +static int do_md_run (mddev_t * mddev)
3488 + struct md_list_head *tmp;
3492 + if (!mddev->nb_dev) {
3501 + * Resize disks to align partitions size on a given
3504 + md_size[mdidx(mddev)] = 0;
3507 + * Analyze all RAID superblock(s)
3509 + if (analyze_sbs(mddev)) {
3514 + chunk_size = mddev->sb->chunk_size;
3515 + pnum = level_to_pers(mddev->sb->level);
3517 + mddev->param.chunk_size = chunk_size;
3518 + mddev->param.personality = pnum;
3520 + if (chunk_size > MAX_CHUNK_SIZE) {
3521 + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
3525 + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3527 + if ( (1 << ffz(~chunk_size)) != chunk_size) {
3531 + if (chunk_size < PAGE_SIZE) {
3532 + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
3536 + if (pnum >= MAX_PERSONALITY) {
3541 + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
3543 + * 'default chunksize' in the old md code used to
3544 + * be PAGE_SIZE, baaad.
3545 + * we abort here to be on the safe side. We dont
3546 + * want to continue the bad practice.
3548 + printk(BAD_CHUNKSIZE);
3555 + char module_name[80];
3556 + sprintf (module_name, "md-personality-%d", pnum);
3557 + request_module (module_name);
3563 + if (device_size_calculation(mddev))
3567 + * Drop all container device buffers, from now on
3568 + * the only valid external interface is through the md
3571 + ITERATE_RDEV(mddev,rdev,tmp) {
3574 + fsync_dev(rdev->dev);
3575 + invalidate_buffers(rdev->dev);
3578 + mddev->pers = pers[pnum];
3580 + err = mddev->pers->run(mddev);
3582 + printk("pers->run() failed ...\n");
3583 + mddev->pers = NULL;
3587 + mddev->sb->state &= ~(1 << MD_SB_CLEAN);
3588 + md_update_sb(mddev);
3591 + * md_size has units of 1K blocks, which are
3592 + * twice as large as sectors.
3594 + md_hd_struct[mdidx(mddev)].start_sect = 0;
3595 + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
3597 + read_ahead[MD_MAJOR] = 1024;
3601 +#undef TOO_BIG_CHUNKSIZE
3602 +#undef BAD_CHUNKSIZE
3604 +#define OUT(x) do { err = (x); goto out; } while (0)
3606 +static int restart_array (mddev_t *mddev)
3611 + * Complain if it has no devices
3613 + if (!mddev->nb_dev)
3616 + if (mddev->pers) {
3621 + set_device_ro(mddev_to_kdev(mddev), 0);
3624 + "md%d switched to read-write mode.\n", mdidx(mddev));
3626 + * Kick recovery or resync if necessary
3628 + md_recover_arrays();
3629 + if (mddev->pers->restart_resync)
3630 + mddev->pers->restart_resync(mddev);
3638 +#define STILL_MOUNTED KERN_WARNING \
3639 +"md: md%d still mounted.\n"
3641 +static int do_md_stop (mddev_t * mddev, int ro)
3643 + int err = 0, resync_interrupted = 0;
3644 + kdev_t dev = mddev_to_kdev(mddev);
3646 + if (!ro && !fs_may_mount (dev)) {
3647 + printk (STILL_MOUNTED, mdidx(mddev));
3652 + * complain if it's already stopped
3654 + if (!mddev->nb_dev)
3657 + if (mddev->pers) {
3659 + * It is safe to call stop here, it only frees private
3660 + * data. Also, it tells us if a device is unstoppable
3661 + * (eg. resyncing is in progress)
3663 + if (mddev->pers->stop_resync)
3664 + if (mddev->pers->stop_resync(mddev))
3665 + resync_interrupted = 1;
3667 + if (mddev->recovery_running)
3668 + md_interrupt_thread(md_recovery_thread);
3671 + * This synchronizes with signal delivery to the
3672 + * resync or reconstruction thread. It also nicely
3673 + * hangs the process if some reconstruction has not
3676 + down(&mddev->recovery_sem);
3677 + up(&mddev->recovery_sem);
3680 + * sync and invalidate buffers because we cannot kill the
3681 + * main thread with valid IO transfers still around.
3682 + * the kernel lock protects us from new requests being
3683 + * added after invalidate_buffers().
3685 + fsync_dev (mddev_to_kdev(mddev));
3687 + invalidate_buffers (dev);
3695 + set_device_ro(dev, 0);
3696 + if (mddev->pers->stop(mddev)) {
3698 + set_device_ro(dev, 1);
3706 + * mark it clean only if there was no resync
3709 + if (!mddev->recovery_running && !resync_interrupted) {
3710 + printk("marking sb clean...\n");
3711 + mddev->sb->state |= 1 << MD_SB_CLEAN;
3713 + md_update_sb(mddev);
3716 + set_device_ro(dev, 1);
3720 + * Free resources if final stop
3723 + export_array(mddev);
3724 + md_size[mdidx(mddev)] = 0;
3725 + md_hd_struct[mdidx(mddev)].nr_sects = 0;
3726 + free_mddev(mddev);
3728 + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
3731 + "md%d switched to read-only mode.\n", mdidx(mddev));
3739 + * We have to safely support old arrays too.
3741 +int detect_old_array (mdp_super_t *sb)
3743 + if (sb->major_version > 0)
3745 + if (sb->minor_version >= 90)
3752 +static void autorun_array (mddev_t *mddev)
3755 + struct md_list_head *tmp;
3758 + if (mddev->disks.prev == &mddev->disks) {
3763 + printk("running: ");
3765 + ITERATE_RDEV(mddev,rdev,tmp) {
3766 + printk("<%s>", partition_name(rdev->dev));
3768 + printk("\nnow!\n");
3770 + err = do_md_run (mddev);
3772 + printk("do_md_run() returned %d\n", err);
3774 + * prevent the writeback of an unrunnable array
3776 + mddev->sb_dirty = 0;
3777 + do_md_stop (mddev, 0);
3782 + * lets try to run arrays based on all disks that have arrived
3783 + * until now. (those are in the ->pending list)
3785 + * the method: pick the first pending disk, collect all disks with
3786 + * the same UUID, remove all from the pending list and put them into
3787 + * the 'same_array' list. Then order this list based on superblock
3788 + * update time (freshest comes first), kick out 'old' disks and
3789 + * compare superblocks. If everything's fine then run it.
3791 +static void autorun_devices (void)
3793 + struct md_list_head candidates;
3794 + struct md_list_head *tmp;
3795 + mdk_rdev_t *rdev0, *rdev;
3800 + printk("autorun ...\n");
3801 + while (pending_raid_disks.next != &pending_raid_disks) {
3802 + rdev0 = md_list_entry(pending_raid_disks.next,
3803 + mdk_rdev_t, pending);
3805 + printk("considering %s ...\n", partition_name(rdev0->dev));
3806 + MD_INIT_LIST_HEAD(&candidates);
3807 + ITERATE_RDEV_PENDING(rdev,tmp) {
3808 + if (uuid_equal(rdev0, rdev)) {
3809 + if (!sb_equal(rdev0->sb, rdev->sb)) {
3810 + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
3813 + printk(" adding %s ...\n", partition_name(rdev->dev));
3814 + md_list_del(&rdev->pending);
3815 + md_list_add(&rdev->pending, &candidates);
3819 + * now we have a set of devices, with all of them having
3820 + * mostly sane superblocks. It's time to allocate the
3823 - if (analyze_one_sb (realdev))
3824 + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
3825 + mddev = kdev_to_mddev(md_kdev);
3827 + printk("md%d already running, cannot run %s\n",
3828 + mdidx(mddev), partition_name(rdev0->dev));
3829 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
3830 + export_rdev(rdev);
3833 + mddev = alloc_mddev(md_kdev);
3834 + printk("created md%d\n", mdidx(mddev));
3835 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
3836 + bind_rdev_to_array(rdev, mddev);
3837 + md_list_del(&rdev->pending);
3838 + MD_INIT_LIST_HEAD(&rdev->pending);
3840 + autorun_array(mddev);
3842 + printk("... autorun DONE.\n");
3846 + * import RAID devices based on one partition
3847 + * if possible, the array gets run as well.
3850 +#define BAD_VERSION KERN_ERR \
3851 +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
3853 +#define OUT_OF_MEM KERN_ALERT \
3854 +"md: out of memory.\n"
3856 +#define NO_DEVICE KERN_ERR \
3857 +"md: disabled device %s\n"
3859 +#define AUTOADD_FAILED KERN_ERR \
3860 +"md: auto-adding devices to md%d FAILED (error %d).\n"
3862 +#define AUTOADD_FAILED_USED KERN_ERR \
3863 +"md: cannot auto-add device %s to md%d, already used.\n"
3865 +#define AUTORUN_FAILED KERN_ERR \
3866 +"md: auto-running md%d FAILED (error %d).\n"
3868 +#define MDDEV_BUSY KERN_ERR \
3869 +"md: cannot auto-add to md%d, already running.\n"
3871 +#define AUTOADDING KERN_INFO \
3872 +"md: auto-adding devices to md%d, based on %s's superblock.\n"
3874 +#define AUTORUNNING KERN_INFO \
3875 +"md: auto-running md%d.\n"
3877 +static int autostart_array (kdev_t startdev)
3879 + int err = -EINVAL, i;
3880 + mdp_super_t *sb = NULL;
3881 + mdk_rdev_t *start_rdev = NULL, *rdev;
3883 + if (md_import_device(startdev, 1)) {
3884 + printk("could not import %s!\n", partition_name(startdev));
3888 + start_rdev = find_rdev_all(startdev);
3889 + if (!start_rdev) {
3893 + if (start_rdev->faulty) {
3894 + printk("can not autostart based on faulty %s!\n",
3895 + partition_name(startdev));
3898 + md_list_add(&start_rdev->pending, &pending_raid_disks);
3900 + sb = start_rdev->sb;
3902 + err = detect_old_array(sb);
3904 + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
3908 + for (i = 0; i < MD_SB_DISKS; i++) {
3912 + desc = sb->disks + i;
3913 + dev = MKDEV(desc->major, desc->minor);
3915 + if (dev == MKDEV(0,0))
3917 + if (dev == startdev)
3919 + if (md_import_device(dev, 1)) {
3920 + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
3923 + rdev = find_rdev_all(dev);
3928 + md_list_add(&rdev->pending, &pending_raid_disks);
3932 + * possibly return codes
3934 + autorun_devices();
3939 + export_rdev(start_rdev);
3946 +#undef AUTOADD_FAILED_USED
3947 +#undef AUTOADD_FAILED
3948 +#undef AUTORUN_FAILED
3956 +} raid_setup_args md__initdata = { 0, 0 };
3959 + * Searches all registered partitions for autorun RAID arrays
3962 +md__initfunc(void autodetect_raid(void))
3964 +#ifdef CONFIG_AUTODETECT_RAID
3965 + struct gendisk *disk;
3969 + if (raid_setup_args.noautodetect) {
3970 + printk(KERN_INFO "skipping autodetection of RAID arrays\n");
3973 + printk(KERN_INFO "autodetecting RAID arrays\n");
3975 + for (disk = gendisk_head ; disk ; disk = disk->next) {
3976 + for (i = 0; i < disk->max_p*disk->max_nr; i++) {
3977 + kdev_t dev = MKDEV(disk->major,i);
3979 + if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
3981 +"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
3982 +" to maintain interoperability with other OSs! Autodetection support for\n"
3983 +" type 0x86 will be deleted after some migration timeout. Sorry.\n",
3984 + partition_name(dev));
3985 + disk->part[i].type = LINUX_RAID_PARTITION;
3987 + if (disk->part[i].type != LINUX_RAID_PARTITION)
3990 + if (md_import_device(dev,1)) {
3991 + printk(KERN_ALERT "could not import %s!\n",
3992 + partition_name(dev));
3998 + rdev = find_rdev_all(dev);
4003 + if (rdev->faulty) {
4007 + md_list_add(&rdev->pending, &pending_raid_disks);
4011 + autorun_devices();
4015 +static int get_version (void * arg)
4017 + mdu_version_t ver;
4019 + ver.major = MD_MAJOR_VERSION;
4020 + ver.minor = MD_MINOR_VERSION;
4021 + ver.patchlevel = MD_PATCHLEVEL_VERSION;
4023 + if (md_copy_to_user(arg, &ver, sizeof(ver)))
4029 +#define SET_FROM_SB(x) info.x = mddev->sb->x
4030 +static int get_array_info (mddev_t * mddev, void * arg)
4032 + mdu_array_info_t info;
4037 + SET_FROM_SB(major_version);
4038 + SET_FROM_SB(minor_version);
4039 + SET_FROM_SB(patch_version);
4040 + SET_FROM_SB(ctime);
4041 + SET_FROM_SB(level);
4042 + SET_FROM_SB(size);
4043 + SET_FROM_SB(nr_disks);
4044 + SET_FROM_SB(raid_disks);
4045 + SET_FROM_SB(md_minor);
4046 + SET_FROM_SB(not_persistent);
4048 + SET_FROM_SB(utime);
4049 + SET_FROM_SB(state);
4050 + SET_FROM_SB(active_disks);
4051 + SET_FROM_SB(working_disks);
4052 + SET_FROM_SB(failed_disks);
4053 + SET_FROM_SB(spare_disks);
4055 + SET_FROM_SB(layout);
4056 + SET_FROM_SB(chunk_size);
4058 + if (md_copy_to_user(arg, &info, sizeof(info)))
4065 +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
4066 +static int get_disk_info (mddev_t * mddev, void * arg)
4068 + mdu_disk_info_t info;
4074 + if (md_copy_from_user(&info, arg, sizeof(info)))
4078 + if (nr >= mddev->sb->nr_disks)
4081 + SET_FROM_SB(major);
4082 + SET_FROM_SB(minor);
4083 + SET_FROM_SB(raid_disk);
4084 + SET_FROM_SB(state);
4086 + if (md_copy_to_user(arg, &info, sizeof(info)))
4093 +#define SET_SB(x) mddev->sb->disks[nr].x = info.x
4095 +static int add_new_disk (mddev_t * mddev, void * arg)
4097 + int err, size, persistent;
4098 + mdu_disk_info_t info;
4106 + if (md_copy_from_user(&info, arg, sizeof(info)))
4110 + if (nr >= mddev->sb->nr_disks)
4113 + dev = MKDEV(info.major,info.minor);
4115 + if (find_rdev_all(dev)) {
4116 + printk("device %s already used in a RAID array!\n",
4117 + partition_name(dev));
4124 + SET_SB(raid_disk);
4127 + if ((info.state & (1<<MD_DISK_FAULTY))==0) {
4128 + err = md_import_device (dev, 0);
4130 + printk("md: error, md_import_device() returned %d\n", err);
4133 + rdev = find_rdev_all(dev);
4139 + rdev->old_dev = dev;
4140 + rdev->desc_nr = info.number;
4142 + bind_rdev_to_array(rdev, mddev);
4144 + persistent = !mddev->sb->not_persistent;
4146 + printk("nonpersistent superblock ...\n");
4147 + if (!mddev->sb->chunk_size)
4148 + printk("no chunksize?\n");
4150 + size = calc_dev_size(dev, mddev, persistent);
4151 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
4153 + if (!mddev->sb->size || (mddev->sb->size > size))
4154 + mddev->sb->size = size;
4158 + * sync all other superblocks with the main superblock
4166 +static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
4175 + printk("trying to remove %s from md%d ... \n",
4176 + partition_name(dev), mdidx(mddev));
4178 + if (!mddev->pers->diskop) {
4179 + printk("md%d: personality does not support diskops!\n",
4184 + rdev = find_rdev(mddev, dev);
4188 + if (rdev->desc_nr == -1) {
4192 + disk = &mddev->sb->disks[rdev->desc_nr];
4193 + if (disk_active(disk))
4195 + if (disk_removed(disk)) {
4200 + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
4201 + if (err == -EBUSY)
4208 + remove_descriptor(disk, mddev->sb);
4209 + kick_rdev_from_array(rdev);
4210 + mddev->sb_dirty = 1;
4211 + md_update_sb(mddev);
4215 + printk("cannot remove active disk %s from md%d ... \n",
4216 + partition_name(dev), mdidx(mddev));
4220 +static int hot_add_disk (mddev_t * mddev, kdev_t dev)
4222 + int i, err, persistent;
4223 + unsigned int size;
4230 + printk("trying to hot-add %s to md%d ... \n",
4231 + partition_name(dev), mdidx(mddev));
4233 + if (!mddev->pers->diskop) {
4234 + printk("md%d: personality does not support diskops!\n",
4239 + persistent = !mddev->sb->not_persistent;
4240 + size = calc_dev_size(dev, mddev, persistent);
4242 + if (size < mddev->sb->size) {
4243 + printk("md%d: disk size %d blocks < array size %d\n",
4244 + mdidx(mddev), size, mddev->sb->size);
4248 + rdev = find_rdev(mddev, dev);
4252 + err = md_import_device (dev, 0);
4254 + printk("md: error, md_import_device() returned %d\n", err);
4257 + rdev = find_rdev_all(dev);
4262 + if (rdev->faulty) {
4263 + printk("md: can not hot-add faulty %s disk to md%d!\n",
4264 + partition_name(dev), mdidx(mddev));
4266 + goto abort_export;
4268 + bind_rdev_to_array(rdev, mddev);
4271 + * The rest should better be atomic, we can have disk failures
4272 + * noticed in interrupt contexts ...
4275 + rdev->old_dev = dev;
4276 + rdev->size = size;
4277 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
4279 + disk = mddev->sb->disks + mddev->sb->raid_disks;
4280 + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
4281 + disk = mddev->sb->disks + i;
4283 + if (!disk->major && !disk->minor)
4285 + if (disk_removed(disk))
4288 + if (i == MD_SB_DISKS) {
4290 + printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
4292 + goto abort_unbind_export;
4295 + if (disk_removed(disk)) {
4297 - * hot_add has to bump up nb_dev itself
4300 - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
4302 - * FIXME: here we should free up the inode and stuff
4304 - printk ("FIXME\n");
4306 + if (disk->number != i) {
4310 + goto abort_unbind_export;
4313 - md_dev[minor].nb_dev++;
4318 - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
4320 + disk->raid_disk = disk->number;
4321 + disk->major = MAJOR(dev);
4322 + disk->minor = MINOR(dev);
4324 + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
4328 + goto abort_unbind_export;
4331 + mark_disk_spare(disk);
4332 + mddev->sb->nr_disks++;
4333 + mddev->sb->spare_disks++;
4334 + mddev->sb->working_disks++;
4336 + mddev->sb_dirty = 1;
4339 + md_update_sb(mddev);
4342 + * Kick recovery, maybe this spare has to be added to the
4343 + * array immediately.
4345 + md_recover_arrays();
4349 +abort_unbind_export:
4350 + unbind_rdev_from_array(rdev);
4353 + export_rdev(rdev);
4357 +#define SET_SB(x) mddev->sb->x = info.x
4358 +static int set_array_info (mddev_t * mddev, void * arg)
4360 + mdu_array_info_t info;
4363 + printk("array md%d already has a superblock!\n",
4368 + if (md_copy_from_user(&info, arg, sizeof(info)))
4371 + if (alloc_array_sb(mddev))
4374 + mddev->sb->major_version = MD_MAJOR_VERSION;
4375 + mddev->sb->minor_version = MD_MINOR_VERSION;
4376 + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
4377 + mddev->sb->ctime = CURRENT_TIME;
4382 + SET_SB(raid_disks);
4384 + SET_SB(not_persistent);
4387 + SET_SB(active_disks);
4388 + SET_SB(working_disks);
4389 + SET_SB(failed_disks);
4390 + SET_SB(spare_disks);
4393 + SET_SB(chunk_size);
4395 + mddev->sb->md_magic = MD_SB_MAGIC;
4398 + * Generate a 128 bit UUID
4400 + get_random_bytes(&mddev->sb->set_uuid0, 4);
4401 + get_random_bytes(&mddev->sb->set_uuid1, 4);
4402 + get_random_bytes(&mddev->sb->set_uuid2, 4);
4403 + get_random_bytes(&mddev->sb->set_uuid3, 4);
4409 +static int set_disk_info (mddev_t * mddev, void * arg)
4411 + printk("not yet");
4415 +static int clear_array (mddev_t * mddev)
4417 + printk("not yet");
4421 +static int write_raid_info (mddev_t * mddev)
4423 + printk("not yet");
4427 +static int protect_array (mddev_t * mddev)
4429 + printk("not yet");
4433 +static int unprotect_array (mddev_t * mddev)
4435 + printk("not yet");
4439 +static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
4443 + fsync_dev(mddev_to_kdev(mddev));
4444 + ret = md_error(mddev_to_kdev(mddev), dev);
4448 static int md_ioctl (struct inode *inode, struct file *file,
4449 unsigned int cmd, unsigned long arg)
4452 - struct hd_geometry *loc = (struct hd_geometry *) arg;
4453 + unsigned int minor;
4455 + struct hd_geometry *loc = (struct hd_geometry *) arg;
4456 + mddev_t *mddev = NULL;
4459 - if (!capable(CAP_SYS_ADMIN))
4461 + if (!md_capable_admin())
4464 - if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
4465 - (minor & 0x7f) < MAX_PERSONALITY &&
4466 - pers[minor & 0x7f] &&
4467 - pers[minor & 0x7f]->ioctl)
4468 - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
4470 - if (minor >= MAX_MD_DEV)
4472 + dev = inode->i_rdev;
4473 + minor = MINOR(dev);
4474 + if (minor >= MAX_MD_DEVS)
4479 - case REGISTER_DEV:
4480 - return do_md_add (minor, to_kdev_t ((dev_t) arg));
4482 + * Commands dealing with the RAID driver but not any
4483 + * particular array:
4487 + case RAID_VERSION:
4488 + err = get_version((void *)arg);
4491 + case PRINT_RAID_DEBUG:
4493 + md_print_devices();
4496 + case BLKGETSIZE: /* Return device size */
4501 + err = md_put_user(md_hd_struct[minor].nr_sects,
4506 - return do_md_run (minor, (int) arg);
4509 + invalidate_buffers(dev);
4513 - return do_md_stop (minor, inode);
4515 - case BLKGETSIZE: /* Return device size */
4516 - if (!arg) return -EINVAL;
4517 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
4523 - fsync_dev (inode->i_rdev);
4524 - invalidate_buffers (inode->i_rdev);
4530 - read_ahead[MAJOR(inode->i_rdev)] = arg;
4534 - if (!arg) return -EINVAL;
4535 - err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
4540 - /* We have a problem here : there is no easy way to give a CHS
4541 - virtual geometry. We currently pretend that we have a 2 heads
4542 - 4 sectors (with a BIG number of cylinders...). This drives dosfs
4543 - just mad... ;-) */
4546 - if (!loc) return -EINVAL;
4547 - err = put_user (2, (char *) &loc->heads);
4550 - err = put_user (4, (char *) &loc->sectors);
4553 - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
4556 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
4557 - (long *) &loc->start);
4562 - RO_IOCTLS(inode->i_rdev,arg);
4568 + read_ahead[MAJOR(dev)] = arg;
4579 + err = md_put_user (read_ahead[
4580 + MAJOR(dev)], (long *) arg);
4586 + * Commands creating/starting a new array:
4589 + mddev = kdev_to_mddev(dev);
4593 + case SET_ARRAY_INFO:
4596 + printk("array md%d already exists!\n",
4606 + case SET_ARRAY_INFO:
4607 + mddev = alloc_mddev(dev);
4613 + * alloc_mddev() should possibly self-lock.
4615 + err = lock_mddev(mddev);
4617 + printk("ioctl, reason %d, cmd %d\n", err, cmd);
4620 + err = set_array_info(mddev, (void *)arg);
4622 + printk("couldnt set array info. %d\n", err);
4629 + * possibly make it lock the array ...
4631 + err = autostart_array((kdev_t)arg);
4633 + printk("autostart %s failed!\n",
4634 + partition_name((kdev_t)arg));
4643 + * Commands querying/configuring an existing array:
4650 + err = lock_mddev(mddev);
4652 + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
4657 + * Commands even a read-only array can execute:
4661 + case GET_ARRAY_INFO:
4662 + err = get_array_info(mddev, (void *)arg);
4665 + case GET_DISK_INFO:
4666 + err = get_disk_info(mddev, (void *)arg);
4669 + case RESTART_ARRAY_RW:
4670 + err = restart_array(mddev);
4674 + err = do_md_stop (mddev, 0);
4677 + case STOP_ARRAY_RO:
4678 + err = do_md_stop (mddev, 1);
4682 + * We have a problem here : there is no easy way to give a CHS
4683 + * virtual geometry. We currently pretend that we have a 2 heads
4684 + * 4 sectors (with a BIG number of cylinders...). This drives
4685 + * dosfs just mad... ;-)
4690 + goto abort_unlock;
4692 + err = md_put_user (2, (char *) &loc->heads);
4694 + goto abort_unlock;
4695 + err = md_put_user (4, (char *) &loc->sectors);
4697 + goto abort_unlock;
4698 + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
4699 + (short *) &loc->cylinders);
4701 + goto abort_unlock;
4702 + err = md_put_user (md_hd_struct[minor].start_sect,
4703 + (long *) &loc->start);
4709 + * The remaining ioctls are changing the state of the
4710 + * superblock, so we do not allow read-only arrays
4715 + goto abort_unlock;
4721 + err = clear_array(mddev);
4724 + case ADD_NEW_DISK:
4725 + err = add_new_disk(mddev, (void *)arg);
4728 + case HOT_REMOVE_DISK:
4729 + err = hot_remove_disk(mddev, (kdev_t)arg);
4732 + case HOT_ADD_DISK:
4733 + err = hot_add_disk(mddev, (kdev_t)arg);
4736 + case SET_DISK_INFO:
4737 + err = set_disk_info(mddev, (void *)arg);
4740 + case WRITE_RAID_INFO:
4741 + err = write_raid_info(mddev);
4744 + case UNPROTECT_ARRAY:
4745 + err = unprotect_array(mddev);
4748 + case PROTECT_ARRAY:
4749 + err = protect_array(mddev);
4752 + case SET_DISK_FAULTY:
4753 + err = set_disk_faulty(mddev, (kdev_t)arg);
4758 + mdu_param_t param;
4760 + err = md_copy_from_user(¶m, (mdu_param_t *)arg,
4763 + goto abort_unlock;
4765 + err = do_md_run (mddev);
4767 + * we have to clean up the mess if
4768 + * the array cannot be run for some
4772 + mddev->sb_dirty = 0;
4773 + do_md_stop (mddev, 0);
4779 + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
4781 + goto abort_unlock;
4787 + unlock_mddev(mddev);
4789 + printk("huh11?\n");
4794 + printk("huh12?\n");
4800 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
4802 static int md_open (struct inode *inode, struct file *file)
4804 - int minor=MINOR(inode->i_rdev);
4811 +static void md_release (struct inode *inode, struct file *file)
4813 + sync_dev(inode->i_rdev);
4817 +static int md_read (struct inode *inode, struct file *file,
4818 + char *buf, int count)
4820 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4822 - md_dev[minor].busy++;
4823 - return (0); /* Always succeed */
4824 + if (!mddev || !mddev->pers)
4827 + return block_read (inode, file, buf, count);
4830 +static int md_write (struct inode *inode, struct file *file,
4831 + const char *buf, int count)
4833 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4835 + if (!mddev || !mddev->pers)
4838 -static int md_release (struct inode *inode, struct file *file)
4839 + return block_write (inode, file, buf, count);
4842 +static struct file_operations md_fops=
4844 - int minor=MINOR(inode->i_rdev);
4859 - sync_dev (inode->i_rdev);
4860 - md_dev[minor].busy--;
4862 +static int md_open (struct inode *inode, struct file *file)
4870 +static int md_release (struct inode *inode, struct file *file)
4872 + sync_dev(inode->i_rdev);
4876 static ssize_t md_read (struct file *file, char *buf, size_t count,
4879 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
4880 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4882 - if (!md_dev[minor].pers) /* Check if device is being run */
4884 + if (!mddev || !mddev->pers)
4887 - return block_read(file, buf, count, ppos);
4888 + return block_read(file, buf, count, ppos);
4891 static ssize_t md_write (struct file *file, const char *buf,
4892 size_t count, loff_t *ppos)
4894 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
4895 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
4897 - if (!md_dev[minor].pers) /* Check if device is being run */
4899 + if (!mddev || !mddev->pers)
4902 - return block_write(file, buf, count, ppos);
4903 + return block_write(file, buf, count, ppos);
4906 static struct file_operations md_fops=
4932 -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
4935 +int md_map (kdev_t dev, kdev_t *rdev,
4936 + unsigned long *rsector, unsigned long size)
4938 - if ((unsigned int) minor >= MAX_MD_DEV)
4940 - printk ("Bad md device %d\n", minor);
4944 - if (!md_dev[minor].pers)
4946 - printk ("Oops ! md%d not running, giving up !\n", minor);
4950 + mddev_t *mddev = kdev_to_mddev(dev);
4952 - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
4953 + if (!mddev || !mddev->pers) {
4958 + err = mddev->pers->map(mddev, dev, rdev, rsector, size);
4963 -int md_make_request (int minor, int rw, struct buffer_head * bh)
4964 +int md_make_request (struct buffer_head * bh, int rw)
4966 - if (md_dev [minor].pers->make_request) {
4967 - if (buffer_locked(bh))
4970 + mddev_t *mddev = kdev_to_mddev(bh->b_dev);
4972 + if (!mddev || !mddev->pers) {
4977 + if (mddev->pers->make_request) {
4978 + if (buffer_locked(bh)) {
4982 set_bit(BH_Lock, &bh->b_state);
4983 if (rw == WRITE || rw == WRITEA) {
4984 if (!buffer_dirty(bh)) {
4985 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
4987 + bh->b_end_io(bh, buffer_uptodate(bh));
4992 if (rw == READ || rw == READA) {
4993 if (buffer_uptodate(bh)) {
4994 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
4996 + bh->b_end_io(bh, buffer_uptodate(bh));
5001 - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
5002 + err = mddev->pers->make_request(mddev, rw, bh);
5004 make_request (MAJOR(bh->b_rdev), rw, bh);
5012 static void do_md_request (void)
5014 - printk ("Got md request, not good...");
5016 + printk(KERN_ALERT "Got md request, not good...");
5020 +int md_thread(void * arg)
5022 + mdk_thread_t *thread = arg;
5026 + exit_files(current);
5033 + sprintf(current->comm, thread->name);
5034 + md_init_signals();
5035 + md_flush_signals();
5036 + thread->tsk = current;
5039 + * md_thread is a 'system-thread', it's priority should be very
5040 + * high. We avoid resource deadlocks individually in each
5041 + * raid personality. (RAID5 does preallocation) We also use RR and
5042 + * the very same RT priority as kswapd, thus we will never get
5043 + * into a priority inversion deadlock.
5045 + * we definitely have to have equal or higher priority than
5046 + * bdflush, otherwise bdflush will deadlock if there are too
5047 + * many dirty RAID5 blocks.
5049 + current->policy = SCHED_OTHER;
5050 + current->priority = 40;
5056 + if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
5059 + interruptible_sleep_on(&thread->wqueue);
5062 + clear_bit(THREAD_WAKEUP, &thread->flags);
5063 + if (thread->run) {
5064 + thread->run(thread->data);
5065 + run_task_queue(&tq_disk);
5067 + if (md_signal_pending(current)) {
5068 + printk("%8s(%d) flushing signals.\n", current->comm,
5070 + md_flush_signals();
5078 -void md_wakeup_thread(struct md_thread *thread)
5079 +void md_wakeup_thread(mdk_thread_t *thread)
5081 set_bit(THREAD_WAKEUP, &thread->flags);
5082 wake_up(&thread->wqueue);
5085 -struct md_thread *md_register_thread (void (*run) (void *), void *data)
5086 +mdk_thread_t *md_register_thread (void (*run) (void *),
5087 + void *data, const char *name)
5089 - struct md_thread *thread = (struct md_thread *)
5090 - kmalloc(sizeof(struct md_thread), GFP_KERNEL);
5091 + mdk_thread_t *thread;
5093 struct semaphore sem = MUTEX_LOCKED;
5095 - if (!thread) return NULL;
5096 + thread = (mdk_thread_t *) kmalloc
5097 + (sizeof(mdk_thread_t), GFP_KERNEL);
5101 - memset(thread, 0, sizeof(struct md_thread));
5102 + memset(thread, 0, sizeof(mdk_thread_t));
5103 init_waitqueue(&thread->wqueue);
5107 thread->data = data;
5108 + thread->name = name;
5109 ret = kernel_thread(md_thread, thread, 0);
5112 @@ -838,270 +3032,407 @@
5116 -void md_unregister_thread (struct md_thread *thread)
5117 +void md_interrupt_thread (mdk_thread_t *thread)
5119 + if (!thread->tsk) {
5123 + printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
5124 + send_sig(SIGKILL, thread->tsk, 1);
5127 +void md_unregister_thread (mdk_thread_t *thread)
5129 struct semaphore sem = MUTEX_LOCKED;
5134 - printk("Killing md_thread %d %p %s\n",
5135 - thread->tsk->pid, thread->tsk, thread->tsk->comm);
5137 - printk("Aiee. md_thread has 0 tsk\n");
5138 - send_sig(SIGKILL, thread->tsk, 1);
5139 - printk("downing on %p\n", &sem);
5140 + thread->name = NULL;
5141 + if (!thread->tsk) {
5145 + md_interrupt_thread(thread);
5149 -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
5151 -int md_thread(void * arg)
5152 +void md_recover_arrays (void)
5154 - struct md_thread *thread = arg;
5158 - exit_files(current);
5161 - current->session = 1;
5162 - current->pgrp = 1;
5163 - sprintf(current->comm, "md_thread");
5164 - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
5165 - thread->tsk = current;
5170 - if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
5172 - spin_lock(¤t->sigmask_lock);
5173 - flush_signals(current);
5174 - spin_unlock(¤t->sigmask_lock);
5175 - interruptible_sleep_on(&thread->wqueue);
5177 - if (test_bit(THREAD_WAKEUP, &thread->flags))
5179 - if (!thread->run) {
5184 - } while (signal_pending(current));
5187 - clear_bit(THREAD_WAKEUP, &thread->flags);
5188 - if (thread->run) {
5189 - thread->run(thread->data);
5190 - run_task_queue(&tq_disk);
5192 + if (!md_recovery_thread) {
5196 + md_wakeup_thread(md_recovery_thread);
5199 -EXPORT_SYMBOL(md_size);
5200 -EXPORT_SYMBOL(md_maxreadahead);
5201 -EXPORT_SYMBOL(register_md_personality);
5202 -EXPORT_SYMBOL(unregister_md_personality);
5203 -EXPORT_SYMBOL(partition_name);
5204 -EXPORT_SYMBOL(md_dev);
5205 -EXPORT_SYMBOL(md_error);
5206 -EXPORT_SYMBOL(md_register_thread);
5207 -EXPORT_SYMBOL(md_unregister_thread);
5208 -EXPORT_SYMBOL(md_update_sb);
5209 -EXPORT_SYMBOL(md_map);
5210 -EXPORT_SYMBOL(md_wakeup_thread);
5211 -EXPORT_SYMBOL(md_do_sync);
5213 -#ifdef CONFIG_PROC_FS
5214 -static struct proc_dir_entry proc_md = {
5215 - PROC_MD, 6, "mdstat",
5216 - S_IFREG | S_IRUGO, 1, 0, 0,
5217 - 0, &proc_array_inode_operations,
5219 +int md_error (kdev_t dev, kdev_t rdev)
5221 + mddev_t *mddev = kdev_to_mddev(dev);
5222 + mdk_rdev_t * rrdev;
5229 + rrdev = find_rdev(mddev, rdev);
5230 + mark_rdev_faulty(rrdev);
5232 + * if recovery was running, stop it now.
5234 + if (mddev->pers->stop_resync)
5235 + mddev->pers->stop_resync(mddev);
5236 + if (mddev->recovery_running)
5237 + md_interrupt_thread(md_recovery_thread);
5238 + if (mddev->pers->error_handler) {
5239 + rc = mddev->pers->error_handler(mddev, rdev);
5240 + md_recover_arrays();
5245 + * Drop all buffers in the failed array.
5246 + * _not_. This is called from IRQ handlers ...
5248 + invalidate_buffers(rdev);
5253 -static void md_geninit (struct gendisk *gdisk)
5254 +static int status_unused (char * page)
5258 - for(i=0;i<MAX_MD_DEV;i++)
5260 - md_blocksizes[i] = 1024;
5261 - md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
5262 - md_gendisk.part[i].start_sect=-1; /* avoid partition check */
5263 - md_gendisk.part[i].nr_sects=0;
5264 - md_dev[i].pers=NULL;
5266 + int sz = 0, i = 0;
5268 + struct md_list_head *tmp;
5270 - blksize_size[MD_MAJOR] = md_blocksizes;
5271 - max_readahead[MD_MAJOR] = md_maxreadahead;
5272 + sz += sprintf(page + sz, "unused devices: ");
5274 -#ifdef CONFIG_PROC_FS
5275 - proc_register(&proc_root, &proc_md);
5277 + ITERATE_RDEV_ALL(rdev,tmp) {
5278 + if (!rdev->same_set.next && !rdev->same_set.prev) {
5280 + * The device is not yet used by any array.
5283 + sz += sprintf(page + sz, "%s ",
5284 + partition_name(rdev->dev));
5288 + sz += sprintf(page + sz, "<none>");
5290 + sz += sprintf(page + sz, "\n");
5294 -int md_error (kdev_t mddev, kdev_t rdev)
5296 +static int status_resync (char * page, mddev_t * mddev)
5298 - unsigned int minor = MINOR (mddev);
5301 + unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
5303 - if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
5304 - panic ("md_error gets unknown device\n");
5305 - if (!md_dev [minor].pers)
5306 - panic ("md_error gets an error for an unknown device\n");
5307 - if (md_dev [minor].pers->error_handler) {
5308 - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
5309 -#if SUPPORT_RECONSTRUCTION
5310 - md_wakeup_thread(md_sync_thread);
5311 -#endif /* SUPPORT_RECONSTRUCTION */
5315 + resync = mddev->curr_resync;
5316 + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
5317 + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
5320 + * Should not happen.
5322 + if (!max_blocks) {
5326 + res = resync*100/max_blocks;
5327 + if (!mddev->recovery_running)
5331 + sz += sprintf(page + sz, " resync=%u%%", res);
5336 + sz += sprintf(page + sz, " recovery=%u%%", res);
5339 + * We do not want to overflow, so the order of operands and
5340 + * the * 100 / 100 trick are important. We do a +1 to be
5341 + * safe against division by zero. We only estimate anyway.
5343 + * dt: time until now
5345 + * et: estimated finish time
5347 + dt = ((jiffies - mddev->resync_start) / HZ);
5348 + tt = (dt * (max_blocks / (resync/100+1)))/100;
5353 + * ignore rounding effects near finish time
5357 + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
5362 int get_md_status (char *page)
5364 - int sz=0, i, j, size;
5366 - sz+=sprintf( page+sz, "Personalities : ");
5367 - for (i=0; i<MAX_PERSONALITY; i++)
5369 - sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
5373 - sz+=sprintf (page+sz, "read_ahead ");
5374 - if (read_ahead[MD_MAJOR]==INT_MAX)
5375 - sz+=sprintf (page+sz, "not set\n");
5377 - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
5378 + int sz = 0, j, size;
5379 + struct md_list_head *tmp, *tmp2;
5383 + sz += sprintf(page + sz, "Personalities : ");
5384 + for (j = 0; j < MAX_PERSONALITY; j++)
5386 + sz += sprintf(page+sz, "[%s] ", pers[j]->name);
5388 + sz += sprintf(page+sz, "\n");
5391 + sz += sprintf(page+sz, "read_ahead ");
5392 + if (read_ahead[MD_MAJOR] == INT_MAX)
5393 + sz += sprintf(page+sz, "not set\n");
5395 + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
5397 - for (i=0; i<MAX_MD_DEV; i++)
5399 - sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
5401 - if (md_dev[i].pers)
5402 - sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
5403 + ITERATE_MDDEV(mddev,tmp) {
5404 + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
5405 + mddev->pers ? "" : "in");
5406 + if (mddev->pers) {
5408 + sz += sprintf(page + sz, " (read-only)");
5409 + sz += sprintf(page + sz, " %s", mddev->pers->name);
5413 - for (j=0; j<md_dev[i].nb_dev; j++)
5415 - sz+=sprintf (page+sz, " %s",
5416 - partition_name(md_dev[i].devices[j].dev));
5417 - size+=md_dev[i].devices[j].size;
5420 + ITERATE_RDEV(mddev,rdev,tmp2) {
5421 + sz += sprintf(page + sz, " %s[%d]",
5422 + partition_name(rdev->dev), rdev->desc_nr);
5423 + if (rdev->faulty) {
5424 + sz += sprintf(page + sz, "(F)");
5427 + size += rdev->size;
5430 - if (md_dev[i].nb_dev) {
5431 - if (md_dev[i].pers)
5432 - sz+=sprintf (page+sz, " %d blocks", md_size[i]);
5434 - sz+=sprintf (page+sz, " %d blocks", size);
5436 + if (mddev->nb_dev) {
5438 + sz += sprintf(page + sz, " %d blocks",
5439 + md_size[mdidx(mddev)]);
5441 + sz += sprintf(page + sz, " %d blocks", size);
5444 - if (!md_dev[i].pers)
5446 - sz+=sprintf (page+sz, "\n");
5449 + if (!mddev->pers) {
5450 + sz += sprintf(page+sz, "\n");
5454 - if (md_dev[i].pers->max_invalid_dev)
5455 - sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
5456 + sz += mddev->pers->status (page+sz, mddev);
5458 - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
5459 - sz+=sprintf (page+sz, "\n");
5461 + if (mddev->curr_resync)
5462 + sz += status_resync (page+sz, mddev);
5464 + if (md_atomic_read(&mddev->resync_sem.count) != 1)
5465 + sz += sprintf(page + sz, " resync=DELAYED");
5467 + sz += sprintf(page + sz, "\n");
5469 + sz += status_unused (page + sz);
5475 -int register_md_personality (int p_num, struct md_personality *p)
5476 +int register_md_personality (int pnum, mdk_personality_t *p)
5478 - int i=(p_num >> PERSONALITY_SHIFT);
5480 - if (i >= MAX_PERSONALITY)
5482 + if (pnum >= MAX_PERSONALITY)
5491 - printk ("%s personality registered\n", p->name);
5494 + printk(KERN_INFO "%s personality registered\n", p->name);
5498 -int unregister_md_personality (int p_num)
5499 +int unregister_md_personality (int pnum)
5501 - int i=(p_num >> PERSONALITY_SHIFT);
5503 - if (i >= MAX_PERSONALITY)
5505 + if (pnum >= MAX_PERSONALITY)
5508 - printk ("%s personality unregistered\n", pers[i]->name);
5511 + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
5512 + pers[pnum] = NULL;
5516 -static md_descriptor_t *get_spare(struct md_dev *mddev)
5517 +static mdp_disk_t *get_spare(mddev_t *mddev)
5520 - md_superblock_t *sb = mddev->sb;
5521 - md_descriptor_t *descriptor;
5522 - struct real_dev *realdev;
5524 - for (i = 0; i < mddev->nb_dev; i++) {
5525 - realdev = &mddev->devices[i];
5527 + mdp_super_t *sb = mddev->sb;
5530 + struct md_list_head *tmp;
5532 + ITERATE_RDEV(mddev,rdev,tmp) {
5538 - descriptor = &sb->disks[realdev->sb->descriptor.number];
5539 - if (descriptor->state & (1 << MD_FAULTY_DEVICE))
5541 + disk = &sb->disks[rdev->desc_nr];
5542 + if (disk_faulty(disk)) {
5545 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
5547 + if (disk_active(disk))
5549 - return descriptor;
5555 +static int is_mddev_idle (mddev_t *mddev)
5557 + mdk_rdev_t * rdev;
5558 + struct md_list_head *tmp;
5560 + unsigned long curr_events;
5563 + ITERATE_RDEV(mddev,rdev,tmp) {
5564 + curr_events = io_events[MAJOR(rdev->dev)];
5566 + if (curr_events != rdev->last_events) {
5567 +// printk("!I(%d)", curr_events-rdev->last_events);
5568 + rdev->last_events = curr_events;
5576 * parallel resyncing thread.
5578 - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
5579 - * - fix read error handing
5582 -int md_do_sync(struct md_dev *mddev)
5584 + * Determine correct block size for this device.
5586 +unsigned int device_bsize (kdev_t dev)
5588 + unsigned int i, correct_size;
5590 + correct_size = BLOCK_SIZE;
5591 + if (blksize_size[MAJOR(dev)]) {
5592 + i = blksize_size[MAJOR(dev)][MINOR(dev)];
5597 + return correct_size;
5600 +static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
5602 +#define RA_ORDER (1)
5603 +#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
5604 +#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
5606 +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
5608 - struct buffer_head *bh;
5609 - int max_blocks, blocksize, curr_bsize, percent=1, j;
5610 - kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
5612 + struct buffer_head **bh;
5613 + unsigned int max_blocks, blocksize, curr_bsize,
5614 + i, ii, j, k, chunk, window, nr_blocks, err, serialize;
5615 + kdev_t read_disk = mddev_to_kdev(mddev);
5616 int major = MAJOR(read_disk), minor = MINOR(read_disk);
5617 unsigned long starttime;
5618 + int max_read_errors = 2*MAX_NR_BLOCKS,
5619 + max_write_errors = 2*MAX_NR_BLOCKS;
5620 + struct md_list_head *tmp;
5623 + bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
5626 + "could not alloc bh array for reconstruction ... retrying!\n");
5630 + err = down_interruptible(&mddev->resync_sem);
5636 + ITERATE_MDDEV(mddev2,tmp) {
5637 + if (mddev2 == mddev)
5639 + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
5640 + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
5646 + interruptible_sleep_on(&resync_wait);
5647 + if (md_signal_pending(current)) {
5648 + md_flush_signals();
5655 + mddev->curr_resync = 1;
5657 - blocksize = blksize_size[major][minor];
5658 + blocksize = device_bsize(read_disk);
5659 max_blocks = blk_size[major][minor] / (blocksize >> 10);
5661 - printk("... resync log\n");
5662 - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
5663 - printk(" .... raid array: %s\n", kdevname(read_disk));
5664 - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
5665 - printk("md: syncing RAID array %s\n", kdevname(read_disk));
5666 + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
5667 + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
5668 + sysctl_speed_limit);
5669 + printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
5672 + * Resync has low priority.
5674 + current->priority = 1;
5676 + is_mddev_idle(mddev); /* this also initializes IO event counters */
5677 + starttime = jiffies;
5678 + mddev->resync_start = starttime;
5682 + * Tune reconstruction:
5684 + window = md_maxreadahead[mdidx(mddev)]/1024;
5685 + nr_blocks = window / (blocksize >> 10);
5686 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
5687 + nr_blocks = MAX_NR_BLOCKS;
5688 + printk(KERN_INFO "md: using %dk window.\n",window);
5690 - starttime=jiffies;
5691 - for (j = 0; j < max_blocks; j++) {
5692 + for (j = 0; j < max_blocks; j += nr_blocks) {
5695 + mddev->curr_resync = j;
5697 * B careful. When some1 mounts a non-'blocksize' filesystem
5698 * then we get the blocksize changed right under us. Go deal
5699 * with it transparently, recalculate 'blocksize', 'j' and
5702 - curr_bsize = blksize_size[major][minor];
5703 + curr_bsize = device_bsize(read_disk);
5704 if (curr_bsize != blocksize) {
5706 + printk(KERN_INFO "md%d: blocksize changed\n",
5709 if (curr_bsize > blocksize)
5711 * this is safe, rounds downwards.
5712 @@ -1111,114 +3442,384 @@
5713 j *= blocksize/curr_bsize;
5715 blocksize = curr_bsize;
5716 + nr_blocks = window / (blocksize >> 10);
5717 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
5718 + nr_blocks = MAX_NR_BLOCKS;
5719 max_blocks = blk_size[major][minor] / (blocksize >> 10);
5721 - if ((bh = breada (read_disk, j, blocksize, j * blocksize,
5722 - max_blocks * blocksize)) != NULL) {
5723 - mark_buffer_dirty(bh, 1);
5726 + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
5727 + nr_blocks, blocksize, j, max_blocks);
5729 - * FIXME: Ugly, but set_blocksize() isnt safe ...
5730 + * We will retry the current block-group
5732 - curr_bsize = blksize_size[major][minor];
5733 - if (curr_bsize != blocksize)
5734 - goto diff_blocksize;
5738 - * It's a real read problem. FIXME, handle this
5741 - printk ( KERN_ALERT
5742 - "read error, stopping reconstruction.\n");
5746 + * Cleanup routines expect this
5748 + for (k = 0; k < nr_blocks; k++)
5751 + chunk = nr_blocks;
5752 + if (chunk > max_blocks-j)
5753 + chunk = max_blocks-j;
5756 + * request buffer heads ...
5758 + for (i = 0; i < chunk; i++) {
5759 + bh[i] = getblk (read_disk, j+i, blocksize);
5762 + if (!buffer_dirty(bh[i]))
5763 + mark_buffer_lowprio(bh[i]);
5767 - * Let's sleep some if we are faster than our speed limit:
5768 + * read buffer heads ...
5770 - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
5772 - current->state = TASK_INTERRUPTIBLE;
5773 - schedule_timeout(1);
5774 + ll_rw_block (READ, chunk, bh);
5775 + run_task_queue(&tq_disk);
5778 + * verify that all of them are OK ...
5780 + for (i = 0; i < chunk; i++) {
5782 + wait_on_buffer(bh[ii]);
5783 + if (!buffer_uptodate(bh[ii]))
5788 + for (i = 0; i < chunk; i++)
5789 + mark_buffer_dirty_lowprio(bh[i]);
5791 + ll_rw_block(WRITE, chunk, bh);
5792 + run_task_queue(&tq_disk);
5794 + for (i = 0; i < chunk; i++) {
5796 + wait_on_buffer(bh[ii]);
5798 + if (spare && disk_faulty(spare)) {
5799 + for (k = 0; k < chunk; k++)
5801 + printk(" <SPARE FAILED!>\n ");
5806 + if (!buffer_uptodate(bh[ii])) {
5807 + curr_bsize = device_bsize(read_disk);
5808 + if (curr_bsize != blocksize) {
5810 + "md%d: blocksize changed during write\n",
5812 + for (k = 0; k < chunk; k++)
5814 + if (buffer_lowprio(bh[k]))
5815 + mark_buffer_clean(bh[k]);
5820 + printk(" BAD WRITE %8d>\n", j);
5822 + * Ouch, write error, retry or bail out.
5824 + if (max_write_errors) {
5825 + max_write_errors--;
5826 + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
5829 + printk ( KERN_ALERT
5830 + "too many write errors, stopping reconstruction.\n");
5831 + for (k = 0; k < chunk; k++)
5833 + if (buffer_lowprio(bh[k]))
5834 + mark_buffer_clean(bh[k]);
5843 - * FIXME: put this status bar thing into /proc
5844 + * This is the normal 'everything went OK' case
5845 + * do a 'free-behind' logic, we sure dont need
5846 + * this buffer if it was the only user.
5848 - if (!(j%(max_blocks/100))) {
5849 - if (!(percent%10))
5850 - printk (" %03d%% done.\n",percent);
5851 + for (i = 0; i < chunk; i++)
5852 + if (buffer_dirty(bh[i]))
5860 + if (md_signal_pending(current)) {
5862 + * got a signal, exit.
5864 + mddev->curr_resync = 0;
5865 + printk("md_do_sync() got signal ... exiting\n");
5866 + md_flush_signals();
5872 + * this loop exits only if either when we are slower than
5873 + * the 'hard' speed limit, or the system was IO-idle for
5875 + * the system might be non-idle CPU-wise, but we only care
5876 + * about not overloading the IO subsystem. (things like an
5877 + * e2fsck being done on the RAID array should execute fast)
5880 + if (md_need_resched(current))
5883 + if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
5884 + > sysctl_speed_limit) {
5885 + current->priority = 1;
5887 + if (!is_mddev_idle(mddev)) {
5888 + current->state = TASK_INTERRUPTIBLE;
5889 + md_schedule_timeout(HZ/2);
5890 + if (!md_signal_pending(current))
5894 + current->priority = 40;
5896 fsync_dev(read_disk);
5897 - printk("md: %s: sync done.\n", kdevname(read_disk));
5900 + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
5903 + * this also signals 'finished resyncing' to md_stop
5906 + up(&mddev->resync_sem);
5908 + free_pages((unsigned long)bh, RA_ORDER);
5909 + mddev->curr_resync = 0;
5910 + wake_up(&resync_wait);
5915 + * set_blocksize() might change the blocksize. This
5916 + * should not happen often, but it happens when eg.
5917 + * someone mounts a filesystem that has non-1k
5918 + * blocksize. set_blocksize() doesnt touch our
5919 + * buffer, but to avoid aliasing problems we change
5920 + * our internal blocksize too and retry the read.
5922 + curr_bsize = device_bsize(read_disk);
5923 + if (curr_bsize != blocksize) {
5924 + printk(KERN_INFO "md%d: blocksize changed during read\n",
5926 + for (k = 0; k < chunk; k++)
5928 + if (buffer_lowprio(bh[k]))
5929 + mark_buffer_clean(bh[k]);
5936 + * It's a real read problem. We retry and bail out
5937 + * only if it's excessive.
5939 + if (max_read_errors) {
5940 + max_read_errors--;
5941 + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
5942 + for (k = 0; k < chunk; k++)
5944 + if (buffer_lowprio(bh[k]))
5945 + mark_buffer_clean(bh[k]);
5950 + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
5951 + for (k = 0; k < chunk; k++)
5953 + if (buffer_lowprio(bh[k]))
5954 + mark_buffer_clean(bh[k]);
5961 +#undef MAX_NR_BLOCKS
5964 - * This is a kernel thread which: syncs a spare disk with the active array
5965 + * This is a kernel thread which syncs a spare disk with the active array
5967 * the amount of foolproofing might seem to be a tad excessive, but an
5968 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
5969 * of my root partition with the first 0.5 gigs of my /home partition ... so
5970 * i'm a bit nervous ;)
5972 -void mdsyncd (void *data)
5973 +void md_do_recovery (void *data)
5976 - struct md_dev *mddev;
5977 - md_superblock_t *sb;
5978 - md_descriptor_t *spare;
5982 + mdp_disk_t *spare;
5983 unsigned long flags;
5984 + struct md_list_head *tmp;
5986 - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
5987 - if ((sb = mddev->sb) == NULL)
5988 + printk(KERN_INFO "md: recovery thread got woken up ...\n");
5990 + ITERATE_MDDEV(mddev,tmp) {
5994 + if (mddev->recovery_running)
5996 if (sb->active_disks == sb->raid_disks)
5998 - if (!sb->spare_disks)
5999 + if (!sb->spare_disks) {
6000 + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
6004 + * now here we get the spare and resync it.
6006 if ((spare = get_spare(mddev)) == NULL)
6008 - if (!mddev->pers->mark_spare)
6009 + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
6010 + if (!mddev->pers->diskop)
6012 - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
6013 + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
6015 - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
6016 - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
6017 + down(&mddev->recovery_sem);
6018 + mddev->recovery_running = 1;
6019 + err = md_do_sync(mddev, spare);
6020 + if (err == -EIO) {
6021 + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
6022 + if (!disk_faulty(spare)) {
6023 + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
6024 + mark_disk_faulty(spare);
6025 + mark_disk_nonsync(spare);
6026 + mark_disk_inactive(spare);
6027 + sb->spare_disks--;
6028 + sb->working_disks--;
6029 + sb->failed_disks++;
6032 + if (disk_faulty(spare))
6033 + mddev->pers->diskop(mddev, &spare,
6034 + DISKOP_SPARE_INACTIVE);
6035 + if (err == -EINTR) {
6037 + * Recovery got interrupted ...
6038 + * signal back that we have finished using the array.
6040 + mddev->pers->diskop(mddev, &spare,
6041 + DISKOP_SPARE_INACTIVE);
6042 + up(&mddev->recovery_sem);
6043 + mddev->recovery_running = 0;
6046 + mddev->recovery_running = 0;
6047 + up(&mddev->recovery_sem);
6051 - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
6052 - spare->state |= (1 << MD_SYNC_DEVICE);
6053 - spare->state |= (1 << MD_ACTIVE_DEVICE);
6054 - sb->spare_disks--;
6055 - sb->active_disks++;
6056 - mddev->sb_dirty = 1;
6057 - md_update_sb(mddev - md_dev);
6058 + if (!disk_faulty(spare)) {
6060 + * the SPARE_ACTIVE diskop possibly changes the
6063 + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
6064 + mark_disk_sync(spare);
6065 + mark_disk_active(spare);
6066 + sb->active_disks++;
6067 + sb->spare_disks--;
6069 restore_flags(flags);
6070 + mddev->sb_dirty = 1;
6071 + md_update_sb(mddev);
6074 + printk(KERN_INFO "md: recovery thread finished ...\n");
6078 +int md_notify_reboot(struct notifier_block *this,
6079 + unsigned long code, void *x)
6081 + struct md_list_head *tmp;
6084 + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
6085 + || (code == MD_SYS_POWER_OFF)) {
6087 + printk(KERN_INFO "stopping all md devices.\n");
6089 + ITERATE_MDDEV(mddev,tmp)
6090 + do_md_stop (mddev, 1);
6092 + * certain more exotic SCSI devices are known to be
6093 + * volatile wrt too early system reboots. While the
6094 + * right place to handle this issue is the given
6095 + * driver, we do want to have a safe RAID driver ...
6097 + md_mdelay(1000*1);
6099 + return NOTIFY_DONE;
6102 +struct notifier_block md_notifier = {
6108 +md__initfunc(void raid_setup(char *str, int *ints))
6110 + char tmpline[100];
6111 + int len, pos, nr, i;
6113 + len = strlen(str) + 1;
6117 + for (i = 0; i < len; i++) {
6120 + if (c == ',' || !c) {
6122 + if (!strcmp(tmpline,"noautodetect"))
6123 + raid_setup_args.noautodetect = 1;
6131 + raid_setup_args.set = 1;
6135 #ifdef CONFIG_MD_BOOT
6140 -} md_setup_args __initdata = {
6141 +} md_setup_args md__initdata = {
6145 /* called from init/main.c */
6146 -__initfunc(void md_setup(char *str,int *ints))
6147 +md__initfunc(void md_setup(char *str,int *ints))
6150 for(i=0;i<=ints[0];i++) {
6151 @@ -1230,21 +3831,24 @@
6155 -__initfunc(void do_md_setup(char *str,int *ints))
6156 +md__initfunc(void do_md_setup(char *str,int *ints))
6158 - int minor, pers, factor, fault;
6160 + int minor, pers, chunk_size, fault;
6164 + printk("i plan to phase this out --mingo\n");
6167 - printk ("md: Too few Arguments (%d).\n", ints[0]);
6168 + printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
6174 - if (minor >= MAX_MD_DEV) {
6175 - printk ("md: Minor device number too high.\n");
6176 + if ((unsigned int)minor >= MAX_MD_DEVS) {
6177 + printk (KERN_WARNING "md: Minor device number too high.\n");
6181 @@ -1254,18 +3858,20 @@
6183 #ifdef CONFIG_MD_LINEAR
6185 - printk ("md: Setting up md%d as linear device.\n",minor);
6186 + printk (KERN_INFO "md: Setting up md%d as linear device.\n",
6189 - printk ("md: Linear mode not configured."
6190 + printk (KERN_WARNING "md: Linear mode not configured."
6191 "Recompile the kernel with linear mode enabled!\n");
6196 #ifdef CONFIG_MD_STRIPED
6197 - printk ("md: Setting up md%d as a striped device.\n",minor);
6198 + printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
6201 - printk ("md: Striped mode not configured."
6202 + printk (KERN_WARNING "md: Striped mode not configured."
6203 "Recompile the kernel with striped mode enabled!\n");
6206 @@ -1280,79 +3886,145 @@
6210 - printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
6211 + printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
6218 - factor=ints[i++]; /* Chunksize */
6219 - fault =ints[i++]; /* Faultlevel */
6220 + chunk_size = ints[i++]; /* Chunksize */
6221 + fault = ints[i++]; /* Faultlevel */
6223 - pers=pers | factor | (fault << FAULT_SHIFT);
6224 + pers = pers | chunk_size | (fault << FAULT_SHIFT);
6226 - while( str && (dev = name_to_kdev_t(str))) {
6227 - do_md_add (minor, dev);
6228 - if((str = strchr (str, ',')) != NULL)
6231 + while( str && (dev = name_to_kdev_t(str))) {
6232 + do_md_add (minor, dev);
6233 + if((str = strchr (str, ',')) != NULL)
6237 - do_md_run (minor, pers);
6238 - printk ("md: Loading md%d.\n",minor);
6239 + do_md_run (minor, pers);
6240 + printk (KERN_INFO "md: Loading md%d.\n",minor);
6247 +void hsm_init (void);
6248 +void translucent_init (void);
6249 void linear_init (void);
6250 void raid0_init (void);
6251 void raid1_init (void);
6252 void raid5_init (void);
6254 -__initfunc(int md_init (void))
6255 +md__initfunc(int md_init (void))
6257 - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
6258 - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
6259 - MAX_MD_DEV, MAX_REAL);
6261 - if (register_blkdev (MD_MAJOR, "md", &md_fops))
6263 - printk ("Unable to get major %d for md\n", MD_MAJOR);
6267 - blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
6268 - blk_dev[MD_MAJOR].current_request=NULL;
6269 - read_ahead[MD_MAJOR]=INT_MAX;
6270 - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
6271 - md_gendisk.next=gendisk_head;
6273 - gendisk_head=&md_gendisk;
6275 -#if SUPPORT_RECONSTRUCTION
6276 - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
6277 - printk("md: bug: md_sync_thread == NULL\n");
6278 -#endif /* SUPPORT_RECONSTRUCTION */
6279 + static char * name = "mdrecoveryd";
6281 + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
6282 + MD_MAJOR_VERSION, MD_MINOR_VERSION,
6283 + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
6285 + if (register_blkdev (MD_MAJOR, "md", &md_fops))
6287 + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
6291 + blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
6292 + blk_dev[MD_MAJOR].current_request = NULL;
6293 + read_ahead[MD_MAJOR] = INT_MAX;
6294 + md_gendisk.next = gendisk_head;
6296 + gendisk_head = &md_gendisk;
6298 + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
6299 + if (!md_recovery_thread)
6300 + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
6302 + md_register_reboot_notifier(&md_notifier);
6303 + md_register_sysctl();
6305 +#ifdef CONFIG_MD_HSM
6308 +#ifdef CONFIG_MD_TRANSLUCENT
6309 + translucent_init ();
6311 #ifdef CONFIG_MD_LINEAR
6315 #ifdef CONFIG_MD_STRIPED
6319 #ifdef CONFIG_MD_MIRRORING
6323 #ifdef CONFIG_MD_RAID5
6327 +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
6329 + * pick a XOR routine, runtime.
6331 + calibrate_xor_block();
6338 #ifdef CONFIG_MD_BOOT
6339 -__initfunc(void md_setup_drive(void))
6340 +md__initfunc(void md_setup_drive(void))
6342 if(md_setup_args.set)
6343 do_md_setup(md_setup_args.str, md_setup_args.ints);
6347 +MD_EXPORT_SYMBOL(md_size);
6348 +MD_EXPORT_SYMBOL(register_md_personality);
6349 +MD_EXPORT_SYMBOL(unregister_md_personality);
6350 +MD_EXPORT_SYMBOL(partition_name);
6351 +MD_EXPORT_SYMBOL(md_error);
6352 +MD_EXPORT_SYMBOL(md_recover_arrays);
6353 +MD_EXPORT_SYMBOL(md_register_thread);
6354 +MD_EXPORT_SYMBOL(md_unregister_thread);
6355 +MD_EXPORT_SYMBOL(md_update_sb);
6356 +MD_EXPORT_SYMBOL(md_map);
6357 +MD_EXPORT_SYMBOL(md_wakeup_thread);
6358 +MD_EXPORT_SYMBOL(md_do_sync);
6359 +MD_EXPORT_SYMBOL(md_print_devices);
6360 +MD_EXPORT_SYMBOL(find_rdev_nr);
6361 +MD_EXPORT_SYMBOL(md_check_ordering);
6362 +MD_EXPORT_SYMBOL(md_interrupt_thread);
6363 +MD_EXPORT_SYMBOL(mddev_map);
6365 +#ifdef CONFIG_PROC_FS
6366 +static struct proc_dir_entry proc_md = {
6367 + PROC_MD, 6, "mdstat",
6368 + S_IFREG | S_IRUGO, 1, 0, 0,
6369 + 0, &proc_array_inode_operations,
6373 +static void md_geninit (struct gendisk *gdisk)
6377 + for(i = 0; i < MAX_MD_DEVS; i++) {
6378 + md_blocksizes[i] = 1024;
6379 + md_maxreadahead[i] = MD_READAHEAD;
6380 + md_gendisk.part[i].start_sect = -1; /* avoid partition check */
6381 + md_gendisk.part[i].nr_sects = 0;
6384 + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6386 + blksize_size[MD_MAJOR] = md_blocksizes;
6387 + md_set_global_readahead(md_maxreadahead);
6389 +#ifdef CONFIG_PROC_FS
6390 + proc_register(&proc_root, &proc_md);
6394 diff -ruN linux.orig/drivers/block/raid0.c linux-2.2.16/drivers/block/raid0.c
6395 --- linux.orig/drivers/block/raid0.c Tue Jan 4 19:12:14 2000
6396 +++ linux-2.2.16/drivers/block/raid0.c Fri Jun 9 11:37:45 2000
6400 raid0.c : Multiple Devices driver for Linux
6401 Copyright (C) 1994-96 Marc ZYNGIER
6402 @@ -18,146 +17,201 @@
6405 #include <linux/module.h>
6406 -#include <linux/md.h>
6407 -#include <linux/raid0.h>
6408 -#include <linux/vmalloc.h>
6409 +#include <linux/raid/raid0.h>
6411 #define MAJOR_NR MD_MAJOR
6413 #define MD_PERSONALITY
6415 -static int create_strip_zones (int minor, struct md_dev *mddev)
6416 +static int create_strip_zones (mddev_t *mddev)
6419 - int current_offset=0;
6420 - struct real_dev *smallest_by_zone;
6421 - struct raid0_data *data=(struct raid0_data *) mddev->private;
6423 - data->nr_strip_zones=1;
6425 - for (i=1; i<mddev->nb_dev; i++)
6427 - for (j=0; j<i; j++)
6428 - if (mddev->devices[i].size==mddev->devices[j].size)
6435 - data->nr_strip_zones++;
6440 - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
6443 - data->smallest=NULL;
6445 - for (i=0; i<data->nr_strip_zones; i++)
6447 - data->strip_zone[i].dev_offset=current_offset;
6448 - smallest_by_zone=NULL;
6451 - for (j=0; j<mddev->nb_dev; j++)
6452 - if (mddev->devices[j].size>current_offset)
6454 - data->strip_zone[i].dev[c++]=mddev->devices+j;
6455 - if (!smallest_by_zone ||
6456 - smallest_by_zone->size > mddev->devices[j].size)
6457 - smallest_by_zone=mddev->devices+j;
6460 - data->strip_zone[i].nb_dev=c;
6461 - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
6463 - if (!data->smallest ||
6464 - data->smallest->size > data->strip_zone[i].size)
6465 - data->smallest=data->strip_zone+i;
6467 - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
6468 - data->strip_zone[i-1].size) : 0;
6469 - current_offset=smallest_by_zone->size;
6472 + int i, c, j, j1, j2;
6473 + int current_offset, curr_zone_offset;
6474 + raid0_conf_t *conf = mddev_to_conf(mddev);
6475 + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
6478 + * The number of 'same size groups'
6480 + conf->nr_strip_zones = 0;
6482 + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
6483 + printk("raid0: looking at %s\n", partition_name(rdev1->dev));
6485 + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
6486 + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
6487 + if (rdev2 == rdev1) {
6488 + printk("raid0: END\n");
6491 + if (rdev2->size == rdev1->size)
6494 + * Not unique, dont count it as a new
6497 + printk("raid0: EQUAL\n");
6501 + printk("raid0: NOT EQUAL\n");
6504 + printk("raid0: ==> UNIQUE\n");
6505 + conf->nr_strip_zones++;
6506 + printk("raid0: %d zones\n", conf->nr_strip_zones);
6509 + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
6511 + conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
6512 + conf->nr_strip_zones);
6513 + if (!conf->strip_zone)
6517 + conf->smallest = NULL;
6518 + current_offset = 0;
6519 + curr_zone_offset = 0;
6521 + for (i = 0; i < conf->nr_strip_zones; i++)
6523 + struct strip_zone *zone = conf->strip_zone + i;
6525 + printk("zone %d\n", i);
6526 + zone->dev_offset = current_offset;
6530 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
6532 + printk(" checking %s ...", partition_name(rdev->dev));
6533 + if (rdev->size > current_offset)
6535 + printk(" contained as device %d\n", c);
6536 + zone->dev[c] = rdev;
6538 + if (!smallest || (rdev->size <smallest->size)) {
6540 + printk(" (%d) is smallest!.\n", rdev->size);
6543 + printk(" nope.\n");
6547 + zone->size = (smallest->size - current_offset) * c;
6548 + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
6550 + if (!conf->smallest || (zone->size < conf->smallest->size))
6551 + conf->smallest = zone;
6553 + zone->zone_offset = curr_zone_offset;
6554 + curr_zone_offset += zone->size;
6556 + current_offset = smallest->size;
6557 + printk("current zone offset: %d\n", current_offset);
6559 + printk("done.\n");
6563 -static int raid0_run (int minor, struct md_dev *mddev)
6564 +static int raid0_run (mddev_t *mddev)
6566 - int cur=0, i=0, size, zone0_size, nb_zone;
6567 - struct raid0_data *data;
6569 - MOD_INC_USE_COUNT;
6570 + int cur=0, i=0, size, zone0_size, nb_zone;
6571 + raid0_conf_t *conf;
6573 - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
6574 - data=(struct raid0_data *) mddev->private;
6576 - if (create_strip_zones (minor, mddev))
6582 - nb_zone=data->nr_zones=
6583 - md_size[minor]/data->smallest->size +
6584 - (md_size[minor]%data->smallest->size ? 1 : 0);
6586 - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
6587 - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
6589 - vfree(data->strip_zone);
6593 - size=data->strip_zone[cur].size;
6596 - while (cur<data->nr_strip_zones)
6598 - data->hash_table[i].zone0=data->strip_zone+cur;
6600 - if (size>=data->smallest->size)/* If we completely fill the slot */
6602 - data->hash_table[i++].zone1=NULL;
6603 - size-=data->smallest->size;
6607 - if (++cur==data->nr_strip_zones) continue;
6608 - size=data->strip_zone[cur].size;
6614 - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
6616 - data->hash_table[i].zone1=NULL;
6620 - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
6621 - size=data->strip_zone[cur].size;
6622 - data->hash_table[i++].zone1=data->strip_zone+cur;
6623 - size-=(data->smallest->size - zone0_size);
6625 + MOD_INC_USE_COUNT;
6628 + conf = vmalloc(sizeof (raid0_conf_t));
6631 + mddev->private = (void *)conf;
6633 + if (md_check_ordering(mddev)) {
6634 + printk("raid0: disks are not ordered, aborting!\n");
6635 + goto out_free_conf;
6638 + if (create_strip_zones (mddev))
6639 + goto out_free_conf;
6641 + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
6642 + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
6643 + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
6644 + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
6645 + printk("raid0 : nb_zone is %d.\n", nb_zone);
6646 + conf->nr_zones = nb_zone;
6648 + printk("raid0 : Allocating %d bytes for hash.\n",
6649 + sizeof(struct raid0_hash)*nb_zone);
6651 + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
6652 + if (!conf->hash_table)
6653 + goto out_free_zone_conf;
6654 + size = conf->strip_zone[cur].size;
6657 + while (cur < conf->nr_strip_zones) {
6658 + conf->hash_table[i].zone0 = conf->strip_zone + cur;
6661 + * If we completely fill the slot
6663 + if (size >= conf->smallest->size) {
6664 + conf->hash_table[i++].zone1 = NULL;
6665 + size -= conf->smallest->size;
6668 + if (++cur == conf->nr_strip_zones)
6670 + size = conf->strip_zone[cur].size;
6674 + if (++cur == conf->nr_strip_zones) {
6676 + * Last dev, set unit1 as NULL
6678 + conf->hash_table[i].zone1=NULL;
6683 + * Here we use a 2nd dev to fill the slot
6685 + zone0_size = size;
6686 + size = conf->strip_zone[cur].size;
6687 + conf->hash_table[i++].zone1 = conf->strip_zone + cur;
6688 + size -= (conf->smallest->size - zone0_size);
6692 +out_free_zone_conf:
6693 + vfree(conf->strip_zone);
6694 + conf->strip_zone = NULL;
6698 + mddev->private = NULL;
6700 + MOD_DEC_USE_COUNT;
6705 -static int raid0_stop (int minor, struct md_dev *mddev)
6706 +static int raid0_stop (mddev_t *mddev)
6708 - struct raid0_data *data=(struct raid0_data *) mddev->private;
6709 + raid0_conf_t *conf = mddev_to_conf(mddev);
6711 - vfree (data->hash_table);
6712 - vfree (data->strip_zone);
6714 + vfree (conf->hash_table);
6715 + conf->hash_table = NULL;
6716 + vfree (conf->strip_zone);
6717 + conf->strip_zone = NULL;
6719 + mddev->private = NULL;
6721 - MOD_DEC_USE_COUNT;
6723 + MOD_DEC_USE_COUNT;
6728 @@ -167,135 +221,140 @@
6729 * Of course, those facts may not be valid anymore (and surely won't...)
6730 * Hey guys, there's some work out there ;-)
6732 -static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
6733 +static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
6734 unsigned long *rsector, unsigned long size)
6736 - struct raid0_data *data=(struct raid0_data *) mddev->private;
6737 - static struct raid0_hash *hash;
6738 - struct strip_zone *zone;
6739 - struct real_dev *tmp_dev;
6740 - int blk_in_chunk, factor, chunk, chunk_size;
6741 - long block, rblock;
6743 - factor=FACTOR(mddev);
6744 - chunk_size=(1UL << FACTOR_SHIFT(factor));
6745 - block=*rsector >> 1;
6746 - hash=data->hash_table+(block/data->smallest->size);
6748 - if (hash - data->hash_table > data->nr_zones)
6750 - printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block);
6754 - /* Sanity check */
6755 - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
6757 - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
6761 - if (block >= (hash->zone0->size +
6762 - hash->zone0->zone_offset))
6766 - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
6774 + raid0_conf_t *conf = mddev_to_conf(mddev);
6775 + struct raid0_hash *hash;
6776 + struct strip_zone *zone;
6777 + mdk_rdev_t *tmp_dev;
6778 + int blk_in_chunk, chunksize_bits, chunk, chunk_size;
6779 + long block, rblock;
6781 + chunk_size = mddev->param.chunk_size >> 10;
6782 + chunksize_bits = ffz(~chunk_size);
6783 + block = *rsector >> 1;
6784 + hash = conf->hash_table + block / conf->smallest->size;
6786 + if (hash - conf->hash_table > conf->nr_zones) {
6787 + printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block);
6791 + /* Sanity check */
6792 + if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
6801 + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
6804 + zone = hash->zone1;
6806 + zone = hash->zone0;
6808 - blk_in_chunk=block & (chunk_size -1);
6809 - chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
6810 - tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
6811 - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
6812 + blk_in_chunk = block & (chunk_size -1);
6813 + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
6814 + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
6815 + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
6817 - *rdev=tmp_dev->dev;
6818 - *rsector=rblock<<1;
6819 + *rdev = tmp_dev->dev;
6820 + *rsector = rblock << 1;
6826 + printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
6829 + printk("raid0_map bug: hash==NULL for block %ld\n", block);
6832 + printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
6835 + printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
6840 -static int raid0_status (char *page, int minor, struct md_dev *mddev)
6841 +static int raid0_status (char *page, mddev_t *mddev)
6848 - struct raid0_data *data=(struct raid0_data *) mddev->private;
6850 + raid0_conf_t *conf = mddev_to_conf(mddev);
6852 - sz+=sprintf (page+sz, " ");
6853 - for (j=0; j<data->nr_zones; j++)
6855 - sz+=sprintf (page+sz, "[z%d",
6856 - data->hash_table[j].zone0-data->strip_zone);
6857 - if (data->hash_table[j].zone1)
6858 - sz+=sprintf (page+sz, "/z%d] ",
6859 - data->hash_table[j].zone1-data->strip_zone);
6861 - sz+=sprintf (page+sz, "] ");
6863 + sz += sprintf(page + sz, " ");
6864 + for (j = 0; j < conf->nr_zones; j++) {
6865 + sz += sprintf(page + sz, "[z%d",
6866 + conf->hash_table[j].zone0 - conf->strip_zone);
6867 + if (conf->hash_table[j].zone1)
6868 + sz += sprintf(page+sz, "/z%d] ",
6869 + conf->hash_table[j].zone1 - conf->strip_zone);
6871 + sz += sprintf(page+sz, "] ");
6874 - sz+=sprintf (page+sz, "\n");
6875 + sz += sprintf(page + sz, "\n");
6877 - for (j=0; j<data->nr_strip_zones; j++)
6879 - sz+=sprintf (page+sz, " z%d=[", j);
6880 - for (k=0; k<data->strip_zone[j].nb_dev; k++)
6881 - sz+=sprintf (page+sz, "%s/",
6882 - partition_name(data->strip_zone[j].dev[k]->dev));
6884 - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
6885 - data->strip_zone[j].zone_offset,
6886 - data->strip_zone[j].dev_offset,
6887 - data->strip_zone[j].size);
6889 + for (j = 0; j < conf->nr_strip_zones; j++) {
6890 + sz += sprintf(page + sz, " z%d=[", j);
6891 + for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
6892 + sz += sprintf (page+sz, "%s/", partition_name(
6893 + conf->strip_zone[j].dev[k]->dev));
6895 + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
6896 + conf->strip_zone[j].zone_offset,
6897 + conf->strip_zone[j].dev_offset,
6898 + conf->strip_zone[j].size);
6901 - sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
6903 + sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
6908 -static struct md_personality raid0_personality=
6909 +static mdk_personality_t raid0_personality=
6913 - NULL, /* no special make_request */
6914 - NULL, /* no special end_request */
6918 - NULL, /* no ioctls */
6920 - NULL, /* no error_handler */
6921 - NULL, /* hot_add_disk */
6922 - NULL, /* hot_remove_disk */
6923 - NULL /* mark_spare */
6926 + NULL, /* no special make_request */
6927 + NULL, /* no special end_request */
6931 + NULL, /* no ioctls */
6933 + NULL, /* no error_handler */
6934 + NULL, /* no diskop */
6935 + NULL, /* no stop resync */
6936 + NULL /* no restart resync */
6942 void raid0_init (void)
6944 - register_md_personality (RAID0, &raid0_personality);
6945 + register_md_personality (RAID0, &raid0_personality);
6950 int init_module (void)
6952 - return (register_md_personality (RAID0, &raid0_personality));
6953 + return (register_md_personality (RAID0, &raid0_personality));
6956 void cleanup_module (void)
6958 - unregister_md_personality (RAID0);
6959 + unregister_md_personality (RAID0);
6964 diff -ruN linux.orig/drivers/block/raid1.c linux-2.2.16/drivers/block/raid1.c
6965 --- linux.orig/drivers/block/raid1.c Thu May 4 02:16:33 2000
6966 +++ linux-2.2.16/drivers/block/raid1.c Fri Jun 9 11:37:45 2000
6968 -/************************************************************************
6970 * raid1.c : Multiple Devices driver for Linux
6971 - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
6972 + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
6974 * RAID-1 management functions.
6979 #include <linux/module.h>
6980 -#include <linux/locks.h>
6981 #include <linux/malloc.h>
6982 -#include <linux/md.h>
6983 -#include <linux/raid1.h>
6984 -#include <asm/bitops.h>
6985 +#include <linux/raid/raid1.h>
6986 #include <asm/atomic.h>
6988 #define MAJOR_NR MD_MAJOR
6990 #define MD_PERSONALITY
6993 - * The following can be used to debug the driver
6995 -/*#define RAID1_DEBUG*/
6997 -#define PRINTK(x) do { printk x; } while (0);
6999 -#define PRINTK(x) do { ; } while (0);
7001 +#define MAX_LINEAR_SECTORS 128
7003 #define MAX(a,b) ((a) > (b) ? (a) : (b))
7004 #define MIN(a,b) ((a) < (b) ? (a) : (b))
7006 -static struct md_personality raid1_personality;
7007 -static struct md_thread *raid1_thread = NULL;
7008 +static mdk_personality_t raid1_personality;
7009 struct buffer_head *raid1_retry_list = NULL;
7011 -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
7012 +static void * raid1_kmalloc (int size)
7016 + * now we are rather fault tolerant than nice, but
7017 + * there are a couple of places in the RAID code where we
7018 + * simply can not afford to fail an allocation because
7019 + * there is no failure return path (eg. make_request())
7021 + while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL)))
7022 + printk ("raid1: out of memory, retrying...\n");
7024 + memset(ptr, 0, size);
7028 +static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
7029 unsigned long *rsector, unsigned long size)
7031 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7032 - int i, n = raid_conf->raid_disks;
7033 + raid1_conf_t *conf = mddev_to_conf(mddev);
7034 + int i, disks = MD_SB_DISKS;
7037 * Later we do read balancing on the read side
7038 * now we use the first available disk.
7041 - PRINTK(("raid1_map().\n"));
7043 - for (i=0; i<n; i++) {
7044 - if (raid_conf->mirrors[i].operational) {
7045 - *rdev = raid_conf->mirrors[i].dev;
7046 + for (i = 0; i < disks; i++) {
7047 + if (conf->mirrors[i].operational) {
7048 + *rdev = conf->mirrors[i].dev;
7056 -static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
7057 +static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
7058 unsigned long *rsector, unsigned long size)
7063 -void raid1_reschedule_retry (struct buffer_head *bh)
7064 +static void raid1_reschedule_retry (struct buffer_head *bh)
7066 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
7068 - PRINTK(("raid1_reschedule_retry().\n"));
7069 + mddev_t *mddev = r1_bh->mddev;
7070 + raid1_conf_t *conf = mddev_to_conf(mddev);
7072 r1_bh->next_retry = raid1_retry_list;
7073 raid1_retry_list = bh;
7074 - md_wakeup_thread(raid1_thread);
7075 + md_wakeup_thread(conf->thread);
7079 - * raid1_end_buffer_io() is called when we have finished servicing a mirrored
7080 + * raid1_end_bh_io() is called when we have finished servicing a mirrored
7081 * operation and are ready to return a success/failure code to the buffer
7084 -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
7085 +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
7087 struct buffer_head *bh = r1_bh->master_bh;
7093 -int raid1_one_error=0;
7095 void raid1_end_request (struct buffer_head *bh, int uptodate)
7097 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
7098 @@ -106,12 +106,7 @@
7102 - PRINTK(("raid1_end_request().\n"));
7104 - if (raid1_one_error) {
7105 - raid1_one_error=0;
7109 * this branch is our 'one mirror IO has finished' event handler:
7111 @@ -136,15 +131,11 @@
7114 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
7116 - PRINTK(("raid1_end_request(), read branch.\n"));
7119 * we have only one buffer_head on the read side
7122 - PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
7123 - raid1_end_buffer_io(r1_bh, uptodate);
7124 + raid1_end_bh_io(r1_bh, uptodate);
7125 restore_flags(flags);
7128 @@ -152,71 +143,56 @@
7131 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
7132 - kdevname(bh->b_dev), bh->b_blocknr);
7133 - raid1_reschedule_retry (bh);
7134 + partition_name(bh->b_dev), bh->b_blocknr);
7135 + raid1_reschedule_retry(bh);
7136 restore_flags(flags);
7141 - * WRITE or WRITEA.
7143 - PRINTK(("raid1_end_request(), write branch.\n"));
7148 * Let's see if all mirrored write operations have finished
7149 - * already [we have irqs off, so we can decrease]:
7153 - if (!--r1_bh->remaining) {
7154 - struct md_dev *mddev = r1_bh->mddev;
7155 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7156 - int i, n = raid_conf->raid_disks;
7157 + if (atomic_dec_and_test(&r1_bh->remaining)) {
7158 + int i, disks = MD_SB_DISKS;
7160 - PRINTK(("raid1_end_request(), remaining == 0.\n"));
7161 + for ( i = 0; i < disks; i++)
7162 + if (r1_bh->mirror_bh[i])
7163 + kfree(r1_bh->mirror_bh[i]);
7165 - for ( i=0; i<n; i++)
7166 - if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
7168 - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
7169 + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
7171 - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
7172 restore_flags(flags);
7175 -/* This routine checks if the undelying device is an md device and in that
7176 - * case it maps the blocks before putting the request on the queue
7178 + * This routine checks if the undelying device is an md device
7179 + * and in that case it maps the blocks before putting the
7180 + * request on the queue
7183 -map_and_make_request (int rw, struct buffer_head *bh)
7184 +static void map_and_make_request (int rw, struct buffer_head *bh)
7186 if (MAJOR (bh->b_rdev) == MD_MAJOR)
7187 - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
7188 + md_map (bh->b_rdev, &bh->b_rdev,
7189 + &bh->b_rsector, bh->b_size >> 9);
7190 clear_bit(BH_Lock, &bh->b_state);
7191 make_request (MAJOR (bh->b_rdev), rw, bh);
7195 -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
7196 +static int raid1_make_request (mddev_t *mddev, int rw,
7197 + struct buffer_head * bh)
7200 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7201 + raid1_conf_t *conf = mddev_to_conf(mddev);
7202 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
7203 struct raid1_bh * r1_bh;
7204 - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
7205 + int disks = MD_SB_DISKS;
7206 + int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
7207 struct mirror_info *mirror;
7209 - PRINTK(("raid1_make_request().\n"));
7211 - while (!( /* FIXME: now we are rather fault tolerant than nice */
7212 - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL)
7215 - printk ("raid1_make_request(#1): out of memory\n");
7216 - current->policy |= SCHED_YIELD;
7219 - memset (r1_bh, 0, sizeof (struct raid1_bh));
7220 + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
7223 * make_request() can abort the operation when READA or WRITEA are being
7224 @@ -227,43 +203,65 @@
7225 if (rw == READA) rw = READ;
7226 if (rw == WRITEA) rw = WRITE;
7228 - if (rw == WRITE || rw == WRITEA)
7229 - mark_buffer_clean(bh); /* Too early ? */
7230 + if (rw == WRITE) {
7234 + mark_buffer_clean(bh);
7236 + * not too early. we _first_ clean the bh, then we start
7237 + * the IO, then when the IO has finished, we unlock the
7238 + * bh and mark it uptodate. This way we do not miss the
7239 + * case when the bh got dirty again during the IO.
7244 + * special flag for 'lowprio' reconstruction requests ...
7246 + if (buffer_lowprio(bh))
7250 - * i think the read and write branch should be separated completely, since we want
7251 - * to do read balancing on the read side for example. Comments? :) --mingo
7252 + * i think the read and write branch should be separated completely,
7253 + * since we want to do read balancing on the read side for example.
7254 + * Comments? :) --mingo
7257 r1_bh->master_bh=bh;
7261 - if (rw==READ || rw==READA) {
7262 - int last_used = raid_conf->last_used;
7263 - PRINTK(("raid1_make_request(), read branch.\n"));
7264 - mirror = raid_conf->mirrors + last_used;
7266 + int last_used = conf->last_used;
7269 + * read balancing logic:
7271 + mirror = conf->mirrors + last_used;
7272 bh->b_rdev = mirror->dev;
7273 sectors = bh->b_size >> 9;
7274 - if (bh->b_blocknr * sectors == raid_conf->next_sect) {
7275 - raid_conf->sect_count += sectors;
7276 - if (raid_conf->sect_count >= mirror->sect_limit)
7278 + if (bh->b_blocknr * sectors == conf->next_sect) {
7279 + conf->sect_count += sectors;
7280 + if (conf->sect_count >= mirror->sect_limit)
7284 - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
7285 - if (switch_disks) {
7286 - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
7287 - raid_conf->sect_count = 0;
7288 - last_used = raid_conf->last_used = mirror->next;
7289 + conf->next_sect = (bh->b_blocknr + 1) * sectors;
7291 + * Do not switch disks if full resync is in progress ...
7293 + if (switch_disks && !conf->resync_mirrors) {
7294 + conf->sect_count = 0;
7295 + last_used = conf->last_used = mirror->next;
7297 - * Do not switch to write-only disks ... resyncing
7299 + * Do not switch to write-only disks ...
7300 + * reconstruction is in progress
7302 - while (raid_conf->mirrors[last_used].write_only)
7303 - raid_conf->last_used = raid_conf->mirrors[last_used].next;
7304 + while (conf->mirrors[last_used].write_only)
7305 + conf->last_used = conf->mirrors[last_used].next;
7307 - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
7308 bh_req = &r1_bh->bh_req;
7309 memcpy(bh_req, bh, sizeof(*bh));
7310 bh_req->b_end_io = raid1_end_request;
7311 @@ -273,13 +271,12 @@
7315 - * WRITE or WRITEA.
7318 - PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
7320 - for (i = 0; i < n; i++) {
7321 + for (i = 0; i < disks; i++) {
7323 - if (!raid_conf->mirrors [i].operational) {
7324 + if (!conf->mirrors[i].operational) {
7326 * the r1_bh->mirror_bh[i] pointer remains NULL
7328 @@ -287,89 +284,91 @@
7333 + * special case for reconstruction ...
7335 + if (lowprio && (i == conf->last_used)) {
7336 + mirror_bh[i] = NULL;
7341 + * We should use a private pool (size depending on NR_REQUEST),
7342 + * to avoid writes filling up the memory with bhs
7344 + * Such pools are much faster than kmalloc anyways (so we waste
7345 + * almost nothing by not using the master bh when writing and
7346 + * win alot of cleanness) but for now we are cool enough. --mingo
7348 + * It's safe to sleep here, buffer heads cannot be used in a shared
7349 + * manner in the write branch. Look how we lock the buffer at the
7350 + * beginning of this function to grok the difference ;)
7352 + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
7354 + * prepare mirrored bh (fields ordered for max mem throughput):
7356 + mirror_bh[i]->b_blocknr = bh->b_blocknr;
7357 + mirror_bh[i]->b_dev = bh->b_dev;
7358 + mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
7359 + mirror_bh[i]->b_rsector = bh->b_rsector;
7360 + mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
7362 + mirror_bh[i]->b_state |= (1<<BH_LowPrio);
7364 + mirror_bh[i]->b_count = 1;
7365 + mirror_bh[i]->b_size = bh->b_size;
7366 + mirror_bh[i]->b_data = bh->b_data;
7367 + mirror_bh[i]->b_list = BUF_LOCKED;
7368 + mirror_bh[i]->b_end_io = raid1_end_request;
7369 + mirror_bh[i]->b_dev_id = r1_bh;
7371 + r1_bh->mirror_bh[i] = mirror_bh[i];
7375 + md_atomic_set(&r1_bh->remaining, sum_bhs);
7378 - * We should use a private pool (size depending on NR_REQUEST),
7379 - * to avoid writes filling up the memory with bhs
7381 - * Such pools are much faster than kmalloc anyways (so we waste almost
7382 - * nothing by not using the master bh when writing and win alot of cleanness)
7384 - * but for now we are cool enough. --mingo
7386 - * It's safe to sleep here, buffer heads cannot be used in a shared
7387 - * manner in the write branch. Look how we lock the buffer at the beginning
7388 - * of this function to grok the difference ;)
7390 - while (!( /* FIXME: now we are rather fault tolerant than nice */
7391 - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL)
7394 - printk ("raid1_make_request(#2): out of memory\n");
7395 - current->policy |= SCHED_YIELD;
7398 - memset (mirror_bh[i], 0, sizeof (struct buffer_head));
7401 - * prepare mirrored bh (fields ordered for max mem throughput):
7403 - mirror_bh [i]->b_blocknr = bh->b_blocknr;
7404 - mirror_bh [i]->b_dev = bh->b_dev;
7405 - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
7406 - mirror_bh [i]->b_rsector = bh->b_rsector;
7407 - mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
7408 - mirror_bh [i]->b_count = 1;
7409 - mirror_bh [i]->b_size = bh->b_size;
7410 - mirror_bh [i]->b_data = bh->b_data;
7411 - mirror_bh [i]->b_list = BUF_LOCKED;
7412 - mirror_bh [i]->b_end_io = raid1_end_request;
7413 - mirror_bh [i]->b_dev_id = r1_bh;
7415 - r1_bh->mirror_bh[i] = mirror_bh[i];
7419 - r1_bh->remaining = sum_bhs;
7421 - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
7424 - * We have to be a bit careful about the semaphore above, thats why we
7425 - * start the requests separately. Since kmalloc() could fail, sleep and
7426 - * make_request() can sleep too, this is the safer solution. Imagine,
7427 - * end_request decreasing the semaphore before we could have set it up ...
7428 - * We could play tricks with the semaphore (presetting it and correcting
7429 - * at the end if sum_bhs is not 'n' but we have to do end_request by hand
7430 - * if all requests finish until we had a chance to set up the semaphore
7431 - * correctly ... lots of races).
7433 - for (i = 0; i < n; i++)
7434 - if (mirror_bh [i] != NULL)
7435 - map_and_make_request (rw, mirror_bh [i]);
7436 + * We have to be a bit careful about the semaphore above, thats
7437 + * why we start the requests separately. Since kmalloc() could
7438 + * fail, sleep and make_request() can sleep too, this is the
7439 + * safer solution. Imagine, end_request decreasing the semaphore
7440 + * before we could have set it up ... We could play tricks with
7441 + * the semaphore (presetting it and correcting at the end if
7442 + * sum_bhs is not 'n' but we have to do end_request by hand if
7443 + * all requests finish until we had a chance to set up the
7444 + * semaphore correctly ... lots of races).
7446 + for (i = 0; i < disks; i++)
7448 + map_and_make_request(rw, mirror_bh[i]);
7453 -static int raid1_status (char *page, int minor, struct md_dev *mddev)
7454 +static int raid1_status (char *page, mddev_t *mddev)
7456 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7457 + raid1_conf_t *conf = mddev_to_conf(mddev);
7460 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
7461 - for (i = 0; i < raid_conf->raid_disks; i++)
7462 - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
7463 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
7464 + conf->working_disks);
7465 + for (i = 0; i < conf->raid_disks; i++)
7466 + sz += sprintf (page+sz, "%s",
7467 + conf->mirrors[i].operational ? "U" : "_");
7468 sz += sprintf (page+sz, "]");
7472 -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
7473 +static void unlink_disk (raid1_conf_t *conf, int target)
7475 - int disks = raid_conf->raid_disks;
7477 + int disks = MD_SB_DISKS;
7480 - for (j = 0; j < disks; j++)
7481 - if (raid_conf->mirrors [j].next == failed_index)
7482 - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
7483 + for (i = 0; i < disks; i++)
7484 + if (conf->mirrors[i].next == target)
7485 + conf->mirrors[i].next = conf->mirrors[target].next;
7488 #define LAST_DISK KERN_ALERT \
7489 @@ -388,48 +387,53 @@
7490 #define ALREADY_SYNCING KERN_INFO \
7491 "raid1: syncing already in progress.\n"
7493 -static int raid1_error (struct md_dev *mddev, kdev_t dev)
7494 +static void mark_disk_bad (mddev_t *mddev, int failed)
7496 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7497 - struct mirror_info *mirror;
7498 - md_superblock_t *sb = mddev->sb;
7499 - int disks = raid_conf->raid_disks;
7501 + raid1_conf_t *conf = mddev_to_conf(mddev);
7502 + struct mirror_info *mirror = conf->mirrors+failed;
7503 + mdp_super_t *sb = mddev->sb;
7505 + mirror->operational = 0;
7506 + unlink_disk(conf, failed);
7507 + mark_disk_faulty(sb->disks+mirror->number);
7508 + mark_disk_nonsync(sb->disks+mirror->number);
7509 + mark_disk_inactive(sb->disks+mirror->number);
7510 + sb->active_disks--;
7511 + sb->working_disks--;
7512 + sb->failed_disks++;
7513 + mddev->sb_dirty = 1;
7514 + md_wakeup_thread(conf->thread);
7515 + conf->working_disks--;
7516 + printk (DISK_FAILED, partition_name (mirror->dev),
7517 + conf->working_disks);
7520 - PRINTK(("raid1_error called\n"));
7521 +static int raid1_error (mddev_t *mddev, kdev_t dev)
7523 + raid1_conf_t *conf = mddev_to_conf(mddev);
7524 + struct mirror_info * mirrors = conf->mirrors;
7525 + int disks = MD_SB_DISKS;
7528 - if (raid_conf->working_disks == 1) {
7529 + if (conf->working_disks == 1) {
7531 * Uh oh, we can do nothing if this is our last disk, but
7532 * first check if this is a queued request for a device
7533 * which has just failed.
7535 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
7537 - if (mirror->dev == dev && !mirror->operational)
7538 + for (i = 0; i < disks; i++) {
7539 + if (mirrors[i].dev==dev && !mirrors[i].operational)
7544 - /* Mark disk as unusable */
7545 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
7547 - if (mirror->dev == dev && mirror->operational){
7548 - mirror->operational = 0;
7549 - raid1_fix_links (raid_conf, i);
7550 - sb->disks[mirror->number].state |=
7551 - (1 << MD_FAULTY_DEVICE);
7552 - sb->disks[mirror->number].state &=
7553 - ~(1 << MD_SYNC_DEVICE);
7554 - sb->disks[mirror->number].state &=
7555 - ~(1 << MD_ACTIVE_DEVICE);
7556 - sb->active_disks--;
7557 - sb->working_disks--;
7558 - sb->failed_disks++;
7559 - mddev->sb_dirty = 1;
7560 - md_wakeup_thread(raid1_thread);
7561 - raid_conf->working_disks--;
7562 - printk (DISK_FAILED, kdevname (dev),
7563 - raid_conf->working_disks);
7565 + * Mark disk as unusable
7567 + for (i = 0; i < disks; i++) {
7568 + if (mirrors[i].dev==dev && mirrors[i].operational) {
7569 + mark_disk_bad (mddev, i);
7574 @@ -442,219 +446,396 @@
7575 #undef START_SYNCING
7578 - * This is the personality-specific hot-addition routine
7579 + * Insert the spare disk into the drive-ring
7581 +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
7584 + int disks = MD_SB_DISKS;
7585 + struct mirror_info *p = conf->mirrors;
7587 + for (j = 0; j < disks; j++, p++)
7588 + if (p->operational && !p->write_only) {
7590 + p->next = mirror->raid_disk;
7591 + mirror->next = next;
7595 -#define NO_SUPERBLOCK KERN_ERR \
7596 -"raid1: cannot hot-add disk to the array with no RAID superblock\n"
7597 + printk("raid1: bug: no read-operational devices\n");
7600 -#define WRONG_LEVEL KERN_ERR \
7601 -"raid1: hot-add: level of disk is not RAID-1\n"
7602 +static void print_raid1_conf (raid1_conf_t *conf)
7605 + struct mirror_info *tmp;
7607 -#define HOT_ADD_SUCCEEDED KERN_INFO \
7608 -"raid1: device %s hot-added\n"
7609 + printk("RAID1 conf printout:\n");
7611 + printk("(conf==NULL)\n");
7614 + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
7615 + conf->raid_disks, conf->nr_disks);
7617 -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
7618 + for (i = 0; i < MD_SB_DISKS; i++) {
7619 + tmp = conf->mirrors + i;
7620 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
7621 + i, tmp->spare,tmp->operational,
7622 + tmp->number,tmp->raid_disk,tmp->used_slot,
7623 + partition_name(tmp->dev));
7627 +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
7630 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
7631 + raid1_conf_t *conf = mddev->private;
7632 + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
7633 unsigned long flags;
7634 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
7635 - struct mirror_info *mirror;
7636 - md_superblock_t *sb = mddev->sb;
7637 - struct real_dev * realdev;
7639 + mdp_super_t *sb = mddev->sb;
7640 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
7642 + save_flags(flags);
7645 + print_raid1_conf(conf);
7647 - * The device has its superblock already read and it was found
7648 - * to be consistent for generic RAID usage. Now we check whether
7649 - * it's usable for RAID-1 hot addition.
7650 + * find the disk ...
7654 - n = mddev->nb_dev++;
7655 - realdev = &mddev->devices[n];
7656 - if (!realdev->sb) {
7657 - printk (NO_SUPERBLOCK);
7660 - if (realdev->sb->level != 1) {
7661 - printk (WRONG_LEVEL);
7663 + case DISKOP_SPARE_ACTIVE:
7666 + * Find the failed disk within the RAID1 configuration ...
7667 + * (this can only be in the first conf->working_disks part)
7669 + for (i = 0; i < conf->raid_disks; i++) {
7670 + tmp = conf->mirrors + i;
7671 + if ((!tmp->operational && !tmp->spare) ||
7672 + !tmp->used_slot) {
7678 + * When we activate a spare disk we _must_ have a disk in
7679 + * the lower (active) part of the array to replace.
7681 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
7686 + /* fall through */
7688 + case DISKOP_SPARE_WRITE:
7689 + case DISKOP_SPARE_INACTIVE:
7692 + * Find the spare disk ... (can only be in the 'high'
7693 + * area of the array)
7695 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
7696 + tmp = conf->mirrors + i;
7697 + if (tmp->spare && tmp->number == (*d)->number) {
7702 + if (spare_disk == -1) {
7709 + case DISKOP_HOT_REMOVE_DISK:
7711 + for (i = 0; i < MD_SB_DISKS; i++) {
7712 + tmp = conf->mirrors + i;
7713 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
7714 + if (tmp->operational) {
7722 + if (removed_disk == -1) {
7729 + case DISKOP_HOT_ADD_DISK:
7731 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
7732 + tmp = conf->mirrors + i;
7733 + if (!tmp->used_slot) {
7738 + if (added_disk == -1) {
7745 - /* FIXME: are there other things left we could sanity-check? */
7749 - * We have to disable interrupts, as our RAID-1 state is used
7750 - * from irq handlers as well.
7751 + * Switch the spare disk to write-only mode:
7753 - save_flags(flags);
7755 + case DISKOP_SPARE_WRITE:
7756 + sdisk = conf->mirrors + spare_disk;
7757 + sdisk->operational = 1;
7758 + sdisk->write_only = 1;
7761 + * Deactivate a spare disk:
7763 + case DISKOP_SPARE_INACTIVE:
7764 + sdisk = conf->mirrors + spare_disk;
7765 + sdisk->operational = 0;
7766 + sdisk->write_only = 0;
7769 + * Activate (mark read-write) the (now sync) spare disk,
7770 + * which means we switch it's 'raid position' (->raid_disk)
7771 + * with the failed disk. (only the first 'conf->nr_disks'
7772 + * slots are used for 'real' disks and we must preserve this
7775 + case DISKOP_SPARE_ACTIVE:
7777 - raid_conf->raid_disks++;
7778 - mirror = raid_conf->mirrors+n;
7779 + sdisk = conf->mirrors + spare_disk;
7780 + fdisk = conf->mirrors + failed_disk;
7783 - mirror->raid_disk=n;
7785 - mirror->next=0; /* FIXME */
7786 - mirror->sect_limit=128;
7788 - mirror->operational=0;
7790 - mirror->write_only=0;
7792 - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
7793 - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
7794 - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
7796 - sb->spare_disks++;
7797 + spare_desc = &sb->disks[sdisk->number];
7798 + failed_desc = &sb->disks[fdisk->number];
7800 - restore_flags(flags);
7801 + if (spare_desc != *d) {
7807 - md_update_sb(MINOR(dev));
7808 + if (spare_desc->raid_disk != sdisk->raid_disk) {
7814 + if (sdisk->raid_disk != spare_disk) {
7820 - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
7821 + if (failed_desc->raid_disk != fdisk->raid_disk) {
7829 + if (fdisk->raid_disk != failed_disk) {
7835 -#undef NO_SUPERBLOCK
7837 -#undef HOT_ADD_SUCCEEDED
7839 + * do the switch finally
7841 + xchg_values(*spare_desc, *failed_desc);
7842 + xchg_values(*fdisk, *sdisk);
7845 - * Insert the spare disk into the drive-ring
7847 -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
7850 - struct mirror_info *p = raid_conf->mirrors;
7852 + * (careful, 'failed' and 'spare' are switched from now on)
7854 + * we want to preserve linear numbering and we want to
7855 + * give the proper raid_disk number to the now activated
7856 + * disk. (this means we switch back these values)
7859 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
7860 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
7861 + xchg_values(spare_desc->number, failed_desc->number);
7862 + xchg_values(sdisk->number, fdisk->number);
7864 - for (j = 0; j < raid_conf->raid_disks; j++, p++)
7865 - if (p->operational && !p->write_only) {
7867 - p->next = mirror->raid_disk;
7868 - mirror->next = next;
7871 - printk("raid1: bug: no read-operational devices\n");
7875 -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
7878 - int i = 0, failed_disk = -1;
7879 - struct raid1_data *raid_conf = mddev->private;
7880 - struct mirror_info *mirror = raid_conf->mirrors;
7881 - md_descriptor_t *descriptor;
7882 - unsigned long flags;
7883 + if (sdisk->dev == MKDEV(0,0))
7884 + sdisk->used_slot = 0;
7886 + * this really activates the spare.
7889 + fdisk->write_only = 0;
7890 + link_disk(conf, fdisk);
7892 - for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
7893 - if (mirror->spare && mirror->number == spare->number)
7898 - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
7900 - if (!mirror->operational)
7903 + * if we activate a spare, we definitely replace a
7904 + * non-operational disk slot in the 'low' area of
7908 - save_flags(flags);
7912 - mirror->operational = 1;
7913 - mirror->write_only = 1;
7914 - raid_conf->raid_disks = MAX(raid_conf->raid_disks,
7915 - mirror->raid_disk + 1);
7917 - case SPARE_INACTIVE:
7918 - mirror->operational = 0;
7919 - mirror->write_only = 0;
7921 - case SPARE_ACTIVE:
7922 - mirror->spare = 0;
7923 - mirror->write_only = 0;
7924 - raid_conf->working_disks++;
7925 - add_ring(raid_conf, mirror);
7927 - if (failed_disk != -1) {
7928 - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
7929 - i = spare->raid_disk;
7930 - spare->raid_disk = descriptor->raid_disk;
7931 - descriptor->raid_disk = i;
7935 - printk("raid1_mark_spare: bug: state == %d\n", state);
7936 - restore_flags(flags);
7938 + conf->working_disks++;
7942 + case DISKOP_HOT_REMOVE_DISK:
7943 + rdisk = conf->mirrors + removed_disk;
7945 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
7950 + rdisk->dev = MKDEV(0,0);
7951 + rdisk->used_slot = 0;
7955 + case DISKOP_HOT_ADD_DISK:
7956 + adisk = conf->mirrors + added_disk;
7959 + if (added_disk != added_desc->number) {
7965 + adisk->number = added_desc->number;
7966 + adisk->raid_disk = added_desc->raid_disk;
7967 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
7969 + adisk->operational = 0;
7970 + adisk->write_only = 0;
7972 + adisk->used_slot = 1;
7983 restore_flags(flags);
7985 + print_raid1_conf(conf);
7990 +#define IO_ERROR KERN_ALERT \
7991 +"raid1: %s: unrecoverable I/O read error for block %lu\n"
7993 +#define REDIRECT_SECTOR KERN_ERR \
7994 +"raid1: %s: redirecting sector %lu to another mirror\n"
7997 * This is a kernel thread which:
7999 * 1. Retries failed read operations on working mirrors.
8000 * 2. Updates the raid superblock when problems encounter.
8002 -void raid1d (void *data)
8003 +static void raid1d (void *data)
8005 struct buffer_head *bh;
8007 unsigned long flags;
8008 - struct raid1_bh * r1_bh;
8009 - struct md_dev *mddev;
8010 + struct raid1_bh *r1_bh;
8013 - PRINTK(("raid1d() active\n"));
8014 - save_flags(flags);
8016 while (raid1_retry_list) {
8017 + save_flags(flags);
8019 bh = raid1_retry_list;
8020 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
8021 raid1_retry_list = r1_bh->next_retry;
8022 restore_flags(flags);
8024 - mddev = md_dev + MINOR(bh->b_dev);
8025 + mddev = kdev_to_mddev(bh->b_dev);
8026 if (mddev->sb_dirty) {
8027 - printk("dirty sb detected, updating.\n");
8028 + printk(KERN_INFO "dirty sb detected, updating.\n");
8029 mddev->sb_dirty = 0;
8030 - md_update_sb(MINOR(bh->b_dev));
8031 + md_update_sb(mddev);
8034 - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
8035 + __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
8037 if (bh->b_rdev == dev) {
8038 - printk (KERN_ALERT
8039 - "raid1: %s: unrecoverable I/O read error for block %lu\n",
8040 - kdevname(bh->b_dev), bh->b_blocknr);
8041 - raid1_end_buffer_io(r1_bh, 0);
8042 + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
8043 + raid1_end_bh_io(r1_bh, 0);
8045 - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
8046 - kdevname(bh->b_dev), bh->b_blocknr);
8047 + printk (REDIRECT_SECTOR,
8048 + partition_name(bh->b_dev), bh->b_blocknr);
8049 map_and_make_request (r1_bh->cmd, bh);
8053 - restore_flags(flags);
8056 +#undef REDIRECT_SECTOR
8059 + * Private kernel thread to reconstruct mirrors after an unclean
8062 +static void raid1syncd (void *data)
8064 + raid1_conf_t *conf = data;
8065 + mddev_t *mddev = conf->mddev;
8067 + if (!conf->resync_mirrors)
8069 + if (conf->resync_mirrors == 2)
8071 + down(&mddev->recovery_sem);
8072 + if (md_do_sync(mddev, NULL)) {
8073 + up(&mddev->recovery_sem);
8077 + * Only if everything went Ok.
8079 + conf->resync_mirrors = 0;
8080 + up(&mddev->recovery_sem);
8085 * This will catch the scenario in which one of the mirrors was
8086 * mounted as a normal device rather than as a part of a raid set.
8088 + * check_consistency is very personality-dependent, eg. RAID5 cannot
8089 + * do this check, it uses another method.
8091 -static int __check_consistency (struct md_dev *mddev, int row)
8092 +static int __check_consistency (mddev_t *mddev, int row)
8094 - struct raid1_data *raid_conf = mddev->private;
8095 + raid1_conf_t *conf = mddev_to_conf(mddev);
8096 + int disks = MD_SB_DISKS;
8098 struct buffer_head *bh = NULL;
8100 char *buffer = NULL;
8102 - for (i = 0; i < raid_conf->raid_disks; i++) {
8103 - if (!raid_conf->mirrors[i].operational)
8104 + for (i = 0; i < disks; i++) {
8105 + printk("(checking disk %d)\n",i);
8106 + if (!conf->mirrors[i].operational)
8108 - dev = raid_conf->mirrors[i].dev;
8109 + printk("(really checking disk %d)\n",i);
8110 + dev = conf->mirrors[i].dev;
8111 set_blocksize(dev, 4096);
8112 if ((bh = bread(dev, row / 4, 4096)) == NULL)
8114 @@ -683,167 +864,342 @@
8118 -static int check_consistency (struct md_dev *mddev)
8119 +static int check_consistency (mddev_t *mddev)
8121 - int size = mddev->sb->size;
8123 + if (__check_consistency(mddev, 0))
8125 + * we do not do this currently, as it's perfectly possible to
8126 + * have an inconsistent array when it's freshly created. Only
8127 + * newly written data has to be consistent.
8131 - for (row = 0; row < size; row += size / 8)
8132 - if (__check_consistency(mddev, row))
8137 -static int raid1_run (int minor, struct md_dev *mddev)
8138 +#define INVALID_LEVEL KERN_WARNING \
8139 +"raid1: md%d: raid level not set to mirroring (%d)\n"
8141 +#define NO_SB KERN_ERR \
8142 +"raid1: disabled mirror %s (couldn't access raid superblock)\n"
8144 +#define ERRORS KERN_ERR \
8145 +"raid1: disabled mirror %s (errors detected)\n"
8147 +#define NOT_IN_SYNC KERN_ERR \
8148 +"raid1: disabled mirror %s (not in sync)\n"
8150 +#define INCONSISTENT KERN_ERR \
8151 +"raid1: disabled mirror %s (inconsistent descriptor)\n"
8153 +#define ALREADY_RUNNING KERN_ERR \
8154 +"raid1: disabled mirror %s (mirror %d already operational)\n"
8156 +#define OPERATIONAL KERN_INFO \
8157 +"raid1: device %s operational as mirror %d\n"
8159 +#define MEM_ERROR KERN_ERR \
8160 +"raid1: couldn't allocate memory for md%d\n"
8162 +#define SPARE KERN_INFO \
8163 +"raid1: spare disk %s\n"
8165 +#define NONE_OPERATIONAL KERN_ERR \
8166 +"raid1: no operational mirrors for md%d\n"
8168 +#define RUNNING_CKRAID KERN_ERR \
8169 +"raid1: detected mirror differences -- running resync\n"
8171 +#define ARRAY_IS_ACTIVE KERN_INFO \
8172 +"raid1: raid set md%d active with %d out of %d mirrors\n"
8174 +#define THREAD_ERROR KERN_ERR \
8175 +"raid1: couldn't allocate thread for md%d\n"
8177 +#define START_RESYNC KERN_WARNING \
8178 +"raid1: raid set md%d not clean; reconstructing mirrors\n"
8180 +static int raid1_run (mddev_t *mddev)
8182 - struct raid1_data *raid_conf;
8183 - int i, j, raid_disk;
8184 - md_superblock_t *sb = mddev->sb;
8185 - md_descriptor_t *descriptor;
8186 - struct real_dev *realdev;
8187 + raid1_conf_t *conf;
8188 + int i, j, disk_idx;
8189 + struct mirror_info *disk;
8190 + mdp_super_t *sb = mddev->sb;
8191 + mdp_disk_t *descriptor;
8193 + struct md_list_head *tmp;
8194 + int start_recovery = 0;
8198 if (sb->level != 1) {
8199 - printk("raid1: %s: raid level not set to mirroring (%d)\n",
8200 - kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
8201 - MOD_DEC_USE_COUNT;
8205 - * copy the now verified devices into our private RAID1 bookkeeping
8206 - * area. [whatever we allocate in raid1_run(), should be freed in
8208 + printk(INVALID_LEVEL, mdidx(mddev), sb->level);
8212 + * copy the already verified devices into our private RAID1
8213 + * bookkeeping area. [whatever we allocate in raid1_run(),
8214 + * should be freed in raid1_stop()]
8217 - while (!( /* FIXME: now we are rather fault tolerant than nice */
8218 - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
8221 - printk ("raid1_run(): out of memory\n");
8222 - current->policy |= SCHED_YIELD;
8225 - raid_conf = mddev->private;
8226 - memset(raid_conf, 0, sizeof(*raid_conf));
8228 - PRINTK(("raid1_run(%d) called.\n", minor));
8230 - for (i = 0; i < mddev->nb_dev; i++) {
8231 - realdev = &mddev->devices[i];
8232 - if (!realdev->sb) {
8233 - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
8234 + conf = raid1_kmalloc(sizeof(raid1_conf_t));
8235 + mddev->private = conf;
8237 + printk(MEM_ERROR, mdidx(mddev));
8241 + ITERATE_RDEV(mddev,rdev,tmp) {
8242 + if (rdev->faulty) {
8243 + printk(ERRORS, partition_name(rdev->dev));
8250 + if (rdev->desc_nr == -1) {
8256 - * This is important -- we are using the descriptor on
8257 - * the disk only to get a pointer to the descriptor on
8258 - * the main superblock, which might be more recent.
8260 - descriptor = &sb->disks[realdev->sb->descriptor.number];
8261 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
8262 - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
8263 + descriptor = &sb->disks[rdev->desc_nr];
8264 + disk_idx = descriptor->raid_disk;
8265 + disk = conf->mirrors + disk_idx;
8267 + if (disk_faulty(descriptor)) {
8268 + disk->number = descriptor->number;
8269 + disk->raid_disk = disk_idx;
8270 + disk->dev = rdev->dev;
8271 + disk->sect_limit = MAX_LINEAR_SECTORS;
8272 + disk->operational = 0;
8273 + disk->write_only = 0;
8275 + disk->used_slot = 1;
8278 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
8279 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
8280 - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
8281 + if (disk_active(descriptor)) {
8282 + if (!disk_sync(descriptor)) {
8283 + printk(NOT_IN_SYNC,
8284 + partition_name(rdev->dev));
8287 - raid_disk = descriptor->raid_disk;
8288 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
8289 - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
8290 + if ((descriptor->number > MD_SB_DISKS) ||
8291 + (disk_idx > sb->raid_disks)) {
8293 + printk(INCONSISTENT,
8294 + partition_name(rdev->dev));
8297 - if (raid_conf->mirrors[raid_disk].operational) {
8298 - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
8299 + if (disk->operational) {
8300 + printk(ALREADY_RUNNING,
8301 + partition_name(rdev->dev),
8305 - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
8306 - raid_conf->mirrors[raid_disk].number = descriptor->number;
8307 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
8308 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
8309 - raid_conf->mirrors[raid_disk].operational = 1;
8310 - raid_conf->mirrors[raid_disk].sect_limit = 128;
8311 - raid_conf->working_disks++;
8312 + printk(OPERATIONAL, partition_name(rdev->dev),
8314 + disk->number = descriptor->number;
8315 + disk->raid_disk = disk_idx;
8316 + disk->dev = rdev->dev;
8317 + disk->sect_limit = MAX_LINEAR_SECTORS;
8318 + disk->operational = 1;
8319 + disk->write_only = 0;
8321 + disk->used_slot = 1;
8322 + conf->working_disks++;
8325 * Must be a spare disk ..
8327 - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
8328 - raid_disk = descriptor->raid_disk;
8329 - raid_conf->mirrors[raid_disk].number = descriptor->number;
8330 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
8331 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
8332 - raid_conf->mirrors[raid_disk].sect_limit = 128;
8334 - raid_conf->mirrors[raid_disk].operational = 0;
8335 - raid_conf->mirrors[raid_disk].write_only = 0;
8336 - raid_conf->mirrors[raid_disk].spare = 1;
8339 - if (!raid_conf->working_disks) {
8340 - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
8342 - mddev->private = NULL;
8343 - MOD_DEC_USE_COUNT;
8347 - raid_conf->raid_disks = sb->raid_disks;
8348 - raid_conf->mddev = mddev;
8350 - for (j = 0; !raid_conf->mirrors[j].operational; j++);
8351 - raid_conf->last_used = j;
8352 - for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
8353 - if (raid_conf->mirrors[i].operational) {
8354 - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
8355 - raid_conf->mirrors[i].next = j;
8356 + printk(SPARE, partition_name(rdev->dev));
8357 + disk->number = descriptor->number;
8358 + disk->raid_disk = disk_idx;
8359 + disk->dev = rdev->dev;
8360 + disk->sect_limit = MAX_LINEAR_SECTORS;
8361 + disk->operational = 0;
8362 + disk->write_only = 0;
8364 + disk->used_slot = 1;
8367 + if (!conf->working_disks) {
8368 + printk(NONE_OPERATIONAL, mdidx(mddev));
8369 + goto out_free_conf;
8372 + conf->raid_disks = sb->raid_disks;
8373 + conf->nr_disks = sb->nr_disks;
8374 + conf->mddev = mddev;
8376 + for (i = 0; i < MD_SB_DISKS; i++) {
8378 + descriptor = sb->disks+i;
8379 + disk_idx = descriptor->raid_disk;
8380 + disk = conf->mirrors + disk_idx;
8382 + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
8383 + !disk->used_slot) {
8385 + disk->number = descriptor->number;
8386 + disk->raid_disk = disk_idx;
8387 + disk->dev = MKDEV(0,0);
8389 + disk->operational = 0;
8390 + disk->write_only = 0;
8392 + disk->used_slot = 1;
8397 + * find the first working one and use it as a starting point
8398 + * to read balancing.
8400 + for (j = 0; !conf->mirrors[j].operational; j++)
8402 + conf->last_used = j;
8405 + * initialize the 'working disks' list.
8407 + for (i = conf->raid_disks - 1; i >= 0; i--) {
8408 + if (conf->mirrors[i].operational) {
8409 + conf->mirrors[i].next = j;
8414 - if (check_consistency(mddev)) {
8415 - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
8416 - sb->state |= 1 << MD_SB_ERRORS;
8418 - mddev->private = NULL;
8419 - MOD_DEC_USE_COUNT;
8421 + if (conf->working_disks != sb->raid_disks) {
8422 + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
8423 + start_recovery = 1;
8426 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
8428 + * we do sanity checks even if the device says
8431 + if (check_consistency(mddev)) {
8432 + printk(RUNNING_CKRAID);
8433 + sb->state &= ~(1 << MD_SB_CLEAN);
8438 + const char * name = "raid1d";
8440 + conf->thread = md_register_thread(raid1d, conf, name);
8441 + if (!conf->thread) {
8442 + printk(THREAD_ERROR, mdidx(mddev));
8443 + goto out_free_conf;
8447 + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
8448 + const char * name = "raid1syncd";
8450 + conf->resync_thread = md_register_thread(raid1syncd, conf,name);
8451 + if (!conf->resync_thread) {
8452 + printk(THREAD_ERROR, mdidx(mddev));
8453 + goto out_free_conf;
8456 + printk(START_RESYNC, mdidx(mddev));
8457 + conf->resync_mirrors = 1;
8458 + md_wakeup_thread(conf->resync_thread);
8462 * Regenerate the "device is in sync with the raid set" bit for
8465 - for (i = 0; i < sb->nr_disks ; i++) {
8466 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
8467 + for (i = 0; i < MD_SB_DISKS; i++) {
8468 + mark_disk_nonsync(sb->disks+i);
8469 for (j = 0; j < sb->raid_disks; j++) {
8470 - if (!raid_conf->mirrors[j].operational)
8471 + if (!conf->mirrors[j].operational)
8473 - if (sb->disks[i].number == raid_conf->mirrors[j].number)
8474 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
8475 + if (sb->disks[i].number == conf->mirrors[j].number)
8476 + mark_disk_sync(sb->disks+i);
8479 - sb->active_disks = raid_conf->working_disks;
8480 + sb->active_disks = conf->working_disks;
8482 - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
8483 - /* Ok, everything is just fine now */
8485 + if (start_recovery)
8486 + md_recover_arrays();
8489 + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
8491 + * Ok, everything is just fine now
8497 + mddev->private = NULL;
8499 + MOD_DEC_USE_COUNT;
8503 +#undef INVALID_LEVEL
8507 +#undef INCONSISTENT
8508 +#undef ALREADY_RUNNING
8511 +#undef NONE_OPERATIONAL
8512 +#undef RUNNING_CKRAID
8513 +#undef ARRAY_IS_ACTIVE
8515 +static int raid1_stop_resync (mddev_t *mddev)
8517 + raid1_conf_t *conf = mddev_to_conf(mddev);
8519 + if (conf->resync_thread) {
8520 + if (conf->resync_mirrors) {
8521 + conf->resync_mirrors = 2;
8522 + md_interrupt_thread(conf->resync_thread);
8523 + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
8531 +static int raid1_restart_resync (mddev_t *mddev)
8533 + raid1_conf_t *conf = mddev_to_conf(mddev);
8535 + if (conf->resync_mirrors) {
8536 + if (!conf->resync_thread) {
8540 + conf->resync_mirrors = 1;
8541 + md_wakeup_thread(conf->resync_thread);
8547 -static int raid1_stop (int minor, struct md_dev *mddev)
8548 +static int raid1_stop (mddev_t *mddev)
8550 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
8551 + raid1_conf_t *conf = mddev_to_conf(mddev);
8553 - kfree (raid_conf);
8554 + md_unregister_thread(conf->thread);
8555 + if (conf->resync_thread)
8556 + md_unregister_thread(conf->resync_thread);
8558 mddev->private = NULL;
8563 -static struct md_personality raid1_personality=
8564 +static mdk_personality_t raid1_personality=
8568 @@ -855,15 +1211,13 @@
8569 NULL, /* no ioctls */
8572 - raid1_hot_add_disk,
8573 - /* raid1_hot_remove_drive */ NULL,
8576 + raid1_stop_resync,
8577 + raid1_restart_resync
8580 int raid1_init (void)
8582 - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
8584 return register_md_personality (RAID1, &raid1_personality);
8587 @@ -875,7 +1229,6 @@
8589 void cleanup_module (void)
8591 - md_unregister_thread (raid1_thread);
8592 unregister_md_personality (RAID1);
8595 diff -ruN linux.orig/drivers/block/raid5.c linux-2.2.16/drivers/block/raid5.c
8596 --- linux.orig/drivers/block/raid5.c Fri May 8 09:17:13 1998
8597 +++ linux-2.2.16/drivers/block/raid5.c Fri Jun 9 11:37:45 2000
8599 -/*****************************************************************************
8601 * raid5.c : Multiple Devices driver for Linux
8602 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8605 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
8609 #include <linux/module.h>
8610 #include <linux/locks.h>
8611 #include <linux/malloc.h>
8612 -#include <linux/md.h>
8613 -#include <linux/raid5.h>
8614 +#include <linux/raid/raid5.h>
8615 #include <asm/bitops.h>
8616 #include <asm/atomic.h>
8617 -#include <asm/md.h>
8619 -static struct md_personality raid5_personality;
8620 +static mdk_personality_t raid5_personality;
8625 #define HASH_PAGES_ORDER 0
8626 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
8627 #define HASH_MASK (NR_HASH - 1)
8628 -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
8629 +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
8632 * The following can be used to debug the driver
8634 #define PRINTK(x) do { ; } while (0)
8637 +static void print_raid5_conf (raid5_conf_t *conf);
8639 static inline int stripe_locked(struct stripe_head *sh)
8641 return test_bit(STRIPE_LOCKED, &sh->state);
8644 static inline void lock_stripe(struct stripe_head *sh)
8646 - struct raid5_data *raid_conf = sh->raid_conf;
8647 - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
8648 + raid5_conf_t *conf = sh->raid_conf;
8649 + if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
8650 PRINTK(("locking stripe %lu\n", sh->sector));
8651 - raid_conf->nr_locked_stripes++;
8652 + conf->nr_locked_stripes++;
8656 static inline void unlock_stripe(struct stripe_head *sh)
8658 - struct raid5_data *raid_conf = sh->raid_conf;
8659 - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
8660 + raid5_conf_t *conf = sh->raid_conf;
8661 + if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
8662 PRINTK(("unlocking stripe %lu\n", sh->sector));
8663 - raid_conf->nr_locked_stripes--;
8664 + conf->nr_locked_stripes--;
8669 static inline void finish_stripe(struct stripe_head *sh)
8671 - struct raid5_data *raid_conf = sh->raid_conf;
8672 + raid5_conf_t *conf = sh->raid_conf;
8674 sh->cmd = STRIPE_NONE;
8675 sh->phase = PHASE_COMPLETE;
8676 - raid_conf->nr_pending_stripes--;
8677 - raid_conf->nr_cached_stripes++;
8678 - wake_up(&raid_conf->wait_for_stripe);
8679 + conf->nr_pending_stripes--;
8680 + conf->nr_cached_stripes++;
8681 + wake_up(&conf->wait_for_stripe);
8684 void __wait_on_stripe(struct stripe_head *sh)
8686 __wait_on_stripe(sh);
8689 -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
8690 +static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
8692 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
8694 @@ -123,21 +124,22 @@
8695 sh->hash_next->hash_pprev = sh->hash_pprev;
8696 *sh->hash_pprev = sh->hash_next;
8697 sh->hash_pprev = NULL;
8698 - raid_conf->nr_hashed_stripes--;
8699 + conf->nr_hashed_stripes--;
8703 -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
8704 +static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
8706 - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
8707 + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
8709 - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
8710 + PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
8711 + sh->sector, conf->nr_hashed_stripes));
8713 if ((sh->hash_next = *shp) != NULL)
8714 (*shp)->hash_pprev = &sh->hash_next;
8716 sh->hash_pprev = shp;
8717 - raid_conf->nr_hashed_stripes++;
8718 + conf->nr_hashed_stripes++;
8721 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
8722 @@ -145,13 +147,15 @@
8723 struct buffer_head *bh;
8724 unsigned long flags;
8726 - save_flags(flags);
8728 - if ((bh = sh->buffer_pool) == NULL)
8730 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
8731 + bh = sh->buffer_pool;
8734 sh->buffer_pool = bh->b_next;
8735 bh->b_size = b_size;
8736 - restore_flags(flags);
8738 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8743 @@ -160,12 +164,14 @@
8744 struct buffer_head *bh;
8745 unsigned long flags;
8747 - save_flags(flags);
8749 - if ((bh = sh->bh_pool) == NULL)
8751 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
8755 sh->bh_pool = bh->b_next;
8756 - restore_flags(flags);
8758 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8763 @@ -173,54 +179,52 @@
8765 unsigned long flags;
8767 - save_flags(flags);
8769 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
8770 bh->b_next = sh->buffer_pool;
8771 sh->buffer_pool = bh;
8772 - restore_flags(flags);
8773 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8776 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
8778 unsigned long flags;
8780 - save_flags(flags);
8782 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
8783 bh->b_next = sh->bh_pool;
8785 - restore_flags(flags);
8786 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
8789 -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
8790 +static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
8792 struct stripe_head *sh;
8793 unsigned long flags;
8797 - if ((sh = raid_conf->free_sh_list) == NULL) {
8798 + if ((sh = conf->free_sh_list) == NULL) {
8799 restore_flags(flags);
8802 - raid_conf->free_sh_list = sh->free_next;
8803 - raid_conf->nr_free_sh--;
8804 - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
8805 + conf->free_sh_list = sh->free_next;
8806 + conf->nr_free_sh--;
8807 + if (!conf->nr_free_sh && conf->free_sh_list)
8808 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
8809 restore_flags(flags);
8810 - if (sh->hash_pprev || sh->nr_pending || sh->count)
8811 + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
8812 printk("get_free_stripe(): bug\n");
8816 -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
8817 +static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
8819 unsigned long flags;
8823 - sh->free_next = raid_conf->free_sh_list;
8824 - raid_conf->free_sh_list = sh;
8825 - raid_conf->nr_free_sh++;
8826 + sh->free_next = conf->free_sh_list;
8827 + conf->free_sh_list = sh;
8828 + conf->nr_free_sh++;
8829 restore_flags(flags);
8834 static void kfree_stripe(struct stripe_head *sh)
8836 - struct raid5_data *raid_conf = sh->raid_conf;
8837 - int disks = raid_conf->raid_disks, j;
8838 + raid5_conf_t *conf = sh->raid_conf;
8839 + int disks = conf->raid_disks, j;
8841 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
8842 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
8843 @@ -338,19 +342,19 @@
8844 if (sh->bh_new[j] || sh->bh_copy[j])
8845 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
8847 - remove_hash(raid_conf, sh);
8848 - put_free_stripe(raid_conf, sh);
8849 + remove_hash(conf, sh);
8850 + put_free_stripe(conf, sh);
8853 -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
8854 +static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
8856 struct stripe_head *sh;
8859 - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
8860 + PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
8861 for (i = 0; i < NR_HASH; i++) {
8863 - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
8864 + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
8865 for (; sh; sh = sh->hash_next) {
8866 if (sh->phase != PHASE_COMPLETE)
8868 @@ -360,30 +364,30 @@
8871 if (++count == nr) {
8872 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
8873 - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
8874 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
8875 + conf->clock = (i + conf->clock) & HASH_MASK;
8881 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
8882 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
8886 -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
8887 +static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
8889 struct stripe_head *sh;
8891 - if (raid_conf->buffer_size != size) {
8892 - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
8893 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
8894 - raid_conf->buffer_size = size;
8895 + if (conf->buffer_size != size) {
8896 + PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
8897 + shrink_stripe_cache(conf, conf->max_nr_stripes);
8898 + conf->buffer_size = size;
8901 PRINTK(("find_stripe, sector %lu\n", sector));
8902 - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
8903 - if (sh->sector == sector && sh->raid_conf == raid_conf) {
8904 + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
8905 + if (sh->sector == sector && sh->raid_conf == conf) {
8906 if (sh->size == size) {
8907 PRINTK(("found stripe %lu\n", sector));
8913 -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
8914 +static int grow_stripes(raid5_conf_t *conf, int num, int priority)
8916 struct stripe_head *sh;
8918 @@ -405,62 +409,64 @@
8919 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
8921 memset(sh, 0, sizeof(*sh));
8922 - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
8923 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
8924 + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
8926 + if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
8927 + shrink_buffers(sh, 2 * conf->raid_disks);
8931 - if (grow_bh(sh, raid_conf->raid_disks, priority)) {
8932 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
8933 - shrink_bh(sh, raid_conf->raid_disks);
8934 + if (grow_bh(sh, conf->raid_disks, priority)) {
8935 + shrink_buffers(sh, 2 * conf->raid_disks);
8936 + shrink_bh(sh, conf->raid_disks);
8940 - put_free_stripe(raid_conf, sh);
8941 - raid_conf->nr_stripes++;
8942 + put_free_stripe(conf, sh);
8943 + conf->nr_stripes++;
8948 -static void shrink_stripes(struct raid5_data *raid_conf, int num)
8949 +static void shrink_stripes(raid5_conf_t *conf, int num)
8951 struct stripe_head *sh;
8954 - sh = get_free_stripe(raid_conf);
8955 + sh = get_free_stripe(conf);
8958 - shrink_buffers(sh, raid_conf->raid_disks * 2);
8959 - shrink_bh(sh, raid_conf->raid_disks);
8960 + shrink_buffers(sh, conf->raid_disks * 2);
8961 + shrink_bh(sh, conf->raid_disks);
8963 - raid_conf->nr_stripes--;
8964 + conf->nr_stripes--;
8968 -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
8969 +static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
8971 struct stripe_head *sh = NULL, *tmp;
8972 struct buffer_head *buffer_pool, *bh_pool;
8974 PRINTK(("kmalloc_stripe called\n"));
8976 - while ((sh = get_free_stripe(raid_conf)) == NULL) {
8977 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
8978 - if ((sh = get_free_stripe(raid_conf)) != NULL)
8979 + while ((sh = get_free_stripe(conf)) == NULL) {
8980 + shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
8981 + if ((sh = get_free_stripe(conf)) != NULL)
8983 - if (!raid_conf->nr_pending_stripes)
8984 + if (!conf->nr_pending_stripes)
8985 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
8986 - md_wakeup_thread(raid_conf->thread);
8987 + md_wakeup_thread(conf->thread);
8988 PRINTK(("waiting for some stripes to complete\n"));
8989 - sleep_on(&raid_conf->wait_for_stripe);
8990 + sleep_on(&conf->wait_for_stripe);
8994 * The above might have slept, so perhaps another process
8995 * already created the stripe for us..
8997 - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
8998 - put_free_stripe(raid_conf, sh);
8999 + if ((tmp = find_stripe(conf, sector, size)) != NULL) {
9000 + put_free_stripe(conf, sh);
9001 wait_on_stripe(tmp);
9004 @@ -472,25 +478,25 @@
9005 sh->bh_pool = bh_pool;
9006 sh->phase = PHASE_COMPLETE;
9007 sh->cmd = STRIPE_NONE;
9008 - sh->raid_conf = raid_conf;
9009 + sh->raid_conf = conf;
9010 sh->sector = sector;
9012 - raid_conf->nr_cached_stripes++;
9013 - insert_hash(raid_conf, sh);
9014 + conf->nr_cached_stripes++;
9015 + insert_hash(conf, sh);
9016 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
9020 -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
9021 +static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
9023 struct stripe_head *sh;
9025 PRINTK(("get_stripe, sector %lu\n", sector));
9026 - sh = find_stripe(raid_conf, sector, size);
9027 + sh = find_stripe(conf, sector, size);
9031 - sh = kmalloc_stripe(raid_conf, sector, size);
9032 + sh = kmalloc_stripe(conf, sector, size);
9037 bh->b_end_io(bh, uptodate);
9039 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
9040 - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
9041 + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
9044 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
9045 @@ -537,36 +543,35 @@
9046 static void raid5_end_request (struct buffer_head * bh, int uptodate)
9048 struct stripe_head *sh = bh->b_dev_id;
9049 - struct raid5_data *raid_conf = sh->raid_conf;
9050 - int disks = raid_conf->raid_disks, i;
9051 + raid5_conf_t *conf = sh->raid_conf;
9052 + int disks = conf->raid_disks, i;
9053 unsigned long flags;
9055 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
9056 - save_flags(flags);
9058 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
9059 raid5_mark_buffer_uptodate(bh, uptodate);
9061 - if (!sh->nr_pending) {
9062 - md_wakeup_thread(raid_conf->thread);
9063 - atomic_inc(&raid_conf->nr_handle);
9064 + if (atomic_dec_and_test(&sh->nr_pending)) {
9065 + md_wakeup_thread(conf->thread);
9066 + atomic_inc(&conf->nr_handle);
9070 md_error(bh->b_dev, bh->b_rdev);
9071 - if (raid_conf->failed_disks) {
9073 + if (conf->failed_disks) {
9074 for (i = 0; i < disks; i++) {
9075 - if (raid_conf->disks[i].operational)
9076 + if (conf->disks[i].operational)
9078 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
9080 - if (bh->b_rdev != raid_conf->disks[i].dev)
9081 + if (bh->b_rdev != conf->disks[i].dev)
9083 set_bit(STRIPE_ERROR, &sh->state);
9086 - restore_flags(flags);
9087 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
9090 -static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
9091 +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9092 unsigned long *rsector, unsigned long size)
9094 /* No complex mapping used: the core of the work is done in the
9095 @@ -577,11 +582,10 @@
9097 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
9099 - struct raid5_data *raid_conf = sh->raid_conf;
9100 - struct md_dev *mddev = raid_conf->mddev;
9101 - int minor = (int) (mddev - md_dev);
9102 + raid5_conf_t *conf = sh->raid_conf;
9103 + mddev_t *mddev = conf->mddev;
9105 - kdev_t dev = MKDEV(MD_MAJOR, minor);
9106 + kdev_t dev = mddev_to_kdev(mddev);
9107 int block = sh->sector / (sh->size >> 9);
9109 b_data = ((volatile struct buffer_head *) bh)->b_data;
9111 init_buffer(bh, dev, block, raid5_end_request, sh);
9112 ((volatile struct buffer_head *) bh)->b_data = b_data;
9114 - bh->b_rdev = raid_conf->disks[i].dev;
9115 + bh->b_rdev = conf->disks[i].dev;
9116 bh->b_rsector = sh->sector;
9118 bh->b_state = (1 << BH_Req);
9119 @@ -597,33 +601,62 @@
9120 bh->b_list = BUF_LOCKED;
9123 -static int raid5_error (struct md_dev *mddev, kdev_t dev)
9124 +static int raid5_error (mddev_t *mddev, kdev_t dev)
9126 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
9127 - md_superblock_t *sb = mddev->sb;
9128 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
9129 + mdp_super_t *sb = mddev->sb;
9130 struct disk_info *disk;
9133 PRINTK(("raid5_error called\n"));
9134 - raid_conf->resync_parity = 0;
9135 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
9136 + conf->resync_parity = 0;
9137 + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
9138 if (disk->dev == dev && disk->operational) {
9139 disk->operational = 0;
9140 - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
9141 - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
9142 - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
9143 + mark_disk_faulty(sb->disks+disk->number);
9144 + mark_disk_nonsync(sb->disks+disk->number);
9145 + mark_disk_inactive(sb->disks+disk->number);
9147 sb->working_disks--;
9149 mddev->sb_dirty = 1;
9150 - raid_conf->working_disks--;
9151 - raid_conf->failed_disks++;
9152 - md_wakeup_thread(raid_conf->thread);
9153 + conf->working_disks--;
9154 + conf->failed_disks++;
9155 + md_wakeup_thread(conf->thread);
9157 - "RAID5: Disk failure on %s, disabling device."
9158 - "Operation continuing on %d devices\n",
9159 - kdevname (dev), raid_conf->working_disks);
9160 + "raid5: Disk failure on %s, disabling device."
9161 + " Operation continuing on %d devices\n",
9162 + partition_name (dev), conf->working_disks);
9167 + * handle errors in spares (during reconstruction)
9169 + if (conf->spare) {
9170 + disk = conf->spare;
9171 + if (disk->dev == dev) {
9172 + printk (KERN_ALERT
9173 + "raid5: Disk failure on spare %s\n",
9174 + partition_name (dev));
9175 + if (!conf->spare->operational) {
9179 + disk->operational = 0;
9180 + disk->write_only = 0;
9181 + conf->spare = NULL;
9182 + mark_disk_faulty(sb->disks+disk->number);
9183 + mark_disk_nonsync(sb->disks+disk->number);
9184 + mark_disk_inactive(sb->disks+disk->number);
9185 + sb->spare_disks--;
9186 + sb->working_disks--;
9187 + sb->failed_disks++;
9196 @@ -634,12 +667,12 @@
9197 static inline unsigned long
9198 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
9199 unsigned int * dd_idx, unsigned int * pd_idx,
9200 - struct raid5_data *raid_conf)
9201 + raid5_conf_t *conf)
9203 unsigned int stripe;
9204 int chunk_number, chunk_offset;
9205 unsigned long new_sector;
9206 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
9207 + int sectors_per_chunk = conf->chunk_size >> 9;
9209 /* First compute the information on this sector */
9213 * Select the parity disk based on the user selected algorithm.
9215 - if (raid_conf->level == 4)
9216 + if (conf->level == 4)
9217 *pd_idx = data_disks;
9218 - else switch (raid_conf->algorithm) {
9219 + else switch (conf->algorithm) {
9220 case ALGORITHM_LEFT_ASYMMETRIC:
9221 *pd_idx = data_disks - stripe % raid_disks;
9222 if (*dd_idx >= *pd_idx)
9224 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
9227 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
9228 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
9232 @@ -705,16 +738,16 @@
9234 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
9236 - struct raid5_data *raid_conf = sh->raid_conf;
9237 - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
9238 + raid5_conf_t *conf = sh->raid_conf;
9239 + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
9240 unsigned long new_sector = sh->sector, check;
9241 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
9242 + int sectors_per_chunk = conf->chunk_size >> 9;
9243 unsigned long stripe = new_sector / sectors_per_chunk;
9244 int chunk_offset = new_sector % sectors_per_chunk;
9245 int chunk_number, dummy1, dummy2, dd_idx = i;
9246 unsigned long r_sector, blocknr;
9248 - switch (raid_conf->algorithm) {
9249 + switch (conf->algorithm) {
9250 case ALGORITHM_LEFT_ASYMMETRIC:
9251 case ALGORITHM_RIGHT_ASYMMETRIC:
9253 @@ -727,14 +760,14 @@
9254 i -= (sh->pd_idx + 1);
9257 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
9258 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
9261 chunk_number = stripe * data_disks + i;
9262 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
9263 blocknr = r_sector / (sh->size >> 9);
9265 - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
9266 + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
9267 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
9268 printk("compute_blocknr: map not correct\n");
9270 @@ -742,36 +775,11 @@
9274 -#ifdef HAVE_ARCH_XORBLOCK
9275 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
9277 - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
9280 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
9282 - long lines = dest->b_size / (sizeof (long)) / 8, i;
9283 - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
9285 - for (i = lines; i > 0; i--) {
9286 - *(destp + 0) ^= *(sourcep + 0);
9287 - *(destp + 1) ^= *(sourcep + 1);
9288 - *(destp + 2) ^= *(sourcep + 2);
9289 - *(destp + 3) ^= *(sourcep + 3);
9290 - *(destp + 4) ^= *(sourcep + 4);
9291 - *(destp + 5) ^= *(sourcep + 5);
9292 - *(destp + 6) ^= *(sourcep + 6);
9293 - *(destp + 7) ^= *(sourcep + 7);
9300 static void compute_block(struct stripe_head *sh, int dd_idx)
9302 - struct raid5_data *raid_conf = sh->raid_conf;
9303 - int i, disks = raid_conf->raid_disks;
9304 + raid5_conf_t *conf = sh->raid_conf;
9305 + int i, count, disks = conf->raid_disks;
9306 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9308 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
9310 @@ -780,69 +788,100 @@
9311 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
9313 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
9314 + bh_ptr[0] = sh->bh_old[dd_idx];
9316 for (i = 0; i < disks; i++) {
9319 if (sh->bh_old[i]) {
9320 - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
9323 + bh_ptr[count++] = sh->bh_old[i];
9325 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
9327 + if (count == MAX_XOR_BLOCKS) {
9328 + xor_block(count, &bh_ptr[0]);
9333 + xor_block(count, &bh_ptr[0]);
9335 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
9338 static void compute_parity(struct stripe_head *sh, int method)
9340 - struct raid5_data *raid_conf = sh->raid_conf;
9341 - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
9342 + raid5_conf_t *conf = sh->raid_conf;
9343 + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
9344 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9346 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
9348 for (i = 0; i < disks; i++) {
9349 if (i == pd_idx || !sh->bh_new[i])
9351 if (!sh->bh_copy[i])
9352 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
9353 raid5_build_block(sh, sh->bh_copy[i], i);
9354 + if (!buffer_lowprio(sh->bh_new[i]))
9357 + mark_buffer_lowprio(sh->bh_copy[i]);
9358 mark_buffer_clean(sh->bh_new[i]);
9359 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
9361 if (sh->bh_copy[pd_idx] == NULL)
9362 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
9363 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
9365 + mark_buffer_lowprio(sh->bh_copy[pd_idx]);
9367 if (method == RECONSTRUCT_WRITE) {
9368 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
9369 + bh_ptr[0] = sh->bh_copy[pd_idx];
9371 for (i = 0; i < disks; i++) {
9372 if (i == sh->pd_idx)
9374 if (sh->bh_new[i]) {
9375 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
9377 + bh_ptr[count++] = sh->bh_copy[i];
9378 + } else if (sh->bh_old[i]) {
9379 + bh_ptr[count++] = sh->bh_old[i];
9381 - if (sh->bh_old[i]) {
9382 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
9384 + if (count == MAX_XOR_BLOCKS) {
9385 + xor_block(count, &bh_ptr[0]);
9390 + xor_block(count, &bh_ptr[0]);
9392 } else if (method == READ_MODIFY_WRITE) {
9393 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
9394 + bh_ptr[0] = sh->bh_copy[pd_idx];
9396 for (i = 0; i < disks; i++) {
9397 if (i == sh->pd_idx)
9399 if (sh->bh_new[i] && sh->bh_old[i]) {
9400 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
9401 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
9403 + bh_ptr[count++] = sh->bh_copy[i];
9404 + bh_ptr[count++] = sh->bh_old[i];
9406 + if (count >= (MAX_XOR_BLOCKS - 1)) {
9407 + xor_block(count, &bh_ptr[0]);
9412 + xor_block(count, &bh_ptr[0]);
9415 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
9418 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
9420 - struct raid5_data *raid_conf = sh->raid_conf;
9421 + raid5_conf_t *conf = sh->raid_conf;
9422 struct buffer_head *bh_req;
9424 if (sh->bh_new[dd_idx]) {
9425 @@ -860,19 +899,22 @@
9426 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
9427 sh->phase = PHASE_BEGIN;
9428 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
9429 - raid_conf->nr_pending_stripes++;
9430 - atomic_inc(&raid_conf->nr_handle);
9431 + conf->nr_pending_stripes++;
9432 + atomic_inc(&conf->nr_handle);
9434 sh->bh_new[dd_idx] = bh;
9435 sh->bh_req[dd_idx] = bh_req;
9436 sh->cmd_new[dd_idx] = rw;
9437 sh->new[dd_idx] = 1;
9439 + if (buffer_lowprio(bh))
9440 + mark_buffer_lowprio(bh_req);
9443 static void complete_stripe(struct stripe_head *sh)
9445 - struct raid5_data *raid_conf = sh->raid_conf;
9446 - int disks = raid_conf->raid_disks;
9447 + raid5_conf_t *conf = sh->raid_conf;
9448 + int disks = conf->raid_disks;
9451 PRINTK(("complete_stripe %lu\n", sh->sector));
9452 @@ -909,6 +951,22 @@
9457 +static int is_stripe_lowprio(struct stripe_head *sh, int disks)
9459 + int i, lowprio = 1;
9461 + for (i = 0; i < disks; i++) {
9462 + if (sh->bh_new[i])
9463 + if (!buffer_lowprio(sh->bh_new[i]))
9465 + if (sh->bh_old[i])
9466 + if (!buffer_lowprio(sh->bh_old[i]))
9473 * handle_stripe() is our main logic routine. Note that:
9475 @@ -919,28 +977,27 @@
9476 * 2. We should be careful to set sh->nr_pending whenever we sleep,
9477 * to prevent re-entry of handle_stripe() for the same sh.
9479 - * 3. raid_conf->failed_disks and disk->operational can be changed
9480 + * 3. conf->failed_disks and disk->operational can be changed
9481 * from an interrupt. This complicates things a bit, but it allows
9482 * us to stop issuing requests for a failed drive as soon as possible.
9484 static void handle_stripe(struct stripe_head *sh)
9486 - struct raid5_data *raid_conf = sh->raid_conf;
9487 - struct md_dev *mddev = raid_conf->mddev;
9488 - int minor = (int) (mddev - md_dev);
9489 + raid5_conf_t *conf = sh->raid_conf;
9490 + mddev_t *mddev = conf->mddev;
9491 struct buffer_head *bh;
9492 - int disks = raid_conf->raid_disks;
9493 - int i, nr = 0, nr_read = 0, nr_write = 0;
9494 + int disks = conf->raid_disks;
9495 + int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
9496 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
9497 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
9498 int reading = 0, nr_writing = 0;
9499 int method1 = INT_MAX, method2 = INT_MAX;
9501 unsigned long flags;
9502 - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
9503 + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
9505 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
9506 - if (sh->nr_pending) {
9507 + if (md_atomic_read(&sh->nr_pending)) {
9508 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
9511 @@ -949,9 +1006,9 @@
9515 - atomic_dec(&raid_conf->nr_handle);
9516 + atomic_dec(&conf->nr_handle);
9518 - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
9519 + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
9520 printk("raid5: restarting stripe %lu\n", sh->sector);
9521 sh->phase = PHASE_BEGIN;
9523 @@ -969,11 +1026,11 @@
9526 for (i = 0; i < disks; i++) {
9527 - operational[i] = raid_conf->disks[i].operational;
9528 - if (i == sh->pd_idx && raid_conf->resync_parity)
9529 + operational[i] = conf->disks[i].operational;
9530 + if (i == sh->pd_idx && conf->resync_parity)
9533 - failed_disks = raid_conf->failed_disks;
9534 + failed_disks = conf->failed_disks;
9535 restore_flags(flags);
9537 if (failed_disks > 1) {
9538 @@ -1017,7 +1074,7 @@
9541 if (nr_write && nr_read)
9542 - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
9543 + printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
9547 @@ -1030,7 +1087,7 @@
9550 block = (int) compute_blocknr(sh, i);
9551 - bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
9552 + bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
9553 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
9554 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
9555 add_stripe_bh(sh, bh, i, WRITE);
9556 @@ -1064,21 +1121,22 @@
9558 if (!method1 || !method2) {
9561 + lowprio = is_stripe_lowprio(sh, disks);
9562 + atomic_inc(&sh->nr_pending);
9563 sh->phase = PHASE_WRITE;
9564 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
9565 for (i = 0; i < disks; i++) {
9566 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
9567 + if (!operational[i] && !conf->spare && !conf->resync_parity)
9569 if (i == sh->pd_idx || sh->bh_new[i])
9573 - sh->nr_pending = nr_writing;
9574 - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
9575 + md_atomic_set(&sh->nr_pending, nr_writing);
9576 + PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9578 for (i = 0; i < disks; i++) {
9579 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
9580 + if (!operational[i] && !conf->spare && !conf->resync_parity)
9582 bh = sh->bh_copy[i];
9583 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
9584 @@ -1089,18 +1147,30 @@
9585 bh->b_state |= (1<<BH_Dirty);
9586 PRINTK(("making request for buffer %d\n", i));
9587 clear_bit(BH_Lock, &bh->b_state);
9588 - if (!operational[i] && !raid_conf->resync_parity) {
9589 - bh->b_rdev = raid_conf->spare->dev;
9590 - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
9592 - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
9593 + if (!operational[i] && !conf->resync_parity) {
9594 + bh->b_rdev = conf->spare->dev;
9595 + make_request(MAJOR(conf->spare->dev), WRITE, bh);
9598 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
9600 + if (!lowprio || (i==sh->pd_idx))
9601 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
9603 + mark_buffer_clean(bh);
9604 + raid5_end_request(bh,1);
9616 + lowprio = is_stripe_lowprio(sh, disks);
9617 + atomic_inc(&sh->nr_pending);
9618 if (method1 < method2) {
9619 sh->write_method = RECONSTRUCT_WRITE;
9620 for (i = 0; i < disks; i++) {
9621 @@ -1110,6 +1180,8 @@
9623 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9624 raid5_build_block(sh, sh->bh_old[i], i);
9626 + mark_buffer_lowprio(sh->bh_old[i]);
9630 @@ -1121,19 +1193,21 @@
9632 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9633 raid5_build_block(sh, sh->bh_old[i], i);
9635 + mark_buffer_lowprio(sh->bh_old[i]);
9639 sh->phase = PHASE_READ_OLD;
9640 - sh->nr_pending = reading;
9641 - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
9642 + md_atomic_set(&sh->nr_pending, reading);
9643 + PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9644 for (i = 0; i < disks; i++) {
9647 if (buffer_uptodate(sh->bh_old[i]))
9649 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
9650 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
9651 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
9655 @@ -1141,7 +1215,8 @@
9657 method1 = nr_read - nr_cache_overwrite;
9660 + lowprio = is_stripe_lowprio(sh,disks);
9661 + atomic_inc(&sh->nr_pending);
9663 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
9664 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
9665 @@ -1149,18 +1224,22 @@
9666 for (i = 0; i < disks; i++) {
9669 - if (!sh->bh_old[i])
9670 + if (!sh->bh_old[i]) {
9671 compute_block(sh, i);
9673 + mark_buffer_lowprio
9676 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
9679 + atomic_dec(&sh->nr_pending);
9680 complete_stripe(sh);
9683 if (nr_failed_overwrite) {
9684 sh->phase = PHASE_READ_OLD;
9685 - sh->nr_pending = (disks - 1) - nr_cache;
9686 - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
9687 + md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
9688 + PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9689 for (i = 0; i < disks; i++) {
9692 @@ -1168,13 +1247,16 @@
9694 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
9695 raid5_build_block(sh, sh->bh_old[i], i);
9697 + mark_buffer_lowprio(sh->bh_old[i]);
9698 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
9699 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
9700 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
9703 sh->phase = PHASE_READ;
9704 - sh->nr_pending = nr_read - nr_cache_overwrite;
9705 - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
9706 + md_atomic_set(&sh->nr_pending,
9707 + nr_read - nr_cache_overwrite);
9708 + PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
9709 for (i = 0; i < disks; i++) {
9712 @@ -1182,16 +1264,16 @@
9713 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
9716 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
9717 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
9723 -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9724 +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
9726 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
9727 - const unsigned int raid_disks = raid_conf->raid_disks;
9728 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
9729 + const unsigned int raid_disks = conf->raid_disks;
9730 const unsigned int data_disks = raid_disks - 1;
9731 unsigned int dd_idx, pd_idx;
9732 unsigned long new_sector;
9733 @@ -1202,15 +1284,15 @@
9734 if (rw == WRITEA) rw = WRITE;
9736 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
9737 - &dd_idx, &pd_idx, raid_conf);
9738 + &dd_idx, &pd_idx, conf);
9740 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
9742 - sh = get_stripe(raid_conf, new_sector, bh->b_size);
9743 + sh = get_stripe(conf, new_sector, bh->b_size);
9744 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
9745 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
9747 - if (!sh->nr_pending)
9748 + if (!md_atomic_read(&sh->nr_pending))
9752 @@ -1221,24 +1303,24 @@
9753 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
9754 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
9756 - md_wakeup_thread(raid_conf->thread);
9757 + md_wakeup_thread(conf->thread);
9761 add_stripe_bh(sh, bh, dd_idx, rw);
9763 - md_wakeup_thread(raid_conf->thread);
9764 + md_wakeup_thread(conf->thread);
9768 static void unplug_devices(struct stripe_head *sh)
9771 - struct raid5_data *raid_conf = sh->raid_conf;
9772 + raid5_conf_t *conf = sh->raid_conf;
9775 - for (i = 0; i < raid_conf->raid_disks; i++)
9776 - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
9777 + for (i = 0; i < conf->raid_disks; i++)
9778 + unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
9782 @@ -1252,8 +1334,8 @@
9783 static void raid5d (void *data)
9785 struct stripe_head *sh;
9786 - struct raid5_data *raid_conf = data;
9787 - struct md_dev *mddev = raid_conf->mddev;
9788 + raid5_conf_t *conf = data;
9789 + mddev_t *mddev = conf->mddev;
9790 int i, handled = 0, unplug = 0;
9791 unsigned long flags;
9793 @@ -1261,47 +1343,47 @@
9795 if (mddev->sb_dirty) {
9796 mddev->sb_dirty = 0;
9797 - md_update_sb((int) (mddev - md_dev));
9798 + md_update_sb(mddev);
9800 for (i = 0; i < NR_HASH; i++) {
9802 - sh = raid_conf->stripe_hashtbl[i];
9803 + sh = conf->stripe_hashtbl[i];
9804 for (; sh; sh = sh->hash_next) {
9805 - if (sh->raid_conf != raid_conf)
9806 + if (sh->raid_conf != conf)
9808 if (sh->phase == PHASE_COMPLETE)
9810 - if (sh->nr_pending)
9811 + if (md_atomic_read(&sh->nr_pending))
9813 - if (sh->sector == raid_conf->next_sector) {
9814 - raid_conf->sector_count += (sh->size >> 9);
9815 - if (raid_conf->sector_count >= 128)
9816 + if (sh->sector == conf->next_sector) {
9817 + conf->sector_count += (sh->size >> 9);
9818 + if (conf->sector_count >= 128)
9823 - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
9824 + PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
9827 - raid_conf->sector_count = 0;
9828 + conf->sector_count = 0;
9830 - raid_conf->next_sector = sh->sector + (sh->size >> 9);
9831 + conf->next_sector = sh->sector + (sh->size >> 9);
9838 - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
9840 + PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
9843 - if (!atomic_read(&raid_conf->nr_handle))
9844 - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
9845 + if (!md_atomic_read(&conf->nr_handle))
9846 + clear_bit(THREAD_WAKEUP, &conf->thread->flags);
9847 + restore_flags(flags);
9849 PRINTK(("--- raid5d inactive\n"));
9852 -#if SUPPORT_RECONSTRUCTION
9854 * Private kernel thread for parity reconstruction after an unclean
9855 * shutdown. Reconstruction on spare drives in case of a failed drive
9856 @@ -1309,44 +1391,64 @@
9858 static void raid5syncd (void *data)
9860 - struct raid5_data *raid_conf = data;
9861 - struct md_dev *mddev = raid_conf->mddev;
9862 + raid5_conf_t *conf = data;
9863 + mddev_t *mddev = conf->mddev;
9865 - if (!raid_conf->resync_parity)
9866 + if (!conf->resync_parity)
9868 + if (conf->resync_parity == 2)
9870 + down(&mddev->recovery_sem);
9871 + if (md_do_sync(mddev,NULL)) {
9872 + up(&mddev->recovery_sem);
9873 + printk("raid5: resync aborted!\n");
9875 - md_do_sync(mddev);
9876 - raid_conf->resync_parity = 0;
9878 + conf->resync_parity = 0;
9879 + up(&mddev->recovery_sem);
9880 + printk("raid5: resync finished.\n");
9882 -#endif /* SUPPORT_RECONSTRUCTION */
9884 -static int __check_consistency (struct md_dev *mddev, int row)
9885 +static int __check_consistency (mddev_t *mddev, int row)
9887 - struct raid5_data *raid_conf = mddev->private;
9888 + raid5_conf_t *conf = mddev->private;
9890 struct buffer_head *bh[MD_SB_DISKS], tmp;
9891 - int i, rc = 0, nr = 0;
9892 + int i, rc = 0, nr = 0, count;
9893 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
9895 - if (raid_conf->working_disks != raid_conf->raid_disks)
9896 + if (conf->working_disks != conf->raid_disks)
9899 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
9901 + md_clear_page((unsigned long)tmp.b_data);
9902 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
9903 - for (i = 0; i < raid_conf->raid_disks; i++) {
9904 - dev = raid_conf->disks[i].dev;
9905 + for (i = 0; i < conf->raid_disks; i++) {
9906 + dev = conf->disks[i].dev;
9907 set_blocksize(dev, 4096);
9908 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
9912 - if (nr == raid_conf->raid_disks) {
9913 - for (i = 1; i < nr; i++)
9914 - xor_block(&tmp, bh[i]);
9915 + if (nr == conf->raid_disks) {
9918 + for (i = 1; i < nr; i++) {
9919 + bh_ptr[count++] = bh[i];
9920 + if (count == MAX_XOR_BLOCKS) {
9921 + xor_block(count, &bh_ptr[0]);
9926 + xor_block(count, &bh_ptr[0]);
9928 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
9931 - for (i = 0; i < raid_conf->raid_disks; i++) {
9932 - dev = raid_conf->disks[i].dev;
9933 + for (i = 0; i < conf->raid_disks; i++) {
9934 + dev = conf->disks[i].dev;
9938 @@ -1358,285 +1460,607 @@
9942 -static int check_consistency (struct md_dev *mddev)
9943 +static int check_consistency (mddev_t *mddev)
9945 - int size = mddev->sb->size;
9947 + if (__check_consistency(mddev, 0))
9949 + * We are not checking this currently, as it's legitimate to have
9950 + * an inconsistent array, at creation time.
9954 - for (row = 0; row < size; row += size / 8)
9955 - if (__check_consistency(mddev, row))
9960 -static int raid5_run (int minor, struct md_dev *mddev)
9961 +static int raid5_run (mddev_t *mddev)
9963 - struct raid5_data *raid_conf;
9964 + raid5_conf_t *conf;
9965 int i, j, raid_disk, memory;
9966 - md_superblock_t *sb = mddev->sb;
9967 - md_descriptor_t *descriptor;
9968 - struct real_dev *realdev;
9969 + mdp_super_t *sb = mddev->sb;
9972 + struct disk_info *disk;
9973 + struct md_list_head *tmp;
9974 + int start_recovery = 0;
9978 if (sb->level != 5 && sb->level != 4) {
9979 - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
9980 + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
9985 - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
9986 - if ((raid_conf = mddev->private) == NULL)
9987 + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
9988 + if ((conf = mddev->private) == NULL)
9990 - memset (raid_conf, 0, sizeof (*raid_conf));
9991 - raid_conf->mddev = mddev;
9992 + memset (conf, 0, sizeof (*conf));
9993 + conf->mddev = mddev;
9995 - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
9996 + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
9998 - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
9999 + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
10001 - init_waitqueue(&raid_conf->wait_for_stripe);
10002 - PRINTK(("raid5_run(%d) called.\n", minor));
10004 - for (i = 0; i < mddev->nb_dev; i++) {
10005 - realdev = &mddev->devices[i];
10006 - if (!realdev->sb) {
10007 - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10010 + init_waitqueue(&conf->wait_for_stripe);
10011 + PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
10013 + ITERATE_RDEV(mddev,rdev,tmp) {
10015 * This is important -- we are using the descriptor on
10016 * the disk only to get a pointer to the descriptor on
10017 * the main superblock, which might be more recent.
10019 - descriptor = &sb->disks[realdev->sb->descriptor.number];
10020 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10021 - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
10022 + desc = sb->disks + rdev->desc_nr;
10023 + raid_disk = desc->raid_disk;
10024 + disk = conf->disks + raid_disk;
10026 + if (disk_faulty(desc)) {
10027 + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
10028 + if (!rdev->faulty) {
10032 + disk->number = desc->number;
10033 + disk->raid_disk = raid_disk;
10034 + disk->dev = rdev->dev;
10036 + disk->operational = 0;
10037 + disk->write_only = 0;
10039 + disk->used_slot = 1;
10042 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10043 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10044 - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
10046 + if (disk_active(desc)) {
10047 + if (!disk_sync(desc)) {
10048 + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
10052 - raid_disk = descriptor->raid_disk;
10053 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10054 - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10055 + if (raid_disk > sb->raid_disks) {
10056 + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
10059 - if (raid_conf->disks[raid_disk].operational) {
10060 - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
10061 + if (disk->operational) {
10062 + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
10065 - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
10066 + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
10068 - raid_conf->disks[raid_disk].number = descriptor->number;
10069 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
10070 - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
10071 - raid_conf->disks[raid_disk].operational = 1;
10072 + disk->number = desc->number;
10073 + disk->raid_disk = raid_disk;
10074 + disk->dev = rdev->dev;
10075 + disk->operational = 1;
10076 + disk->used_slot = 1;
10078 - raid_conf->working_disks++;
10079 + conf->working_disks++;
10082 * Must be a spare disk ..
10084 - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
10085 - raid_disk = descriptor->raid_disk;
10086 - raid_conf->disks[raid_disk].number = descriptor->number;
10087 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
10088 - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
10090 - raid_conf->disks[raid_disk].operational = 0;
10091 - raid_conf->disks[raid_disk].write_only = 0;
10092 - raid_conf->disks[raid_disk].spare = 1;
10095 - raid_conf->raid_disks = sb->raid_disks;
10096 - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
10097 - raid_conf->mddev = mddev;
10098 - raid_conf->chunk_size = sb->chunk_size;
10099 - raid_conf->level = sb->level;
10100 - raid_conf->algorithm = sb->parity_algorithm;
10101 - raid_conf->max_nr_stripes = NR_STRIPES;
10102 + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
10103 + disk->number = desc->number;
10104 + disk->raid_disk = raid_disk;
10105 + disk->dev = rdev->dev;
10107 - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
10108 - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
10110 + disk->operational = 0;
10111 + disk->write_only = 0;
10113 + disk->used_slot = 1;
10116 - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
10117 - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
10119 + for (i = 0; i < MD_SB_DISKS; i++) {
10120 + desc = sb->disks + i;
10121 + raid_disk = desc->raid_disk;
10122 + disk = conf->disks + raid_disk;
10124 + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
10125 + !conf->disks[raid_disk].used_slot) {
10127 + disk->number = desc->number;
10128 + disk->raid_disk = raid_disk;
10129 + disk->dev = MKDEV(0,0);
10131 + disk->operational = 0;
10132 + disk->write_only = 0;
10134 + disk->used_slot = 1;
10138 + conf->raid_disks = sb->raid_disks;
10140 + * 0 for a fully functional array, 1 for a degraded array.
10142 + conf->failed_disks = conf->raid_disks - conf->working_disks;
10143 + conf->mddev = mddev;
10144 + conf->chunk_size = sb->chunk_size;
10145 + conf->level = sb->level;
10146 + conf->algorithm = sb->layout;
10147 + conf->max_nr_stripes = NR_STRIPES;
10150 + for (i = 0; i < conf->raid_disks; i++) {
10151 + if (!conf->disks[i].used_slot) {
10157 + if (!conf->chunk_size || conf->chunk_size % 4) {
10158 + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
10161 - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
10162 - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
10163 + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
10164 + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
10167 - if (raid_conf->failed_disks > 1) {
10168 - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
10169 + if (conf->failed_disks > 1) {
10170 + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
10174 - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
10175 - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
10176 - sb->state |= 1 << MD_SB_ERRORS;
10178 + if (conf->working_disks != sb->raid_disks) {
10179 + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10180 + start_recovery = 1;
10183 - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
10184 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10186 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
10187 + check_consistency(mddev)) {
10188 + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
10189 + sb->state &= ~(1 << MD_SB_CLEAN);
10192 -#if SUPPORT_RECONSTRUCTION
10193 - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
10194 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10197 + const char * name = "raid5d";
10199 + conf->thread = md_register_thread(raid5d, conf, name);
10200 + if (!conf->thread) {
10201 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
10205 -#endif /* SUPPORT_RECONSTRUCTION */
10207 - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
10208 - raid_conf->raid_disks * (sizeof(struct buffer_head) +
10209 + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
10210 + conf->raid_disks * (sizeof(struct buffer_head) +
10211 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
10212 - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
10213 + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
10214 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
10215 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
10216 + shrink_stripes(conf, conf->max_nr_stripes);
10219 - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
10220 + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
10223 * Regenerate the "device is in sync with the raid set" bit for
10226 - for (i = 0; i < sb->nr_disks ; i++) {
10227 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10228 + for (i = 0; i < MD_SB_DISKS ; i++) {
10229 + mark_disk_nonsync(sb->disks + i);
10230 for (j = 0; j < sb->raid_disks; j++) {
10231 - if (!raid_conf->disks[j].operational)
10232 + if (!conf->disks[j].operational)
10234 - if (sb->disks[i].number == raid_conf->disks[j].number)
10235 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10236 + if (sb->disks[i].number == conf->disks[j].number)
10237 + mark_disk_sync(sb->disks + i);
10240 - sb->active_disks = raid_conf->working_disks;
10241 + sb->active_disks = conf->working_disks;
10243 if (sb->active_disks == sb->raid_disks)
10244 - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
10245 + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
10247 - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
10248 + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
10250 + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
10251 + const char * name = "raid5syncd";
10253 + conf->resync_thread = md_register_thread(raid5syncd, conf,name);
10254 + if (!conf->resync_thread) {
10255 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
10259 - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
10260 - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
10261 - raid_conf->resync_parity = 1;
10262 -#if SUPPORT_RECONSTRUCTION
10263 - md_wakeup_thread(raid_conf->resync_thread);
10264 -#endif /* SUPPORT_RECONSTRUCTION */
10265 + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
10266 + conf->resync_parity = 1;
10267 + md_wakeup_thread(conf->resync_thread);
10270 + print_raid5_conf(conf);
10271 + if (start_recovery)
10272 + md_recover_arrays();
10273 + print_raid5_conf(conf);
10275 /* Ok, everything is just fine now */
10279 - if (raid_conf->stripe_hashtbl)
10280 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
10281 - kfree(raid_conf);
10283 + print_raid5_conf(conf);
10284 + if (conf->stripe_hashtbl)
10285 + free_pages((unsigned long) conf->stripe_hashtbl,
10286 + HASH_PAGES_ORDER);
10289 mddev->private = NULL;
10290 - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10291 + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
10296 -static int raid5_stop (int minor, struct md_dev *mddev)
10297 +static int raid5_stop_resync (mddev_t *mddev)
10299 + raid5_conf_t *conf = mddev_to_conf(mddev);
10300 + mdk_thread_t *thread = conf->resync_thread;
10303 + if (conf->resync_parity) {
10304 + conf->resync_parity = 2;
10305 + md_interrupt_thread(thread);
10306 + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
10314 +static int raid5_restart_resync (mddev_t *mddev)
10316 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
10317 + raid5_conf_t *conf = mddev_to_conf(mddev);
10319 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
10320 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
10321 - md_unregister_thread(raid_conf->thread);
10322 -#if SUPPORT_RECONSTRUCTION
10323 - md_unregister_thread(raid_conf->resync_thread);
10324 -#endif /* SUPPORT_RECONSTRUCTION */
10325 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
10326 - kfree(raid_conf);
10327 + if (conf->resync_parity) {
10328 + if (!conf->resync_thread) {
10332 + printk("raid5: waking up raid5resync.\n");
10333 + conf->resync_parity = 1;
10334 + md_wakeup_thread(conf->resync_thread);
10337 + printk("raid5: no restart-resync needed.\n");
10342 +static int raid5_stop (mddev_t *mddev)
10344 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
10346 + shrink_stripe_cache(conf, conf->max_nr_stripes);
10347 + shrink_stripes(conf, conf->max_nr_stripes);
10348 + md_unregister_thread(conf->thread);
10349 + if (conf->resync_thread)
10350 + md_unregister_thread(conf->resync_thread);
10351 + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
10353 mddev->private = NULL;
10358 -static int raid5_status (char *page, int minor, struct md_dev *mddev)
10359 +static int raid5_status (char *page, mddev_t *mddev)
10361 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
10362 - md_superblock_t *sb = mddev->sb;
10363 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
10364 + mdp_super_t *sb = mddev->sb;
10367 - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
10368 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
10369 - for (i = 0; i < raid_conf->raid_disks; i++)
10370 - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
10371 + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
10372 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
10373 + for (i = 0; i < conf->raid_disks; i++)
10374 + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
10375 sz += sprintf (page+sz, "]");
10379 -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
10380 +static void print_raid5_conf (raid5_conf_t *conf)
10383 + struct disk_info *tmp;
10385 + printk("RAID5 conf printout:\n");
10387 + printk("(conf==NULL)\n");
10390 + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
10391 + conf->working_disks, conf->failed_disks);
10393 + for (i = 0; i < MD_SB_DISKS; i++) {
10394 + tmp = conf->disks + i;
10395 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10396 + i, tmp->spare,tmp->operational,
10397 + tmp->number,tmp->raid_disk,tmp->used_slot,
10398 + partition_name(tmp->dev));
10402 +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10404 - int i = 0, failed_disk = -1;
10405 - struct raid5_data *raid_conf = mddev->private;
10406 - struct disk_info *disk = raid_conf->disks;
10408 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10409 + raid5_conf_t *conf = mddev->private;
10410 + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10411 unsigned long flags;
10412 - md_superblock_t *sb = mddev->sb;
10413 - md_descriptor_t *descriptor;
10414 + mdp_super_t *sb = mddev->sb;
10415 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10417 - for (i = 0; i < MD_SB_DISKS; i++, disk++) {
10418 - if (disk->spare && disk->number == spare->number)
10423 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
10424 - if (!disk->operational)
10426 - if (failed_disk == -1)
10431 + print_raid5_conf(conf);
10433 + * find the disk ...
10436 - case SPARE_WRITE:
10437 - disk->operational = 1;
10438 - disk->write_only = 1;
10439 - raid_conf->spare = disk;
10441 - case SPARE_INACTIVE:
10442 - disk->operational = 0;
10443 - disk->write_only = 0;
10444 - raid_conf->spare = NULL;
10446 - case SPARE_ACTIVE:
10448 - disk->write_only = 0;
10450 - descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
10451 - i = spare->raid_disk;
10452 - disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
10453 - if (disk->raid_disk != failed_disk)
10454 - printk("raid5: disk->raid_disk != failed_disk");
10455 - descriptor->raid_disk = i;
10457 - raid_conf->spare = NULL;
10458 - raid_conf->working_disks++;
10459 - raid_conf->failed_disks--;
10460 - raid_conf->disks[failed_disk] = *disk;
10463 - printk("raid5_mark_spare: bug: state == %d\n", state);
10464 - restore_flags(flags);
10466 + case DISKOP_SPARE_ACTIVE:
10469 + * Find the failed disk within the RAID5 configuration ...
10470 + * (this can only be in the first conf->raid_disks part)
10472 + for (i = 0; i < conf->raid_disks; i++) {
10473 + tmp = conf->disks + i;
10474 + if ((!tmp->operational && !tmp->spare) ||
10475 + !tmp->used_slot) {
10481 + * When we activate a spare disk we _must_ have a disk in
10482 + * the lower (active) part of the array to replace.
10484 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10489 + /* fall through */
10491 + case DISKOP_SPARE_WRITE:
10492 + case DISKOP_SPARE_INACTIVE:
10495 + * Find the spare disk ... (can only be in the 'high'
10496 + * area of the array)
10498 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10499 + tmp = conf->disks + i;
10500 + if (tmp->spare && tmp->number == (*d)->number) {
10505 + if (spare_disk == -1) {
10512 + case DISKOP_HOT_REMOVE_DISK:
10514 + for (i = 0; i < MD_SB_DISKS; i++) {
10515 + tmp = conf->disks + i;
10516 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
10517 + if (tmp->operational) {
10521 + removed_disk = i;
10525 + if (removed_disk == -1) {
10532 + case DISKOP_HOT_ADD_DISK:
10534 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10535 + tmp = conf->disks + i;
10536 + if (!tmp->used_slot) {
10541 + if (added_disk == -1) {
10551 + * Switch the spare disk to write-only mode:
10553 + case DISKOP_SPARE_WRITE:
10554 + if (conf->spare) {
10559 + sdisk = conf->disks + spare_disk;
10560 + sdisk->operational = 1;
10561 + sdisk->write_only = 1;
10562 + conf->spare = sdisk;
10565 + * Deactivate a spare disk:
10567 + case DISKOP_SPARE_INACTIVE:
10568 + sdisk = conf->disks + spare_disk;
10569 + sdisk->operational = 0;
10570 + sdisk->write_only = 0;
10572 + * Was the spare being resynced?
10574 + if (conf->spare == sdisk)
10575 + conf->spare = NULL;
10578 + * Activate (mark read-write) the (now sync) spare disk,
10579 + * which means we switch it's 'raid position' (->raid_disk)
10580 + * with the failed disk. (only the first 'conf->raid_disks'
10581 + * slots are used for 'real' disks and we must preserve this
10584 + case DISKOP_SPARE_ACTIVE:
10585 + if (!conf->spare) {
10590 + sdisk = conf->disks + spare_disk;
10591 + fdisk = conf->disks + failed_disk;
10593 + spare_desc = &sb->disks[sdisk->number];
10594 + failed_desc = &sb->disks[fdisk->number];
10596 + if (spare_desc != *d) {
10602 + if (spare_desc->raid_disk != sdisk->raid_disk) {
10608 + if (sdisk->raid_disk != spare_disk) {
10614 + if (failed_desc->raid_disk != fdisk->raid_disk) {
10620 + if (fdisk->raid_disk != failed_disk) {
10627 + * do the switch finally
10629 + xchg_values(*spare_desc, *failed_desc);
10630 + xchg_values(*fdisk, *sdisk);
10633 + * (careful, 'failed' and 'spare' are switched from now on)
10635 + * we want to preserve linear numbering and we want to
10636 + * give the proper raid_disk number to the now activated
10637 + * disk. (this means we switch back these values)
10640 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10641 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10642 + xchg_values(spare_desc->number, failed_desc->number);
10643 + xchg_values(sdisk->number, fdisk->number);
10645 + *d = failed_desc;
10647 + if (sdisk->dev == MKDEV(0,0))
10648 + sdisk->used_slot = 0;
10651 + * this really activates the spare.
10653 + fdisk->spare = 0;
10654 + fdisk->write_only = 0;
10657 + * if we activate a spare, we definitely replace a
10658 + * non-operational disk slot in the 'low' area of
10659 + * the disk array.
10661 + conf->failed_disks--;
10662 + conf->working_disks++;
10663 + conf->spare = NULL;
10667 + case DISKOP_HOT_REMOVE_DISK:
10668 + rdisk = conf->disks + removed_disk;
10670 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10675 + rdisk->dev = MKDEV(0,0);
10676 + rdisk->used_slot = 0;
10680 + case DISKOP_HOT_ADD_DISK:
10681 + adisk = conf->disks + added_disk;
10684 + if (added_disk != added_desc->number) {
10690 + adisk->number = added_desc->number;
10691 + adisk->raid_disk = added_desc->raid_disk;
10692 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10694 + adisk->operational = 0;
10695 + adisk->write_only = 0;
10696 + adisk->spare = 1;
10697 + adisk->used_slot = 1;
10708 restore_flags(flags);
10710 + print_raid5_conf(conf);
10714 -static struct md_personality raid5_personality=
10715 +static mdk_personality_t raid5_personality=
10719 @@ -1648,14 +2072,19 @@
10720 NULL, /* no ioctls */
10723 - /* raid5_hot_add_disk, */ NULL,
10724 - /* raid1_hot_remove_drive */ NULL,
10727 + raid5_stop_resync,
10728 + raid5_restart_resync
10731 int raid5_init (void)
10733 - return register_md_personality (RAID5, &raid5_personality);
10736 + err = register_md_personality (RAID5, &raid5_personality);
10743 diff -ruN linux.orig/drivers/block/translucent.c linux-2.2.16/drivers/block/translucent.c
10744 --- linux.orig/drivers/block/translucent.c Thu Jan 1 01:00:00 1970
10745 +++ linux-2.2.16/drivers/block/translucent.c Fri Jun 9 11:37:45 2000
10748 + translucent.c : Translucent RAID driver for Linux
10749 + Copyright (C) 1998 Ingo Molnar
10751 + Translucent mode management functions.
10753 + This program is free software; you can redistribute it and/or modify
10754 + it under the terms of the GNU General Public License as published by
10755 + the Free Software Foundation; either version 2, or (at your option)
10756 + any later version.
10758 + You should have received a copy of the GNU General Public License
10759 + (for example /usr/src/linux/COPYING); if not, write to the Free
10760 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
10763 +#include <linux/module.h>
10765 +#include <linux/raid/md.h>
10766 +#include <linux/malloc.h>
10768 +#include <linux/raid/translucent.h>
10770 +#define MAJOR_NR MD_MAJOR
10772 +#define MD_PERSONALITY
10774 +static int translucent_run (mddev_t *mddev)
10776 + translucent_conf_t *conf;
10777 + mdk_rdev_t *rdev;
10780 + MOD_INC_USE_COUNT;
10782 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
10785 + mddev->private = conf;
10787 + if (mddev->nb_dev != 2) {
10788 + printk("translucent: this mode needs 2 disks, aborting!\n");
10792 + if (md_check_ordering(mddev)) {
10793 + printk("translucent: disks are not ordered, aborting!\n");
10797 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
10798 + dev_info_t *disk = conf->disks + i;
10800 + disk->dev = rdev->dev;
10801 + disk->size = rdev->size;
10810 + MOD_DEC_USE_COUNT;
10814 +static int translucent_stop (mddev_t *mddev)
10816 + translucent_conf_t *conf = mddev_to_conf(mddev);
10820 + MOD_DEC_USE_COUNT;
10826 +static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
10827 + unsigned long *rsector, unsigned long size)
10829 + translucent_conf_t *conf = mddev_to_conf(mddev);
10831 + *rdev = conf->disks[0].dev;
10836 +static int translucent_status (char *page, mddev_t *mddev)
10840 + sz += sprintf(page+sz, " %d%% full", 10);
10845 +static mdk_personality_t translucent_personality=
10852 + translucent_stop,
10853 + translucent_status,
10864 +md__initfunc(void translucent_init (void))
10866 + register_md_personality (TRANSLUCENT, &translucent_personality);
10871 +int init_module (void)
10873 + return (register_md_personality (TRANSLUCENT, &translucent_personality));
10876 +void cleanup_module (void)
10878 + unregister_md_personality (TRANSLUCENT);
10883 diff -ruN linux.orig/drivers/block/xor.c linux-2.2.16/drivers/block/xor.c
10884 --- linux.orig/drivers/block/xor.c Thu Jan 1 01:00:00 1970
10885 +++ linux-2.2.16/drivers/block/xor.c Fri Jun 9 11:37:45 2000
10888 + * xor.c : Multiple Devices driver for Linux
10890 + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
10893 + * optimized RAID-5 checksumming functions.
10895 + * This program is free software; you can redistribute it and/or modify
10896 + * it under the terms of the GNU General Public License as published by
10897 + * the Free Software Foundation; either version 2, or (at your option)
10898 + * any later version.
10900 + * You should have received a copy of the GNU General Public License
10901 + * (for example /usr/src/linux/COPYING); if not, write to the Free
10902 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
10904 +#include <linux/module.h>
10905 +#include <linux/raid/md.h>
10906 +#ifdef __sparc_v9__
10907 +#include <asm/head.h>
10908 +#include <asm/asi.h>
10909 +#include <asm/visasm.h>
10913 + * we use the 'XOR function template' to register multiple xor
10914 + * functions runtime. The kernel measures their speed upon bootup
10915 + * and decides which one to use. (compile-time registration is
10916 + * not enough as certain CPU features like MMX can only be detected
10919 + * this architecture makes it pretty easy to add new routines
10920 + * that are faster on certain CPUs, without killing other CPU's
10921 + * 'native' routine. Although the current routines are belived
10922 + * to be the physically fastest ones on all CPUs tested, but
10923 + * feel free to prove me wrong and add yet another routine =B-)
10927 +#define MAX_XOR_BLOCKS 5
10929 +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
10931 +typedef void (*xor_block_t) XOR_ARGS;
10932 +xor_block_t xor_block = NULL;
10934 +#ifndef __sparc_v9__
10936 +struct xor_block_template;
10938 +struct xor_block_template {
10940 + xor_block_t xor_block;
10942 + struct xor_block_template * next;
10945 +struct xor_block_template * xor_functions = NULL;
10947 +#define XORBLOCK_TEMPLATE(x) \
10948 +static void xor_block_##x XOR_ARGS; \
10949 +static struct xor_block_template t_xor_block_##x = \
10950 + { #x, xor_block_##x, 0, NULL }; \
10951 +static void xor_block_##x XOR_ARGS
10955 +#ifdef CONFIG_X86_XMM
10957 + * Cache avoiding checksumming functions utilizing KNI instructions
10958 + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
10961 +XORBLOCK_TEMPLATE(pIII_kni)
10963 + char xmm_save[16*4];
10965 + int lines = (bh_ptr[0]->b_size>>8);
10967 + __asm__ __volatile__ (
10968 + "movl %%cr0,%0 ;\n\t"
10970 + "movups %%xmm0,(%1) ;\n\t"
10971 + "movups %%xmm1,0x10(%1) ;\n\t"
10972 + "movups %%xmm2,0x20(%1) ;\n\t"
10973 + "movups %%xmm3,0x30(%1) ;\n\t"
10978 +#define OFFS(x) "8*("#x"*2)"
10980 + " prefetcht0 "OFFS(x)"(%1) ;\n"
10982 + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
10984 + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
10986 + " prefetchnta "OFFS(x)"(%2) ;\n"
10988 + " prefetchnta "OFFS(x)"(%3) ;\n"
10990 + " prefetchnta "OFFS(x)"(%4) ;\n"
10992 + " prefetchnta "OFFS(x)"(%5) ;\n"
10994 + " prefetchnta "OFFS(x)"(%6) ;\n"
10995 +#define XO1(x,y) \
10996 + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
10997 +#define XO2(x,y) \
10998 + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
10999 +#define XO3(x,y) \
11000 + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
11001 +#define XO4(x,y) \
11002 + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
11003 +#define XO5(x,y) \
11004 + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
11008 + __asm__ __volatile__ (
11010 +#define BLOCK(i) \
11032 + " .align 32,0x90 ;\n"
11040 + " addl $256, %1 ;\n"
11041 + " addl $256, %2 ;\n"
11047 + "r" (bh_ptr[0]->b_data),
11048 + "r" (bh_ptr[1]->b_data)
11052 + __asm__ __volatile__ (
11054 +#define BLOCK(i) \
11082 + " .align 32,0x90 ;\n"
11090 + " addl $256, %1 ;\n"
11091 + " addl $256, %2 ;\n"
11092 + " addl $256, %3 ;\n"
11097 + "r" (bh_ptr[0]->b_data),
11098 + "r" (bh_ptr[1]->b_data),
11099 + "r" (bh_ptr[2]->b_data)
11103 + __asm__ __volatile__ (
11105 +#define BLOCK(i) \
11139 + " .align 32,0x90 ;\n"
11147 + " addl $256, %1 ;\n"
11148 + " addl $256, %2 ;\n"
11149 + " addl $256, %3 ;\n"
11150 + " addl $256, %4 ;\n"
11156 + "r" (bh_ptr[0]->b_data),
11157 + "r" (bh_ptr[1]->b_data),
11158 + "r" (bh_ptr[2]->b_data),
11159 + "r" (bh_ptr[3]->b_data)
11163 + __asm__ __volatile__ (
11165 +#define BLOCK(i) \
11205 + " .align 32,0x90 ;\n"
11213 + " addl $256, %1 ;\n"
11214 + " addl $256, %2 ;\n"
11215 + " addl $256, %3 ;\n"
11216 + " addl $256, %4 ;\n"
11217 + " addl $256, %5 ;\n"
11223 + "r" (bh_ptr[0]->b_data),
11224 + "r" (bh_ptr[1]->b_data),
11225 + "r" (bh_ptr[2]->b_data),
11226 + "r" (bh_ptr[3]->b_data),
11227 + "r" (bh_ptr[4]->b_data)
11232 + __asm__ __volatile__ (
11234 + "movups (%1),%%xmm0 ;\n\t"
11235 + "movups 0x10(%1),%%xmm1 ;\n\t"
11236 + "movups 0x20(%1),%%xmm2 ;\n\t"
11237 + "movups 0x30(%1),%%xmm3 ;\n\t"
11238 + "movl %0,%%cr0 ;\n\t"
11240 + : "r" (cr0), "r" (xmm_save)
11260 +#endif /* CONFIG_X86_XMM */
11263 + * high-speed RAID5 checksumming functions utilizing MMX instructions
11264 + * Copyright (C) 1998 Ingo Molnar
11266 +XORBLOCK_TEMPLATE(pII_mmx)
11268 + char fpu_save[108];
11269 + int lines = (bh_ptr[0]->b_size>>7);
11271 + if (!(current->flags & PF_USEDFPU))
11272 + __asm__ __volatile__ ( " clts;\n");
11274 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
11277 + " movq 8*("#x")(%1), %%mm"#y" ;\n"
11279 + " movq %%mm"#y", 8*("#x")(%1) ;\n"
11280 +#define XO1(x,y) \
11281 + " pxor 8*("#x")(%2), %%mm"#y" ;\n"
11282 +#define XO2(x,y) \
11283 + " pxor 8*("#x")(%3), %%mm"#y" ;\n"
11284 +#define XO3(x,y) \
11285 + " pxor 8*("#x")(%4), %%mm"#y" ;\n"
11286 +#define XO4(x,y) \
11287 + " pxor 8*("#x")(%5), %%mm"#y" ;\n"
11291 + __asm__ __volatile__ (
11293 +#define BLOCK(i) \
11307 + " .align 32,0x90 ;\n"
11315 + " addl $128, %1 ;\n"
11316 + " addl $128, %2 ;\n"
11321 + "r" (bh_ptr[0]->b_data),
11322 + "r" (bh_ptr[1]->b_data)
11326 + __asm__ __volatile__ (
11328 +#define BLOCK(i) \
11346 + " .align 32,0x90 ;\n"
11354 + " addl $128, %1 ;\n"
11355 + " addl $128, %2 ;\n"
11356 + " addl $128, %3 ;\n"
11361 + "r" (bh_ptr[0]->b_data),
11362 + "r" (bh_ptr[1]->b_data),
11363 + "r" (bh_ptr[2]->b_data)
11367 + __asm__ __volatile__ (
11369 +#define BLOCK(i) \
11391 + " .align 32,0x90 ;\n"
11399 + " addl $128, %1 ;\n"
11400 + " addl $128, %2 ;\n"
11401 + " addl $128, %3 ;\n"
11402 + " addl $128, %4 ;\n"
11407 + "r" (bh_ptr[0]->b_data),
11408 + "r" (bh_ptr[1]->b_data),
11409 + "r" (bh_ptr[2]->b_data),
11410 + "r" (bh_ptr[3]->b_data)
11414 + __asm__ __volatile__ (
11416 +#define BLOCK(i) \
11442 + " .align 32,0x90 ;\n"
11450 + " addl $128, %1 ;\n"
11451 + " addl $128, %2 ;\n"
11452 + " addl $128, %3 ;\n"
11453 + " addl $128, %4 ;\n"
11454 + " addl $128, %5 ;\n"
11459 + "r" (bh_ptr[0]->b_data),
11460 + "r" (bh_ptr[1]->b_data),
11461 + "r" (bh_ptr[2]->b_data),
11462 + "r" (bh_ptr[3]->b_data),
11463 + "r" (bh_ptr[4]->b_data)
11468 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
11470 + if (!(current->flags & PF_USEDFPU))
11482 +XORBLOCK_TEMPLATE(p5_mmx)
11484 + char fpu_save[108];
11485 + int lines = (bh_ptr[0]->b_size>>6);
11487 + if (!(current->flags & PF_USEDFPU))
11488 + __asm__ __volatile__ ( " clts;\n");
11490 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
11494 + __asm__ __volatile__ (
11496 + " .align 32,0x90 ;\n"
11498 + " movq (%1), %%mm0 ;\n"
11499 + " movq 8(%1), %%mm1 ;\n"
11500 + " pxor (%2), %%mm0 ;\n"
11501 + " movq 16(%1), %%mm2 ;\n"
11502 + " movq %%mm0, (%1) ;\n"
11503 + " pxor 8(%2), %%mm1 ;\n"
11504 + " movq 24(%1), %%mm3 ;\n"
11505 + " movq %%mm1, 8(%1) ;\n"
11506 + " pxor 16(%2), %%mm2 ;\n"
11507 + " movq 32(%1), %%mm4 ;\n"
11508 + " movq %%mm2, 16(%1) ;\n"
11509 + " pxor 24(%2), %%mm3 ;\n"
11510 + " movq 40(%1), %%mm5 ;\n"
11511 + " movq %%mm3, 24(%1) ;\n"
11512 + " pxor 32(%2), %%mm4 ;\n"
11513 + " movq 48(%1), %%mm6 ;\n"
11514 + " movq %%mm4, 32(%1) ;\n"
11515 + " pxor 40(%2), %%mm5 ;\n"
11516 + " movq 56(%1), %%mm7 ;\n"
11517 + " movq %%mm5, 40(%1) ;\n"
11518 + " pxor 48(%2), %%mm6 ;\n"
11519 + " pxor 56(%2), %%mm7 ;\n"
11520 + " movq %%mm6, 48(%1) ;\n"
11521 + " movq %%mm7, 56(%1) ;\n"
11523 + " addl $64, %1 ;\n"
11524 + " addl $64, %2 ;\n"
11530 + "r" (bh_ptr[0]->b_data),
11531 + "r" (bh_ptr[1]->b_data)
11535 + __asm__ __volatile__ (
11537 + " .align 32,0x90 ;\n"
11539 + " movq (%1), %%mm0 ;\n"
11540 + " movq 8(%1), %%mm1 ;\n"
11541 + " pxor (%2), %%mm0 ;\n"
11542 + " movq 16(%1), %%mm2 ;\n"
11543 + " pxor 8(%2), %%mm1 ;\n"
11544 + " pxor (%3), %%mm0 ;\n"
11545 + " pxor 16(%2), %%mm2 ;\n"
11546 + " movq %%mm0, (%1) ;\n"
11547 + " pxor 8(%3), %%mm1 ;\n"
11548 + " pxor 16(%3), %%mm2 ;\n"
11549 + " movq 24(%1), %%mm3 ;\n"
11550 + " movq %%mm1, 8(%1) ;\n"
11551 + " movq 32(%1), %%mm4 ;\n"
11552 + " movq 40(%1), %%mm5 ;\n"
11553 + " pxor 24(%2), %%mm3 ;\n"
11554 + " movq %%mm2, 16(%1) ;\n"
11555 + " pxor 32(%2), %%mm4 ;\n"
11556 + " pxor 24(%3), %%mm3 ;\n"
11557 + " pxor 40(%2), %%mm5 ;\n"
11558 + " movq %%mm3, 24(%1) ;\n"
11559 + " pxor 32(%3), %%mm4 ;\n"
11560 + " pxor 40(%3), %%mm5 ;\n"
11561 + " movq 48(%1), %%mm6 ;\n"
11562 + " movq %%mm4, 32(%1) ;\n"
11563 + " movq 56(%1), %%mm7 ;\n"
11564 + " pxor 48(%2), %%mm6 ;\n"
11565 + " movq %%mm5, 40(%1) ;\n"
11566 + " pxor 56(%2), %%mm7 ;\n"
11567 + " pxor 48(%3), %%mm6 ;\n"
11568 + " pxor 56(%3), %%mm7 ;\n"
11569 + " movq %%mm6, 48(%1) ;\n"
11570 + " movq %%mm7, 56(%1) ;\n"
11572 + " addl $64, %1 ;\n"
11573 + " addl $64, %2 ;\n"
11574 + " addl $64, %3 ;\n"
11580 + "r" (bh_ptr[0]->b_data),
11581 + "r" (bh_ptr[1]->b_data),
11582 + "r" (bh_ptr[2]->b_data)
11586 + __asm__ __volatile__ (
11588 + " .align 32,0x90 ;\n"
11590 + " movq (%1), %%mm0 ;\n"
11591 + " movq 8(%1), %%mm1 ;\n"
11592 + " pxor (%2), %%mm0 ;\n"
11593 + " movq 16(%1), %%mm2 ;\n"
11594 + " pxor 8(%2), %%mm1 ;\n"
11595 + " pxor (%3), %%mm0 ;\n"
11596 + " pxor 16(%2), %%mm2 ;\n"
11597 + " pxor 8(%3), %%mm1 ;\n"
11598 + " pxor (%4), %%mm0 ;\n"
11599 + " movq 24(%1), %%mm3 ;\n"
11600 + " pxor 16(%3), %%mm2 ;\n"
11601 + " pxor 8(%4), %%mm1 ;\n"
11602 + " movq %%mm0, (%1) ;\n"
11603 + " movq 32(%1), %%mm4 ;\n"
11604 + " pxor 24(%2), %%mm3 ;\n"
11605 + " pxor 16(%4), %%mm2 ;\n"
11606 + " movq %%mm1, 8(%1) ;\n"
11607 + " movq 40(%1), %%mm5 ;\n"
11608 + " pxor 32(%2), %%mm4 ;\n"
11609 + " pxor 24(%3), %%mm3 ;\n"
11610 + " movq %%mm2, 16(%1) ;\n"
11611 + " pxor 40(%2), %%mm5 ;\n"
11612 + " pxor 32(%3), %%mm4 ;\n"
11613 + " pxor 24(%4), %%mm3 ;\n"
11614 + " movq %%mm3, 24(%1) ;\n"
11615 + " movq 56(%1), %%mm7 ;\n"
11616 + " movq 48(%1), %%mm6 ;\n"
11617 + " pxor 40(%3), %%mm5 ;\n"
11618 + " pxor 32(%4), %%mm4 ;\n"
11619 + " pxor 48(%2), %%mm6 ;\n"
11620 + " movq %%mm4, 32(%1) ;\n"
11621 + " pxor 56(%2), %%mm7 ;\n"
11622 + " pxor 40(%4), %%mm5 ;\n"
11623 + " pxor 48(%3), %%mm6 ;\n"
11624 + " pxor 56(%3), %%mm7 ;\n"
11625 + " movq %%mm5, 40(%1) ;\n"
11626 + " pxor 48(%4), %%mm6 ;\n"
11627 + " pxor 56(%4), %%mm7 ;\n"
11628 + " movq %%mm6, 48(%1) ;\n"
11629 + " movq %%mm7, 56(%1) ;\n"
11631 + " addl $64, %1 ;\n"
11632 + " addl $64, %2 ;\n"
11633 + " addl $64, %3 ;\n"
11634 + " addl $64, %4 ;\n"
11640 + "r" (bh_ptr[0]->b_data),
11641 + "r" (bh_ptr[1]->b_data),
11642 + "r" (bh_ptr[2]->b_data),
11643 + "r" (bh_ptr[3]->b_data)
11647 + __asm__ __volatile__ (
11649 + " .align 32,0x90 ;\n"
11651 + " movq (%1), %%mm0 ;\n"
11652 + " movq 8(%1), %%mm1 ;\n"
11653 + " pxor (%2), %%mm0 ;\n"
11654 + " pxor 8(%2), %%mm1 ;\n"
11655 + " movq 16(%1), %%mm2 ;\n"
11656 + " pxor (%3), %%mm0 ;\n"
11657 + " pxor 8(%3), %%mm1 ;\n"
11658 + " pxor 16(%2), %%mm2 ;\n"
11659 + " pxor (%4), %%mm0 ;\n"
11660 + " pxor 8(%4), %%mm1 ;\n"
11661 + " pxor 16(%3), %%mm2 ;\n"
11662 + " movq 24(%1), %%mm3 ;\n"
11663 + " pxor (%5), %%mm0 ;\n"
11664 + " pxor 8(%5), %%mm1 ;\n"
11665 + " movq %%mm0, (%1) ;\n"
11666 + " pxor 16(%4), %%mm2 ;\n"
11667 + " pxor 24(%2), %%mm3 ;\n"
11668 + " movq %%mm1, 8(%1) ;\n"
11669 + " pxor 16(%5), %%mm2 ;\n"
11670 + " pxor 24(%3), %%mm3 ;\n"
11671 + " movq 32(%1), %%mm4 ;\n"
11672 + " movq %%mm2, 16(%1) ;\n"
11673 + " pxor 24(%4), %%mm3 ;\n"
11674 + " pxor 32(%2), %%mm4 ;\n"
11675 + " movq 40(%1), %%mm5 ;\n"
11676 + " pxor 24(%5), %%mm3 ;\n"
11677 + " pxor 32(%3), %%mm4 ;\n"
11678 + " pxor 40(%2), %%mm5 ;\n"
11679 + " movq %%mm3, 24(%1) ;\n"
11680 + " pxor 32(%4), %%mm4 ;\n"
11681 + " pxor 40(%3), %%mm5 ;\n"
11682 + " movq 48(%1), %%mm6 ;\n"
11683 + " movq 56(%1), %%mm7 ;\n"
11684 + " pxor 32(%5), %%mm4 ;\n"
11685 + " pxor 40(%4), %%mm5 ;\n"
11686 + " pxor 48(%2), %%mm6 ;\n"
11687 + " pxor 56(%2), %%mm7 ;\n"
11688 + " movq %%mm4, 32(%1) ;\n"
11689 + " pxor 48(%3), %%mm6 ;\n"
11690 + " pxor 56(%3), %%mm7 ;\n"
11691 + " pxor 40(%5), %%mm5 ;\n"
11692 + " pxor 48(%4), %%mm6 ;\n"
11693 + " pxor 56(%4), %%mm7 ;\n"
11694 + " movq %%mm5, 40(%1) ;\n"
11695 + " pxor 48(%5), %%mm6 ;\n"
11696 + " pxor 56(%5), %%mm7 ;\n"
11697 + " movq %%mm6, 48(%1) ;\n"
11698 + " movq %%mm7, 56(%1) ;\n"
11700 + " addl $64, %1 ;\n"
11701 + " addl $64, %2 ;\n"
11702 + " addl $64, %3 ;\n"
11703 + " addl $64, %4 ;\n"
11704 + " addl $64, %5 ;\n"
11710 + "r" (bh_ptr[0]->b_data),
11711 + "r" (bh_ptr[1]->b_data),
11712 + "r" (bh_ptr[2]->b_data),
11713 + "r" (bh_ptr[3]->b_data),
11714 + "r" (bh_ptr[4]->b_data)
11719 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
11721 + if (!(current->flags & PF_USEDFPU))
11724 +#endif /* __i386__ */
11725 +#endif /* !__sparc_v9__ */
11727 +#ifdef __sparc_v9__
11729 + * High speed xor_block operation for RAID4/5 utilizing the
11730 + * UltraSparc Visual Instruction Set.
11732 + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
11735 + * !(((long)dest | (long)sourceN) & (64 - 1)) &&
11736 + * !(len & 127) && len >= 256
11738 + * It is done in pure assembly, as otherwise gcc makes it
11739 + * a non-leaf function, which is not what we want.
11740 + * Also, we don't measure the speeds as on other architectures,
11741 + * as the measuring routine does not take into account cold caches
11742 + * and the fact that xor_block_VIS bypasses the caches.
11743 + * xor_block_32regs might be 5% faster for count 2 if caches are hot
11744 + * and things just right (for count 3 VIS is about as fast as 32regs for
11745 + * hot caches and for count 4 and 5 VIS is faster by good margin always),
11746 + * but I think it is better not to pollute the caches.
11747 + * Actually, if I'd just fight for speed for hot caches, I could
11748 + * write a hybrid VIS/integer routine, which would do always two
11749 + * 64B blocks in VIS and two in IEUs, but I really care more about
11752 +extern void *VISenter(void);
11753 +extern void xor_block_VIS XOR_ARGS;
11755 +void __xor_block_VIS(void)
11758 + .globl xor_block_VIS
11760 + ldx [%%o1 + 0], %%o4
11761 + ldx [%%o1 + 8], %%o3
11762 + ldx [%%o4 + %1], %%g5
11763 + ldx [%%o4 + %0], %%o4
11764 + ldx [%%o3 + %0], %%o3
11766 + andcc %%o5, %2, %%g0
11767 + be,pt %%icc, 297f
11768 + sethi %%hi(%5), %%g1
11769 + jmpl %%g1 + %%lo(%5), %%g7
11770 + add %%g7, 8, %%g7
11771 +297: wr %%g0, %4, %%fprs
11772 + membar #LoadStore|#StoreLoad|#StoreStore
11773 + sub %%g5, 64, %%g5
11774 + ldda [%%o4] %3, %%f0
11775 + ldda [%%o3] %3, %%f16
11777 + bgeu,pt %%xcc, 10f
11781 + sub %%g5, 64, %%g5
11783 + wr %%g0, %3, %%asi
11785 +2: ldda [%%o4 + 64] %%asi, %%f32
11786 + fxor %%f0, %%f16, %%f16
11787 + fxor %%f2, %%f18, %%f18
11788 + fxor %%f4, %%f20, %%f20
11789 + fxor %%f6, %%f22, %%f22
11790 + fxor %%f8, %%f24, %%f24
11791 + fxor %%f10, %%f26, %%f26
11792 + fxor %%f12, %%f28, %%f28
11793 + fxor %%f14, %%f30, %%f30
11794 + stda %%f16, [%%o4] %3
11795 + ldda [%%o3 + 64] %%asi, %%f48
11796 + ldda [%%o4 + 128] %%asi, %%f0
11797 + fxor %%f32, %%f48, %%f48
11798 + fxor %%f34, %%f50, %%f50
11799 + add %%o4, 128, %%o4
11800 + fxor %%f36, %%f52, %%f52
11801 + add %%o3, 128, %%o3
11802 + fxor %%f38, %%f54, %%f54
11803 + subcc %%g5, 128, %%g5
11804 + fxor %%f40, %%f56, %%f56
11805 + fxor %%f42, %%f58, %%f58
11806 + fxor %%f44, %%f60, %%f60
11807 + fxor %%f46, %%f62, %%f62
11808 + stda %%f48, [%%o4 - 64] %%asi
11810 + ldda [%%o3] %3, %%f16
11812 + ldda [%%o4 + 64] %%asi, %%f32
11813 + fxor %%f0, %%f16, %%f16
11814 + fxor %%f2, %%f18, %%f18
11815 + fxor %%f4, %%f20, %%f20
11816 + fxor %%f6, %%f22, %%f22
11817 + fxor %%f8, %%f24, %%f24
11818 + fxor %%f10, %%f26, %%f26
11819 + fxor %%f12, %%f28, %%f28
11820 + fxor %%f14, %%f30, %%f30
11821 + stda %%f16, [%%o4] %3
11822 + ldda [%%o3 + 64] %%asi, %%f48
11824 + fxor %%f32, %%f48, %%f48
11825 + fxor %%f34, %%f50, %%f50
11826 + fxor %%f36, %%f52, %%f52
11827 + fxor %%f38, %%f54, %%f54
11828 + fxor %%f40, %%f56, %%f56
11829 + fxor %%f42, %%f58, %%f58
11830 + fxor %%f44, %%f60, %%f60
11831 + fxor %%f46, %%f62, %%f62
11832 + stda %%f48, [%%o4 + 64] %%asi
11833 + membar #Sync|#StoreStore|#StoreLoad
11834 + wr %%g0, 0, %%fprs
11836 + wr %%g1, %%g0, %%asi
11838 +13: ldx [%%o1 + 16], %%o2
11839 + ldx [%%o2 + %0], %%o2
11841 +3: ldda [%%o2] %3, %%f32
11842 + fxor %%f0, %%f16, %%f48
11843 + fxor %%f2, %%f18, %%f50
11844 + add %%o4, 64, %%o4
11845 + fxor %%f4, %%f20, %%f52
11846 + fxor %%f6, %%f22, %%f54
11847 + add %%o3, 64, %%o3
11848 + fxor %%f8, %%f24, %%f56
11849 + fxor %%f10, %%f26, %%f58
11850 + fxor %%f12, %%f28, %%f60
11851 + fxor %%f14, %%f30, %%f62
11852 + ldda [%%o4] %3, %%f0
11853 + fxor %%f48, %%f32, %%f48
11854 + fxor %%f50, %%f34, %%f50
11855 + fxor %%f52, %%f36, %%f52
11856 + fxor %%f54, %%f38, %%f54
11857 + add %%o2, 64, %%o2
11858 + fxor %%f56, %%f40, %%f56
11859 + fxor %%f58, %%f42, %%f58
11860 + subcc %%g5, 64, %%g5
11861 + fxor %%f60, %%f44, %%f60
11862 + fxor %%f62, %%f46, %%f62
11863 + stda %%f48, [%%o4 + %%g1] %3
11865 + ldda [%%o3] %3, %%f16
11867 + ldda [%%o2] %3, %%f32
11868 + fxor %%f0, %%f16, %%f48
11869 + fxor %%f2, %%f18, %%f50
11870 + fxor %%f4, %%f20, %%f52
11871 + fxor %%f6, %%f22, %%f54
11872 + fxor %%f8, %%f24, %%f56
11873 + fxor %%f10, %%f26, %%f58
11874 + fxor %%f12, %%f28, %%f60
11875 + fxor %%f14, %%f30, %%f62
11877 + fxor %%f48, %%f32, %%f48
11878 + fxor %%f50, %%f34, %%f50
11879 + fxor %%f52, %%f36, %%f52
11880 + fxor %%f54, %%f38, %%f54
11881 + fxor %%f56, %%f40, %%f56
11882 + fxor %%f58, %%f42, %%f58
11883 + fxor %%f60, %%f44, %%f60
11884 + fxor %%f62, %%f46, %%f62
11885 + stda %%f48, [%%o4] %3
11886 + membar #Sync|#StoreStore|#StoreLoad
11888 + wr %%g0, 0, %%fprs
11894 +14: ldx [%%o1 + 16], %%o2
11895 + ldx [%%o1 + 24], %%o0
11896 + ldx [%%o2 + %0], %%o2
11897 + ldx [%%o0 + %0], %%o0
11899 +4: ldda [%%o2] %3, %%f32
11900 + fxor %%f0, %%f16, %%f16
11901 + fxor %%f2, %%f18, %%f18
11902 + add %%o4, 64, %%o4
11903 + fxor %%f4, %%f20, %%f20
11904 + fxor %%f6, %%f22, %%f22
11905 + add %%o3, 64, %%o3
11906 + fxor %%f8, %%f24, %%f24
11907 + fxor %%f10, %%f26, %%f26
11908 + fxor %%f12, %%f28, %%f28
11909 + fxor %%f14, %%f30, %%f30
11910 + ldda [%%o0] %3, %%f48
11911 + fxor %%f16, %%f32, %%f32
11912 + fxor %%f18, %%f34, %%f34
11913 + fxor %%f20, %%f36, %%f36
11914 + fxor %%f22, %%f38, %%f38
11915 + add %%o2, 64, %%o2
11916 + fxor %%f24, %%f40, %%f40
11917 + fxor %%f26, %%f42, %%f42
11918 + fxor %%f28, %%f44, %%f44
11919 + fxor %%f30, %%f46, %%f46
11920 + ldda [%%o4] %3, %%f0
11921 + fxor %%f32, %%f48, %%f48
11922 + fxor %%f34, %%f50, %%f50
11923 + fxor %%f36, %%f52, %%f52
11924 + add %%o0, 64, %%o0
11925 + fxor %%f38, %%f54, %%f54
11926 + fxor %%f40, %%f56, %%f56
11927 + fxor %%f42, %%f58, %%f58
11928 + subcc %%g5, 64, %%g5
11929 + fxor %%f44, %%f60, %%f60
11930 + fxor %%f46, %%f62, %%f62
11931 + stda %%f48, [%%o4 + %%g1] %3
11933 + ldda [%%o3] %3, %%f16
11935 + ldda [%%o2] %3, %%f32
11936 + fxor %%f0, %%f16, %%f16
11937 + fxor %%f2, %%f18, %%f18
11938 + fxor %%f4, %%f20, %%f20
11939 + fxor %%f6, %%f22, %%f22
11940 + fxor %%f8, %%f24, %%f24
11941 + fxor %%f10, %%f26, %%f26
11942 + fxor %%f12, %%f28, %%f28
11943 + fxor %%f14, %%f30, %%f30
11944 + ldda [%%o0] %3, %%f48
11945 + fxor %%f16, %%f32, %%f32
11946 + fxor %%f18, %%f34, %%f34
11947 + fxor %%f20, %%f36, %%f36
11948 + fxor %%f22, %%f38, %%f38
11949 + fxor %%f24, %%f40, %%f40
11950 + fxor %%f26, %%f42, %%f42
11951 + fxor %%f28, %%f44, %%f44
11952 + fxor %%f30, %%f46, %%f46
11954 + fxor %%f32, %%f48, %%f48
11955 + fxor %%f34, %%f50, %%f50
11956 + fxor %%f36, %%f52, %%f52
11957 + fxor %%f38, %%f54, %%f54
11958 + fxor %%f40, %%f56, %%f56
11959 + fxor %%f42, %%f58, %%f58
11960 + fxor %%f44, %%f60, %%f60
11961 + fxor %%f46, %%f62, %%f62
11962 + stda %%f48, [%%o4] %3
11963 + membar #Sync|#StoreStore|#StoreLoad
11965 + wr %%g0, 0, %%fprs
11967 +15: ldx [%%o1 + 16], %%o2
11968 + ldx [%%o1 + 24], %%o0
11969 + ldx [%%o1 + 32], %%o1
11970 + ldx [%%o2 + %0], %%o2
11971 + ldx [%%o0 + %0], %%o0
11972 + ldx [%%o1 + %0], %%o1
11974 +5: ldda [%%o2] %3, %%f32
11975 + fxor %%f0, %%f16, %%f48
11976 + fxor %%f2, %%f18, %%f50
11977 + add %%o4, 64, %%o4
11978 + fxor %%f4, %%f20, %%f52
11979 + fxor %%f6, %%f22, %%f54
11980 + add %%o3, 64, %%o3
11981 + fxor %%f8, %%f24, %%f56
11982 + fxor %%f10, %%f26, %%f58
11983 + fxor %%f12, %%f28, %%f60
11984 + fxor %%f14, %%f30, %%f62
11985 + ldda [%%o0] %3, %%f16
11986 + fxor %%f48, %%f32, %%f48
11987 + fxor %%f50, %%f34, %%f50
11988 + fxor %%f52, %%f36, %%f52
11989 + fxor %%f54, %%f38, %%f54
11990 + add %%o2, 64, %%o2
11991 + fxor %%f56, %%f40, %%f56
11992 + fxor %%f58, %%f42, %%f58
11993 + fxor %%f60, %%f44, %%f60
11994 + fxor %%f62, %%f46, %%f62
11995 + ldda [%%o1] %3, %%f32
11996 + fxor %%f48, %%f16, %%f48
11997 + fxor %%f50, %%f18, %%f50
11998 + add %%o0, 64, %%o0
11999 + fxor %%f52, %%f20, %%f52
12000 + fxor %%f54, %%f22, %%f54
12001 + add %%o1, 64, %%o1
12002 + fxor %%f56, %%f24, %%f56
12003 + fxor %%f58, %%f26, %%f58
12004 + fxor %%f60, %%f28, %%f60
12005 + fxor %%f62, %%f30, %%f62
12006 + ldda [%%o4] %3, %%f0
12007 + fxor %%f48, %%f32, %%f48
12008 + fxor %%f50, %%f34, %%f50
12009 + fxor %%f52, %%f36, %%f52
12010 + fxor %%f54, %%f38, %%f54
12011 + fxor %%f56, %%f40, %%f56
12012 + fxor %%f58, %%f42, %%f58
12013 + subcc %%g5, 64, %%g5
12014 + fxor %%f60, %%f44, %%f60
12015 + fxor %%f62, %%f46, %%f62
12016 + stda %%f48, [%%o4 + %%g1] %3
12018 + ldda [%%o3] %3, %%f16
12020 + ldda [%%o2] %3, %%f32
12021 + fxor %%f0, %%f16, %%f48
12022 + fxor %%f2, %%f18, %%f50
12023 + fxor %%f4, %%f20, %%f52
12024 + fxor %%f6, %%f22, %%f54
12025 + fxor %%f8, %%f24, %%f56
12026 + fxor %%f10, %%f26, %%f58
12027 + fxor %%f12, %%f28, %%f60
12028 + fxor %%f14, %%f30, %%f62
12029 + ldda [%%o0] %3, %%f16
12030 + fxor %%f48, %%f32, %%f48
12031 + fxor %%f50, %%f34, %%f50
12032 + fxor %%f52, %%f36, %%f52
12033 + fxor %%f54, %%f38, %%f54
12034 + fxor %%f56, %%f40, %%f56
12035 + fxor %%f58, %%f42, %%f58
12036 + fxor %%f60, %%f44, %%f60
12037 + fxor %%f62, %%f46, %%f62
12038 + ldda [%%o1] %3, %%f32
12039 + fxor %%f48, %%f16, %%f48
12040 + fxor %%f50, %%f18, %%f50
12041 + fxor %%f52, %%f20, %%f52
12042 + fxor %%f54, %%f22, %%f54
12043 + fxor %%f56, %%f24, %%f56
12044 + fxor %%f58, %%f26, %%f58
12045 + fxor %%f60, %%f28, %%f60
12046 + fxor %%f62, %%f30, %%f62
12048 + fxor %%f48, %%f32, %%f48
12049 + fxor %%f50, %%f34, %%f50
12050 + fxor %%f52, %%f36, %%f52
12051 + fxor %%f54, %%f38, %%f54
12052 + fxor %%f56, %%f40, %%f56
12053 + fxor %%f58, %%f42, %%f58
12054 + fxor %%f60, %%f44, %%f60
12055 + fxor %%f62, %%f46, %%f62
12056 + stda %%f48, [%%o4] %3
12057 + membar #Sync|#StoreStore|#StoreLoad
12059 + wr %%g0, 0, %%fprs
12061 + "i" (&((struct buffer_head *)0)->b_data),
12062 + "i" (&((struct buffer_head *)0)->b_size),
12063 + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
12064 + "i" (FPRS_FEF), "i" (VISenter));
12066 +#endif /* __sparc_v9__ */
12068 +#if defined(__sparc__) && !defined(__sparc_v9__)
12070 + * High speed xor_block operation for RAID4/5 utilizing the
12071 + * ldd/std SPARC instructions.
12073 + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
12077 +XORBLOCK_TEMPLATE(SPARC)
12079 + int size = bh_ptr[0]->b_size;
12080 + int lines = size / (sizeof (long)) / 8, i;
12081 + long *destp = (long *) bh_ptr[0]->b_data;
12082 + long *source1 = (long *) bh_ptr[1]->b_data;
12083 + long *source2, *source3, *source4;
12087 + for (i = lines; i > 0; i--) {
12088 + __asm__ __volatile__("
12089 + ldd [%0 + 0x00], %%g2
12090 + ldd [%0 + 0x08], %%g4
12091 + ldd [%0 + 0x10], %%o0
12092 + ldd [%0 + 0x18], %%o2
12093 + ldd [%1 + 0x00], %%o4
12094 + ldd [%1 + 0x08], %%l0
12095 + ldd [%1 + 0x10], %%l2
12096 + ldd [%1 + 0x18], %%l4
12097 + xor %%g2, %%o4, %%g2
12098 + xor %%g3, %%o5, %%g3
12099 + xor %%g4, %%l0, %%g4
12100 + xor %%g5, %%l1, %%g5
12101 + xor %%o0, %%l2, %%o0
12102 + xor %%o1, %%l3, %%o1
12103 + xor %%o2, %%l4, %%o2
12104 + xor %%o3, %%l5, %%o3
12105 + std %%g2, [%0 + 0x00]
12106 + std %%g4, [%0 + 0x08]
12107 + std %%o0, [%0 + 0x10]
12108 + std %%o2, [%0 + 0x18]
12109 + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
12110 + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
12116 + source2 = (long *) bh_ptr[2]->b_data;
12117 + for (i = lines; i > 0; i--) {
12118 + __asm__ __volatile__("
12119 + ldd [%0 + 0x00], %%g2
12120 + ldd [%0 + 0x08], %%g4
12121 + ldd [%0 + 0x10], %%o0
12122 + ldd [%0 + 0x18], %%o2
12123 + ldd [%1 + 0x00], %%o4
12124 + ldd [%1 + 0x08], %%l0
12125 + ldd [%1 + 0x10], %%l2
12126 + ldd [%1 + 0x18], %%l4
12127 + xor %%g2, %%o4, %%g2
12128 + xor %%g3, %%o5, %%g3
12129 + ldd [%2 + 0x00], %%o4
12130 + xor %%g4, %%l0, %%g4
12131 + xor %%g5, %%l1, %%g5
12132 + ldd [%2 + 0x08], %%l0
12133 + xor %%o0, %%l2, %%o0
12134 + xor %%o1, %%l3, %%o1
12135 + ldd [%2 + 0x10], %%l2
12136 + xor %%o2, %%l4, %%o2
12137 + xor %%o3, %%l5, %%o3
12138 + ldd [%2 + 0x18], %%l4
12139 + xor %%g2, %%o4, %%g2
12140 + xor %%g3, %%o5, %%g3
12141 + xor %%g4, %%l0, %%g4
12142 + xor %%g5, %%l1, %%g5
12143 + xor %%o0, %%l2, %%o0
12144 + xor %%o1, %%l3, %%o1
12145 + xor %%o2, %%l4, %%o2
12146 + xor %%o3, %%l5, %%o3
12147 + std %%g2, [%0 + 0x00]
12148 + std %%g4, [%0 + 0x08]
12149 + std %%o0, [%0 + 0x10]
12150 + std %%o2, [%0 + 0x18]
12151 + " : : "r" (destp), "r" (source1), "r" (source2)
12152 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12153 + "l0", "l1", "l2", "l3", "l4", "l5");
12160 + source2 = (long *) bh_ptr[2]->b_data;
12161 + source3 = (long *) bh_ptr[3]->b_data;
12162 + for (i = lines; i > 0; i--) {
12163 + __asm__ __volatile__("
12164 + ldd [%0 + 0x00], %%g2
12165 + ldd [%0 + 0x08], %%g4
12166 + ldd [%0 + 0x10], %%o0
12167 + ldd [%0 + 0x18], %%o2
12168 + ldd [%1 + 0x00], %%o4
12169 + ldd [%1 + 0x08], %%l0
12170 + ldd [%1 + 0x10], %%l2
12171 + ldd [%1 + 0x18], %%l4
12172 + xor %%g2, %%o4, %%g2
12173 + xor %%g3, %%o5, %%g3
12174 + ldd [%2 + 0x00], %%o4
12175 + xor %%g4, %%l0, %%g4
12176 + xor %%g5, %%l1, %%g5
12177 + ldd [%2 + 0x08], %%l0
12178 + xor %%o0, %%l2, %%o0
12179 + xor %%o1, %%l3, %%o1
12180 + ldd [%2 + 0x10], %%l2
12181 + xor %%o2, %%l4, %%o2
12182 + xor %%o3, %%l5, %%o3
12183 + ldd [%2 + 0x18], %%l4
12184 + xor %%g2, %%o4, %%g2
12185 + xor %%g3, %%o5, %%g3
12186 + ldd [%3 + 0x00], %%o4
12187 + xor %%g4, %%l0, %%g4
12188 + xor %%g5, %%l1, %%g5
12189 + ldd [%3 + 0x08], %%l0
12190 + xor %%o0, %%l2, %%o0
12191 + xor %%o1, %%l3, %%o1
12192 + ldd [%3 + 0x10], %%l2
12193 + xor %%o2, %%l4, %%o2
12194 + xor %%o3, %%l5, %%o3
12195 + ldd [%3 + 0x18], %%l4
12196 + xor %%g2, %%o4, %%g2
12197 + xor %%g3, %%o5, %%g3
12198 + xor %%g4, %%l0, %%g4
12199 + xor %%g5, %%l1, %%g5
12200 + xor %%o0, %%l2, %%o0
12201 + xor %%o1, %%l3, %%o1
12202 + xor %%o2, %%l4, %%o2
12203 + xor %%o3, %%l5, %%o3
12204 + std %%g2, [%0 + 0x00]
12205 + std %%g4, [%0 + 0x08]
12206 + std %%o0, [%0 + 0x10]
12207 + std %%o2, [%0 + 0x18]
12208 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
12209 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12210 + "l0", "l1", "l2", "l3", "l4", "l5");
12218 + source2 = (long *) bh_ptr[2]->b_data;
12219 + source3 = (long *) bh_ptr[3]->b_data;
12220 + source4 = (long *) bh_ptr[4]->b_data;
12221 + for (i = lines; i > 0; i--) {
12222 + __asm__ __volatile__("
12223 + ldd [%0 + 0x00], %%g2
12224 + ldd [%0 + 0x08], %%g4
12225 + ldd [%0 + 0x10], %%o0
12226 + ldd [%0 + 0x18], %%o2
12227 + ldd [%1 + 0x00], %%o4
12228 + ldd [%1 + 0x08], %%l0
12229 + ldd [%1 + 0x10], %%l2
12230 + ldd [%1 + 0x18], %%l4
12231 + xor %%g2, %%o4, %%g2
12232 + xor %%g3, %%o5, %%g3
12233 + ldd [%2 + 0x00], %%o4
12234 + xor %%g4, %%l0, %%g4
12235 + xor %%g5, %%l1, %%g5
12236 + ldd [%2 + 0x08], %%l0
12237 + xor %%o0, %%l2, %%o0
12238 + xor %%o1, %%l3, %%o1
12239 + ldd [%2 + 0x10], %%l2
12240 + xor %%o2, %%l4, %%o2
12241 + xor %%o3, %%l5, %%o3
12242 + ldd [%2 + 0x18], %%l4
12243 + xor %%g2, %%o4, %%g2
12244 + xor %%g3, %%o5, %%g3
12245 + ldd [%3 + 0x00], %%o4
12246 + xor %%g4, %%l0, %%g4
12247 + xor %%g5, %%l1, %%g5
12248 + ldd [%3 + 0x08], %%l0
12249 + xor %%o0, %%l2, %%o0
12250 + xor %%o1, %%l3, %%o1
12251 + ldd [%3 + 0x10], %%l2
12252 + xor %%o2, %%l4, %%o2
12253 + xor %%o3, %%l5, %%o3
12254 + ldd [%3 + 0x18], %%l4
12255 + xor %%g2, %%o4, %%g2
12256 + xor %%g3, %%o5, %%g3
12257 + ldd [%4 + 0x00], %%o4
12258 + xor %%g4, %%l0, %%g4
12259 + xor %%g5, %%l1, %%g5
12260 + ldd [%4 + 0x08], %%l0
12261 + xor %%o0, %%l2, %%o0
12262 + xor %%o1, %%l3, %%o1
12263 + ldd [%4 + 0x10], %%l2
12264 + xor %%o2, %%l4, %%o2
12265 + xor %%o3, %%l5, %%o3
12266 + ldd [%4 + 0x18], %%l4
12267 + xor %%g2, %%o4, %%g2
12268 + xor %%g3, %%o5, %%g3
12269 + xor %%g4, %%l0, %%g4
12270 + xor %%g5, %%l1, %%g5
12271 + xor %%o0, %%l2, %%o0
12272 + xor %%o1, %%l3, %%o1
12273 + xor %%o2, %%l4, %%o2
12274 + xor %%o3, %%l5, %%o3
12275 + std %%g2, [%0 + 0x00]
12276 + std %%g4, [%0 + 0x08]
12277 + std %%o0, [%0 + 0x10]
12278 + std %%o2, [%0 + 0x18]
12279 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
12280 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
12281 + "l0", "l1", "l2", "l3", "l4", "l5");
12291 +#endif /* __sparc_v[78]__ */
12293 +#ifndef __sparc_v9__
12296 + * this one works reasonably on any x86 CPU
12297 + * (send me an assembly version for inclusion if you can make it faster)
12299 + * this one is just as fast as written in pure assembly on x86.
12300 + * the reason for this separate version is that the
12301 + * fast open-coded xor routine "32reg" produces suboptimal code
12302 + * on x86, due to lack of registers.
12304 +XORBLOCK_TEMPLATE(8regs)
12306 + int len = bh_ptr[0]->b_size;
12307 + long *destp = (long *) bh_ptr[0]->b_data;
12308 + long *source1, *source2, *source3, *source4;
12309 + long lines = len / (sizeof (long)) / 8, i;
12313 + source1 = (long *) bh_ptr[1]->b_data;
12314 + for (i = lines; i > 0; i--) {
12315 + *(destp + 0) ^= *(source1 + 0);
12316 + *(destp + 1) ^= *(source1 + 1);
12317 + *(destp + 2) ^= *(source1 + 2);
12318 + *(destp + 3) ^= *(source1 + 3);
12319 + *(destp + 4) ^= *(source1 + 4);
12320 + *(destp + 5) ^= *(source1 + 5);
12321 + *(destp + 6) ^= *(source1 + 6);
12322 + *(destp + 7) ^= *(source1 + 7);
12328 + source2 = (long *) bh_ptr[2]->b_data;
12329 + source1 = (long *) bh_ptr[1]->b_data;
12330 + for (i = lines; i > 0; i--) {
12331 + *(destp + 0) ^= *(source1 + 0);
12332 + *(destp + 0) ^= *(source2 + 0);
12333 + *(destp + 1) ^= *(source1 + 1);
12334 + *(destp + 1) ^= *(source2 + 1);
12335 + *(destp + 2) ^= *(source1 + 2);
12336 + *(destp + 2) ^= *(source2 + 2);
12337 + *(destp + 3) ^= *(source1 + 3);
12338 + *(destp + 3) ^= *(source2 + 3);
12339 + *(destp + 4) ^= *(source1 + 4);
12340 + *(destp + 4) ^= *(source2 + 4);
12341 + *(destp + 5) ^= *(source1 + 5);
12342 + *(destp + 5) ^= *(source2 + 5);
12343 + *(destp + 6) ^= *(source1 + 6);
12344 + *(destp + 6) ^= *(source2 + 6);
12345 + *(destp + 7) ^= *(source1 + 7);
12346 + *(destp + 7) ^= *(source2 + 7);
12353 + source3 = (long *) bh_ptr[3]->b_data;
12354 + source2 = (long *) bh_ptr[2]->b_data;
12355 + source1 = (long *) bh_ptr[1]->b_data;
12356 + for (i = lines; i > 0; i--) {
12357 + *(destp + 0) ^= *(source1 + 0);
12358 + *(destp + 0) ^= *(source2 + 0);
12359 + *(destp + 0) ^= *(source3 + 0);
12360 + *(destp + 1) ^= *(source1 + 1);
12361 + *(destp + 1) ^= *(source2 + 1);
12362 + *(destp + 1) ^= *(source3 + 1);
12363 + *(destp + 2) ^= *(source1 + 2);
12364 + *(destp + 2) ^= *(source2 + 2);
12365 + *(destp + 2) ^= *(source3 + 2);
12366 + *(destp + 3) ^= *(source1 + 3);
12367 + *(destp + 3) ^= *(source2 + 3);
12368 + *(destp + 3) ^= *(source3 + 3);
12369 + *(destp + 4) ^= *(source1 + 4);
12370 + *(destp + 4) ^= *(source2 + 4);
12371 + *(destp + 4) ^= *(source3 + 4);
12372 + *(destp + 5) ^= *(source1 + 5);
12373 + *(destp + 5) ^= *(source2 + 5);
12374 + *(destp + 5) ^= *(source3 + 5);
12375 + *(destp + 6) ^= *(source1 + 6);
12376 + *(destp + 6) ^= *(source2 + 6);
12377 + *(destp + 6) ^= *(source3 + 6);
12378 + *(destp + 7) ^= *(source1 + 7);
12379 + *(destp + 7) ^= *(source2 + 7);
12380 + *(destp + 7) ^= *(source3 + 7);
12388 + source4 = (long *) bh_ptr[4]->b_data;
12389 + source3 = (long *) bh_ptr[3]->b_data;
12390 + source2 = (long *) bh_ptr[2]->b_data;
12391 + source1 = (long *) bh_ptr[1]->b_data;
12392 + for (i = lines; i > 0; i--) {
12393 + *(destp + 0) ^= *(source1 + 0);
12394 + *(destp + 0) ^= *(source2 + 0);
12395 + *(destp + 0) ^= *(source3 + 0);
12396 + *(destp + 0) ^= *(source4 + 0);
12397 + *(destp + 1) ^= *(source1 + 1);
12398 + *(destp + 1) ^= *(source2 + 1);
12399 + *(destp + 1) ^= *(source3 + 1);
12400 + *(destp + 1) ^= *(source4 + 1);
12401 + *(destp + 2) ^= *(source1 + 2);
12402 + *(destp + 2) ^= *(source2 + 2);
12403 + *(destp + 2) ^= *(source3 + 2);
12404 + *(destp + 2) ^= *(source4 + 2);
12405 + *(destp + 3) ^= *(source1 + 3);
12406 + *(destp + 3) ^= *(source2 + 3);
12407 + *(destp + 3) ^= *(source3 + 3);
12408 + *(destp + 3) ^= *(source4 + 3);
12409 + *(destp + 4) ^= *(source1 + 4);
12410 + *(destp + 4) ^= *(source2 + 4);
12411 + *(destp + 4) ^= *(source3 + 4);
12412 + *(destp + 4) ^= *(source4 + 4);
12413 + *(destp + 5) ^= *(source1 + 5);
12414 + *(destp + 5) ^= *(source2 + 5);
12415 + *(destp + 5) ^= *(source3 + 5);
12416 + *(destp + 5) ^= *(source4 + 5);
12417 + *(destp + 6) ^= *(source1 + 6);
12418 + *(destp + 6) ^= *(source2 + 6);
12419 + *(destp + 6) ^= *(source3 + 6);
12420 + *(destp + 6) ^= *(source4 + 6);
12421 + *(destp + 7) ^= *(source1 + 7);
12422 + *(destp + 7) ^= *(source2 + 7);
12423 + *(destp + 7) ^= *(source3 + 7);
12424 + *(destp + 7) ^= *(source4 + 7);
12436 + * platform independent RAID5 checksum calculation, this should
12437 + * be very fast on any platform that has a decent amount of
12438 + * registers. (32 or more)
12440 +XORBLOCK_TEMPLATE(32regs)
12442 + int size = bh_ptr[0]->b_size;
12443 + int lines = size / (sizeof (long)) / 8, i;
12444 + long *destp = (long *) bh_ptr[0]->b_data;
12445 + long *source1, *source2, *source3, *source4;
12447 + /* LOTS of registers available...
12448 + We do explicite loop-unrolling here for code which
12449 + favours RISC machines. In fact this is almoast direct
12450 + RISC assembly on Alpha and SPARC :-) */
12455 + source1 = (long *) bh_ptr[1]->b_data;
12456 + for (i = lines; i > 0; i--) {
12457 + register long d0, d1, d2, d3, d4, d5, d6, d7;
12458 + d0 = destp[0]; /* Pull the stuff into registers */
12459 + d1 = destp[1]; /* ... in bursts, if possible. */
12466 + d0 ^= source1[0];
12467 + d1 ^= source1[1];
12468 + d2 ^= source1[2];
12469 + d3 ^= source1[3];
12470 + d4 ^= source1[4];
12471 + d5 ^= source1[5];
12472 + d6 ^= source1[6];
12473 + d7 ^= source1[7];
12474 + destp[0] = d0; /* Store the result (in burts) */
12478 + destp[4] = d4; /* Store the result (in burts) */
12487 + source2 = (long *) bh_ptr[2]->b_data;
12488 + source1 = (long *) bh_ptr[1]->b_data;
12489 + for (i = lines; i > 0; i--) {
12490 + register long d0, d1, d2, d3, d4, d5, d6, d7;
12491 + d0 = destp[0]; /* Pull the stuff into registers */
12492 + d1 = destp[1]; /* ... in bursts, if possible. */
12499 + d0 ^= source1[0];
12500 + d1 ^= source1[1];
12501 + d2 ^= source1[2];
12502 + d3 ^= source1[3];
12503 + d4 ^= source1[4];
12504 + d5 ^= source1[5];
12505 + d6 ^= source1[6];
12506 + d7 ^= source1[7];
12507 + d0 ^= source2[0];
12508 + d1 ^= source2[1];
12509 + d2 ^= source2[2];
12510 + d3 ^= source2[3];
12511 + d4 ^= source2[4];
12512 + d5 ^= source2[5];
12513 + d6 ^= source2[6];
12514 + d7 ^= source2[7];
12515 + destp[0] = d0; /* Store the result (in burts) */
12519 + destp[4] = d4; /* Store the result (in burts) */
12529 + source3 = (long *) bh_ptr[3]->b_data;
12530 + source2 = (long *) bh_ptr[2]->b_data;
12531 + source1 = (long *) bh_ptr[1]->b_data;
12532 + for (i = lines; i > 0; i--) {
12533 + register long d0, d1, d2, d3, d4, d5, d6, d7;
12534 + d0 = destp[0]; /* Pull the stuff into registers */
12535 + d1 = destp[1]; /* ... in bursts, if possible. */
12542 + d0 ^= source1[0];
12543 + d1 ^= source1[1];
12544 + d2 ^= source1[2];
12545 + d3 ^= source1[3];
12546 + d4 ^= source1[4];
12547 + d5 ^= source1[5];
12548 + d6 ^= source1[6];
12549 + d7 ^= source1[7];
12550 + d0 ^= source2[0];
12551 + d1 ^= source2[1];
12552 + d2 ^= source2[2];
12553 + d3 ^= source2[3];
12554 + d4 ^= source2[4];
12555 + d5 ^= source2[5];
12556 + d6 ^= source2[6];
12557 + d7 ^= source2[7];
12558 + d0 ^= source3[0];
12559 + d1 ^= source3[1];
12560 + d2 ^= source3[2];
12561 + d3 ^= source3[3];
12562 + d4 ^= source3[4];
12563 + d5 ^= source3[5];
12564 + d6 ^= source3[6];
12565 + d7 ^= source3[7];
12566 + destp[0] = d0; /* Store the result (in burts) */
12570 + destp[4] = d4; /* Store the result (in burts) */
12581 + source4 = (long *) bh_ptr[4]->b_data;
12582 + source3 = (long *) bh_ptr[3]->b_data;
12583 + source2 = (long *) bh_ptr[2]->b_data;
12584 + source1 = (long *) bh_ptr[1]->b_data;
12585 + for (i = lines; i > 0; i--) {
12586 + register long d0, d1, d2, d3, d4, d5, d6, d7;
12587 + d0 = destp[0]; /* Pull the stuff into registers */
12588 + d1 = destp[1]; /* ... in bursts, if possible. */
12595 + d0 ^= source1[0];
12596 + d1 ^= source1[1];
12597 + d2 ^= source1[2];
12598 + d3 ^= source1[3];
12599 + d4 ^= source1[4];
12600 + d5 ^= source1[5];
12601 + d6 ^= source1[6];
12602 + d7 ^= source1[7];
12603 + d0 ^= source2[0];
12604 + d1 ^= source2[1];
12605 + d2 ^= source2[2];
12606 + d3 ^= source2[3];
12607 + d4 ^= source2[4];
12608 + d5 ^= source2[5];
12609 + d6 ^= source2[6];
12610 + d7 ^= source2[7];
12611 + d0 ^= source3[0];
12612 + d1 ^= source3[1];
12613 + d2 ^= source3[2];
12614 + d3 ^= source3[3];
12615 + d4 ^= source3[4];
12616 + d5 ^= source3[5];
12617 + d6 ^= source3[6];
12618 + d7 ^= source3[7];
12619 + d0 ^= source4[0];
12620 + d1 ^= source4[1];
12621 + d2 ^= source4[2];
12622 + d3 ^= source4[3];
12623 + d4 ^= source4[4];
12624 + d5 ^= source4[5];
12625 + d6 ^= source4[6];
12626 + d7 ^= source4[7];
12627 + destp[0] = d0; /* Store the result (in burts) */
12631 + destp[4] = d4; /* Store the result (in burts) */
12646 + * (the -6*32 shift factor colors the cache)
12648 +#define SIZE (PAGE_SIZE-6*32)
12650 +static void xor_speed ( struct xor_block_template * func,
12651 + struct buffer_head *b1, struct buffer_head *b2)
12654 + unsigned long now;
12655 + int i, count, max;
12656 + struct buffer_head *bh_ptr[6];
12658 + func->next = xor_functions;
12659 + xor_functions = func;
12664 + * count the number of XORs done during a whole jiffy.
12665 + * calculate the speed of checksumming from this.
12666 + * (we use a 2-page allocation to have guaranteed
12667 + * color L1-cache layout)
12670 + for (i = 0; i < 5; i++) {
12673 + while (jiffies == now) {
12675 + func->xor_block(2,bh_ptr);
12684 + speed = max * (HZ*SIZE/1024);
12685 + func->speed = speed;
12687 + printk( " %-10s: %5d.%03d MB/sec\n", func->name,
12688 + speed / 1000, speed % 1000);
12691 +static inline void pick_fastest_function(void)
12693 + struct xor_block_template *f, *fastest;
12695 + fastest = xor_functions;
12696 + for (f = fastest; f; f = f->next) {
12697 + if (f->speed > fastest->speed)
12700 +#ifdef CONFIG_X86_XMM
12701 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
12702 + fastest = &t_xor_block_pIII_kni;
12705 + xor_block = fastest->xor_block;
12706 + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
12707 + fastest->speed / 1000, fastest->speed % 1000);
12711 +void calibrate_xor_block(void)
12713 + struct buffer_head b1, b2;
12715 + memset(&b1,0,sizeof(b1));
12718 + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
12719 + if (!b1.b_data) {
12720 + pick_fastest_function();
12723 + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
12725 + b1.b_size = SIZE;
12727 + printk(KERN_INFO "raid5: measuring checksumming speed\n");
12729 + sti(); /* should be safe */
12731 +#if defined(__sparc__) && !defined(__sparc_v9__)
12732 + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
12733 + xor_speed(&t_xor_block_SPARC,&b1,&b2);
12736 +#ifdef CONFIG_X86_XMM
12737 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
12739 + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
12740 + /* we force the use of the KNI xor block because it
12741 + can write around l2. we may also be able
12742 + to load into the l1 only depending on how
12743 + the cpu deals with a load to a line that is
12744 + being prefetched.
12746 + xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
12748 +#endif /* CONFIG_X86_XMM */
12752 + if (md_cpu_has_mmx()) {
12754 + "raid5: MMX detected, trying high-speed MMX checksum routines\n");
12755 + xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
12756 + xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
12759 +#endif /* __i386__ */
12762 + xor_speed(&t_xor_block_8regs,&b1,&b2);
12763 + xor_speed(&t_xor_block_32regs,&b1,&b2);
12765 + free_pages((unsigned long)b1.b_data,2);
12766 + pick_fastest_function();
12769 +#else /* __sparc_v9__ */
12771 +void calibrate_xor_block(void)
12773 + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
12774 + xor_block = xor_block_VIS;
12777 +#endif /* __sparc_v9__ */
12779 +MD_EXPORT_SYMBOL(xor_block);
12781 diff -ruN linux.orig/include/asm-alpha/md.h linux-2.2.16/include/asm-alpha/md.h
12782 --- linux.orig/include/asm-alpha/md.h Fri May 8 09:17:13 1998
12783 +++ linux-2.2.16/include/asm-alpha/md.h Thu Jan 1 01:00:00 1970
12786 - * md.h: High speed xor_block operation for RAID4/5
12790 -#ifndef __ASM_MD_H
12791 -#define __ASM_MD_H
12793 -/* #define HAVE_ARCH_XORBLOCK */
12795 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12797 -#endif /* __ASM_MD_H */
12798 diff -ruN linux.orig/include/asm-i386/md.h linux-2.2.16/include/asm-i386/md.h
12799 --- linux.orig/include/asm-i386/md.h Fri May 8 09:17:13 1998
12800 +++ linux-2.2.16/include/asm-i386/md.h Thu Jan 1 01:00:00 1970
12803 - * md.h: High speed xor_block operation for RAID4/5
12807 -#ifndef __ASM_MD_H
12808 -#define __ASM_MD_H
12810 -/* #define HAVE_ARCH_XORBLOCK */
12812 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12814 -#endif /* __ASM_MD_H */
12815 diff -ruN linux.orig/include/asm-m68k/md.h linux-2.2.16/include/asm-m68k/md.h
12816 --- linux.orig/include/asm-m68k/md.h Fri May 8 09:15:22 1998
12817 +++ linux-2.2.16/include/asm-m68k/md.h Thu Jan 1 01:00:00 1970
12820 - * md.h: High speed xor_block operation for RAID4/5
12824 -#ifndef __ASM_MD_H
12825 -#define __ASM_MD_H
12827 -/* #define HAVE_ARCH_XORBLOCK */
12829 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12831 -#endif /* __ASM_MD_H */
12832 diff -ruN linux.orig/include/asm-ppc/md.h linux-2.2.16/include/asm-ppc/md.h
12833 --- linux.orig/include/asm-ppc/md.h Wed Oct 27 02:53:42 1999
12834 +++ linux-2.2.16/include/asm-ppc/md.h Thu Jan 1 01:00:00 1970
12837 - * md.h: High speed xor_block operation for RAID4/5
12841 -#ifndef __ASM_MD_H
12842 -#define __ASM_MD_H
12844 -/* #define HAVE_ARCH_XORBLOCK */
12846 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12848 -#endif /* __ASM_MD_H */
12849 diff -ruN linux.orig/include/asm-sparc/md.h linux-2.2.16/include/asm-sparc/md.h
12850 --- linux.orig/include/asm-sparc/md.h Tue Jan 13 00:15:54 1998
12851 +++ linux-2.2.16/include/asm-sparc/md.h Thu Jan 1 01:00:00 1970
12854 - * md.h: High speed xor_block operation for RAID4/5
12858 -#ifndef __ASM_MD_H
12859 -#define __ASM_MD_H
12861 -/* #define HAVE_ARCH_XORBLOCK */
12863 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
12865 -#endif /* __ASM_MD_H */
12866 diff -ruN linux.orig/include/asm-sparc64/md.h linux-2.2.16/include/asm-sparc64/md.h
12867 --- linux.orig/include/asm-sparc64/md.h Tue Jan 13 00:15:58 1998
12868 +++ linux-2.2.16/include/asm-sparc64/md.h Thu Jan 1 01:00:00 1970
12871 - * md.h: High speed xor_block operation for RAID4/5
12872 - * utilizing the UltraSparc Visual Instruction Set.
12874 - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
12877 -#ifndef __ASM_MD_H
12878 -#define __ASM_MD_H
12880 -#include <asm/head.h>
12881 -#include <asm/asi.h>
12883 -#define HAVE_ARCH_XORBLOCK
12885 -#define MD_XORBLOCK_ALIGNMENT 64
12887 -/* void __xor_block (char *dest, char *src, long len)
12889 - * while (len--) *dest++ ^= *src++;
12893 - * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
12894 - * !(len & 127) && len >= 256
12897 -static inline void __xor_block (char *dest, char *src, long len)
12899 - __asm__ __volatile__ ("
12900 - wr %%g0, %3, %%fprs
12901 - wr %%g0, %4, %%asi
12902 - membar #LoadStore|#StoreLoad|#StoreStore
12904 - ldda [%0] %4, %%f0
12905 - ldda [%1] %4, %%f16
12906 -1: ldda [%0 + 64] %%asi, %%f32
12907 - fxor %%f0, %%f16, %%f16
12908 - fxor %%f2, %%f18, %%f18
12909 - fxor %%f4, %%f20, %%f20
12910 - fxor %%f6, %%f22, %%f22
12911 - fxor %%f8, %%f24, %%f24
12912 - fxor %%f10, %%f26, %%f26
12913 - fxor %%f12, %%f28, %%f28
12914 - fxor %%f14, %%f30, %%f30
12915 - stda %%f16, [%0] %4
12916 - ldda [%1 + 64] %%asi, %%f48
12917 - ldda [%0 + 128] %%asi, %%f0
12918 - fxor %%f32, %%f48, %%f48
12919 - fxor %%f34, %%f50, %%f50
12921 - fxor %%f36, %%f52, %%f52
12923 - fxor %%f38, %%f54, %%f54
12924 - subcc %2, 128, %2
12925 - fxor %%f40, %%f56, %%f56
12926 - fxor %%f42, %%f58, %%f58
12927 - fxor %%f44, %%f60, %%f60
12928 - fxor %%f46, %%f62, %%f62
12929 - stda %%f48, [%0 - 64] %%asi
12931 - ldda [%1] %4, %%f16
12932 - ldda [%0 + 64] %%asi, %%f32
12933 - fxor %%f0, %%f16, %%f16
12934 - fxor %%f2, %%f18, %%f18
12935 - fxor %%f4, %%f20, %%f20
12936 - fxor %%f6, %%f22, %%f22
12937 - fxor %%f8, %%f24, %%f24
12938 - fxor %%f10, %%f26, %%f26
12939 - fxor %%f12, %%f28, %%f28
12940 - fxor %%f14, %%f30, %%f30
12941 - stda %%f16, [%0] %4
12942 - ldda [%1 + 64] %%asi, %%f48
12944 - fxor %%f32, %%f48, %%f48
12945 - fxor %%f34, %%f50, %%f50
12946 - fxor %%f36, %%f52, %%f52
12947 - fxor %%f38, %%f54, %%f54
12948 - fxor %%f40, %%f56, %%f56
12949 - fxor %%f42, %%f58, %%f58
12950 - fxor %%f44, %%f60, %%f60
12951 - fxor %%f46, %%f62, %%f62
12952 - stda %%f48, [%0 + 64] %%asi
12953 - membar #Sync|#StoreStore|#StoreLoad
12954 - wr %%g0, 0, %%fprs
12956 - "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
12960 -#endif /* __ASM_MD_H */
12961 Binary files linux.orig/include/linux/.sysctl.h.rej.swp and linux-2.2.16/include/linux/.sysctl.h.rej.swp differ
12962 diff -ruN linux.orig/include/linux/blkdev.h linux-2.2.16/include/linux/blkdev.h
12963 --- linux.orig/include/linux/blkdev.h Wed Jun 7 23:26:44 2000
12964 +++ linux-2.2.16/include/linux/blkdev.h Fri Jun 9 11:37:44 2000
12966 extern void make_request(int major,int rw, struct buffer_head * bh);
12968 /* md needs this function to remap requests */
12969 -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
12970 -extern int md_make_request (int minor, int rw, struct buffer_head * bh);
12971 +extern int md_map (kdev_t dev, kdev_t *rdev,
12972 + unsigned long *rsector, unsigned long size);
12973 +extern int md_make_request (struct buffer_head * bh, int rw);
12974 extern int md_error (kdev_t mddev, kdev_t rdev);
12976 extern int * blk_size[MAX_BLKDEV];
12977 diff -ruN linux.orig/include/linux/fs.h linux-2.2.16/include/linux/fs.h
12978 --- linux.orig/include/linux/fs.h Wed Jun 7 23:26:44 2000
12979 +++ linux-2.2.16/include/linux/fs.h Fri Jun 9 11:37:44 2000
12980 @@ -185,6 +185,7 @@
12981 #define BH_Lock 2 /* 1 if the buffer is locked */
12982 #define BH_Req 3 /* 0 if the buffer has been invalidated */
12983 #define BH_Protected 6 /* 1 if the buffer is protected */
12984 +#define BH_LowPrio 7 /* 1 if the buffer is lowprio */
12987 * Try to keep the most commonly used fields in single cache lines (16
12988 @@ -755,6 +756,7 @@
12989 extern void refile_buffer(struct buffer_head * buf);
12990 extern void set_writetime(struct buffer_head * buf, int flag);
12991 extern int try_to_free_buffers(struct page *);
12992 +extern void cache_drop_behind(struct buffer_head *bh);
12994 extern int nr_buffers;
12995 extern long buffermem;
12996 @@ -775,6 +777,25 @@
13000 +extern inline void mark_buffer_highprio(struct buffer_head * bh)
13002 + clear_bit(BH_LowPrio, &bh->b_state);
13005 +extern inline void mark_buffer_lowprio(struct buffer_head * bh)
13008 + * dirty buffers cannot be marked lowprio.
13010 + if (!buffer_dirty(bh))
13011 + set_bit(BH_LowPrio, &bh->b_state);
13014 +static inline int buffer_lowprio(struct buffer_head * bh)
13016 + return test_bit(BH_LowPrio, &bh->b_state);
13019 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
13021 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
13022 @@ -782,6 +803,23 @@
13023 if (bh->b_list != BUF_DIRTY)
13027 + * if a buffer gets marked dirty then it has to lose
13028 + * it's lowprio state.
13030 + mark_buffer_highprio(bh);
13033 +extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
13035 + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
13036 + if (bh->b_list != BUF_DIRTY)
13037 + refile_buffer(bh);
13039 + * Mark it lowprio only if it was not dirty before!
13041 + set_bit(BH_LowPrio, &bh->b_state);
13045 extern int check_disk_change(kdev_t dev);
13046 @@ -855,6 +893,7 @@
13047 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
13048 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
13049 extern int is_read_only(kdev_t);
13050 +extern int is_device_idle(kdev_t);
13051 extern void __brelse(struct buffer_head *);
13052 extern inline void brelse(struct buffer_head *buf)
13054 @@ -870,8 +909,12 @@
13055 extern void set_blocksize(kdev_t dev, int size);
13056 extern unsigned int get_hardblocksize(kdev_t dev);
13057 extern struct buffer_head * bread(kdev_t dev, int block, int size);
13058 +extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
13059 +extern void bread_ahead (kdev_t dev, int block, int size);
13060 extern struct buffer_head * breada(kdev_t dev,int block, int size,
13061 unsigned int pos, unsigned int filesize);
13062 +extern struct buffer_head * breada_blocks(kdev_t dev,int block,
13063 + int size, int blocks);
13065 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
13067 diff -ruN linux.orig/include/linux/md.h linux-2.2.16/include/linux/md.h
13068 --- linux.orig/include/linux/md.h Fri May 8 09:17:13 1998
13069 +++ linux-2.2.16/include/linux/md.h Thu Jan 1 01:00:00 1970
13072 - md.h : Multiple Devices driver for Linux
13073 - Copyright (C) 1994-96 Marc ZYNGIER
13074 - <zyngier@ufr-info-p7.ibp.fr> or
13075 - <maz@gloups.fdn.fr>
13077 - This program is free software; you can redistribute it and/or modify
13078 - it under the terms of the GNU General Public License as published by
13079 - the Free Software Foundation; either version 2, or (at your option)
13080 - any later version.
13082 - You should have received a copy of the GNU General Public License
13083 - (for example /usr/src/linux/COPYING); if not, write to the Free
13084 - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13090 -#include <linux/major.h>
13091 -#include <linux/ioctl.h>
13092 -#include <linux/types.h>
13095 - * Different major versions are not compatible.
13096 - * Different minor versions are only downward compatible.
13097 - * Different patchlevel versions are downward and upward compatible.
13099 -#define MD_MAJOR_VERSION 0
13100 -#define MD_MINOR_VERSION 36
13101 -#define MD_PATCHLEVEL_VERSION 6
13103 -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
13106 -#define REGISTER_DEV _IO (MD_MAJOR, 1)
13107 -#define START_MD _IO (MD_MAJOR, 2)
13108 -#define STOP_MD _IO (MD_MAJOR, 3)
13109 -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
13113 - Byte 0 : Chunk size factor
13114 - Byte 1 : Fault tolerance count for each physical device
13115 - ( 0 means no fault tolerance,
13116 - 0xFF means always tolerate faults), not used by now.
13117 - Byte 2 : Personality
13118 - Byte 3 : Reserved.
13121 -#define FAULT_SHIFT 8
13122 -#define PERSONALITY_SHIFT 16
13124 -#define FACTOR_MASK 0x000000FFUL
13125 -#define FAULT_MASK 0x0000FF00UL
13126 -#define PERSONALITY_MASK 0x00FF0000UL
13128 -#define MD_RESERVED 0 /* Not used by now */
13129 -#define LINEAR (1UL << PERSONALITY_SHIFT)
13130 -#define STRIPED (2UL << PERSONALITY_SHIFT)
13131 -#define RAID0 STRIPED
13132 -#define RAID1 (3UL << PERSONALITY_SHIFT)
13133 -#define RAID5 (4UL << PERSONALITY_SHIFT)
13134 -#define MAX_PERSONALITY 5
13139 - * The MD superblock maintains some statistics on each MD configuration.
13140 - * Each real device in the MD set contains it near the end of the device.
13141 - * Some of the ideas are copied from the ext2fs implementation.
13143 - * We currently use 4096 bytes as follows:
13145 - * word offset function
13147 - * 0 - 31 Constant generic MD device information.
13148 - * 32 - 63 Generic state information.
13149 - * 64 - 127 Personality specific information.
13150 - * 128 - 511 12 32-words descriptors of the disks in the raid set.
13151 - * 512 - 911 Reserved.
13152 - * 912 - 1023 Disk specific descriptor.
13156 - * If x is the real device size in bytes, we return an apparent size of:
13158 - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
13160 - * and place the 4kB superblock at offset y.
13162 -#define MD_RESERVED_BYTES (64 * 1024)
13163 -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
13164 -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
13166 -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
13167 -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
13169 -#define MD_SB_BYTES 4096
13170 -#define MD_SB_WORDS (MD_SB_BYTES / 4)
13171 -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
13172 -#define MD_SB_SECTORS (MD_SB_BYTES / 512)
13175 - * The following are counted in 32-bit words
13177 -#define MD_SB_GENERIC_OFFSET 0
13178 -#define MD_SB_PERSONALITY_OFFSET 64
13179 -#define MD_SB_DISKS_OFFSET 128
13180 -#define MD_SB_DESCRIPTOR_OFFSET 992
13182 -#define MD_SB_GENERIC_CONSTANT_WORDS 32
13183 -#define MD_SB_GENERIC_STATE_WORDS 32
13184 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
13185 -#define MD_SB_PERSONALITY_WORDS 64
13186 -#define MD_SB_DISKS_WORDS 384
13187 -#define MD_SB_DESCRIPTOR_WORDS 32
13188 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
13189 -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
13190 -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
13193 - * Device "operational" state bits
13195 -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
13196 -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
13197 -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
13199 -typedef struct md_device_descriptor_s {
13200 - __u32 number; /* 0 Device number in the entire set */
13201 - __u32 major; /* 1 Device major number */
13202 - __u32 minor; /* 2 Device minor number */
13203 - __u32 raid_disk; /* 3 The role of the device in the raid set */
13204 - __u32 state; /* 4 Operational state */
13205 - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
13206 -} md_descriptor_t;
13208 -#define MD_SB_MAGIC 0xa92b4efc
13211 - * Superblock state bits
13213 -#define MD_SB_CLEAN 0
13214 -#define MD_SB_ERRORS 1
13216 -typedef struct md_superblock_s {
13219 - * Constant generic information
13221 - __u32 md_magic; /* 0 MD identifier */
13222 - __u32 major_version; /* 1 major version to which the set conforms */
13223 - __u32 minor_version; /* 2 minor version to which the set conforms */
13224 - __u32 patch_version; /* 3 patchlevel version to which the set conforms */
13225 - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
13226 - __u32 set_magic; /* 5 Raid set identifier */
13227 - __u32 ctime; /* 6 Creation time */
13228 - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
13229 - __u32 size; /* 8 Apparent size of each individual disk, in kB */
13230 - __u32 nr_disks; /* 9 Number of total disks in the raid set */
13231 - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
13232 - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
13235 - * Generic state information
13237 - __u32 utime; /* 0 Superblock update time */
13238 - __u32 state; /* 1 State bits (clean, ...) */
13239 - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
13240 - __u32 working_disks; /* 3 Number of working disks */
13241 - __u32 failed_disks; /* 4 Number of failed disks */
13242 - __u32 spare_disks; /* 5 Number of spare disks */
13243 - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
13246 - * Personality information
13248 - __u32 parity_algorithm;
13249 - __u32 chunk_size;
13250 - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
13253 - * Disks information
13255 - md_descriptor_t disks[MD_SB_DISKS];
13260 - __u32 reserved[MD_SB_RESERVED_WORDS];
13263 - * Active descriptor
13265 - md_descriptor_t descriptor;
13266 -} md_superblock_t;
13270 -#include <linux/mm.h>
13271 -#include <linux/fs.h>
13272 -#include <linux/blkdev.h>
13273 -#include <asm/semaphore.h>
13276 - * Kernel-based reconstruction is mostly working, but still requires
13277 - * some additional work.
13279 -#define SUPPORT_RECONSTRUCTION 0
13281 -#define MAX_REAL 8 /* Max number of physical dev per md dev */
13282 -#define MAX_MD_DEV 4 /* Max number of md dev */
13284 -#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
13285 -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
13286 -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
13288 -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
13292 - kdev_t dev; /* Device number */
13293 - int size; /* Device size (in blocks) */
13294 - int offset; /* Real device offset (in blocks) in md dev
13295 - (only used in linear mode) */
13296 - struct inode *inode; /* Lock inode */
13297 - md_superblock_t *sb;
13303 -#define SPARE_INACTIVE 0
13304 -#define SPARE_WRITE 1
13305 -#define SPARE_ACTIVE 2
13307 -struct md_personality
13310 - int (*map)(struct md_dev *mddev, kdev_t *rdev,
13311 - unsigned long *rsector, unsigned long size);
13312 - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
13313 - void (*end_request)(struct buffer_head * bh, int uptodate);
13314 - int (*run)(int minor, struct md_dev *mddev);
13315 - int (*stop)(int minor, struct md_dev *mddev);
13316 - int (*status)(char *page, int minor, struct md_dev *mddev);
13317 - int (*ioctl)(struct inode *inode, struct file *file,
13318 - unsigned int cmd, unsigned long arg);
13319 - int max_invalid_dev;
13320 - int (*error_handler)(struct md_dev *mddev, kdev_t dev);
13323 - * Some personalities (RAID-1, RAID-5) can get disks hot-added and
13324 - * hot-removed. Hot removal is different from failure. (failure marks
13325 - * a disk inactive, but the disk is still part of the array)
13327 - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
13328 - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
13329 - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
13334 - struct real_dev devices[MAX_REAL];
13335 - struct md_personality *pers;
13336 - md_superblock_t *sb;
13344 -struct md_thread {
13345 - void (*run) (void *data);
13347 - struct wait_queue *wqueue;
13348 - unsigned long flags;
13349 - struct semaphore *sem;
13350 - struct task_struct *tsk;
13353 -#define THREAD_WAKEUP 0
13355 -extern struct md_dev md_dev[MAX_MD_DEV];
13356 -extern int md_size[MAX_MD_DEV];
13357 -extern int md_maxreadahead[MAX_MD_DEV];
13359 -extern char *partition_name (kdev_t dev);
13361 -extern int register_md_personality (int p_num, struct md_personality *p);
13362 -extern int unregister_md_personality (int p_num);
13363 -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
13364 -extern void md_unregister_thread (struct md_thread *thread);
13365 -extern void md_wakeup_thread(struct md_thread *thread);
13366 -extern int md_update_sb (int minor);
13367 -extern int md_do_sync(struct md_dev *mddev);
13371 diff -ruN linux.orig/include/linux/raid/hsm.h linux-2.2.16/include/linux/raid/hsm.h
13372 --- linux.orig/include/linux/raid/hsm.h Thu Jan 1 01:00:00 1970
13373 +++ linux-2.2.16/include/linux/raid/hsm.h Fri Jun 9 11:37:44 2000
13378 +#include <linux/raid/md.h>
13381 +#error fix cpu_addr on Alpha first
13384 +#include <linux/raid/hsm_p.h>
13386 +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
13387 +#define index_dev(lv,index) index_pv((lv),(index))->dev
13388 +#define index_block(lv,index) (index)->data.phys_block
13389 +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
13391 +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
13394 +typedef struct pv_bg_desc_s {
13395 + unsigned int free_blocks;
13396 + pv_block_group_t *bg;
13399 +typedef struct pv_s pv_t;
13400 +typedef struct vg_s vg_t;
13401 +typedef struct lv_s lv_t;
13408 + pv_bg_desc_t *bg_array;
13416 + unsigned int max_indices;
13417 + unsigned int free_indices;
13418 + lv_lptr_t root_index;
13426 + pv_t pv_array [MD_SB_DISKS];
13429 + lv_t lv_array [HSM_MAX_LVS_PER_VG];
13435 +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
13436 +#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
13440 diff -ruN linux.orig/include/linux/raid/hsm_p.h linux-2.2.16/include/linux/raid/hsm_p.h
13441 --- linux.orig/include/linux/raid/hsm_p.h Thu Jan 1 01:00:00 1970
13442 +++ linux-2.2.16/include/linux/raid/hsm_p.h Fri Jun 9 11:37:44 2000
13447 +#define HSM_BLOCKSIZE 4096
13448 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
13449 +#define PACKED __attribute__ ((packed))
13452 + * Identifies a block in physical space
13454 +typedef struct phys_idx_s {
13456 + __u32 phys_block;
13458 +} PACKED phys_idx_t;
13461 + * Identifies a block in logical space
13463 +typedef struct log_idx_s {
13467 +} PACKED log_idx_t;
13470 + * Describes one PV
13472 +#define HSM_PV_SB_MAGIC 0xf091ae9fU
13474 +#define HSM_PV_SB_GENERIC_WORDS 32
13475 +#define HSM_PV_SB_RESERVED_WORDS \
13476 + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
13479 + * On-disk PV identification data, on block 0 in any PV.
13481 +typedef struct pv_sb_s
13483 + __u32 pv_magic; /* 0 */
13485 + __u32 pv_uuid0; /* 1 */
13486 + __u32 pv_uuid1; /* 2 */
13487 + __u32 pv_uuid2; /* 3 */
13488 + __u32 pv_uuid3; /* 4 */
13490 + __u32 pv_major; /* 5 */
13491 + __u32 pv_minor; /* 6 */
13492 + __u32 pv_patch; /* 7 */
13494 + __u32 pv_ctime; /* 8 Creation time */
13496 + __u32 pv_total_size; /* 9 size of this PV, in blocks */
13497 + __u32 pv_first_free; /* 10 first free block */
13498 + __u32 pv_first_used; /* 11 first used block */
13499 + __u32 pv_blocks_left; /* 12 unallocated blocks */
13500 + __u32 pv_bg_size; /* 13 size of a block group, in blocks */
13501 + __u32 pv_block_size; /* 14 size of blocks, in bytes */
13502 + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
13503 + __u32 pv_block_groups; /* 16 number of block groups */
13505 + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
13510 + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
13515 + * this is pretty much arbitrary, but has to be less than ~64
13517 +#define HSM_MAX_LVS_PER_VG 32
13519 +#define HSM_VG_SB_GENERIC_WORDS 32
13521 +#define LV_DESCRIPTOR_WORDS 8
13522 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
13523 + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
13525 +#if (HSM_PV_SB_RESERVED_WORDS < 0)
13526 +#error you messed this one up dude ...
13529 +typedef struct lv_descriptor_s
13531 + __u32 lv_id; /* 0 */
13532 + phys_idx_t lv_root_idx; /* 1 */
13533 + __u16 __reserved; /* 2 */
13534 + __u32 lv_max_indices; /* 3 */
13535 + __u32 lv_free_indices; /* 4 */
13536 + __u32 md_id; /* 5 */
13538 + __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
13540 +} PACKED lv_descriptor_t;
13542 +#define HSM_VG_SB_MAGIC 0x98320d7aU
13544 + * On-disk VG identification data, in block 1 on all PVs
13546 +typedef struct vg_sb_s
13548 + __u32 vg_magic; /* 0 */
13549 + __u32 nr_lvs; /* 1 */
13551 + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
13553 + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
13557 + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
13562 + * Describes one LV
13565 +#define HSM_LV_SB_MAGIC 0xe182bd8aU
13567 +/* do we need lv_sb_t? */
13569 +typedef struct lv_sb_s
13572 + * On-disk LV identifier
13574 + __u32 lv_magic; /* 0 LV identifier */
13575 + __u32 lv_uuid0; /* 1 */
13576 + __u32 lv_uuid1; /* 2 */
13577 + __u32 lv_uuid2; /* 3 */
13578 + __u32 lv_uuid3; /* 4 */
13580 + __u32 lv_major; /* 5 PV identifier */
13581 + __u32 lv_minor; /* 6 PV identifier */
13582 + __u32 lv_patch; /* 7 PV identifier */
13584 + __u32 ctime; /* 8 Creation time */
13585 + __u32 size; /* 9 size of this LV, in blocks */
13586 + phys_idx_t start; /* 10 position of root index block */
13587 + log_idx_t first_free; /* 11-12 first free index */
13592 + __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
13597 + * Pointer pointing from the physical space, points to
13598 + * the LV owning this block. It also contains various
13599 + * statistics about the physical block.
13601 +typedef struct pv_pptr_s
13607 + log_idx_t predicted;
13608 + __u32 last_referenced;
13619 +} PACKED pv_pptr_t;
13621 +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
13623 + return !pptr->u.free.log_id;
13627 +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
13629 +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
13631 + * A table of pointers filling up a single block, managing
13632 + * the next DATA_BLOCKS_PER_BG physical blocks. Such block
13633 + * groups form the physical space of blocks.
13635 +typedef struct pv_block_group_s
13637 + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
13639 + pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
13641 +} PACKED pv_block_group_t;
13644 + * Pointer from the logical space, points to
13645 + * the (PV,block) containing this logical block
13647 +typedef struct lv_lptr_s
13650 + __u16 __reserved;
13652 + __u32 __reserved2;
13654 +} PACKED lv_lptr_t;
13656 +static __inline__ int index_free (const lv_lptr_t * index)
13658 + return !index->data.phys_block;
13661 +static __inline__ int index_present (const lv_lptr_t * index)
13663 + return index->cpu_addr;
13667 +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
13669 + * A table of pointers filling up a single block, managing
13670 + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
13671 + * the logical space of blocks.
13673 +typedef struct lv_index_block_s
13675 + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
13677 +} PACKED lv_index_block_t;
13681 diff -ruN linux.orig/include/linux/raid/linear.h linux-2.2.16/include/linux/raid/linear.h
13682 --- linux.orig/include/linux/raid/linear.h Thu Jan 1 01:00:00 1970
13683 +++ linux-2.2.16/include/linux/raid/linear.h Fri Jun 9 11:37:44 2000
13688 +#include <linux/raid/md.h>
13693 + unsigned int offset;
13696 +typedef struct dev_info dev_info_t;
13698 +struct linear_hash
13700 + dev_info_t *dev0, *dev1;
13703 +struct linear_private_data
13705 + struct linear_hash *hash_table;
13706 + dev_info_t disks[MD_SB_DISKS];
13707 + dev_info_t *smallest;
13712 +typedef struct linear_private_data linear_conf_t;
13714 +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
13717 diff -ruN linux.orig/include/linux/raid/md.h linux-2.2.16/include/linux/raid/md.h
13718 --- linux.orig/include/linux/raid/md.h Thu Jan 1 01:00:00 1970
13719 +++ linux-2.2.16/include/linux/raid/md.h Fri Jun 9 11:37:44 2000
13722 + md.h : Multiple Devices driver for Linux
13723 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
13724 + Copyright (C) 1994-96 Marc ZYNGIER
13725 + <zyngier@ufr-info-p7.ibp.fr> or
13726 + <maz@gloups.fdn.fr>
13728 + This program is free software; you can redistribute it and/or modify
13729 + it under the terms of the GNU General Public License as published by
13730 + the Free Software Foundation; either version 2, or (at your option)
13731 + any later version.
13733 + You should have received a copy of the GNU General Public License
13734 + (for example /usr/src/linux/COPYING); if not, write to the Free
13735 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13741 +#include <linux/mm.h>
13742 +#include <linux/fs.h>
13743 +#include <linux/blkdev.h>
13744 +#include <asm/semaphore.h>
13745 +#include <linux/major.h>
13746 +#include <linux/ioctl.h>
13747 +#include <linux/types.h>
13748 +#include <asm/bitops.h>
13749 +#include <linux/module.h>
13750 +#include <linux/mm.h>
13751 +#include <linux/hdreg.h>
13752 +#include <linux/sysctl.h>
13753 +#include <linux/fs.h>
13754 +#include <linux/proc_fs.h>
13755 +#include <linux/smp_lock.h>
13756 +#include <linux/delay.h>
13757 +#include <net/checksum.h>
13758 +#include <linux/random.h>
13759 +#include <linux/locks.h>
13760 +#include <asm/io.h>
13762 +#include <linux/raid/md_compatible.h>
13764 + * 'md_p.h' holds the 'physical' layout of RAID devices
13765 + * 'md_u.h' holds the user <=> kernel API
13767 + * 'md_k.h' holds kernel internal definitions
13770 +#include <linux/raid/md_p.h>
13771 +#include <linux/raid/md_u.h>
13772 +#include <linux/raid/md_k.h>
13775 + * Different major versions are not compatible.
13776 + * Different minor versions are only downward compatible.
13777 + * Different patchlevel versions are downward and upward compatible.
13779 +#define MD_MAJOR_VERSION 0
13780 +#define MD_MINOR_VERSION 90
13781 +#define MD_PATCHLEVEL_VERSION 0
13783 +extern int md_size[MAX_MD_DEVS];
13784 +extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
13786 +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
13787 +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
13788 +extern char * partition_name (kdev_t dev);
13789 +extern int register_md_personality (int p_num, mdk_personality_t *p);
13790 +extern int unregister_md_personality (int p_num);
13791 +extern mdk_thread_t * md_register_thread (void (*run) (void *data),
13792 + void *data, const char *name);
13793 +extern void md_unregister_thread (mdk_thread_t *thread);
13794 +extern void md_wakeup_thread(mdk_thread_t *thread);
13795 +extern void md_interrupt_thread (mdk_thread_t *thread);
13796 +extern int md_update_sb (mddev_t *mddev);
13797 +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
13798 +extern void md_recover_arrays (void);
13799 +extern int md_check_ordering (mddev_t *mddev);
13800 +extern void autodetect_raid(void);
13801 +extern struct gendisk * find_gendisk (kdev_t dev);
13802 +extern int md_notify_reboot(struct notifier_block *this,
13803 + unsigned long code, void *x);
13804 +#if CONFIG_BLK_DEV_MD
13805 +extern void raid_setup(char *str,int *ints) md__init;
13807 +#ifdef CONFIG_MD_BOOT
13808 +extern void md_setup(char *str,int *ints) md__init;
13811 +extern void md_print_devices (void);
13813 +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
13817 diff -ruN linux.orig/include/linux/raid/md_compatible.h linux-2.2.16/include/linux/raid/md_compatible.h
13818 --- linux.orig/include/linux/raid/md_compatible.h Thu Jan 1 01:00:00 1970
13819 +++ linux-2.2.16/include/linux/raid/md_compatible.h Fri Jun 9 11:37:44 2000
13823 + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
13824 + Copyright (C) 1998 Ingo Molnar
13826 + This program is free software; you can redistribute it and/or modify
13827 + it under the terms of the GNU General Public License as published by
13828 + the Free Software Foundation; either version 2, or (at your option)
13829 + any later version.
13831 + You should have received a copy of the GNU General Public License
13832 + (for example /usr/src/linux/COPYING); if not, write to the Free
13833 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13836 +#include <linux/version.h>
13838 +#ifndef _MD_COMPATIBLE_H
13839 +#define _MD_COMPATIBLE_H
13841 +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
13843 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
13846 +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
13850 +extern __inline__ int md_cpu_has_mmx(void)
13852 + return x86_capability & 0x00800000;
13857 +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
13861 + * someone please suggest a sane compatibility layer for modules
13863 +#define MD_EXPORT_SYMBOL(x)
13866 +static inline unsigned long
13867 +md_copy_from_user(void *to, const void *from, unsigned long n)
13871 + err = verify_area(VERIFY_READ,from,n);
13873 + memcpy_fromfs(to, from, n);
13878 +extern inline unsigned long
13879 +md_copy_to_user(void *to, const void *from, unsigned long n)
13883 + err = verify_area(VERIFY_WRITE,to,n);
13885 + memcpy_tofs(to, from, n);
13890 +#define md_put_user(x,ptr) \
13894 + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
13896 + put_user(x,ptr); \
13901 +extern inline int md_capable_admin(void)
13907 +#define MD_FILE_TO_INODE(file) ((file)->f_inode)
13910 +extern inline void md_flush_signals (void)
13912 + current->signal = 0;
13916 +#define __S(nr) (1<<((nr)-1))
13917 +extern inline void md_init_signals (void)
13919 + current->exit_signal = SIGCHLD;
13920 + current->blocked = ~(__S(SIGKILL));
13925 +extern inline unsigned long md_signal_pending (struct task_struct * tsk)
13927 + return (tsk->signal & ~tsk->blocked);
13931 +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
13934 +#define md_mdelay(n) (\
13935 + {unsigned long msec=(n); while (msec--) udelay(1000);})
13938 +#define MD_SYS_DOWN 0
13939 +#define MD_SYS_HALT 0
13940 +#define MD_SYS_POWER_OFF 0
13943 +#define md_register_reboot_notifier(x)
13946 +extern __inline__ unsigned long
13947 +md_test_and_set_bit(int nr, void * addr)
13949 + unsigned long flags;
13950 + unsigned long oldbit;
13952 + save_flags(flags);
13954 + oldbit = test_bit(nr,addr);
13955 + set_bit(nr,addr);
13956 + restore_flags(flags);
13961 +extern __inline__ unsigned long
13962 +md_test_and_clear_bit(int nr, void * addr)
13964 + unsigned long flags;
13965 + unsigned long oldbit;
13967 + save_flags(flags);
13969 + oldbit = test_bit(nr,addr);
13970 + clear_bit(nr,addr);
13971 + restore_flags(flags);
13976 +#define md_atomic_read(x) (*(volatile int *)(x))
13977 +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
13980 +extern __inline__ void md_lock_kernel (void)
13988 +extern __inline__ void md_unlock_kernel (void)
13998 +#define md__initdata
13999 +#define md__initfunc(__arginit) __arginit
14005 +struct md_list_head {
14006 + struct md_list_head *next, *prev;
14009 +#define MD_LIST_HEAD(name) \
14010 + struct md_list_head name = { &name, &name }
14012 +#define MD_INIT_LIST_HEAD(ptr) do { \
14013 + (ptr)->next = (ptr); (ptr)->prev = (ptr); \
14016 +static __inline__ void md__list_add(struct md_list_head * new,
14017 + struct md_list_head * prev,
14018 + struct md_list_head * next)
14020 + next->prev = new;
14021 + new->next = next;
14022 + new->prev = prev;
14023 + prev->next = new;
14026 +static __inline__ void md_list_add(struct md_list_head *new,
14027 + struct md_list_head *head)
14029 + md__list_add(new, head, head->next);
14032 +static __inline__ void md__list_del(struct md_list_head * prev,
14033 + struct md_list_head * next)
14035 + next->prev = prev;
14036 + prev->next = next;
14039 +static __inline__ void md_list_del(struct md_list_head *entry)
14041 + md__list_del(entry->prev, entry->next);
14044 +static __inline__ int md_list_empty(struct md_list_head *head)
14046 + return head->next == head;
14049 +#define md_list_entry(ptr, type, member) \
14050 + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
14054 +static __inline__ signed long md_schedule_timeout(signed long timeout)
14056 + current->timeout = jiffies + timeout;
14062 +#define md_need_resched(tsk) (need_resched)
14065 +typedef struct { int gcc_is_buggy; } md_spinlock_t;
14066 +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
14068 +#define md_spin_lock_irq cli
14069 +#define md_spin_unlock_irq sti
14070 +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
14071 +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
14077 +#include <linux/reboot.h>
14078 +#include <linux/vmalloc.h>
14081 +#define md__get_free_pages(x,y) __get_free_pages(x,y)
14085 +extern __inline__ int md_cpu_has_mmx(void)
14087 + return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
14092 +#define md_clear_page(page) clear_page(page)
14095 +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
14098 +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
14101 +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
14104 +#define md_put_user put_user
14107 +extern inline int md_capable_admin(void)
14109 + return capable(CAP_SYS_ADMIN);
14113 +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
14116 +extern inline void md_flush_signals (void)
14118 + spin_lock(¤t->sigmask_lock);
14119 + flush_signals(current);
14120 + spin_unlock(¤t->sigmask_lock);
14124 +extern inline void md_init_signals (void)
14126 + current->exit_signal = SIGCHLD;
14127 + siginitsetinv(¤t->blocked, sigmask(SIGKILL));
14131 +#define md_signal_pending signal_pending
14134 +extern inline void md_set_global_readahead(int * table)
14136 + max_readahead[MD_MAJOR] = table;
14140 +#define md_mdelay(x) mdelay(x)
14143 +#define MD_SYS_DOWN SYS_DOWN
14144 +#define MD_SYS_HALT SYS_HALT
14145 +#define MD_SYS_POWER_OFF SYS_POWER_OFF
14148 +#define md_register_reboot_notifier register_reboot_notifier
14151 +#define md_test_and_set_bit test_and_set_bit
14154 +#define md_test_and_clear_bit test_and_clear_bit
14157 +#define md_atomic_read atomic_read
14158 +#define md_atomic_set atomic_set
14161 +#define md_lock_kernel lock_kernel
14162 +#define md_unlock_kernel unlock_kernel
14166 +#include <linux/init.h>
14168 +#define md__init __init
14169 +#define md__initdata __initdata
14170 +#define md__initfunc(__arginit) __initfunc(__arginit)
14177 +#define md_list_head list_head
14178 +#define MD_LIST_HEAD(name) LIST_HEAD(name)
14179 +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
14180 +#define md_list_add list_add
14181 +#define md_list_del list_del
14182 +#define md_list_empty list_empty
14184 +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
14188 +#define md_schedule_timeout schedule_timeout
14191 +#define md_need_resched(tsk) ((tsk)->need_resched)
14194 +#define md_spinlock_t spinlock_t
14195 +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
14197 +#define md_spin_lock_irq spin_lock_irq
14198 +#define md_spin_unlock_irq spin_unlock_irq
14199 +#define md_spin_unlock_irqrestore spin_unlock_irqrestore
14200 +#define md_spin_lock_irqsave spin_lock_irqsave
14206 +#endif _MD_COMPATIBLE_H
14208 diff -ruN linux.orig/include/linux/raid/md_k.h linux-2.2.16/include/linux/raid/md_k.h
14209 --- linux.orig/include/linux/raid/md_k.h Thu Jan 1 01:00:00 1970
14210 +++ linux-2.2.16/include/linux/raid/md_k.h Fri Jun 9 11:37:44 2000
14213 + md_k.h : kernel internal structure of the Linux MD driver
14214 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
14216 + This program is free software; you can redistribute it and/or modify
14217 + it under the terms of the GNU General Public License as published by
14218 + the Free Software Foundation; either version 2, or (at your option)
14219 + any later version.
14221 + You should have received a copy of the GNU General Public License
14222 + (for example /usr/src/linux/COPYING); if not, write to the Free
14223 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14229 +#define MD_RESERVED 0UL
14230 +#define LINEAR 1UL
14231 +#define STRIPED 2UL
14232 +#define RAID0 STRIPED
14235 +#define TRANSLUCENT 5UL
14237 +#define MAX_PERSONALITY 7UL
14239 +extern inline int pers_to_level (int pers)
14242 + case HSM: return -3;
14243 + case TRANSLUCENT: return -2;
14244 + case LINEAR: return -1;
14245 + case RAID0: return 0;
14246 + case RAID1: return 1;
14247 + case RAID5: return 5;
14249 + panic("pers_to_level()");
14252 +extern inline int level_to_pers (int level)
14255 + case -3: return HSM;
14256 + case -2: return TRANSLUCENT;
14257 + case -1: return LINEAR;
14258 + case 0: return RAID0;
14259 + case 1: return RAID1;
14261 + case 5: return RAID5;
14263 + return MD_RESERVED;
14266 +typedef struct mddev_s mddev_t;
14267 +typedef struct mdk_rdev_s mdk_rdev_t;
14269 +#if (MINORBITS != 8)
14270 +#error MD doesnt handle bigger kdev yet
14273 +#define MAX_REAL 12 /* Max number of disks per md dev */
14274 +#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
14277 + * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
14278 + * the personality. (eg. HSM uses this to identify individual LVs)
14280 +typedef struct dev_mapping_s {
14285 +extern dev_mapping_t mddev_map [MAX_MD_DEVS];
14287 +extern inline mddev_t * kdev_to_mddev (kdev_t dev)
14289 + return mddev_map[MINOR(dev)].mddev;
14293 + * options passed in raidrun:
14296 +#define MAX_CHUNK_SIZE (4096*1024)
14299 + * default readahead
14301 +#define MD_READAHEAD (256 * 512)
14303 +extern inline int disk_faulty(mdp_disk_t * d)
14305 + return d->state & (1 << MD_DISK_FAULTY);
14308 +extern inline int disk_active(mdp_disk_t * d)
14310 + return d->state & (1 << MD_DISK_ACTIVE);
14313 +extern inline int disk_sync(mdp_disk_t * d)
14315 + return d->state & (1 << MD_DISK_SYNC);
14318 +extern inline int disk_spare(mdp_disk_t * d)
14320 + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
14323 +extern inline int disk_removed(mdp_disk_t * d)
14325 + return d->state & (1 << MD_DISK_REMOVED);
14328 +extern inline void mark_disk_faulty(mdp_disk_t * d)
14330 + d->state |= (1 << MD_DISK_FAULTY);
14333 +extern inline void mark_disk_active(mdp_disk_t * d)
14335 + d->state |= (1 << MD_DISK_ACTIVE);
14338 +extern inline void mark_disk_sync(mdp_disk_t * d)
14340 + d->state |= (1 << MD_DISK_SYNC);
14343 +extern inline void mark_disk_spare(mdp_disk_t * d)
14348 +extern inline void mark_disk_removed(mdp_disk_t * d)
14350 + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
14353 +extern inline void mark_disk_inactive(mdp_disk_t * d)
14355 + d->state &= ~(1 << MD_DISK_ACTIVE);
14358 +extern inline void mark_disk_nonsync(mdp_disk_t * d)
14360 + d->state &= ~(1 << MD_DISK_SYNC);
14364 + * MD's 'extended' device
14368 + struct md_list_head same_set; /* RAID devices within the same set */
14369 + struct md_list_head all; /* all RAID devices */
14370 + struct md_list_head pending; /* undetected RAID devices */
14372 + kdev_t dev; /* Device number */
14373 + kdev_t old_dev; /* "" when it was last imported */
14374 + int size; /* Device size (in blocks) */
14375 + mddev_t *mddev; /* RAID array if running */
14376 + unsigned long last_events; /* IO event timestamp */
14378 + struct inode *inode; /* Lock inode */
14379 + struct file filp; /* Lock file */
14384 + int faulty; /* if faulty do not issue IO requests */
14385 + int desc_nr; /* descriptor index in the superblock */
14390 + * disk operations in a working array:
14392 +#define DISKOP_SPARE_INACTIVE 0
14393 +#define DISKOP_SPARE_WRITE 1
14394 +#define DISKOP_SPARE_ACTIVE 2
14395 +#define DISKOP_HOT_REMOVE_DISK 3
14396 +#define DISKOP_HOT_ADD_DISK 4
14398 +typedef struct mdk_personality_s mdk_personality_t;
14403 + mdk_personality_t *pers;
14407 + struct md_list_head disks;
14409 + mdu_param_t param;
14411 + unsigned int curr_resync;
14412 + unsigned long resync_start;
14414 + int recovery_running;
14415 + struct semaphore reconfig_sem;
14416 + struct semaphore recovery_sem;
14417 + struct semaphore resync_sem;
14418 + struct md_list_head all_mddevs;
14421 +struct mdk_personality_s
14424 + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
14425 + unsigned long *rsector, unsigned long size);
14426 + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
14427 + void (*end_request)(struct buffer_head * bh, int uptodate);
14428 + int (*run)(mddev_t *mddev);
14429 + int (*stop)(mddev_t *mddev);
14430 + int (*status)(char *page, mddev_t *mddev);
14431 + int (*ioctl)(struct inode *inode, struct file *file,
14432 + unsigned int cmd, unsigned long arg);
14433 + int max_invalid_dev;
14434 + int (*error_handler)(mddev_t *mddev, kdev_t dev);
14437 + * Some personalities (RAID-1, RAID-5) can have disks hot-added and
14438 + * hot-removed. Hot removal is different from failure. (failure marks
14439 + * a disk inactive, but the disk is still part of the array) The interface
14440 + * to such operations is the 'pers->diskop()' function, can be NULL.
14442 + * the diskop function can change the pointer pointing to the incoming
14443 + * descriptor, but must do so very carefully. (currently only
14444 + * SPARE_ACTIVE expects such a change)
14446 + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
14448 + int (*stop_resync)(mddev_t *mddev);
14449 + int (*restart_resync)(mddev_t *mddev);
14454 + * Currently we index md_array directly, based on the minor
14455 + * number. This will have to change to dynamic allocation
14456 + * once we start supporting partitioning of md devices.
14458 +extern inline int mdidx (mddev_t * mddev)
14460 + return mddev->__minor;
14463 +extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
14465 + return MKDEV(MD_MAJOR, mdidx(mddev));
14468 +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
14469 +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
14472 + * iterates through some rdev ringlist. It's safe to remove the
14473 + * current 'rdev'. Dont touch 'tmp' though.
14475 +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
14477 + for (tmp = head.next; \
14478 + rdev = md_list_entry(tmp, mdk_rdev_t, field), \
14479 + tmp = tmp->next, tmp->prev != &head \
14482 + * iterates through the 'same array disks' ringlist
14484 +#define ITERATE_RDEV(mddev,rdev,tmp) \
14485 + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
14488 + * Same as above, but assumes that the device has rdev->desc_nr numbered
14489 + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
14491 +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
14492 + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
14496 + * Iterates through all 'RAID managed disks'
14498 +#define ITERATE_RDEV_ALL(rdev,tmp) \
14499 + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
14502 + * Iterates through 'pending RAID disks'
14504 +#define ITERATE_RDEV_PENDING(rdev,tmp) \
14505 + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
14508 + * iterates through all used mddevs in the system.
14510 +#define ITERATE_MDDEV(mddev,tmp) \
14512 + for (tmp = all_mddevs.next; \
14513 + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
14514 + tmp = tmp->next, tmp->prev != &all_mddevs \
14517 +extern inline int lock_mddev (mddev_t * mddev)
14519 + return down_interruptible(&mddev->reconfig_sem);
14522 +extern inline void unlock_mddev (mddev_t * mddev)
14524 + up(&mddev->reconfig_sem);
14527 +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
14528 + x = y; y = __tmp; } while (0)
14530 +typedef struct mdk_thread_s {
14531 + void (*run) (void *data);
14533 + struct wait_queue *wqueue;
14534 + unsigned long flags;
14535 + struct semaphore *sem;
14536 + struct task_struct *tsk;
14537 + const char *name;
14540 +#define THREAD_WAKEUP 0
14542 +typedef struct dev_name_s {
14543 + struct md_list_head list;
14545 + char name [MAX_DISKNAME_LEN];
14550 diff -ruN linux.orig/include/linux/raid/md_p.h linux-2.2.16/include/linux/raid/md_p.h
14551 --- linux.orig/include/linux/raid/md_p.h Thu Jan 1 01:00:00 1970
14552 +++ linux-2.2.16/include/linux/raid/md_p.h Fri Jun 9 11:37:44 2000
14555 + md_p.h : physical layout of Linux RAID devices
14556 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
14558 + This program is free software; you can redistribute it and/or modify
14559 + it under the terms of the GNU General Public License as published by
14560 + the Free Software Foundation; either version 2, or (at your option)
14561 + any later version.
14563 + You should have received a copy of the GNU General Public License
14564 + (for example /usr/src/linux/COPYING); if not, write to the Free
14565 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14572 + * RAID superblock.
14574 + * The RAID superblock maintains some statistics on each RAID configuration.
14575 + * Each real device in the RAID set contains it near the end of the device.
14576 + * Some of the ideas are copied from the ext2fs implementation.
14578 + * We currently use 4096 bytes as follows:
14580 + * word offset function
14582 + * 0 - 31 Constant generic RAID device information.
14583 + * 32 - 63 Generic state information.
14584 + * 64 - 127 Personality specific information.
14585 + * 128 - 511 12 32-words descriptors of the disks in the raid set.
14586 + * 512 - 911 Reserved.
14587 + * 912 - 1023 Disk specific descriptor.
14591 + * If x is the real device size in bytes, we return an apparent size of:
14593 + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
14595 + * and place the 4kB superblock at offset y.
14597 +#define MD_RESERVED_BYTES (64 * 1024)
14598 +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
14599 +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
14601 +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
14602 +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
14604 +#define MD_SB_BYTES 4096
14605 +#define MD_SB_WORDS (MD_SB_BYTES / 4)
14606 +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
14607 +#define MD_SB_SECTORS (MD_SB_BYTES / 512)
14610 + * The following are counted in 32-bit words
14612 +#define MD_SB_GENERIC_OFFSET 0
14613 +#define MD_SB_PERSONALITY_OFFSET 64
14614 +#define MD_SB_DISKS_OFFSET 128
14615 +#define MD_SB_DESCRIPTOR_OFFSET 992
14617 +#define MD_SB_GENERIC_CONSTANT_WORDS 32
14618 +#define MD_SB_GENERIC_STATE_WORDS 32
14619 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
14620 +#define MD_SB_PERSONALITY_WORDS 64
14621 +#define MD_SB_DISKS_WORDS 384
14622 +#define MD_SB_DESCRIPTOR_WORDS 32
14623 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
14624 +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
14625 +#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
14628 + * Device "operational" state bits
14630 +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
14631 +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
14632 +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
14633 +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
14635 +typedef struct mdp_device_descriptor_s {
14636 + __u32 number; /* 0 Device number in the entire set */
14637 + __u32 major; /* 1 Device major number */
14638 + __u32 minor; /* 2 Device minor number */
14639 + __u32 raid_disk; /* 3 The role of the device in the raid set */
14640 + __u32 state; /* 4 Operational state */
14641 + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
14644 +#define MD_SB_MAGIC 0xa92b4efc
14647 + * Superblock state bits
14649 +#define MD_SB_CLEAN 0
14650 +#define MD_SB_ERRORS 1
14652 +typedef struct mdp_superblock_s {
14654 + * Constant generic information
14656 + __u32 md_magic; /* 0 MD identifier */
14657 + __u32 major_version; /* 1 major version to which the set conforms */
14658 + __u32 minor_version; /* 2 minor version ... */
14659 + __u32 patch_version; /* 3 patchlevel version ... */
14660 + __u32 gvalid_words; /* 4 Number of used words in this section */
14661 + __u32 set_uuid0; /* 5 Raid set identifier */
14662 + __u32 ctime; /* 6 Creation time */
14663 + __u32 level; /* 7 Raid personality */
14664 + __u32 size; /* 8 Apparent size of each individual disk */
14665 + __u32 nr_disks; /* 9 total disks in the raid set */
14666 + __u32 raid_disks; /* 10 disks in a fully functional raid set */
14667 + __u32 md_minor; /* 11 preferred MD minor device number */
14668 + __u32 not_persistent; /* 12 does it have a persistent superblock */
14669 + __u32 set_uuid1; /* 13 Raid set identifier #2 */
14670 + __u32 set_uuid2; /* 14 Raid set identifier #3 */
14671 + __u32 set_uuid3; /* 14 Raid set identifier #4 */
14672 + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
14675 + * Generic state information
14677 + __u32 utime; /* 0 Superblock update time */
14678 + __u32 state; /* 1 State bits (clean, ...) */
14679 + __u32 active_disks; /* 2 Number of currently active disks */
14680 + __u32 working_disks; /* 3 Number of working disks */
14681 + __u32 failed_disks; /* 4 Number of failed disks */
14682 + __u32 spare_disks; /* 5 Number of spare disks */
14683 + __u32 sb_csum; /* 6 checksum of the whole superblock */
14684 + __u64 events; /* 7 number of superblock updates (64-bit!) */
14685 + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
14688 + * Personality information
14690 + __u32 layout; /* 0 the array's physical layout */
14691 + __u32 chunk_size; /* 1 chunk size in bytes */
14692 + __u32 root_pv; /* 2 LV root PV */
14693 + __u32 root_block; /* 3 LV root block */
14694 + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
14697 + * Disks information
14699 + mdp_disk_t disks[MD_SB_DISKS];
14704 + __u32 reserved[MD_SB_RESERVED_WORDS];
14707 + * Active descriptor
14709 + mdp_disk_t this_disk;
14715 diff -ruN linux.orig/include/linux/raid/md_u.h linux-2.2.16/include/linux/raid/md_u.h
14716 --- linux.orig/include/linux/raid/md_u.h Thu Jan 1 01:00:00 1970
14717 +++ linux-2.2.16/include/linux/raid/md_u.h Fri Jun 9 11:37:44 2000
14720 + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
14721 + Copyright (C) 1998 Ingo Molnar
14723 + This program is free software; you can redistribute it and/or modify
14724 + it under the terms of the GNU General Public License as published by
14725 + the Free Software Foundation; either version 2, or (at your option)
14726 + any later version.
14728 + You should have received a copy of the GNU General Public License
14729 + (for example /usr/src/linux/COPYING); if not, write to the Free
14730 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14739 +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
14740 +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
14741 +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
14742 +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
14744 +/* configuration */
14745 +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
14746 +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
14747 +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
14748 +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
14749 +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
14750 +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
14751 +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
14752 +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
14753 +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
14754 +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
14757 +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
14758 +#define START_ARRAY _IO (MD_MAJOR, 0x31)
14759 +#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
14760 +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
14761 +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
14763 +typedef struct mdu_version_s {
14769 +typedef struct mdu_array_info_s {
14771 + * Generic constant information
14773 + int major_version;
14774 + int minor_version;
14775 + int patch_version;
14782 + int not_persistent;
14785 + * Generic state information
14787 + int utime; /* 0 Superblock update time */
14788 + int state; /* 1 State bits (clean, ...) */
14789 + int active_disks; /* 2 Number of currently active disks */
14790 + int working_disks; /* 3 Number of working disks */
14791 + int failed_disks; /* 4 Number of failed disks */
14792 + int spare_disks; /* 5 Number of spare disks */
14795 + * Personality information
14797 + int layout; /* 0 the array's physical layout */
14798 + int chunk_size; /* 1 chunk size in bytes */
14800 +} mdu_array_info_t;
14802 +typedef struct mdu_disk_info_s {
14804 + * configuration/status of one particular disk
14812 +} mdu_disk_info_t;
14814 +typedef struct mdu_start_info_s {
14816 + * configuration/status of one particular disk
14823 +} mdu_start_info_t;
14825 +typedef struct mdu_param_s
14827 + int personality; /* 1,2,3,4 */
14828 + int chunk_size; /* in bytes */
14829 + int max_fault; /* unused for now */
14834 diff -ruN linux.orig/include/linux/raid/raid0.h linux-2.2.16/include/linux/raid/raid0.h
14835 --- linux.orig/include/linux/raid/raid0.h Thu Jan 1 01:00:00 1970
14836 +++ linux-2.2.16/include/linux/raid/raid0.h Fri Jun 9 11:37:44 2000
14841 +#include <linux/raid/md.h>
14845 + int zone_offset; /* Zone offset in md_dev */
14846 + int dev_offset; /* Zone offset in real dev */
14847 + int size; /* Zone size */
14848 + int nb_dev; /* # of devices attached to the zone */
14849 + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
14854 + struct strip_zone *zone0, *zone1;
14857 +struct raid0_private_data
14859 + struct raid0_hash *hash_table; /* Dynamically allocated */
14860 + struct strip_zone *strip_zone; /* This one too */
14861 + int nr_strip_zones;
14862 + struct strip_zone *smallest;
14866 +typedef struct raid0_private_data raid0_conf_t;
14868 +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
14871 diff -ruN linux.orig/include/linux/raid/raid1.h linux-2.2.16/include/linux/raid/raid1.h
14872 --- linux.orig/include/linux/raid/raid1.h Thu Jan 1 01:00:00 1970
14873 +++ linux-2.2.16/include/linux/raid/raid1.h Fri Jun 9 11:37:44 2000
14878 +#include <linux/raid/md.h>
14880 +struct mirror_info {
14897 +struct raid1_private_data {
14899 + struct mirror_info mirrors[MD_SB_DISKS];
14902 + int working_disks;
14904 + unsigned long next_sect;
14906 + mdk_thread_t *thread, *resync_thread;
14907 + int resync_mirrors;
14908 + struct mirror_info *spare;
14911 +typedef struct raid1_private_data raid1_conf_t;
14914 + * this is the only point in the RAID code where we violate
14915 + * C type safety. mddev->private is an 'opaque' pointer.
14917 +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
14920 + * this is our 'private' 'collective' RAID1 buffer head.
14921 + * it contains information about what kind of IO operations were started
14922 + * for this RAID1 operation, and about their status:
14926 + atomic_t remaining; /* 'have we finished' count,
14927 + * used from IRQ handlers
14930 + unsigned long state;
14932 + struct buffer_head *master_bh;
14933 + struct buffer_head *mirror_bh [MD_SB_DISKS];
14934 + struct buffer_head bh_req;
14935 + struct buffer_head *next_retry;
14939 diff -ruN linux.orig/include/linux/raid/raid5.h linux-2.2.16/include/linux/raid/raid5.h
14940 --- linux.orig/include/linux/raid/raid5.h Thu Jan 1 01:00:00 1970
14941 +++ linux-2.2.16/include/linux/raid/raid5.h Fri Jun 9 11:37:44 2000
14946 +#include <linux/raid/md.h>
14947 +#include <linux/raid/xor.h>
14949 +struct disk_info {
14959 +struct stripe_head {
14960 + md_spinlock_t stripe_lock;
14961 + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
14962 + struct stripe_head *free_next; /* pool of free sh's */
14963 + struct buffer_head *buffer_pool; /* pool of free buffers */
14964 + struct buffer_head *bh_pool; /* pool of free bh's */
14965 + struct raid5_private_data *raid_conf;
14966 + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
14967 + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
14968 + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
14969 + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
14970 + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
14971 + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
14972 + unsigned long sector; /* sector of this row */
14973 + int size; /* buffers size */
14974 + int pd_idx; /* parity disk index */
14975 + atomic_t nr_pending; /* nr of pending cmds */
14976 + unsigned long state; /* state flags */
14977 + int cmd; /* stripe cmd */
14978 + int count; /* nr of waiters */
14979 + int write_method; /* reconstruct-write / read-modify-write */
14980 + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
14981 + struct wait_queue *wait; /* processes waiting for this stripe */
14987 +#define PHASE_BEGIN 0
14988 +#define PHASE_READ_OLD 1
14989 +#define PHASE_WRITE 2
14990 +#define PHASE_READ 3
14991 +#define PHASE_COMPLETE 4
14996 +#define METHOD_NONE 0
14997 +#define RECONSTRUCT_WRITE 1
14998 +#define READ_MODIFY_WRITE 2
15003 +#define STRIPE_LOCKED 0
15004 +#define STRIPE_ERROR 1
15007 + * Stripe commands
15009 +#define STRIPE_NONE 0
15010 +#define STRIPE_WRITE 1
15011 +#define STRIPE_READ 2
15013 +struct raid5_private_data {
15014 + struct stripe_head **stripe_hashtbl;
15016 + mdk_thread_t *thread, *resync_thread;
15017 + struct disk_info disks[MD_SB_DISKS];
15018 + struct disk_info *spare;
15020 + int chunk_size, level, algorithm;
15021 + int raid_disks, working_disks, failed_disks;
15022 + int sector_count;
15023 + unsigned long next_sector;
15024 + atomic_t nr_handle;
15025 + struct stripe_head *next_free_stripe;
15027 + int resync_parity;
15028 + int max_nr_stripes;
15030 + int nr_hashed_stripes;
15031 + int nr_locked_stripes;
15032 + int nr_pending_stripes;
15033 + int nr_cached_stripes;
15036 + * Free stripes pool
15039 + struct stripe_head *free_sh_list;
15040 + struct wait_queue *wait_for_stripe;
15043 +typedef struct raid5_private_data raid5_conf_t;
15045 +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
15048 + * Our supported algorithms
15050 +#define ALGORITHM_LEFT_ASYMMETRIC 0
15051 +#define ALGORITHM_RIGHT_ASYMMETRIC 1
15052 +#define ALGORITHM_LEFT_SYMMETRIC 2
15053 +#define ALGORITHM_RIGHT_SYMMETRIC 3
15056 diff -ruN linux.orig/include/linux/raid/translucent.h linux-2.2.16/include/linux/raid/translucent.h
15057 --- linux.orig/include/linux/raid/translucent.h Thu Jan 1 01:00:00 1970
15058 +++ linux-2.2.16/include/linux/raid/translucent.h Fri Jun 9 11:37:44 2000
15060 +#ifndef _TRANSLUCENT_H
15061 +#define _TRANSLUCENT_H
15063 +#include <linux/raid/md.h>
15065 +typedef struct dev_info dev_info_t;
15072 +struct translucent_private_data
15074 + dev_info_t disks[MD_SB_DISKS];
15078 +typedef struct translucent_private_data translucent_conf_t;
15080 +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
15083 diff -ruN linux.orig/include/linux/raid/xor.h linux-2.2.16/include/linux/raid/xor.h
15084 --- linux.orig/include/linux/raid/xor.h Thu Jan 1 01:00:00 1970
15085 +++ linux-2.2.16/include/linux/raid/xor.h Fri Jun 9 11:37:44 2000
15090 +#include <linux/raid/md.h>
15092 +#define MAX_XOR_BLOCKS 5
15094 +extern void calibrate_xor_block(void);
15095 +extern void (*xor_block)(unsigned int count,
15096 + struct buffer_head **bh_ptr);
15099 diff -ruN linux.orig/include/linux/raid0.h linux-2.2.16/include/linux/raid0.h
15100 --- linux.orig/include/linux/raid0.h Tue Oct 29 14:20:24 1996
15101 +++ linux-2.2.16/include/linux/raid0.h Thu Jan 1 01:00:00 1970
15108 - int zone_offset; /* Zone offset in md_dev */
15109 - int dev_offset; /* Zone offset in real dev */
15110 - int size; /* Zone size */
15111 - int nb_dev; /* Number of devices attached to the zone */
15112 - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
15117 - struct strip_zone *zone0, *zone1;
15122 - struct raid0_hash *hash_table; /* Dynamically allocated */
15123 - struct strip_zone *strip_zone; /* This one too */
15124 - int nr_strip_zones;
15125 - struct strip_zone *smallest;
15130 diff -ruN linux.orig/include/linux/raid1.h linux-2.2.16/include/linux/raid1.h
15131 --- linux.orig/include/linux/raid1.h Fri May 8 09:17:13 1998
15132 +++ linux-2.2.16/include/linux/raid1.h Thu Jan 1 01:00:00 1970
15137 -#include <linux/md.h>
15139 -struct mirror_info {
15154 -struct raid1_data {
15155 - struct md_dev *mddev;
15156 - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
15158 - int working_disks; /* Number of working disks */
15160 - unsigned long next_sect;
15162 - int resync_running;
15166 - * this is our 'private' 'collective' RAID1 buffer head.
15167 - * it contains information about what kind of IO operations were started
15168 - * for this RAID5 operation, and about their status:
15172 - unsigned int remaining;
15174 - unsigned long state;
15175 - struct md_dev *mddev;
15176 - struct buffer_head *master_bh;
15177 - struct buffer_head *mirror_bh [MD_SB_DISKS];
15178 - struct buffer_head bh_req;
15179 - struct buffer_head *next_retry;
15183 diff -ruN linux.orig/include/linux/raid5.h linux-2.2.16/include/linux/raid5.h
15184 --- linux.orig/include/linux/raid5.h Fri May 8 09:17:13 1998
15185 +++ linux-2.2.16/include/linux/raid5.h Thu Jan 1 01:00:00 1970
15191 -#include <linux/md.h>
15192 -#include <asm/atomic.h>
15194 -struct disk_info {
15203 -struct stripe_head {
15204 - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
15205 - struct stripe_head *free_next; /* pool of free sh's */
15206 - struct buffer_head *buffer_pool; /* pool of free buffers */
15207 - struct buffer_head *bh_pool; /* pool of free bh's */
15208 - struct raid5_data *raid_conf;
15209 - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
15210 - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
15211 - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
15212 - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
15213 - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
15214 - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
15215 - unsigned long sector; /* sector of this row */
15216 - int size; /* buffers size */
15217 - int pd_idx; /* parity disk index */
15218 - int nr_pending; /* nr of pending cmds */
15219 - unsigned long state; /* state flags */
15220 - int cmd; /* stripe cmd */
15221 - int count; /* nr of waiters */
15222 - int write_method; /* reconstruct-write / read-modify-write */
15223 - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
15224 - struct wait_queue *wait; /* processes waiting for this stripe */
15230 -#define PHASE_BEGIN 0
15231 -#define PHASE_READ_OLD 1
15232 -#define PHASE_WRITE 2
15233 -#define PHASE_READ 3
15234 -#define PHASE_COMPLETE 4
15239 -#define METHOD_NONE 0
15240 -#define RECONSTRUCT_WRITE 1
15241 -#define READ_MODIFY_WRITE 2
15246 -#define STRIPE_LOCKED 0
15247 -#define STRIPE_ERROR 1
15250 - * Stripe commands
15252 -#define STRIPE_NONE 0
15253 -#define STRIPE_WRITE 1
15254 -#define STRIPE_READ 2
15256 -struct raid5_data {
15257 - struct stripe_head **stripe_hashtbl;
15258 - struct md_dev *mddev;
15259 - struct md_thread *thread, *resync_thread;
15260 - struct disk_info disks[MD_SB_DISKS];
15261 - struct disk_info *spare;
15263 - int chunk_size, level, algorithm;
15264 - int raid_disks, working_disks, failed_disks;
15265 - int sector_count;
15266 - unsigned long next_sector;
15267 - atomic_t nr_handle;
15268 - struct stripe_head *next_free_stripe;
15270 - int resync_parity;
15271 - int max_nr_stripes;
15273 - int nr_hashed_stripes;
15274 - int nr_locked_stripes;
15275 - int nr_pending_stripes;
15276 - int nr_cached_stripes;
15279 - * Free stripes pool
15282 - struct stripe_head *free_sh_list;
15283 - struct wait_queue *wait_for_stripe;
15289 - * Our supported algorithms
15291 -#define ALGORITHM_LEFT_ASYMMETRIC 0
15292 -#define ALGORITHM_RIGHT_ASYMMETRIC 1
15293 -#define ALGORITHM_LEFT_SYMMETRIC 2
15294 -#define ALGORITHM_RIGHT_SYMMETRIC 3
15297 diff -ruN linux.orig/include/linux/sysctl.h linux-2.2.16/include/linux/sysctl.h
15298 --- linux.orig/include/linux/sysctl.h Wed Jun 7 23:26:44 2000
15299 +++ linux-2.2.16/include/linux/sysctl.h Fri Jun 9 11:45:55 2000
15300 @@ -430,7 +430,8 @@
15301 /* CTL_DEV names: */
15309 /* /proc/sys/dev/cdrom */
15310 @@ -441,6 +442,11 @@
15313 DEV_CDROM_CHECK_MEDIA=6
15316 +/* /proc/sys/dev/md */
15318 + DEV_MD_SPEED_LIMIT=1
15322 diff -ruN linux.orig/init/main.c linux-2.2.16/init/main.c
15323 --- linux.orig/init/main.c Wed Jun 7 23:26:44 2000
15324 +++ linux-2.2.16/init/main.c Fri Jun 9 11:37:44 2000
15326 #include <linux/utsname.h>
15327 #include <linux/ioport.h>
15328 #include <linux/init.h>
15329 +#include <linux/raid/md.h>
15330 #include <linux/smp_lock.h>
15331 #include <linux/blk.h>
15332 #include <linux/hdreg.h>
15333 @@ -540,7 +541,7 @@
15334 #ifdef CONFIG_BLK_DEV_FD
15337 -#ifdef CONFIG_MD_BOOT
15338 +#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
15341 #ifdef CONFIG_BLK_DEV_XD
15342 @@ -1042,6 +1043,9 @@
15343 #ifdef CONFIG_MD_BOOT
15344 { "md=", md_setup},
15346 +#if CONFIG_BLK_DEV_MD
15347 + { "raid=", raid_setup},
15349 #ifdef CONFIG_ADBMOUSE
15350 { "adb_buttons=", adb_mouse_setup },
15352 @@ -1580,6 +1584,9 @@
15353 while (pid != wait(&i));
15354 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
15355 || MINOR(real_root_dev) != 0) {
15356 +#ifdef CONFIG_BLK_DEV_MD
15357 + autodetect_raid();
15359 error = change_root(real_root_dev,"/initrd");
15361 printk(KERN_ERR "Change root to /initrd: "