--- /dev/null
+diff -uNr linux-orig/Documentation/Configure.help linux/Documentation/Configure.help
+--- linux-orig/Documentation/Configure.help Fri Dec 8 22:53:50 2000
++++ linux/Documentation/Configure.help Fri Dec 8 22:54:52 2000
+@@ -1015,6 +1015,13 @@
+
+ If unsure, say N.
+
++Autodetect RAID partitions
++CONFIG_AUTODETECT_RAID
++ This feature lets the kernel detect RAID partitions on bootup.
++ An autodetect RAID partition is a normal partition with partition
++ type 0xfd. Use this if you want to boot RAID devices, or want to
++ run them automatically.
++
+ Linear (append) mode
+ CONFIG_MD_LINEAR
+ If you say Y here, then your multiple devices driver will be able to
+@@ -1093,6 +1100,21 @@
+ Documentation/modules.txt.
+
+ If unsure, say Y.
++
++Translucent Block Device Support (EXPERIMENTAL)
++CONFIG_MD_TRANSLUCENT
++ DO NOT USE THIS STUFF YET!
++
++ currently there is only a placeholder there as the implementation
++ is not yet usable.
++
++Hierarchical Storage Management support (EXPERIMENTAL)
++CONFIG_MD_HSM
++ DO NOT USE THIS STUFF YET!
++
++ i have released this so people can comment on the architecture,
++ but user-space tools are still unusable so there is nothing much
++ you can do with this.
+
+ Boot support (linear, striped)
+ CONFIG_MD_BOOT
+diff -uNr linux-orig/arch/i386/defconfig linux/arch/i386/defconfig
+--- linux-orig/arch/i386/defconfig Fri Dec 8 22:53:50 2000
++++ linux/arch/i386/defconfig Fri Dec 8 22:54:52 2000
+@@ -93,7 +93,15 @@
+ #
+ # CONFIG_BLK_DEV_LOOP is not set
+ # CONFIG_BLK_DEV_NBD is not set
+-# CONFIG_BLK_DEV_MD is not set
++CONFIG_BLK_DEV_MD=y
++CONFIG_AUTODETECT_RAID=y
++CONFIG_MD_TRANSLUCENT=y
++CONFIG_MD_LINEAR=y
++CONFIG_MD_STRIPED=y
++CONFIG_MD_MIRRORING=y
++CONFIG_MD_RAID5=y
++CONFIG_MD_BOOT=y
++CONFIG_BLK_DEV_HSM=y
+ # CONFIG_BLK_DEV_RAM is not set
+ # CONFIG_BLK_DEV_XD is not set
+ # CONFIG_BLK_DEV_DAC960 is not set
+diff -uNr linux-orig/arch/sparc/config.in linux/arch/sparc/config.in
+--- linux-orig/arch/sparc/config.in Fri Dec 8 22:53:51 2000
++++ linux/arch/sparc/config.in Fri Dec 8 22:54:52 2000
+@@ -88,10 +88,16 @@
+
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+ tristate ' Linear (append) mode' CONFIG_MD_LINEAR
+ tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
+ tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+ tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
++ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
++fi
++if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
++ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
+ fi
+
+ tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
+diff -uNr linux-orig/arch/sparc/defconfig linux/arch/sparc/defconfig
+--- linux-orig/arch/sparc/defconfig Fri Dec 8 22:53:51 2000
++++ linux/arch/sparc/defconfig Fri Dec 8 22:54:52 2000
+@@ -89,10 +89,13 @@
+ #
+ CONFIG_BLK_DEV_FD=y
+ CONFIG_BLK_DEV_MD=y
++# CONFIG_AUTODETECT_RAID is not set
+ CONFIG_MD_LINEAR=m
+ CONFIG_MD_STRIPED=m
+ CONFIG_MD_MIRRORING=m
+ CONFIG_MD_RAID5=m
++# CONFIG_MD_TRANSLUCENT is not set
++# CONFIG_MD_HSM is not set
+ CONFIG_BLK_DEV_RAM=y
+ CONFIG_BLK_DEV_RAM_SIZE=4096
+ CONFIG_BLK_DEV_INITRD=y
+diff -uNr linux-orig/arch/sparc64/config.in linux/arch/sparc64/config.in
+--- linux-orig/arch/sparc64/config.in Fri Dec 8 22:53:51 2000
++++ linux/arch/sparc64/config.in Fri Dec 8 22:54:52 2000
+@@ -102,10 +102,16 @@
+
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+ tristate ' Linear (append) mode' CONFIG_MD_LINEAR
+ tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
+ tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+ tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
++ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
++fi
++if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
++ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
+ fi
+
+ tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
+diff -uNr linux-orig/arch/sparc64/defconfig linux/arch/sparc64/defconfig
+--- linux-orig/arch/sparc64/defconfig Fri Dec 8 22:53:51 2000
++++ linux/arch/sparc64/defconfig Fri Dec 8 22:54:52 2000
+@@ -106,10 +106,13 @@
+ #
+ CONFIG_BLK_DEV_FD=y
+ CONFIG_BLK_DEV_MD=y
++# CONFIG_AUTODETECT_RAID is not set
+ CONFIG_MD_LINEAR=m
+ CONFIG_MD_STRIPED=m
+ CONFIG_MD_MIRRORING=m
+ CONFIG_MD_RAID5=m
++# CONFIG_MD_TRANSLUCENT is not set
++# CONFIG_MD_HSM is not set
+ CONFIG_BLK_DEV_RAM=y
+ CONFIG_BLK_DEV_RAM_SIZE=4096
+ CONFIG_BLK_DEV_INITRD=y
+diff -uNr linux-orig/drivers/block/Config.in linux/drivers/block/Config.in
+--- linux-orig/drivers/block/Config.in Fri Dec 8 22:53:51 2000
++++ linux/drivers/block/Config.in Fri Dec 8 22:54:52 2000
+@@ -103,10 +103,13 @@
+ fi
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++ bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+ tristate ' Linear (append) mode' CONFIG_MD_LINEAR
+ tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
+ tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+ tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++ tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
++ tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
+ fi
+ if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
+ bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
+diff -uNr linux-orig/drivers/block/Makefile linux/drivers/block/Makefile
+--- linux-orig/drivers/block/Makefile Fri Dec 8 22:53:51 2000
++++ linux/drivers/block/Makefile Fri Dec 8 22:54:52 2000
+@@ -294,10 +294,28 @@
+ endif
+
+ ifeq ($(CONFIG_MD_RAID5),y)
++LX_OBJS += xor.o
+ L_OBJS += raid5.o
+ else
+ ifeq ($(CONFIG_MD_RAID5),m)
++ LX_OBJS += xor.o
+ M_OBJS += raid5.o
++ endif
++endif
++
++ifeq ($(CONFIG_MD_TRANSLUCENT),y)
++L_OBJS += translucent.o
++else
++ ifeq ($(CONFIG_MD_TRANSLUCENT),m)
++ M_OBJS += translucent.o
++ endif
++endif
++
++ifeq ($(CONFIG_MD_HSM),y)
++L_OBJS += hsm.o
++else
++ ifeq ($(CONFIG_MD_HSM),m)
++ M_OBJS += hsm.o
+ endif
+ endif
+
+diff -uNr linux-orig/drivers/block/genhd.c linux/drivers/block/genhd.c
+--- linux-orig/drivers/block/genhd.c Fri Dec 8 22:53:56 2000
++++ linux/drivers/block/genhd.c Fri Dec 8 22:54:52 2000
+@@ -28,6 +28,7 @@
+ #include <linux/string.h>
+ #include <linux/blk.h>
+ #include <linux/init.h>
++#include <linux/raid/md.h>
+
+ #ifdef CONFIG_ARCH_S390
+ #include <asm/dasd.h>
+@@ -1731,6 +1732,9 @@
+ else
+ #endif
+ rd_load();
++#endif
++#ifdef CONFIG_BLK_DEV_MD
++ autodetect_raid();
+ #endif
+ #ifdef CONFIG_MD_BOOT
+ md_setup_drive();
+diff -uNr linux-orig/drivers/block/hsm.c linux/drivers/block/hsm.c
+--- linux-orig/drivers/block/hsm.c Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/hsm.c Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,840 @@
++/*
++ hsm.c : HSM RAID driver for Linux
++ Copyright (C) 1998 Ingo Molnar
++
++ HSM mode management functions.
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#include <linux/module.h>
++
++#include <linux/raid/md.h>
++#include <linux/malloc.h>
++
++#include <linux/raid/hsm.h>
++#include <linux/blk.h>
++
++#define MAJOR_NR MD_MAJOR
++#define MD_DRIVER
++#define MD_PERSONALITY
++
++
++#define DEBUG_HSM 1
++
++#if DEBUG_HSM
++#define dprintk(x,y...) printk(x,##y)
++#else
++#define dprintk(x,y...) do { } while (0)
++#endif
++
++void print_bh(struct buffer_head *bh)
++{
++ dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
++ bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
++ bh->b_rsector, bh->b_this_page, bh->b_state,
++ bh->b_next_free, bh->b_count, bh->b_data,
++ bh->b_list, bh->b_flushtime
++ );
++}
++
++static int check_bg (pv_t *pv, pv_block_group_t * bg)
++{
++ int i, free = 0;
++
++ dprintk("checking bg ...\n");
++
++ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
++ if (pv_pptr_free(bg->blocks + i)) {
++ free++;
++ if (test_bit(i, bg->used_bitmap)) {
++ printk("hm, bit %d set?\n", i);
++ }
++ } else {
++ if (!test_bit(i, bg->used_bitmap)) {
++ printk("hm, bit %d not set?\n", i);
++ }
++ }
++ }
++ dprintk("%d free blocks in bg ...\n", free);
++ return free;
++}
++
++static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
++{
++ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
++ struct buffer_head *bh;
++
++ dprintk("... getting BG at %u ...\n", bg_pos);
++
++ bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
++ if (!bh) {
++ MD_BUG();
++ return;
++ }
++ desc->bg = (pv_block_group_t *) bh->b_data;
++ desc->free_blocks = check_bg(pv, desc->bg);
++}
++
++static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
++ unsigned int lblock, lv_lptr_t * index)
++{
++ int i;
++
++ for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
++ pv_pptr_t * bptr = desc->bg->blocks + i;
++ if (pv_pptr_free(bptr)) {
++ unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
++
++ if (test_bit(i, desc->bg->used_bitmap)) {
++ MD_BUG();
++ continue;
++ }
++ bptr->u.used.owner.log_id = lv->log_id;
++ bptr->u.used.owner.log_index = lblock;
++ index->data.phys_nr = pv->phys_nr;
++ index->data.phys_block = bg_pos + i + 1;
++ set_bit(i, desc->bg->used_bitmap);
++ desc->free_blocks--;
++ dprintk(".....free blocks left in bg %p: %d\n",
++ desc->bg, desc->free_blocks);
++ return 0;
++ }
++ }
++ return -ENOSPC;
++}
++
++static int __get_free_block (lv_t *lv, pv_t *pv,
++ unsigned int lblock, lv_lptr_t * index)
++{
++ int i;
++
++ dprintk("trying to get free block for lblock %d ...\n", lblock);
++
++ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
++ pv_bg_desc_t *desc = pv->bg_array + i;
++
++ dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
++ if (!desc->bg)
++ get_bg(pv, desc, i);
++
++ if (desc->bg && desc->free_blocks)
++ return find_free_block(lv, pv, desc, i,
++ lblock, index);
++ }
++ dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
++ return -ENOSPC;
++}
++
++static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
++{
++ int err;
++
++ if (!lv->free_indices)
++ return -ENOSPC;
++
++ /* fix me */
++ err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
++
++ if (err || !index->data.phys_block) {
++ MD_BUG();
++ return -ENOSPC;
++ }
++
++ lv->free_indices--;
++
++ return 0;
++}
++
++/*
++ * fix me: wordsize assumptions ...
++ */
++#define INDEX_BITS 8
++#define INDEX_DEPTH (32/INDEX_BITS)
++#define INDEX_MASK ((1<<INDEX_BITS) - 1)
++
++static void print_index_list (lv_t *lv, lv_lptr_t *index)
++{
++ lv_lptr_t *tmp;
++ int i;
++
++ dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
++ index->data.phys_block, index->cpu_addr);
++
++ tmp = index_child(index);
++ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++ if (index_block(lv, tmp))
++ dprintk("(%d->%d)", i, index_block(lv, tmp));
++ tmp++;
++ }
++ dprintk(".]\n");
++}
++
++static int read_index_group (lv_t *lv, lv_lptr_t *index)
++{
++ lv_lptr_t *index_group, *tmp;
++ struct buffer_head *bh;
++ int i;
++
++ dprintk("reading index group <%s:%d>\n",
++ partition_name(index_dev(lv, index)), index_block(lv, index));
++
++ bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
++ if (!bh) {
++ MD_BUG();
++ return -EIO;
++ }
++ if (!buffer_uptodate(bh))
++ MD_BUG();
++
++ index_group = (lv_lptr_t *) bh->b_data;
++ tmp = index_group;
++ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++ if (index_block(lv, tmp)) {
++ dprintk("index group has BLOCK %d, non-present.\n", i);
++ tmp->cpu_addr = 0;
++ }
++ tmp++;
++ }
++ index->cpu_addr = ptr_to_cpuaddr(index_group);
++
++ dprintk("have read index group %p at block %d.\n",
++ index_group, index_block(lv, index));
++ print_index_list(lv, index);
++
++ return 0;
++}
++
++static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
++{
++ struct buffer_head *bh;
++ lv_lptr_t * index_group;
++
++ if (get_free_block(lv, lblock, index))
++ return -ENOSPC;
++
++ dprintk("creating block for index group <%s:%d>\n",
++ partition_name(index_dev(lv, index)), index_block(lv, index));
++
++ bh = getblk(index_dev(lv, index),
++ index_block(lv, index), HSM_BLOCKSIZE);
++
++ index_group = (lv_lptr_t *) bh->b_data;
++ md_clear_page(index_group);
++ mark_buffer_uptodate(bh, 1);
++
++ index->cpu_addr = ptr_to_cpuaddr(index_group);
++
++ dprintk("allocated index group %p at block %d.\n",
++ index_group, index_block(lv, index));
++ return 0;
++}
++
++static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
++{
++ lv_lptr_t * index = index_child(&lv->root_index);
++ int idx, l;
++
++ for (l = INDEX_DEPTH-1; l >= 0; l--) {
++ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
++ index += idx;
++ if (!l)
++ break;
++ if (!index_present(index)) {
++ dprintk("no group, level %u, pos %u\n", l, idx);
++ if (alloc_index_group(lv, lblock, index))
++ return NULL;
++ }
++ index = index_child(index);
++ }
++ if (!index_block(lv,index)) {
++ dprintk("no data, pos %u\n", idx);
++ if (get_free_block(lv, lblock, index))
++ return NULL;
++ return index;
++ }
++ MD_BUG();
++ return index;
++}
++
++static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
++{
++ lv_lptr_t * index = index_child(&lv->root_index);
++ int idx, l;
++
++ for (l = INDEX_DEPTH-1; l >= 0; l--) {
++ idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
++ index += idx;
++ if (!l)
++ break;
++ if (index_free(index))
++ return NULL;
++ if (!index_present(index))
++ read_index_group(lv, index);
++ if (!index_present(index)) {
++ MD_BUG();
++ return NULL;
++ }
++ index = index_child(index);
++ }
++ if (!index_block(lv,index))
++ return NULL;
++ return index;
++}
++
++static int read_root_index(lv_t *lv)
++{
++ int err;
++ lv_lptr_t *index = &lv->root_index;
++
++ if (!index_block(lv, index)) {
++ printk("LV has no root index yet, creating.\n");
++
++ err = alloc_index_group (lv, 0, index);
++ if (err) {
++ printk("could not create index group, err:%d\n", err);
++ return err;
++ }
++ lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
++ lv->root_index.data;
++ } else {
++ printk("LV already has a root index.\n");
++ printk("... at <%s:%d>.\n",
++ partition_name(index_dev(lv, index)),
++ index_block(lv, index));
++
++ read_index_group(lv, index);
++ }
++ return 0;
++}
++
++static int init_pv(pv_t *pv)
++{
++ struct buffer_head *bh;
++ pv_sb_t *pv_sb;
++
++ bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
++ if (!bh) {
++ MD_BUG();
++ return -1;
++ }
++
++ pv_sb = (pv_sb_t *) bh->b_data;
++ pv->pv_sb = pv_sb;
++
++ if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
++ printk("%s is not a PV, has magic %x instead of %x!\n",
++ partition_name(pv->dev), pv_sb->pv_magic,
++ HSM_PV_SB_MAGIC);
++ return -1;
++ }
++ printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
++ pv->phys_nr);
++ printk("... created under HSM version %d.%d.%d, at %x.\n",
++ pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
++ printk("... total # of blocks: %d (%d left unallocated).\n",
++ pv_sb->pv_total_size, pv_sb->pv_blocks_left);
++
++ printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
++ printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
++ printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
++ printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
++
++ if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
++ MD_BUG();
++ return 1;
++ }
++ pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
++ if (!pv->bg_array) {
++ MD_BUG();
++ return 1;
++ }
++ memset(pv->bg_array, 0, PAGE_SIZE);
++
++ return 0;
++}
++
++static int free_pv(pv_t *pv)
++{
++ struct buffer_head *bh;
++
++ dprintk("freeing PV %d ...\n", pv->phys_nr);
++
++ if (pv->bg_array) {
++ int i;
++
++ dprintk(".... freeing BGs ...\n");
++ for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
++ unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
++ pv_bg_desc_t *desc = pv->bg_array + i;
++
++ if (desc->bg) {
++ dprintk(".... freeing BG %d ...\n", i);
++ bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
++ mark_buffer_dirty(bh, 1);
++ brelse(bh);
++ brelse(bh);
++ }
++ }
++ free_page((unsigned long)pv->bg_array);
++ } else
++ MD_BUG();
++
++ bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
++ if (!bh) {
++ MD_BUG();
++ return -1;
++ }
++ mark_buffer_dirty(bh, 1);
++ brelse(bh);
++ brelse(bh);
++
++ return 0;
++}
++
++struct semaphore hsm_sem = MUTEX;
++
++#define HSM_SECTORS (HSM_BLOCKSIZE/512)
++
++static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++ unsigned long *rsector, unsigned long bsectors)
++{
++ lv_t *lv = kdev_to_lv(dev);
++ lv_lptr_t *index;
++ unsigned int lblock = *rsector / HSM_SECTORS;
++ unsigned int offset = *rsector % HSM_SECTORS;
++ int err = -EIO;
++
++ if (!lv) {
++ printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
++ goto out;
++ }
++ if (offset + bsectors > HSM_SECTORS) {
++ MD_BUG();
++ goto out;
++ }
++ down(&hsm_sem);
++ index = find_index(lv, lblock);
++ if (!index) {
++ printk("no block %u yet ... allocating\n", lblock);
++ index = alloc_fixed_index(lv, lblock);
++ }
++
++ err = 0;
++
++ printk(" %u <%s : %ld(%ld)> -> ", lblock,
++ partition_name(*rdev), *rsector, bsectors);
++
++ *rdev = index_dev(lv, index);
++ *rsector = index_block(lv, index) * HSM_SECTORS + offset;
++
++ printk(" <%s : %ld> %u\n",
++ partition_name(*rdev), *rsector, index_block(lv, index));
++
++ up(&hsm_sem);
++out:
++ return err;
++}
++
++static void free_index (lv_t *lv, lv_lptr_t * index)
++{
++ struct buffer_head *bh;
++
++ printk("tryin to get cached block for index group <%s:%d>\n",
++ partition_name(index_dev(lv, index)), index_block(lv, index));
++
++ bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
++
++ printk("....FREEING ");
++ print_index_list(lv, index);
++
++ if (bh) {
++ if (!buffer_uptodate(bh))
++ MD_BUG();
++ if ((lv_lptr_t *)bh->b_data != index_child(index)) {
++ printk("huh? b_data is %p, index content is %p.\n",
++ bh->b_data, index_child(index));
++ } else
++ printk("good, b_data == index content == %p.\n",
++ index_child(index));
++ printk("b_count == %d, writing.\n", bh->b_count);
++ mark_buffer_dirty(bh, 1);
++ brelse(bh);
++ brelse(bh);
++ printk("done.\n");
++ } else {
++ printk("FAILED!\n");
++ }
++ print_index_list(lv, index);
++ index_child(index) = NULL;
++}
++
++static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
++{
++ char dots [3*8];
++ lv_lptr_t * index;
++ int i, nr_dots;
++
++ nr_dots = (INDEX_DEPTH-level)*3;
++ memcpy(dots,"...............",nr_dots);
++ dots[nr_dots] = 0;
++
++ dprintk("%s level %d index group block:\n", dots, level);
++
++
++ index = index_0;
++ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++ if (index->data.phys_block) {
++ dprintk("%s block <%u,%u,%x>\n", dots,
++ index->data.phys_nr,
++ index->data.phys_block,
++ index->cpu_addr);
++ if (level && index_present(index)) {
++ dprintk("%s==> deeper one level\n", dots);
++ free_index_group(lv, level-1,
++ index_child(index));
++ dprintk("%s freeing index group block %p ...",
++ dots, index_child(index));
++ free_index(lv, index);
++ }
++ }
++ index++;
++ }
++ dprintk("%s DONE: level %d index group block.\n", dots, level);
++}
++
++static void free_lv_indextree (lv_t *lv)
++{
++ dprintk("freeing LV %d ...\n", lv->log_id);
++ dprintk("..root index: %p\n", index_child(&lv->root_index));
++ dprintk("..INDEX TREE:\n");
++ free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
++ dprintk("..freeing root index %p ...", index_child(&lv->root_index));
++ dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
++ lv->root_index.data.phys_block, lv->root_index.cpu_addr);
++ free_index(lv, &lv->root_index);
++ dprintk("..INDEX TREE done.\n");
++ fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
++ lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
++}
++
++static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
++{
++ char dots [3*5];
++ lv_lptr_t * index;
++ int i, nr_dots;
++
++ nr_dots = (INDEX_DEPTH-level)*3;
++ memcpy(dots,"...............",nr_dots);
++ dots[nr_dots] = 0;
++
++ dprintk("%s level %d index group block:\n", dots, level);
++
++
++ for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++ index = index_0 + i;
++ if (index->data.phys_block) {
++ dprintk("%s block <%u,%u,%x>\n", dots,
++ index->data.phys_nr,
++ index->data.phys_block,
++ index->cpu_addr);
++ if (level && index_present(index)) {
++ dprintk("%s==> deeper one level\n", dots);
++ print_index_group(lv, level-1,
++ index_child(index));
++ }
++ }
++ }
++ dprintk("%s DONE: level %d index group block.\n", dots, level);
++}
++
++static void print_lv (lv_t *lv)
++{
++ dprintk("printing LV %d ...\n", lv->log_id);
++ dprintk("..root index: %p\n", index_child(&lv->root_index));
++ dprintk("..INDEX TREE:\n");
++ print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
++ dprintk("..INDEX TREE done.\n");
++}
++
++static int map_lv (lv_t *lv)
++{
++ kdev_t dev = lv->dev;
++ unsigned int nr = MINOR(dev);
++ mddev_t *mddev = lv->vg->mddev;
++
++ if (MAJOR(dev) != MD_MAJOR) {
++ MD_BUG();
++ return -1;
++ }
++ if (kdev_to_mddev(dev)) {
++ MD_BUG();
++ return -1;
++ }
++ md_hd_struct[nr].start_sect = 0;
++ md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
++ md_size[nr] = md_size[mdidx(mddev)];
++ add_mddev_mapping(mddev, dev, lv);
++
++ return 0;
++}
++
++static int unmap_lv (lv_t *lv)
++{
++ kdev_t dev = lv->dev;
++ unsigned int nr = MINOR(dev);
++
++ if (MAJOR(dev) != MD_MAJOR) {
++ MD_BUG();
++ return -1;
++ }
++ md_hd_struct[nr].start_sect = 0;
++ md_hd_struct[nr].nr_sects = 0;
++ md_size[nr] = 0;
++ del_mddev_mapping(lv->vg->mddev, dev);
++
++ return 0;
++}
++
++static int init_vg (vg_t *vg)
++{
++ int i;
++ lv_t *lv;
++ kdev_t dev;
++ vg_sb_t *vg_sb;
++ struct buffer_head *bh;
++ lv_descriptor_t *lv_desc;
++
++ /*
++ * fix me: read all PVs and compare the SB
++ */
++ dev = vg->pv_array[0].dev;
++ bh = bread (dev, 1, HSM_BLOCKSIZE);
++ if (!bh) {
++ MD_BUG();
++ return -1;
++ }
++
++ vg_sb = (vg_sb_t *) bh->b_data;
++ vg->vg_sb = vg_sb;
++
++ if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
++ printk("%s is not a valid VG, has magic %x instead of %x!\n",
++ partition_name(dev), vg_sb->vg_magic,
++ HSM_VG_SB_MAGIC);
++ return -1;
++ }
++
++ vg->nr_lv = 0;
++ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++ unsigned int id;
++ lv_desc = vg->vg_sb->lv_array + i;
++
++ id = lv_desc->lv_id;
++ if (!id) {
++ printk("... LV desc %d empty\n", i);
++ continue;
++ }
++ if (id >= HSM_MAX_LVS_PER_VG) {
++ MD_BUG();
++ continue;
++ }
++
++ lv = vg->lv_array + id;
++ if (lv->vg) {
++ MD_BUG();
++ continue;
++ }
++ lv->log_id = id;
++ lv->vg = vg;
++ lv->max_indices = lv_desc->lv_max_indices;
++ lv->free_indices = lv_desc->lv_free_indices;
++ lv->root_index.data = lv_desc->lv_root_idx;
++ lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
++
++ vg->nr_lv++;
++
++ map_lv(lv);
++ if (read_root_index(lv)) {
++ vg->nr_lv--;
++ unmap_lv(lv);
++ memset(lv, 0, sizeof(*lv));
++ }
++ }
++ if (vg->nr_lv != vg_sb->nr_lvs)
++ MD_BUG();
++
++ return 0;
++}
++
++static int hsm_run (mddev_t *mddev)
++{
++ int i;
++ vg_t *vg;
++ mdk_rdev_t *rdev;
++
++ MOD_INC_USE_COUNT;
++
++ vg = kmalloc (sizeof (*vg), GFP_KERNEL);
++ if (!vg)
++ goto out;
++ memset(vg, 0, sizeof(*vg));
++ mddev->private = vg;
++ vg->mddev = mddev;
++
++ if (md_check_ordering(mddev)) {
++ printk("hsm: disks are not ordered, aborting!\n");
++ goto out;
++ }
++
++ set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
++
++ vg->nr_pv = mddev->nb_dev;
++ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
++ pv_t *pv = vg->pv_array + i;
++
++ pv->dev = rdev->dev;
++ fsync_dev (pv->dev);
++ set_blocksize (pv->dev, HSM_BLOCKSIZE);
++ pv->phys_nr = i;
++ if (init_pv(pv))
++ goto out;
++ }
++
++ init_vg(vg);
++
++ return 0;
++
++out:
++ if (vg) {
++ kfree(vg);
++ mddev->private = NULL;
++ }
++ MOD_DEC_USE_COUNT;
++
++ return 1;
++}
++
++static int hsm_stop (mddev_t *mddev)
++{
++ lv_t *lv;
++ vg_t *vg;
++ int i;
++
++ vg = mddev_to_vg(mddev);
++
++ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++ lv = vg->lv_array + i;
++ if (!lv->log_id)
++ continue;
++ print_lv(lv);
++ free_lv_indextree(lv);
++ unmap_lv(lv);
++ }
++ for (i = 0; i < vg->nr_pv; i++)
++ free_pv(vg->pv_array + i);
++
++ kfree(vg);
++
++ MOD_DEC_USE_COUNT;
++
++ return 0;
++}
++
++
++static int hsm_status (char *page, mddev_t *mddev)
++{
++ int sz = 0, i;
++ lv_t *lv;
++ vg_t *vg;
++
++ vg = mddev_to_vg(mddev);
++
++ for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++ lv = vg->lv_array + i;
++ if (!lv->log_id)
++ continue;
++ sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
++ lv->max_indices - lv->free_indices, lv->max_indices);
++ }
++ return sz;
++}
++
++
++static mdk_personality_t hsm_personality=
++{
++ "hsm",
++ hsm_map,
++ NULL,
++ NULL,
++ hsm_run,
++ hsm_stop,
++ hsm_status,
++ NULL,
++ 0,
++ NULL,
++ NULL,
++ NULL,
++ NULL
++};
++
++#ifndef MODULE
++
++md__initfunc(void hsm_init (void))
++{
++ register_md_personality (HSM, &hsm_personality);
++}
++
++#else
++
++int init_module (void)
++{
++ return (register_md_personality (HSM, &hsm_personality));
++}
++
++void cleanup_module (void)
++{
++ unregister_md_personality (HSM);
++}
++
++#endif
++
++/*
++ * This Linus-trick catches bugs via the linker.
++ */
++
++extern void __BUG__in__hsm_dot_c_1(void);
++extern void __BUG__in__hsm_dot_c_2(void);
++extern void __BUG__in__hsm_dot_c_3(void);
++extern void __BUG__in__hsm_dot_c_4(void);
++extern void __BUG__in__hsm_dot_c_5(void);
++extern void __BUG__in__hsm_dot_c_6(void);
++extern void __BUG__in__hsm_dot_c_7(void);
++
++void bugcatcher (void)
++{
++ if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
++ __BUG__in__hsm_dot_c_1();
++ if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
++ __BUG__in__hsm_dot_c_2();
++
++ if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
++ __BUG__in__hsm_dot_c_4();
++ if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
++ __BUG__in__hsm_dot_c_3();
++ if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
++ __BUG__in__hsm_dot_c_6();
++
++ if (sizeof(lv_lptr_t) != 16)
++ __BUG__in__hsm_dot_c_5();
++ if (sizeof(pv_pptr_t) != 16)
++ __BUG__in__hsm_dot_c_6();
++}
++
+diff -uNr linux-orig/drivers/block/linear.c linux/drivers/block/linear.c
+--- linux-orig/drivers/block/linear.c Sat Nov 8 12:39:12 1997
++++ linux/drivers/block/linear.c Fri Dec 8 22:54:52 2000
+@@ -1,4 +1,3 @@
+-
+ /*
+ linear.c : Multiple Devices driver for Linux
+ Copyright (C) 1994-96 Marc ZYNGIER
+@@ -19,186 +18,207 @@
+
+ #include <linux/module.h>
+
+-#include <linux/md.h>
++#include <linux/raid/md.h>
+ #include <linux/malloc.h>
+-#include <linux/init.h>
+
+-#include "linear.h"
++#include <linux/raid/linear.h>
+
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+
+-static int linear_run (int minor, struct md_dev *mddev)
++static int linear_run (mddev_t *mddev)
+ {
+- int cur=0, i, size, dev0_size, nb_zone;
+- struct linear_data *data;
+-
+- MOD_INC_USE_COUNT;
+-
+- mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
+- data=(struct linear_data *) mddev->private;
+-
+- /*
+- Find out the smallest device. This was previously done
+- at registry time, but since it violates modularity,
+- I moved it here... Any comment ? ;-)
+- */
+-
+- data->smallest=mddev->devices;
+- for (i=1; i<mddev->nb_dev; i++)
+- if (data->smallest->size > mddev->devices[i].size)
+- data->smallest=mddev->devices+i;
+-
+- nb_zone=data->nr_zones=
+- md_size[minor]/data->smallest->size +
+- (md_size[minor]%data->smallest->size ? 1 : 0);
+-
+- data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
+-
+- size=mddev->devices[cur].size;
++ linear_conf_t *conf;
++ struct linear_hash *table;
++ mdk_rdev_t *rdev;
++ int size, i, j, nb_zone;
++ unsigned int curr_offset;
++
++ MOD_INC_USE_COUNT;
++
++ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
++ if (!conf)
++ goto out;
++ mddev->private = conf;
++
++ if (md_check_ordering(mddev)) {
++ printk("linear: disks are not ordered, aborting!\n");
++ goto out;
++ }
++ /*
++ * Find the smallest device.
++ */
++
++ conf->smallest = NULL;
++ curr_offset = 0;
++ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
++ dev_info_t *disk = conf->disks + j;
++
++ disk->dev = rdev->dev;
++ disk->size = rdev->size;
++ disk->offset = curr_offset;
++
++ curr_offset += disk->size;
++
++ if (!conf->smallest || (disk->size < conf->smallest->size))
++ conf->smallest = disk;
++ }
++
++ nb_zone = conf->nr_zones =
++ md_size[mdidx(mddev)] / conf->smallest->size +
++ ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
++
++ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
++ GFP_KERNEL);
++ if (!conf->hash_table)
++ goto out;
++
++ /*
++ * Here we generate the linear hash table
++ */
++ table = conf->hash_table;
++ i = 0;
++ size = 0;
++ for (j = 0; j < mddev->nb_dev; j++) {
++ dev_info_t *disk = conf->disks + j;
++
++ if (size < 0) {
++ table->dev1 = disk;
++ table++;
++ }
++ size += disk->size;
++
++ while (size) {
++ table->dev0 = disk;
++ size -= conf->smallest->size;
++ if (size < 0)
++ break;
++ table->dev1 = NULL;
++ table++;
++ }
++ }
++ table->dev1 = NULL;
++
++ return 0;
++
++out:
++ if (conf)
++ kfree(conf);
++ MOD_DEC_USE_COUNT;
++ return 1;
++}
++
++static int linear_stop (mddev_t *mddev)
++{
++ linear_conf_t *conf = mddev_to_conf(mddev);
++
++ kfree(conf->hash_table);
++ kfree(conf);
+
+- i=0;
+- while (cur<mddev->nb_dev)
+- {
+- data->hash_table[i].dev0=mddev->devices+cur;
++ MOD_DEC_USE_COUNT;
+
+- if (size>=data->smallest->size) /* If we completely fill the slot */
+- {
+- data->hash_table[i++].dev1=NULL;
+- size-=data->smallest->size;
+-
+- if (!size)
+- {
+- if (++cur==mddev->nb_dev) continue;
+- size=mddev->devices[cur].size;
+- }
+-
+- continue;
+- }
+-
+- if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
+- {
+- data->hash_table[i].dev1=NULL;
+- continue;
+- }
+-
+- dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
+- size=mddev->devices[cur].size;
+- data->hash_table[i++].dev1=mddev->devices+cur;
+- size-=(data->smallest->size - dev0_size);
+- }
+-
+- return 0;
+-}
+-
+-static int linear_stop (int minor, struct md_dev *mddev)
+-{
+- struct linear_data *data=(struct linear_data *) mddev->private;
+-
+- kfree (data->hash_table);
+- kfree (data);
+-
+- MOD_DEC_USE_COUNT;
+-
+- return 0;
++ return 0;
+ }
+
+
+-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
++static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+ unsigned long *rsector, unsigned long size)
+ {
+- struct linear_data *data=(struct linear_data *) mddev->private;
+- struct linear_hash *hash;
+- struct real_dev *tmp_dev;
+- long block;
+-
+- block=*rsector >> 1;
+- hash=data->hash_table+(block/data->smallest->size);
+-
+- if (block >= (hash->dev0->size + hash->dev0->offset))
+- {
+- if (!hash->dev1)
+- {
+- printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
+- return (-1);
+- }
+-
+- tmp_dev=hash->dev1;
+- }
+- else
+- tmp_dev=hash->dev0;
++ linear_conf_t *conf = mddev_to_conf(mddev);
++ struct linear_hash *hash;
++ dev_info_t *tmp_dev;
++ long block;
++
++ block = *rsector >> 1;
++ hash = conf->hash_table + (block / conf->smallest->size);
++
++ if (block >= (hash->dev0->size + hash->dev0->offset))
++ {
++ if (!hash->dev1)
++ {
++ printk ("linear_map : hash->dev1==NULL for block %ld\n",
++ block);
++ return -1;
++ }
++ tmp_dev = hash->dev1;
++ } else
++ tmp_dev = hash->dev0;
+
+- if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
+- printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
+- block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
++ if (block >= (tmp_dev->size + tmp_dev->offset)
++ || block < tmp_dev->offset)
++ printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
++ block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
+
+- *rdev=tmp_dev->dev;
+- *rsector=(block-(tmp_dev->offset)) << 1;
++ *rdev = tmp_dev->dev;
++ *rsector = (block - tmp_dev->offset) << 1;
+
+- return (0);
++ return 0;
+ }
+
+-static int linear_status (char *page, int minor, struct md_dev *mddev)
++static int linear_status (char *page, mddev_t *mddev)
+ {
+- int sz=0;
++ int sz=0;
+
+ #undef MD_DEBUG
+ #ifdef MD_DEBUG
+- int j;
+- struct linear_data *data=(struct linear_data *) mddev->private;
++ int j;
++ linear_conf_t *conf = mddev_to_conf(mddev);
+
+- sz+=sprintf (page+sz, " ");
+- for (j=0; j<data->nr_zones; j++)
+- {
+- sz+=sprintf (page+sz, "[%s",
+- partition_name (data->hash_table[j].dev0->dev));
+-
+- if (data->hash_table[j].dev1)
+- sz+=sprintf (page+sz, "/%s] ",
+- partition_name(data->hash_table[j].dev1->dev));
+- else
+- sz+=sprintf (page+sz, "] ");
+- }
+-
+- sz+=sprintf (page+sz, "\n");
++ sz += sprintf(page+sz, " ");
++ for (j = 0; j < conf->nr_zones; j++)
++ {
++ sz += sprintf(page+sz, "[%s",
++ partition_name(conf->hash_table[j].dev0->dev));
++
++ if (conf->hash_table[j].dev1)
++ sz += sprintf(page+sz, "/%s] ",
++ partition_name(conf->hash_table[j].dev1->dev));
++ else
++ sz += sprintf(page+sz, "] ");
++ }
++ sz += sprintf(page+sz, "\n");
+ #endif
+- sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
+- return sz;
++ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
++ return sz;
+ }
+
+
+-static struct md_personality linear_personality=
++static mdk_personality_t linear_personality=
+ {
+- "linear",
+- linear_map,
+- NULL,
+- NULL,
+- linear_run,
+- linear_stop,
+- linear_status,
+- NULL, /* no ioctls */
+- 0
++ "linear",
++ linear_map,
++ NULL,
++ NULL,
++ linear_run,
++ linear_stop,
++ linear_status,
++ NULL,
++ 0,
++ NULL,
++ NULL,
++ NULL,
++ NULL
+ };
+
+-
+ #ifndef MODULE
+
+-__initfunc(void linear_init (void))
++md__initfunc(void linear_init (void))
+ {
+- register_md_personality (LINEAR, &linear_personality);
++ register_md_personality (LINEAR, &linear_personality);
+ }
+
+ #else
+
+ int init_module (void)
+ {
+- return (register_md_personality (LINEAR, &linear_personality));
++ return (register_md_personality (LINEAR, &linear_personality));
+ }
+
+ void cleanup_module (void)
+ {
+- unregister_md_personality (LINEAR);
++ unregister_md_personality (LINEAR);
+ }
+
+ #endif
++
+diff -uNr linux-orig/drivers/block/linear.h linux/drivers/block/linear.h
+--- linux-orig/drivers/block/linear.h Fri Nov 22 07:07:23 1996
++++ linux/drivers/block/linear.h Fri Dec 8 22:54:52 2000
+@@ -1,16 +0,0 @@
+-#ifndef _LINEAR_H
+-#define _LINEAR_H
+-
+-struct linear_hash
+-{
+- struct real_dev *dev0, *dev1;
+-};
+-
+-struct linear_data
+-{
+- struct linear_hash *hash_table; /* Dynamically allocated */
+- struct real_dev *smallest;
+- int nr_zones;
+-};
+-
+-#endif
+diff -uNr linux-orig/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
+--- linux-orig/drivers/block/ll_rw_blk.c Fri Dec 8 22:53:51 2000
++++ linux/drivers/block/ll_rw_blk.c Fri Dec 8 22:54:52 2000
+@@ -23,6 +23,7 @@
+ #include <asm/io.h>
+ #include <asm/uaccess.h>
+ #include <linux/blk.h>
++#include <linux/raid/md.h>
+
+ #include <linux/module.h>
+
+@@ -53,6 +54,11 @@
+ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
+
+ /*
++ * per-major idle-IO detection
++ */
++unsigned long io_events[MAX_BLKDEV] = {0, };
++
++/*
+ * used to wait on when there are no free requests
+ */
+ struct wait_queue * wait_for_request;
+@@ -583,6 +589,8 @@
+ return;
+ /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
+ lock_buffer(bh);
++ if (!buffer_lowprio(bh))
++ io_events[major]++;
+
+ if (blk_size[major]) {
+ unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
+@@ -832,7 +840,7 @@
+ bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
+ #ifdef CONFIG_BLK_DEV_MD
+ if (major==MD_MAJOR &&
+- md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
++ md_map (bh[i]->b_dev, &bh[i]->b_rdev,
+ &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
+ printk (KERN_ERR
+ "Bad md_map in ll_rw_block\n");
+@@ -852,7 +860,7 @@
+ set_bit(BH_Req, &bh[i]->b_state);
+ #ifdef CONFIG_BLK_DEV_MD
+ if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
+- md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
++ md_make_request(bh[i], rw);
+ continue;
+ }
+ #endif
+diff -uNr linux-orig/drivers/block/md.c linux/drivers/block/md.c
+--- linux-orig/drivers/block/md.c Mon Sep 4 10:39:16 2000
++++ linux/drivers/block/md.c Fri Dec 8 22:54:52 2000
+@@ -1,21 +1,17 @@
+-
+ /*
+ md.c : Multiple Devices driver for Linux
+- Copyright (C) 1994-96 Marc ZYNGIER
+- <zyngier@ufr-info-p7.ibp.fr> or
+- <maz@gloups.fdn.fr>
++ Copyright (C) 1998, 1999 Ingo Molnar
+
+- A lot of inspiration came from hd.c ...
++ completely rewritten, based on the MD driver code from Marc Zyngier
+
+- kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+- boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
++ Changes:
+
+- RAID-1/RAID-5 extensions by:
+- Ingo Molnar, Miguel de Icaza, Gadi Oxman
++ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
++ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
++ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
++ - kmod support by: Cyrus Durgin
++ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+
+- Changes for kmod by:
+- Cyrus Durgin
+-
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+@@ -26,807 +22,3007 @@
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+-/*
+- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
+- * the extra system load does not show up that much. Increase it if your
+- * system can take more.
+- */
+-#define SPEED_LIMIT 1024
++#include <linux/raid/md.h>
++#include <linux/raid/xor.h>
+
+-#include <linux/config.h>
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/malloc.h>
+-#include <linux/mm.h>
+-#include <linux/md.h>
+-#include <linux/hdreg.h>
+-#include <linux/stat.h>
+-#include <linux/fs.h>
+-#include <linux/proc_fs.h>
+-#include <linux/blkdev.h>
+-#include <linux/genhd.h>
+-#include <linux/smp_lock.h>
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+-#include <linux/errno.h>
+-#include <linux/init.h>
+
+ #define __KERNEL_SYSCALLS__
+ #include <linux/unistd.h>
+
++#include <asm/unaligned.h>
++
++extern asmlinkage int sys_sched_yield(void);
++extern asmlinkage int sys_setsid(void);
++
++extern unsigned long io_events[MAX_BLKDEV];
++
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+
+ #include <linux/blk.h>
+-#include <asm/uaccess.h>
+-#include <asm/bitops.h>
+-#include <asm/atomic.h>
+
+ #ifdef CONFIG_MD_BOOT
+-extern kdev_t name_to_kdev_t(char *line) __init;
++extern kdev_t name_to_kdev_t(char *line) md__init;
+ #endif
+
+-static struct hd_struct md_hd_struct[MAX_MD_DEV];
+-static int md_blocksizes[MAX_MD_DEV];
+-int md_maxreadahead[MAX_MD_DEV];
+-#if SUPPORT_RECONSTRUCTION
+-static struct md_thread *md_sync_thread = NULL;
+-#endif /* SUPPORT_RECONSTRUCTION */
++static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
++
++/*
++ * these have to be allocated separately because external
++ * subsystems want to have a pre-defined structure
++ */
++struct hd_struct md_hd_struct[MAX_MD_DEVS];
++static int md_blocksizes[MAX_MD_DEVS];
++static int md_maxreadahead[MAX_MD_DEVS];
++static mdk_thread_t *md_recovery_thread = NULL;
+
+-int md_size[MAX_MD_DEV]={0, };
++int md_size[MAX_MD_DEVS] = {0, };
+
+ static void md_geninit (struct gendisk *);
+
+ static struct gendisk md_gendisk=
+ {
+- MD_MAJOR,
+- "md",
+- 0,
+- 1,
+- MAX_MD_DEV,
+- md_geninit,
+- md_hd_struct,
+- md_size,
+- MAX_MD_DEV,
+- NULL,
+- NULL
++ MD_MAJOR,
++ "md",
++ 0,
++ 1,
++ MAX_MD_DEVS,
++ md_geninit,
++ md_hd_struct,
++ md_size,
++ MAX_MD_DEVS,
++ NULL,
++ NULL
+ };
+
+-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
+-struct md_dev md_dev[MAX_MD_DEV];
+-
+-int md_thread(void * arg);
++/*
++ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
++ * is 100 KB/sec, so the extra system load does not show up that much.
++ * Increase it if you want to have more _guaranteed_ speed. Note that
++ * the RAID driver will use the maximum available bandwith if the IO
++ * subsystem is idle.
++ *
++ * you can change it via /proc/sys/dev/speed-limit
++ */
+
+-static struct gendisk *find_gendisk (kdev_t dev)
+-{
+- struct gendisk *tmp=gendisk_head;
++static int sysctl_speed_limit = 100;
+
+- while (tmp != NULL)
+- {
+- if (tmp->major==MAJOR(dev))
+- return (tmp);
+-
+- tmp=tmp->next;
+- }
++static struct ctl_table_header *md_table_header;
+
+- return (NULL);
+-}
++static ctl_table md_table[] = {
++ {DEV_MD_SPEED_LIMIT, "speed-limit",
++ &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
++ {0}
++};
+
+-char *partition_name (kdev_t dev)
+-{
+- static char name[40]; /* This should be long
+- enough for a device name ! */
+- struct gendisk *hd = find_gendisk (dev);
++static ctl_table md_dir_table[] = {
++ {DEV_MD, "md", NULL, 0, 0555, md_table},
++ {0}
++};
+
+- if (!hd)
+- {
+- sprintf (name, "[dev %s]", kdevname(dev));
+- return (name);
+- }
++static ctl_table md_root_table[] = {
++ {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
++ {0}
++};
+
+- return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
++static void md_register_sysctl(void)
++{
++ md_table_header = register_sysctl_table(md_root_table, 1);
+ }
+
+-static int legacy_raid_sb (int minor, int pnum)
++void md_unregister_sysctl(void)
+ {
+- int i, factor;
++ unregister_sysctl_table(md_table_header);
++}
++
++/*
++ * The mapping between kdev and mddev is not necessary a simple
++ * one! Eg. HSM uses several sub-devices to implement Logical
++ * Volumes. All these sub-devices map to the same mddev.
++ */
++dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
+
+- factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
++void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
++{
++ unsigned int minor = MINOR(dev);
+
+- /*****
+- * do size and offset calculations.
+- */
+- for (i=0; i<md_dev[minor].nb_dev; i++) {
+- md_dev[minor].devices[i].size &= ~(factor - 1);
+- md_size[minor] += md_dev[minor].devices[i].size;
+- md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
+- md_dev[minor].devices[i-1].size) : 0;
++ if (MAJOR(dev) != MD_MAJOR) {
++ MD_BUG();
++ return;
+ }
+- if (pnum == RAID0 >> PERSONALITY_SHIFT)
+- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
+- return 0;
++ if (mddev_map[minor].mddev != NULL) {
++ MD_BUG();
++ return;
++ }
++ mddev_map[minor].mddev = mddev;
++ mddev_map[minor].data = data;
+ }
+
+-static void free_sb (struct md_dev *mddev)
++void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
+ {
+- int i;
+- struct real_dev *realdev;
++ unsigned int minor = MINOR(dev);
+
+- if (mddev->sb) {
+- free_page((unsigned long) mddev->sb);
+- mddev->sb = NULL;
++ if (MAJOR(dev) != MD_MAJOR) {
++ MD_BUG();
++ return;
+ }
+- for (i = 0; i <mddev->nb_dev; i++) {
+- realdev = mddev->devices + i;
+- if (realdev->sb) {
+- free_page((unsigned long) realdev->sb);
+- realdev->sb = NULL;
+- }
++ if (mddev_map[minor].mddev != mddev) {
++ MD_BUG();
++ return;
+ }
++ mddev_map[minor].mddev = NULL;
++ mddev_map[minor].data = NULL;
+ }
+
+ /*
+- * Check one RAID superblock for generic plausibility
++ * Enables to iterate over all existing md arrays
+ */
++static MD_LIST_HEAD(all_mddevs);
+
+-#define BAD_MAGIC KERN_ERR \
+-"md: %s: invalid raid superblock magic (%x) on block %u\n"
++static mddev_t * alloc_mddev (kdev_t dev)
++{
++ mddev_t * mddev;
+
+-#define OUT_OF_MEM KERN_ALERT \
+-"md: out of memory.\n"
++ if (MAJOR(dev) != MD_MAJOR) {
++ MD_BUG();
++ return 0;
++ }
++ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
++ if (!mddev)
++ return NULL;
++
++ memset(mddev, 0, sizeof(*mddev));
+
+-#define NO_DEVICE KERN_ERR \
+-"md: disabled device %s\n"
++ mddev->__minor = MINOR(dev);
++ mddev->reconfig_sem = MUTEX;
++ mddev->recovery_sem = MUTEX;
++ mddev->resync_sem = MUTEX;
++ MD_INIT_LIST_HEAD(&mddev->disks);
++ /*
++ * The 'base' mddev is the one with data NULL.
++ * personalities can create additional mddevs
++ * if necessary.
++ */
++ add_mddev_mapping(mddev, dev, 0);
++ md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+-#define SUCCESS 0
+-#define FAILURE -1
++ return mddev;
++}
+
+-static int analyze_one_sb (struct real_dev * rdev)
++static void free_mddev (mddev_t *mddev)
+ {
+- int ret = FAILURE;
+- struct buffer_head *bh;
+- kdev_t dev = rdev->dev;
+- md_superblock_t *sb;
++ if (!mddev) {
++ MD_BUG();
++ return;
++ }
+
+ /*
+- * Read the superblock, it's at the end of the disk
++ * Make sure nobody else is using this mddev
++ * (careful, we rely on the global kernel lock here)
+ */
+- rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
+- set_blocksize (dev, MD_SB_BYTES);
+- bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+-
+- if (bh) {
+- sb = (md_superblock_t *) bh->b_data;
+- if (sb->md_magic != MD_SB_MAGIC) {
+- printk (BAD_MAGIC, kdevname(dev),
+- sb->md_magic, rdev->sb_offset);
+- goto abort;
+- }
+- rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
+- if (!rdev->sb) {
+- printk (OUT_OF_MEM);
+- goto abort;
+- }
+- memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
++ while (md_atomic_read(&mddev->resync_sem.count) != 1)
++ schedule();
++ while (md_atomic_read(&mddev->recovery_sem.count) != 1)
++ schedule();
+
+- rdev->size = sb->size;
+- } else
+- printk (NO_DEVICE,kdevname(rdev->dev));
+- ret = SUCCESS;
+-abort:
+- if (bh)
+- brelse (bh);
+- return ret;
++ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
++ md_list_del(&mddev->all_mddevs);
++ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
++ kfree(mddev);
+ }
+
+-#undef SUCCESS
+-#undef FAILURE
+-
+-#undef BAD_MAGIC
+-#undef OUT_OF_MEM
+-#undef NO_DEVICE
+
+-/*
+- * Check a full RAID array for plausibility
+- */
++struct gendisk * find_gendisk (kdev_t dev)
++{
++ struct gendisk *tmp = gendisk_head;
+
+-#define INCONSISTENT KERN_ERR \
+-"md: superblock inconsistency -- run ckraid\n"
++ while (tmp != NULL) {
++ if (tmp->major == MAJOR(dev))
++ return (tmp);
++ tmp = tmp->next;
++ }
++ return (NULL);
++}
+
+-#define OUT_OF_DATE KERN_ERR \
+-"md: superblock update time inconsistenty -- using the most recent one\n"
++mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
++{
++ mdk_rdev_t * rdev;
++ struct md_list_head *tmp;
+
+-#define OLD_VERSION KERN_ALERT \
+-"md: %s: unsupported raid array version %d.%d.%d\n"
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->desc_nr == nr)
++ return rdev;
++ }
++ return NULL;
++}
+
+-#define NOT_CLEAN KERN_ERR \
+-"md: %s: raid array is not clean -- run ckraid\n"
++mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
++{
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
+
+-#define NOT_CLEAN_IGNORE KERN_ERR \
+-"md: %s: raid array is not clean -- reconstructing parity\n"
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->dev == dev)
++ return rdev;
++ }
++ return NULL;
++}
+
+-#define UNKNOWN_LEVEL KERN_ERR \
+-"md: %s: unsupported raid level %d\n"
++static MD_LIST_HEAD(device_names);
+
+-static int analyze_sbs (int minor, int pnum)
++char * partition_name (kdev_t dev)
+ {
+- struct md_dev *mddev = md_dev + minor;
+- int i, N = mddev->nb_dev, out_of_date = 0;
+- struct real_dev * disks = mddev->devices;
+- md_superblock_t *sb, *freshest = NULL;
++ struct gendisk *hd;
++ static char nomem [] = "<nomem>";
++ dev_name_t *dname;
++ struct md_list_head *tmp = device_names.next;
+
+- /*
+- * RAID-0 and linear don't use a RAID superblock
+- */
+- if (pnum == RAID0 >> PERSONALITY_SHIFT ||
+- pnum == LINEAR >> PERSONALITY_SHIFT)
+- return legacy_raid_sb (minor, pnum);
++ while (tmp != &device_names) {
++ dname = md_list_entry(tmp, dev_name_t, list);
++ if (dname->dev == dev)
++ return dname->name;
++ tmp = tmp->next;
++ }
++
++ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
++ if (!dname)
++ return nomem;
+ /*
+- * Verify the RAID superblock on each real device
++ * ok, add this new device name to the list
+ */
+- for (i = 0; i < N; i++)
+- if (analyze_one_sb(disks+i))
+- goto abort;
++ hd = find_gendisk (dev);
++
++ if (!hd)
++ sprintf (dname->name, "[dev %s]", kdevname(dev));
++ else
++ disk_name (hd, MINOR(dev), dname->name);
++
++ dname->dev = dev;
++ md_list_add(&dname->list, &device_names);
++
++ return dname->name;
++}
++
++static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
++ int persistent)
++{
++ unsigned int size = 0;
++
++ if (blk_size[MAJOR(dev)])
++ size = blk_size[MAJOR(dev)][MINOR(dev)];
++ if (persistent)
++ size = MD_NEW_SIZE_BLOCKS(size);
++ return size;
++}
++
++static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
++{
++ unsigned int size;
++
++ size = calc_dev_sboffset(dev, mddev, persistent);
++ if (!mddev->sb) {
++ MD_BUG();
++ return size;
++ }
++ if (mddev->sb->chunk_size)
++ size &= ~(mddev->sb->chunk_size/1024 - 1);
++ return size;
++}
++
++/*
++ * We check wether all devices are numbered from 0 to nb_dev-1. The
++ * order is guaranteed even after device name changes.
++ *
++ * Some personalities (raid0, linear) use this. Personalities that
++ * provide data have to be able to deal with loss of individual
++ * disks, so they do their checking themselves.
++ */
++int md_check_ordering (mddev_t *mddev)
++{
++ int i, c;
++ mdk_rdev_t *rdev;
++ struct md_list_head *tmp;
+
+ /*
+- * The superblock constant part has to be the same
+- * for all disks in the array.
++ * First, all devices must be fully functional
+ */
+- sb = NULL;
+- for (i = 0; i < N; i++) {
+- if (!disks[i].sb)
+- continue;
+- if (!sb) {
+- sb = disks[i].sb;
+- continue;
+- }
+- if (memcmp(sb,
+- disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
+- printk (INCONSISTENT);
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty) {
++ printk("md: md%d's device %s faulty, aborting.\n",
++ mdidx(mddev), partition_name(rdev->dev));
+ goto abort;
+ }
+ }
+
+- /*
+- * OK, we have all disks and the array is ready to run. Let's
+- * find the freshest superblock, that one will be the superblock
+- * that represents the whole array.
+- */
+- if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
++ c = 0;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ c++;
++ }
++ if (c != mddev->nb_dev) {
++ MD_BUG();
+ goto abort;
+- freshest = NULL;
+- for (i = 0; i < N; i++) {
+- if (!disks[i].sb)
+- continue;
+- if (!freshest) {
+- freshest = disks[i].sb;
+- continue;
+- }
+- /*
+- * Find the newest superblock version
+- */
+- if (disks[i].sb->utime != freshest->utime) {
+- out_of_date = 1;
+- if (disks[i].sb->utime > freshest->utime)
+- freshest = disks[i].sb;
+- }
+ }
+- if (out_of_date)
+- printk(OUT_OF_DATE);
+- memcpy (sb, freshest, sizeof(*freshest));
+-
+- /*
+- * Check if we can support this RAID array
+- */
+- if (sb->major_version != MD_MAJOR_VERSION ||
+- sb->minor_version > MD_MINOR_VERSION) {
+-
+- printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
+- sb->major_version, sb->minor_version,
+- sb->patch_version);
++ if (mddev->nb_dev != mddev->sb->raid_disks) {
++ printk("md: md%d, array needs %d disks, has %d, aborting.\n",
++ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
+ goto abort;
+ }
+-
+ /*
+- * We need to add this as a superblock option.
++ * Now the numbering check
+ */
+-#if SUPPORT_RECONSTRUCTION
+- if (sb->state != (1 << MD_SB_CLEAN)) {
+- if (sb->level == 1) {
+- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
++ for (i = 0; i < mddev->nb_dev; i++) {
++ c = 0;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->desc_nr == i)
++ c++;
++ }
++ if (c == 0) {
++ printk("md: md%d, missing disk #%d, aborting.\n",
++ mdidx(mddev), i);
+ goto abort;
+- } else
+- printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
+- }
+-#else
+- if (sb->state != (1 << MD_SB_CLEAN)) {
+- printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
+- goto abort;
+- }
+-#endif /* SUPPORT_RECONSTRUCTION */
+-
+- switch (sb->level) {
+- case 1:
+- md_size[minor] = sb->size;
+- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
+- break;
+- case 4:
+- case 5:
+- md_size[minor] = sb->size * (sb->raid_disks - 1);
+- md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
+- break;
+- default:
+- printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
+- sb->level);
++ }
++ if (c > 1) {
++ printk("md: md%d, too many disks #%d, aborting.\n",
++ mdidx(mddev), i);
+ goto abort;
++ }
+ }
+ return 0;
+ abort:
+- free_sb(mddev);
+ return 1;
+ }
+
+-#undef INCONSISTENT
+-#undef OUT_OF_DATE
+-#undef OLD_VERSION
+-#undef NOT_CLEAN
+-#undef OLD_LEVEL
+-
+-int md_update_sb(int minor)
++static unsigned int zoned_raid_size (mddev_t *mddev)
+ {
+- struct md_dev *mddev = md_dev + minor;
+- struct buffer_head *bh;
+- md_superblock_t *sb = mddev->sb;
+- struct real_dev *realdev;
+- kdev_t dev;
+- int i;
+- u32 sb_offset;
++ unsigned int mask;
++ mdk_rdev_t * rdev;
++ struct md_list_head *tmp;
+
+- sb->utime = CURRENT_TIME;
+- for (i = 0; i < mddev->nb_dev; i++) {
+- realdev = mddev->devices + i;
+- if (!realdev->sb)
+- continue;
+- dev = realdev->dev;
+- sb_offset = realdev->sb_offset;
+- set_blocksize(dev, MD_SB_BYTES);
+- printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
+- bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+- if (bh) {
+- sb = (md_superblock_t *) bh->b_data;
+- memcpy(sb, mddev->sb, MD_SB_BYTES);
+- memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
+- mark_buffer_uptodate(bh, 1);
+- mark_buffer_dirty(bh, 1);
+- ll_rw_block(WRITE, 1, &bh);
+- wait_on_buffer(bh);
+- bforget(bh);
+- fsync_dev(dev);
+- invalidate_buffers(dev);
+- } else
+- printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
++ if (!mddev->sb) {
++ MD_BUG();
++ return -EINVAL;
++ }
++ /*
++ * do size and offset calculations.
++ */
++ mask = ~(mddev->sb->chunk_size/1024 - 1);
++printk("mask %08x\n", mask);
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++printk(" rdev->size: %d\n", rdev->size);
++ rdev->size &= mask;
++printk(" masked rdev->size: %d\n", rdev->size);
++ md_size[mdidx(mddev)] += rdev->size;
++printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
+ }
+ return 0;
+ }
+
+-static int do_md_run (int minor, int repart)
++static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
+ {
+- int pnum, i, min, factor, err;
++ if (disk_active(disk)) {
++ sb->working_disks--;
++ } else {
++ if (disk_spare(disk)) {
++ sb->spare_disks--;
++ sb->working_disks--;
++ } else {
++ sb->failed_disks--;
++ }
++ }
++ sb->nr_disks--;
++ disk->major = 0;
++ disk->minor = 0;
++ mark_disk_removed(disk);
++}
+
+- if (!md_dev[minor].nb_dev)
+- return -EINVAL;
+-
+- if (md_dev[minor].pers)
+- return -EBUSY;
++#define BAD_MAGIC KERN_ERR \
++"md: invalid raid superblock magic on %s\n"
+
+- md_dev[minor].repartition=repart;
+-
+- if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
+- >= MAX_PERSONALITY)
+- return -EINVAL;
+-
+- /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
+- if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
+- for (i = 0; i < md_dev [minor].nb_dev; i++)
+- if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
+- return -EINVAL;
+- }
+- if (!pers[pnum])
+- {
+-#ifdef CONFIG_KMOD
+- char module_name[80];
+- sprintf (module_name, "md-personality-%d", pnum);
+- request_module (module_name);
+- if (!pers[pnum])
+-#endif
+- return -EINVAL;
+- }
+-
+- factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+-
+- for (i=0; i<md_dev[minor].nb_dev; i++)
+- if (md_dev[minor].devices[i].size<min)
+- {
+- printk ("Dev %s smaller than %dk, cannot shrink\n",
+- partition_name (md_dev[minor].devices[i].dev), min);
+- return -EINVAL;
+- }
+-
+- for (i=0; i<md_dev[minor].nb_dev; i++) {
+- fsync_dev(md_dev[minor].devices[i].dev);
+- invalidate_buffers(md_dev[minor].devices[i].dev);
+- }
+-
+- /* Resize devices according to the factor. It is used to align
+- partitions size on a given chunk size. */
+- md_size[minor]=0;
+-
+- /*
+- * Analyze the raid superblock
+- */
+- if (analyze_sbs(minor, pnum))
+- return -EINVAL;
++#define BAD_MINOR KERN_ERR \
++"md: %s: invalid raid minor (%x)\n"
+
+- md_dev[minor].pers=pers[pnum];
+-
+- if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
+- {
+- md_dev[minor].pers=NULL;
+- free_sb(md_dev + minor);
+- return (err);
+- }
+-
+- if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
+- {
+- md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
+- md_update_sb(minor);
+- }
+-
+- /* FIXME : We assume here we have blocks
+- that are twice as large as sectors.
+- THIS MAY NOT BE TRUE !!! */
+- md_hd_struct[minor].start_sect=0;
+- md_hd_struct[minor].nr_sects=md_size[minor]<<1;
+-
+- read_ahead[MD_MAJOR] = 128;
+- return (0);
+-}
++#define OUT_OF_MEM KERN_ALERT \
++"md: out of memory.\n"
++
++#define NO_SB KERN_ERR \
++"md: disabled device %s, could not read superblock.\n"
+
+-static int do_md_stop (int minor, struct inode *inode)
++#define BAD_CSUM KERN_WARNING \
++"md: invalid superblock checksum on %s\n"
++
++static int alloc_array_sb (mddev_t * mddev)
+ {
+- int i;
+-
+- if (inode->i_count>1 || md_dev[minor].busy>1) {
+- /*
+- * ioctl : one open channel
+- */
+- printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
+- minor, inode->i_count, md_dev[minor].busy);
+- return -EBUSY;
+- }
+-
+- if (md_dev[minor].pers) {
+- /*
+- * It is safe to call stop here, it only frees private
+- * data. Also, it tells us if a device is unstoppable
+- * (eg. resyncing is in progress)
+- */
+- if (md_dev[minor].pers->stop (minor, md_dev+minor))
+- return -EBUSY;
+- /*
+- * The device won't exist anymore -> flush it now
+- */
+- fsync_dev (inode->i_rdev);
+- invalidate_buffers (inode->i_rdev);
+- if (md_dev[minor].sb) {
+- md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
+- md_update_sb(minor);
+- }
++ if (mddev->sb) {
++ MD_BUG();
++ return 0;
+ }
+-
+- /* Remove locks. */
+- if (md_dev[minor].sb)
+- free_sb(md_dev + minor);
+- for (i=0; i<md_dev[minor].nb_dev; i++)
+- clear_inode (md_dev[minor].devices[i].inode);
+-
+- md_dev[minor].nb_dev=md_size[minor]=0;
+- md_hd_struct[minor].nr_sects=0;
+- md_dev[minor].pers=NULL;
+-
+- read_ahead[MD_MAJOR] = 128;
+-
+- return (0);
++
++ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
++ if (!mddev->sb)
++ return -ENOMEM;
++ md_clear_page((unsigned long)mddev->sb);
++ return 0;
+ }
+
+-static int do_md_add (int minor, kdev_t dev)
++static int alloc_disk_sb (mdk_rdev_t * rdev)
+ {
+- int i;
+- int hot_add=0;
+- struct real_dev *realdev;
++ if (rdev->sb)
++ MD_BUG();
+
+- if (md_dev[minor].nb_dev==MAX_REAL)
++ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
++ if (!rdev->sb) {
++ printk (OUT_OF_MEM);
+ return -EINVAL;
++ }
++ md_clear_page((unsigned long)rdev->sb);
+
+- if (!fs_may_mount (dev))
+- return -EBUSY;
++ return 0;
++}
+
+- if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
+- printk("md_add(): zero device size, huh, bailing out.\n");
+- return -EINVAL;
++static void free_disk_sb (mdk_rdev_t * rdev)
++{
++ if (rdev->sb) {
++ free_page((unsigned long) rdev->sb);
++ rdev->sb = NULL;
++ rdev->sb_offset = 0;
++ rdev->size = 0;
++ } else {
++ if (!rdev->faulty)
++ MD_BUG();
+ }
++}
+
+- if (md_dev[minor].pers) {
+- /*
+- * The array is already running, hot-add the drive, or
+- * bail out:
+- */
+- if (!md_dev[minor].pers->hot_add_disk)
+- return -EBUSY;
+- else
+- hot_add=1;
++static void mark_rdev_faulty (mdk_rdev_t * rdev)
++{
++ unsigned long flags;
++
++ if (!rdev) {
++ MD_BUG();
++ return;
+ }
++ save_flags(flags);
++ cli();
++ free_disk_sb(rdev);
++ rdev->faulty = 1;
++ restore_flags(flags);
++}
++
++static int read_disk_sb (mdk_rdev_t * rdev)
++{
++ int ret = -EINVAL;
++ struct buffer_head *bh = NULL;
++ kdev_t dev = rdev->dev;
++ mdp_super_t *sb;
++ u32 sb_offset;
+
++ if (!rdev->sb) {
++ MD_BUG();
++ goto abort;
++ }
++
+ /*
+- * Careful. We cannot increase nb_dev for a running array.
++ * Calculate the position of the superblock,
++ * it's at the end of the disk
+ */
+- i=md_dev[minor].nb_dev;
+- realdev = &md_dev[minor].devices[i];
+- realdev->dev=dev;
+-
+- /* Lock the device by inserting a dummy inode. This doesn't
+- smell very good, but I need to be consistent with the
+- mount stuff, specially with fs_may_mount. If someone have
+- a better idea, please help ! */
+-
+- realdev->inode=get_empty_inode ();
+- realdev->inode->i_dev=dev; /* don't care about other fields */
+- insert_inode_hash (realdev->inode);
+-
+- /* Sizes are now rounded at run time */
+-
+-/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
+-
+- realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
++ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
++ rdev->sb_offset = sb_offset;
++ printk("(read) %s's sb offset: %d", partition_name(dev),
++ sb_offset);
++ fsync_dev(dev);
++ set_blocksize (dev, MD_SB_BYTES);
++ bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+
+- if (hot_add) {
++ if (bh) {
++ sb = (mdp_super_t *) bh->b_data;
++ memcpy (rdev->sb, sb, MD_SB_BYTES);
++ } else {
++ printk (NO_SB,partition_name(rdev->dev));
++ goto abort;
++ }
++ printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
++ ret = 0;
++abort:
++ if (bh)
++ brelse (bh);
++ return ret;
++}
++
++static unsigned int calc_sb_csum (mdp_super_t * sb)
++{
++ unsigned int disk_csum, csum;
++
++ disk_csum = sb->sb_csum;
++ sb->sb_csum = 0;
++ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
++ sb->sb_csum = disk_csum;
++ return csum;
++}
++
++/*
++ * Check one RAID superblock for generic plausibility
++ */
++
++static int check_disk_sb (mdk_rdev_t * rdev)
++{
++ mdp_super_t *sb;
++ int ret = -EINVAL;
++
++ sb = rdev->sb;
++ if (!sb) {
++ MD_BUG();
++ goto abort;
++ }
++
++ if (sb->md_magic != MD_SB_MAGIC) {
++ printk (BAD_MAGIC, partition_name(rdev->dev));
++ goto abort;
++ }
++
++ if (sb->md_minor >= MAX_MD_DEVS) {
++ printk (BAD_MINOR, partition_name(rdev->dev),
++ sb->md_minor);
++ goto abort;
++ }
++
++ if (calc_sb_csum(sb) != sb->sb_csum)
++ printk(BAD_CSUM, partition_name(rdev->dev));
++ ret = 0;
++abort:
++ return ret;
++}
++
++static kdev_t dev_unit(kdev_t dev)
++{
++ unsigned int mask;
++ struct gendisk *hd = find_gendisk(dev);
++
++ if (!hd)
++ return 0;
++ mask = ~((1 << hd->minor_shift) - 1);
++
++ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
++}
++
++static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
++{
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++
++ ITERATE_RDEV(mddev,rdev,tmp)
++ if (dev_unit(rdev->dev) == dev_unit(dev))
++ return rdev;
++
++ return NULL;
++}
++
++static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
++{
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++
++ ITERATE_RDEV(mddev1,rdev,tmp)
++ if (match_dev_unit(mddev2, rdev->dev))
++ return 1;
++
++ return 0;
++}
++
++static MD_LIST_HEAD(all_raid_disks);
++static MD_LIST_HEAD(pending_raid_disks);
++
++static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
++{
++ mdk_rdev_t *same_pdev;
++
++ if (rdev->mddev) {
++ MD_BUG();
++ return;
++ }
++ same_pdev = match_dev_unit(mddev, rdev->dev);
++ if (same_pdev)
++ printk( KERN_WARNING
++"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
++" protection against single-disk failure might be compromised.\n",
++ mdidx(mddev), partition_name(rdev->dev),
++ partition_name(same_pdev->dev));
++
++ md_list_add(&rdev->same_set, &mddev->disks);
++ rdev->mddev = mddev;
++ mddev->nb_dev++;
++ printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
++}
++
++static void unbind_rdev_from_array (mdk_rdev_t * rdev)
++{
++ if (!rdev->mddev) {
++ MD_BUG();
++ return;
++ }
++ md_list_del(&rdev->same_set);
++ MD_INIT_LIST_HEAD(&rdev->same_set);
++ rdev->mddev->nb_dev--;
++ printk("unbind<%s,%d>\n", partition_name(rdev->dev),
++ rdev->mddev->nb_dev);
++ rdev->mddev = NULL;
++}
++
++/*
++ * prevent the device from being mounted, repartitioned or
++ * otherwise reused by a RAID array (or any other kernel
++ * subsystem), by opening the device. [simply getting an
++ * inode is not enough, the SCSI module usage code needs
++ * an explicit open() on the device]
++ */
++static int lock_rdev (mdk_rdev_t *rdev)
++{
++ int err = 0;
++
++ /*
++ * First insert a dummy inode.
++ */
++ if (rdev->inode)
++ MD_BUG();
++ rdev->inode = get_empty_inode();
++ /*
++ * we dont care about any other fields
++ */
++ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
++ insert_inode_hash(rdev->inode);
++
++ memset(&rdev->filp, 0, sizeof(rdev->filp));
++ rdev->filp.f_mode = 3; /* read write */
++ err = blkdev_open(rdev->inode, &rdev->filp);
++ if (err) {
++ printk("blkdev_open() failed: %d\n", err);
++ clear_inode(rdev->inode);
++ rdev->inode = NULL;
++ }
++ return err;
++}
++
++static void unlock_rdev (mdk_rdev_t *rdev)
++{
++ blkdev_release(rdev->inode);
++ if (!rdev->inode)
++ MD_BUG();
++ clear_inode(rdev->inode);
++ rdev->inode = NULL;
++}
++
++static void export_rdev (mdk_rdev_t * rdev)
++{
++ printk("export_rdev(%s)\n",partition_name(rdev->dev));
++ if (rdev->mddev)
++ MD_BUG();
++ unlock_rdev(rdev);
++ free_disk_sb(rdev);
++ md_list_del(&rdev->all);
++ MD_INIT_LIST_HEAD(&rdev->all);
++ if (rdev->pending.next != &rdev->pending) {
++ printk("(%s was pending)\n",partition_name(rdev->dev));
++ md_list_del(&rdev->pending);
++ MD_INIT_LIST_HEAD(&rdev->pending);
++ }
++ rdev->dev = 0;
++ rdev->faulty = 0;
++ kfree(rdev);
++}
++
++static void kick_rdev_from_array (mdk_rdev_t * rdev)
++{
++ unbind_rdev_from_array(rdev);
++ export_rdev(rdev);
++}
++
++static void export_array (mddev_t *mddev)
++{
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++ mdp_super_t *sb = mddev->sb;
++
++ if (mddev->sb) {
++ mddev->sb = NULL;
++ free_page((unsigned long) sb);
++ }
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (!rdev->mddev) {
++ MD_BUG();
++ continue;
++ }
++ kick_rdev_from_array(rdev);
++ }
++ if (mddev->nb_dev)
++ MD_BUG();
++}
++
++#undef BAD_CSUM
++#undef BAD_MAGIC
++#undef OUT_OF_MEM
++#undef NO_SB
++
++static void print_desc(mdp_disk_t *desc)
++{
++ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
++ partition_name(MKDEV(desc->major,desc->minor)),
++ desc->major,desc->minor,desc->raid_disk,desc->state);
++}
++
++static void print_sb(mdp_super_t *sb)
++{
++ int i;
++
++ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
++ sb->major_version, sb->minor_version, sb->patch_version,
++ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
++ sb->ctime);
++ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
++ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
++ sb->layout, sb->chunk_size);
++ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
++ sb->utime, sb->state, sb->active_disks, sb->working_disks,
++ sb->failed_disks, sb->spare_disks,
++ sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ mdp_disk_t *desc;
++
++ desc = sb->disks + i;
++ printk(" D %2d: ", i);
++ print_desc(desc);
++ }
++ printk(" THIS: ");
++ print_desc(&sb->this_disk);
++
++}
++
++static void print_rdev(mdk_rdev_t *rdev)
++{
++ printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
++ partition_name(rdev->dev), partition_name(rdev->old_dev),
++ rdev->size, rdev->faulty, rdev->desc_nr);
++ if (rdev->sb) {
++ printk("rdev superblock:\n");
++ print_sb(rdev->sb);
++ } else
++ printk("no rdev superblock!\n");
++}
++
++void md_print_devices (void)
++{
++ struct md_list_head *tmp, *tmp2;
++ mdk_rdev_t *rdev;
++ mddev_t *mddev;
++
++ printk("\n");
++ printk(" **********************************\n");
++ printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
++ printk(" **********************************\n");
++ ITERATE_MDDEV(mddev,tmp) {
++ printk("md%d: ", mdidx(mddev));
++
++ ITERATE_RDEV(mddev,rdev,tmp2)
++ printk("<%s>", partition_name(rdev->dev));
++
++ if (mddev->sb) {
++ printk(" array superblock:\n");
++ print_sb(mddev->sb);
++ } else
++ printk(" no array superblock.\n");
++
++ ITERATE_RDEV(mddev,rdev,tmp2)
++ print_rdev(rdev);
++ }
++ printk(" **********************************\n");
++ printk("\n");
++}
++
++static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
++{
++ int ret;
++ mdp_super_t *tmp1, *tmp2;
++
++ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
++ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
++
++ if (!tmp1 || !tmp2) {
++ ret = 0;
++ goto abort;
++ }
++
++ *tmp1 = *sb1;
++ *tmp2 = *sb2;
++
++ /*
++ * nr_disks is not constant
++ */
++ tmp1->nr_disks = 0;
++ tmp2->nr_disks = 0;
++
++ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
++ ret = 0;
++ else
++ ret = 1;
++
++abort:
++ if (tmp1)
++ kfree(tmp1);
++ if (tmp2)
++ kfree(tmp2);
++
++ return ret;
++}
++
++static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
++{
++ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
++ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
++ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
++ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
++
++ return 1;
++
++ return 0;
++}
++
++static mdk_rdev_t * find_rdev_all (kdev_t dev)
++{
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++
++ tmp = all_raid_disks.next;
++ while (tmp != &all_raid_disks) {
++ rdev = md_list_entry(tmp, mdk_rdev_t, all);
++ if (rdev->dev == dev)
++ return rdev;
++ tmp = tmp->next;
++ }
++ return NULL;
++}
++
++#define GETBLK_FAILED KERN_ERR \
++"md: getblk failed for device %s\n"
++
++static int write_disk_sb(mdk_rdev_t * rdev)
++{
++ struct buffer_head *bh;
++ kdev_t dev;
++ u32 sb_offset, size;
++ mdp_super_t *sb;
++
++ if (!rdev->sb) {
++ MD_BUG();
++ return -1;
++ }
++ if (rdev->faulty) {
++ MD_BUG();
++ return -1;
++ }
++ if (rdev->sb->md_magic != MD_SB_MAGIC) {
++ MD_BUG();
++ return -1;
++ }
++
++ dev = rdev->dev;
++ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
++ if (rdev->sb_offset != sb_offset) {
++ printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
++ goto skip;
++ }
++ /*
++ * If the disk went offline meanwhile and it's just a spare, then
++ * it's size has changed to zero silently, and the MD code does
++ * not yet know that it's faulty.
++ */
++ size = calc_dev_size(dev, rdev->mddev, 1);
++ if (size != rdev->size) {
++ printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
++ goto skip;
++ }
++
++ printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
++ fsync_dev(dev);
++ set_blocksize(dev, MD_SB_BYTES);
++ bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
++ if (!bh) {
++ printk(GETBLK_FAILED, partition_name(dev));
++ return 1;
++ }
++ memset(bh->b_data,0,bh->b_size);
++ sb = (mdp_super_t *) bh->b_data;
++ memcpy(sb, rdev->sb, MD_SB_BYTES);
++
++ mark_buffer_uptodate(bh, 1);
++ mark_buffer_dirty(bh, 1);
++ ll_rw_block(WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ brelse(bh);
++ fsync_dev(dev);
++skip:
++ return 0;
++}
++#undef GETBLK_FAILED KERN_ERR
++
++static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
++{
++ int i, ok = 0;
++ mdp_disk_t *desc;
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ desc = mddev->sb->disks + i;
++#if 0
++ if (disk_faulty(desc)) {
++ if (MKDEV(desc->major,desc->minor) == rdev->dev)
++ ok = 1;
++ continue;
++ }
++#endif
++ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
++ rdev->sb->this_disk = *desc;
++ rdev->desc_nr = desc->number;
++ ok = 1;
++ break;
++ }
++ }
++
++ if (!ok) {
++ MD_BUG();
++ }
++}
++
++static int sync_sbs(mddev_t * mddev)
++{
++ mdk_rdev_t *rdev;
++ mdp_super_t *sb;
++ struct md_list_head *tmp;
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty)
++ continue;
++ sb = rdev->sb;
++ *sb = *mddev->sb;
++ set_this_disk(mddev, rdev);
++ sb->sb_csum = calc_sb_csum(sb);
++ }
++ return 0;
++}
++
++int md_update_sb(mddev_t * mddev)
++{
++ int first, err, count = 100;
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++ __u64 ev;
++
++repeat:
++ mddev->sb->utime = CURRENT_TIME;
++ ev = get_unaligned(&mddev->sb->events);
++ ++ev;
++ put_unaligned(ev,&mddev->sb->events);
++ if (ev == (__u64)0) {
++ /*
++ * oops, this 64-bit counter should never wrap.
++ * Either we are in around ~1 trillion A.C., assuming
++ * 1 reboot per second, or we have a bug:
++ */
++ MD_BUG();
++ --ev;
++ put_unaligned(ev,&mddev->sb->events);
++ }
++ sync_sbs(mddev);
++
++ /*
++ * do not write anything to disk if using
++ * nonpersistent superblocks
++ */
++ if (mddev->sb->not_persistent)
++ return 0;
++
++ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
++ mdidx(mddev));
++
++ first = 1;
++ err = 0;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (!first) {
++ first = 0;
++ printk(", ");
++ }
++ if (rdev->faulty)
++ printk("(skipping faulty ");
++ printk("%s ", partition_name(rdev->dev));
++ if (!rdev->faulty) {
++ printk("[events: %08lx]",
++ (unsigned long)get_unaligned(&rdev->sb->events));
++ err += write_disk_sb(rdev);
++ } else
++ printk(")\n");
++ }
++ printk(".\n");
++ if (err) {
++ printk("errors occured during superblock update, repeating\n");
++ if (--count)
++ goto repeat;
++ printk("excessive errors occured during superblock update, exiting\n");
++ }
++ return 0;
++}
++
++/*
++ * Import a device. If 'on_disk', then sanity check the superblock
++ *
++ * mark the device faulty if:
++ *
++ * - the device is nonexistent (zero size)
++ * - the device has no valid superblock
++ *
++ * a faulty rdev _never_ has rdev->sb set.
++ */
++static int md_import_device (kdev_t newdev, int on_disk)
++{
++ int err;
++ mdk_rdev_t *rdev;
++ unsigned int size;
++
++ if (find_rdev_all(newdev))
++ return -EEXIST;
++
++ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
++ if (!rdev) {
++ printk("could not alloc mem for %s!\n", partition_name(newdev));
++ return -ENOMEM;
++ }
++ memset(rdev, 0, sizeof(*rdev));
++
++ if (!fs_may_mount(newdev)) {
++ printk("md: can not import %s, has active inodes!\n",
++ partition_name(newdev));
++ err = -EBUSY;
++ goto abort_free;
++ }
++
++ if ((err = alloc_disk_sb(rdev)))
++ goto abort_free;
++
++ rdev->dev = newdev;
++ if (lock_rdev(rdev)) {
++ printk("md: could not lock %s, zero-size? Marking faulty.\n",
++ partition_name(newdev));
++ err = -EINVAL;
++ goto abort_free;
++ }
++ rdev->desc_nr = -1;
++ rdev->faulty = 0;
++
++ size = 0;
++ if (blk_size[MAJOR(newdev)])
++ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
++ if (!size) {
++ printk("md: %s has zero size, marking faulty!\n",
++ partition_name(newdev));
++ err = -EINVAL;
++ goto abort_free;
++ }
++
++ if (on_disk) {
++ if ((err = read_disk_sb(rdev))) {
++ printk("md: could not read %s's sb, not importing!\n",
++ partition_name(newdev));
++ goto abort_free;
++ }
++ if ((err = check_disk_sb(rdev))) {
++ printk("md: %s has invalid sb, not importing!\n",
++ partition_name(newdev));
++ goto abort_free;
++ }
++
++ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
++ rdev->sb->this_disk.minor);
++ rdev->desc_nr = rdev->sb->this_disk.number;
++ }
++ md_list_add(&rdev->all, &all_raid_disks);
++ MD_INIT_LIST_HEAD(&rdev->pending);
++
++ if (rdev->faulty && rdev->sb)
++ free_disk_sb(rdev);
++ return 0;
++
++abort_free:
++ if (rdev->sb) {
++ if (rdev->inode)
++ unlock_rdev(rdev);
++ free_disk_sb(rdev);
++ }
++ kfree(rdev);
++ return err;
++}
++
++/*
++ * Check a full RAID array for plausibility
++ */
++
++#define INCONSISTENT KERN_ERR \
++"md: fatal superblock inconsistency in %s -- removing from array\n"
++
++#define OUT_OF_DATE KERN_ERR \
++"md: superblock update time inconsistency -- using the most recent one\n"
++
++#define OLD_VERSION KERN_ALERT \
++"md: md%d: unsupported raid array version %d.%d.%d\n"
++
++#define NOT_CLEAN_IGNORE KERN_ERR \
++"md: md%d: raid array is not clean -- starting background reconstruction\n"
++
++#define UNKNOWN_LEVEL KERN_ERR \
++"md: md%d: unsupported raid level %d\n"
++
++static int analyze_sbs (mddev_t * mddev)
++{
++ int out_of_date = 0, i;
++ struct md_list_head *tmp, *tmp2;
++ mdk_rdev_t *rdev, *rdev2, *freshest;
++ mdp_super_t *sb;
++
++ /*
++ * Verify the RAID superblock on each real device
++ */
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty) {
++ MD_BUG();
++ goto abort;
++ }
++ if (!rdev->sb) {
++ MD_BUG();
++ goto abort;
++ }
++ if (check_disk_sb(rdev))
++ goto abort;
++ }
++
++ /*
++ * The superblock constant part has to be the same
++ * for all disks in the array.
++ */
++ sb = NULL;
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (!sb) {
++ sb = rdev->sb;
++ continue;
++ }
++ if (!sb_equal(sb, rdev->sb)) {
++ printk (INCONSISTENT, partition_name(rdev->dev));
++ kick_rdev_from_array(rdev);
++ continue;
++ }
++ }
++
++ /*
++ * OK, we have all disks and the array is ready to run. Let's
++ * find the freshest superblock, that one will be the superblock
++ * that represents the whole array.
++ */
++ if (!mddev->sb)
++ if (alloc_array_sb(mddev))
++ goto abort;
++ sb = mddev->sb;
++ freshest = NULL;
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ __u64 ev1, ev2;
++ /*
++ * if the checksum is invalid, use the superblock
++ * only as a last resort. (decrease it's age by
++ * one event)
++ */
++ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
++ __u64 ev = get_unaligned(&rdev->sb->events);
++ if (ev != (__u64)0) {
++ --ev;
++ put_unaligned(ev,&rdev->sb->events);
++ }
++ }
++
++ printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
++ (unsigned long)get_unaligned(&rdev->sb->events));
++ if (!freshest) {
++ freshest = rdev;
++ continue;
++ }
++ /*
++ * Find the newest superblock version
++ */
++ ev1 = get_unaligned(&rdev->sb->events);
++ ev2 = get_unaligned(&freshest->sb->events);
++ if (ev1 != ev2) {
++ out_of_date = 1;
++ if (ev1 > ev2)
++ freshest = rdev;
++ }
++ }
++ if (out_of_date) {
++ printk(OUT_OF_DATE);
++ printk("freshest: %s\n", partition_name(freshest->dev));
++ }
++ memcpy (sb, freshest->sb, sizeof(*sb));
++
++ /*
++ * at this point we have picked the 'best' superblock
++ * from all available superblocks.
++ * now we validate this superblock and kick out possibly
++ * failed disks.
++ */
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ /*
++ * Kick all non-fresh devices faulty
++ */
++ __u64 ev1, ev2;
++ ev1 = get_unaligned(&rdev->sb->events);
++ ev2 = get_unaligned(&sb->events);
++ ++ev1;
++ if (ev1 < ev2) {
++ printk("md: kicking non-fresh %s from array!\n",
++ partition_name(rdev->dev));
++ kick_rdev_from_array(rdev);
++ continue;
++ }
++ }
++
++ /*
++ * Fix up changed device names ... but only if this disk has a
++ * recent update time. Use faulty checksum ones too.
++ */
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ __u64 ev1, ev2, ev3;
++ if (rdev->faulty) { /* REMOVEME */
++ MD_BUG();
++ goto abort;
++ }
++ ev1 = get_unaligned(&rdev->sb->events);
++ ev2 = get_unaligned(&sb->events);
++ ev3 = ev2;
++ --ev3;
++ if ((rdev->dev != rdev->old_dev) &&
++ ((ev1 == ev2) || (ev1 == ev3))) {
++ mdp_disk_t *desc;
++
++ printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
++ if (rdev->desc_nr == -1) {
++ MD_BUG();
++ goto abort;
++ }
++ desc = &sb->disks[rdev->desc_nr];
++ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
++ MD_BUG();
++ goto abort;
++ }
++ desc->major = MAJOR(rdev->dev);
++ desc->minor = MINOR(rdev->dev);
++ desc = &rdev->sb->this_disk;
++ desc->major = MAJOR(rdev->dev);
++ desc->minor = MINOR(rdev->dev);
++ }
++ }
++
++ /*
++ * Remove unavailable and faulty devices ...
++ *
++ * note that if an array becomes completely unrunnable due to
++ * missing devices, we do not write the superblock back, so the
++ * administrator has a chance to fix things up. The removal thus
++ * only happens if it's nonfatal to the contents of the array.
++ */
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ int found;
++ mdp_disk_t *desc;
++ kdev_t dev;
++
++ desc = sb->disks + i;
++ dev = MKDEV(desc->major, desc->minor);
++
++ /*
++ * We kick faulty devices/descriptors immediately.
++ */
++ if (disk_faulty(desc)) {
++ found = 0;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->desc_nr != desc->number)
++ continue;
++ printk("md%d: kicking faulty %s!\n",
++ mdidx(mddev),partition_name(rdev->dev));
++ kick_rdev_from_array(rdev);
++ found = 1;
++ break;
++ }
++ if (!found) {
++ if (dev == MKDEV(0,0))
++ continue;
++ printk("md%d: removing former faulty %s!\n",
++ mdidx(mddev), partition_name(dev));
++ }
++ remove_descriptor(desc, sb);
++ continue;
++ }
++
++ if (dev == MKDEV(0,0))
++ continue;
++ /*
++ * Is this device present in the rdev ring?
++ */
++ found = 0;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->desc_nr == desc->number) {
++ found = 1;
++ break;
++ }
++ }
++ if (found)
++ continue;
++
++ printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
++ remove_descriptor(desc, sb);
++ }
++
++ /*
++ * Double check wether all devices mentioned in the
++ * superblock are in the rdev ring.
++ */
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ mdp_disk_t *desc;
++ kdev_t dev;
++
++ desc = sb->disks + i;
++ dev = MKDEV(desc->major, desc->minor);
++
++ if (dev == MKDEV(0,0))
++ continue;
++
++ if (disk_faulty(desc)) {
++ MD_BUG();
++ goto abort;
++ }
++
++ rdev = find_rdev(mddev, dev);
++ if (!rdev) {
++ MD_BUG();
++ goto abort;
++ }
++ }
++
++ /*
++ * Do a final reality check.
++ */
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->desc_nr == -1) {
++ MD_BUG();
++ goto abort;
++ }
++ /*
++ * is the desc_nr unique?
++ */
++ ITERATE_RDEV(mddev,rdev2,tmp2) {
++ if ((rdev2 != rdev) &&
++ (rdev2->desc_nr == rdev->desc_nr)) {
++ MD_BUG();
++ goto abort;
++ }
++ }
++ /*
++ * is the device unique?
++ */
++ ITERATE_RDEV(mddev,rdev2,tmp2) {
++ if ((rdev2 != rdev) &&
++ (rdev2->dev == rdev->dev)) {
++ MD_BUG();
++ goto abort;
++ }
++ }
++ }
++
++ /*
++ * Check if we can support this RAID array
++ */
++ if (sb->major_version != MD_MAJOR_VERSION ||
++ sb->minor_version > MD_MINOR_VERSION) {
++
++ printk (OLD_VERSION, mdidx(mddev), sb->major_version,
++ sb->minor_version, sb->patch_version);
++ goto abort;
++ }
++
++ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
++ (sb->level == 4) || (sb->level == 5)))
++ printk (NOT_CLEAN_IGNORE, mdidx(mddev));
++
++ return 0;
++abort:
++ return 1;
++}
++
++#undef INCONSISTENT
++#undef OUT_OF_DATE
++#undef OLD_VERSION
++#undef OLD_LEVEL
++
++static int device_size_calculation (mddev_t * mddev)
++{
++ int data_disks = 0, persistent;
++ unsigned int readahead;
++ mdp_super_t *sb = mddev->sb;
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++
++ /*
++ * Do device size calculation. Bail out if too small.
++ * (we have to do this after having validated chunk_size,
++ * because device size has to be modulo chunk_size)
++ */
++ persistent = !mddev->sb->not_persistent;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty)
++ continue;
++ if (rdev->size) {
++ MD_BUG();
++ continue;
++ }
++ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
++ if (rdev->size < sb->chunk_size / 1024) {
++ printk (KERN_WARNING
++ "Dev %s smaller than chunk_size: %dk < %dk\n",
++ partition_name(rdev->dev),
++ rdev->size, sb->chunk_size / 1024);
++ return -EINVAL;
++ }
++ }
++
++ switch (sb->level) {
++ case -3:
++ data_disks = 1;
++ break;
++ case -2:
++ data_disks = 1;
++ break;
++ case -1:
++ zoned_raid_size(mddev);
++ data_disks = 1;
++ break;
++ case 0:
++ zoned_raid_size(mddev);
++ data_disks = sb->raid_disks;
++ break;
++ case 1:
++ data_disks = 1;
++ break;
++ case 4:
++ case 5:
++ data_disks = sb->raid_disks-1;
++ break;
++ default:
++ printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
++ goto abort;
++ }
++ if (!md_size[mdidx(mddev)])
++ md_size[mdidx(mddev)] = sb->size * data_disks;
++
++ readahead = MD_READAHEAD;
++ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
++ readahead = mddev->sb->chunk_size * 4 * data_disks;
++ if (readahead < data_disks * MAX_SECTORS*512*2)
++ readahead = data_disks * MAX_SECTORS*512*2;
++ else {
++ if (sb->level == -3)
++ readahead = 0;
++ }
++ md_maxreadahead[mdidx(mddev)] = readahead;
++
++ printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
++ mdidx(mddev), readahead/1024);
++
++ printk(KERN_INFO
++ "md%d: %d data-disks, max readahead per data-disk: %dk\n",
++ mdidx(mddev), data_disks, readahead/data_disks/1024);
++ return 0;
++abort:
++ return 1;
++}
++
++
++#define TOO_BIG_CHUNKSIZE KERN_ERR \
++"too big chunk_size: %d > %d\n"
++
++#define TOO_SMALL_CHUNKSIZE KERN_ERR \
++"too small chunk_size: %d < %ld\n"
++
++#define BAD_CHUNKSIZE KERN_ERR \
++"no chunksize specified, see 'man raidtab'\n"
++
++static int do_md_run (mddev_t * mddev)
++{
++ int pnum, err;
++ int chunk_size;
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev;
++
++
++ if (!mddev->nb_dev) {
++ MD_BUG();
++ return -EINVAL;
++ }
++
++ if (mddev->pers)
++ return -EBUSY;
++
++ /*
++ * Resize disks to align partitions size on a given
++ * chunk size.
++ */
++ md_size[mdidx(mddev)] = 0;
++
++ /*
++ * Analyze all RAID superblock(s)
++ */
++ if (analyze_sbs(mddev)) {
++ MD_BUG();
++ return -EINVAL;
++ }
++
++ chunk_size = mddev->sb->chunk_size;
++ pnum = level_to_pers(mddev->sb->level);
++
++ mddev->param.chunk_size = chunk_size;
++ mddev->param.personality = pnum;
++
++ if (chunk_size > MAX_CHUNK_SIZE) {
++ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
++ return -EINVAL;
++ }
++ /*
++ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
++ */
++ if ( (1 << ffz(~chunk_size)) != chunk_size) {
++ MD_BUG();
++ return -EINVAL;
++ }
++ if (chunk_size < PAGE_SIZE) {
++ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
++ return -EINVAL;
++ }
++
++ if (pnum >= MAX_PERSONALITY) {
++ MD_BUG();
++ return -EINVAL;
++ }
++
++ if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
++ /*
++ * 'default chunksize' in the old md code used to
++ * be PAGE_SIZE, baaad.
++ * we abort here to be on the safe side. We dont
++ * want to continue the bad practice.
++ */
++ printk(BAD_CHUNKSIZE);
++ return -EINVAL;
++ }
++
++ if (!pers[pnum])
++ {
++#ifdef CONFIG_KMOD
++ char module_name[80];
++ sprintf (module_name, "md-personality-%d", pnum);
++ request_module (module_name);
++ if (!pers[pnum])
++#endif
++ return -EINVAL;
++ }
++
++ if (device_size_calculation(mddev))
++ return -EINVAL;
++
++ /*
++ * Drop all container device buffers, from now on
++ * the only valid external interface is through the md
++ * device.
++ */
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty)
++ continue;
++ fsync_dev(rdev->dev);
++ invalidate_buffers(rdev->dev);
++ }
++
++ mddev->pers = pers[pnum];
++
++ err = mddev->pers->run(mddev);
++ if (err) {
++ printk("pers->run() failed ...\n");
++ mddev->pers = NULL;
++ return -EINVAL;
++ }
++
++ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
++ md_update_sb(mddev);
++
++ /*
++ * md_size has units of 1K blocks, which are
++ * twice as large as sectors.
++ */
++ md_hd_struct[mdidx(mddev)].start_sect = 0;
++ md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
++
++ read_ahead[MD_MAJOR] = 1024;
++ return (0);
++}
++
++#undef TOO_BIG_CHUNKSIZE
++#undef BAD_CHUNKSIZE
++
++#define OUT(x) do { err = (x); goto out; } while (0)
++
++static int restart_array (mddev_t *mddev)
++{
++ int err = 0;
++
++ /*
++ * Complain if it has no devices
++ */
++ if (!mddev->nb_dev)
++ OUT(-ENXIO);
++
++ if (mddev->pers) {
++ if (!mddev->ro)
++ OUT(-EBUSY);
++
++ mddev->ro = 0;
++ set_device_ro(mddev_to_kdev(mddev), 0);
++
++ printk (KERN_INFO
++ "md%d switched to read-write mode.\n", mdidx(mddev));
++ /*
++ * Kick recovery or resync if necessary
++ */
++ md_recover_arrays();
++ if (mddev->pers->restart_resync)
++ mddev->pers->restart_resync(mddev);
++ } else
++ err = -EINVAL;
++
++out:
++ return err;
++}
++
++#define STILL_MOUNTED KERN_WARNING \
++"md: md%d still mounted.\n"
++
++static int do_md_stop (mddev_t * mddev, int ro)
++{
++ int err = 0, resync_interrupted = 0;
++ kdev_t dev = mddev_to_kdev(mddev);
++
++ if (!ro && !fs_may_mount (dev)) {
++ printk (STILL_MOUNTED, mdidx(mddev));
++ OUT(-EBUSY);
++ }
++
++ /*
++ * complain if it's already stopped
++ */
++ if (!mddev->nb_dev)
++ OUT(-ENXIO);
++
++ if (mddev->pers) {
++ /*
++ * It is safe to call stop here, it only frees private
++ * data. Also, it tells us if a device is unstoppable
++ * (eg. resyncing is in progress)
++ */
++ if (mddev->pers->stop_resync)
++ if (mddev->pers->stop_resync(mddev))
++ resync_interrupted = 1;
++
++ if (mddev->recovery_running)
++ md_interrupt_thread(md_recovery_thread);
++
++ /*
++ * This synchronizes with signal delivery to the
++ * resync or reconstruction thread. It also nicely
++ * hangs the process if some reconstruction has not
++ * finished.
++ */
++ down(&mddev->recovery_sem);
++ up(&mddev->recovery_sem);
++
++ /*
++ * sync and invalidate buffers because we cannot kill the
++ * main thread with valid IO transfers still around.
++ * the kernel lock protects us from new requests being
++ * added after invalidate_buffers().
++ */
++ fsync_dev (mddev_to_kdev(mddev));
++ fsync_dev (dev);
++ invalidate_buffers (dev);
++
++ if (ro) {
++ if (mddev->ro)
++ OUT(-ENXIO);
++ mddev->ro = 1;
++ } else {
++ if (mddev->ro)
++ set_device_ro(dev, 0);
++ if (mddev->pers->stop(mddev)) {
++ if (mddev->ro)
++ set_device_ro(dev, 1);
++ OUT(-EBUSY);
++ }
++ if (mddev->ro)
++ mddev->ro = 0;
++ }
++ if (mddev->sb) {
++ /*
++ * mark it clean only if there was no resync
++ * interrupted.
++ */
++ if (!mddev->recovery_running && !resync_interrupted) {
++ printk("marking sb clean...\n");
++ mddev->sb->state |= 1 << MD_SB_CLEAN;
++ }
++ md_update_sb(mddev);
++ }
++ if (ro)
++ set_device_ro(dev, 1);
++ }
++
++ /*
++ * Free resources if final stop
++ */
++ if (!ro) {
++ export_array(mddev);
++ md_size[mdidx(mddev)] = 0;
++ md_hd_struct[mdidx(mddev)].nr_sects = 0;
++ free_mddev(mddev);
++
++ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
++ } else
++ printk (KERN_INFO
++ "md%d switched to read-only mode.\n", mdidx(mddev));
++out:
++ return err;
++}
++
++#undef OUT
++
++/*
++ * We have to safely support old arrays too.
++ */
++int detect_old_array (mdp_super_t *sb)
++{
++ if (sb->major_version > 0)
++ return 0;
++ if (sb->minor_version >= 90)
++ return 0;
++
++ return -EINVAL;
++}
++
++
++static void autorun_array (mddev_t *mddev)
++{
++ mdk_rdev_t *rdev;
++ struct md_list_head *tmp;
++ int err;
++
++ if (mddev->disks.prev == &mddev->disks) {
++ MD_BUG();
++ return;
++ }
++
++ printk("running: ");
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ printk("<%s>", partition_name(rdev->dev));
++ }
++ printk("\nnow!\n");
++
++ err = do_md_run (mddev);
++ if (err) {
++ printk("do_md_run() returned %d\n", err);
++ /*
++ * prevent the writeback of an unrunnable array
++ */
++ mddev->sb_dirty = 0;
++ do_md_stop (mddev, 0);
++ }
++}
++
++/*
++ * lets try to run arrays based on all disks that have arrived
++ * until now. (those are in the ->pending list)
++ *
++ * the method: pick the first pending disk, collect all disks with
++ * the same UUID, remove all from the pending list and put them into
++ * the 'same_array' list. Then order this list based on superblock
++ * update time (freshest comes first), kick out 'old' disks and
++ * compare superblocks. If everything's fine then run it.
++ */
++static void autorun_devices (void)
++{
++ struct md_list_head candidates;
++ struct md_list_head *tmp;
++ mdk_rdev_t *rdev0, *rdev;
++ mddev_t *mddev;
++ kdev_t md_kdev;
++
++
++ printk("autorun ...\n");
++ while (pending_raid_disks.next != &pending_raid_disks) {
++ rdev0 = md_list_entry(pending_raid_disks.next,
++ mdk_rdev_t, pending);
++
++ printk("considering %s ...\n", partition_name(rdev0->dev));
++ MD_INIT_LIST_HEAD(&candidates);
++ ITERATE_RDEV_PENDING(rdev,tmp) {
++ if (uuid_equal(rdev0, rdev)) {
++ if (!sb_equal(rdev0->sb, rdev->sb)) {
++ printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
++ continue;
++ }
++ printk(" adding %s ...\n", partition_name(rdev->dev));
++ md_list_del(&rdev->pending);
++ md_list_add(&rdev->pending, &candidates);
++ }
++ }
+ /*
+- * Check the superblock for consistency.
+- * The personality itself has to check whether it's getting
+- * added with the proper flags. The personality has to be
+- * checked too. ;)
++ * now we have a set of devices, with all of them having
++ * mostly sane superblocks. It's time to allocate the
++ * mddev.
+ */
+- if (analyze_one_sb (realdev))
++ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
++ mddev = kdev_to_mddev(md_kdev);
++ if (mddev) {
++ printk("md%d already running, cannot run %s\n",
++ mdidx(mddev), partition_name(rdev0->dev));
++ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
++ export_rdev(rdev);
++ continue;
++ }
++ mddev = alloc_mddev(md_kdev);
++ printk("created md%d\n", mdidx(mddev));
++ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
++ bind_rdev_to_array(rdev, mddev);
++ md_list_del(&rdev->pending);
++ MD_INIT_LIST_HEAD(&rdev->pending);
++ }
++ autorun_array(mddev);
++ }
++ printk("... autorun DONE.\n");
++}
++
++/*
++ * import RAID devices based on one partition
++ * if possible, the array gets run as well.
++ */
++
++#define BAD_VERSION KERN_ERR \
++"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
++
++#define OUT_OF_MEM KERN_ALERT \
++"md: out of memory.\n"
++
++#define NO_DEVICE KERN_ERR \
++"md: disabled device %s\n"
++
++#define AUTOADD_FAILED KERN_ERR \
++"md: auto-adding devices to md%d FAILED (error %d).\n"
++
++#define AUTOADD_FAILED_USED KERN_ERR \
++"md: cannot auto-add device %s to md%d, already used.\n"
++
++#define AUTORUN_FAILED KERN_ERR \
++"md: auto-running md%d FAILED (error %d).\n"
++
++#define MDDEV_BUSY KERN_ERR \
++"md: cannot auto-add to md%d, already running.\n"
++
++#define AUTOADDING KERN_INFO \
++"md: auto-adding devices to md%d, based on %s's superblock.\n"
++
++#define AUTORUNNING KERN_INFO \
++"md: auto-running md%d.\n"
++
++static int autostart_array (kdev_t startdev)
++{
++ int err = -EINVAL, i;
++ mdp_super_t *sb = NULL;
++ mdk_rdev_t *start_rdev = NULL, *rdev;
++
++ if (md_import_device(startdev, 1)) {
++ printk("could not import %s!\n", partition_name(startdev));
++ goto abort;
++ }
++
++ start_rdev = find_rdev_all(startdev);
++ if (!start_rdev) {
++ MD_BUG();
++ goto abort;
++ }
++ if (start_rdev->faulty) {
++ printk("can not autostart based on faulty %s!\n",
++ partition_name(startdev));
++ goto abort;
++ }
++ md_list_add(&start_rdev->pending, &pending_raid_disks);
++
++ sb = start_rdev->sb;
++
++ err = detect_old_array(sb);
++ if (err) {
++ printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
++ goto abort;
++ }
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ mdp_disk_t *desc;
++ kdev_t dev;
++
++ desc = sb->disks + i;
++ dev = MKDEV(desc->major, desc->minor);
++
++ if (dev == MKDEV(0,0))
++ continue;
++ if (dev == startdev)
++ continue;
++ if (md_import_device(dev, 1)) {
++ printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
++ continue;
++ }
++ rdev = find_rdev_all(dev);
++ if (!rdev) {
++ MD_BUG();
++ goto abort;
++ }
++ md_list_add(&rdev->pending, &pending_raid_disks);
++ }
++
++ /*
++ * possibly return codes
++ */
++ autorun_devices();
++ return 0;
++
++abort:
++ if (start_rdev)
++ export_rdev(start_rdev);
++ return err;
++}
++
++#undef BAD_VERSION
++#undef OUT_OF_MEM
++#undef NO_DEVICE
++#undef AUTOADD_FAILED_USED
++#undef AUTOADD_FAILED
++#undef AUTORUN_FAILED
++#undef AUTOADDING
++#undef AUTORUNNING
++
++struct {
++ int set;
++ int noautodetect;
++
++} raid_setup_args md__initdata = { 0, 0 };
++
++/*
++ * Searches all registered partitions for autorun RAID arrays
++ * at boot time.
++ */
++md__initfunc(void autodetect_raid(void))
++{
++#ifdef CONFIG_AUTODETECT_RAID
++ struct gendisk *disk;
++ mdk_rdev_t *rdev;
++ int i;
++
++ if (raid_setup_args.noautodetect) {
++ printk(KERN_INFO "skipping autodetection of RAID arrays\n");
++ return;
++ }
++ printk(KERN_INFO "autodetecting RAID arrays\n");
++
++ for (disk = gendisk_head ; disk ; disk = disk->next) {
++ for (i = 0; i < disk->max_p*disk->max_nr; i++) {
++ kdev_t dev = MKDEV(disk->major,i);
++
++ if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
++ printk(KERN_ALERT
++"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
++" to maintain interoperability with other OSs! Autodetection support for\n"
++" type 0x86 will be deleted after some migration timeout. Sorry.\n",
++ partition_name(dev));
++ disk->part[i].type = LINUX_RAID_PARTITION;
++ }
++ if (disk->part[i].type != LINUX_RAID_PARTITION)
++ continue;
++
++ if (md_import_device(dev,1)) {
++ printk(KERN_ALERT "could not import %s!\n",
++ partition_name(dev));
++ continue;
++ }
++ /*
++ * Sanity checks:
++ */
++ rdev = find_rdev_all(dev);
++ if (!rdev) {
++ MD_BUG();
++ continue;
++ }
++ if (rdev->faulty) {
++ MD_BUG();
++ continue;
++ }
++ md_list_add(&rdev->pending, &pending_raid_disks);
++ }
++ }
++
++ autorun_devices();
++#endif
++}
++
++static int get_version (void * arg)
++{
++ mdu_version_t ver;
++
++ ver.major = MD_MAJOR_VERSION;
++ ver.minor = MD_MINOR_VERSION;
++ ver.patchlevel = MD_PATCHLEVEL_VERSION;
++
++ if (md_copy_to_user(arg, &ver, sizeof(ver)))
++ return -EFAULT;
++
++ return 0;
++}
++
++#define SET_FROM_SB(x) info.x = mddev->sb->x
++static int get_array_info (mddev_t * mddev, void * arg)
++{
++ mdu_array_info_t info;
++
++ if (!mddev->sb)
++ return -EINVAL;
++
++ SET_FROM_SB(major_version);
++ SET_FROM_SB(minor_version);
++ SET_FROM_SB(patch_version);
++ SET_FROM_SB(ctime);
++ SET_FROM_SB(level);
++ SET_FROM_SB(size);
++ SET_FROM_SB(nr_disks);
++ SET_FROM_SB(raid_disks);
++ SET_FROM_SB(md_minor);
++ SET_FROM_SB(not_persistent);
++
++ SET_FROM_SB(utime);
++ SET_FROM_SB(state);
++ SET_FROM_SB(active_disks);
++ SET_FROM_SB(working_disks);
++ SET_FROM_SB(failed_disks);
++ SET_FROM_SB(spare_disks);
++
++ SET_FROM_SB(layout);
++ SET_FROM_SB(chunk_size);
++
++ if (md_copy_to_user(arg, &info, sizeof(info)))
++ return -EFAULT;
++
++ return 0;
++}
++#undef SET_FROM_SB
++
++#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
++static int get_disk_info (mddev_t * mddev, void * arg)
++{
++ mdu_disk_info_t info;
++ unsigned int nr;
++
++ if (!mddev->sb)
++ return -EINVAL;
++
++ if (md_copy_from_user(&info, arg, sizeof(info)))
++ return -EFAULT;
++
++ nr = info.number;
++ if (nr >= mddev->sb->nr_disks)
++ return -EINVAL;
++
++ SET_FROM_SB(major);
++ SET_FROM_SB(minor);
++ SET_FROM_SB(raid_disk);
++ SET_FROM_SB(state);
++
++ if (md_copy_to_user(arg, &info, sizeof(info)))
++ return -EFAULT;
++
++ return 0;
++}
++#undef SET_FROM_SB
++
++#define SET_SB(x) mddev->sb->disks[nr].x = info.x
++
++static int add_new_disk (mddev_t * mddev, void * arg)
++{
++ int err, size, persistent;
++ mdu_disk_info_t info;
++ mdk_rdev_t *rdev;
++ unsigned int nr;
++ kdev_t dev;
++
++ if (!mddev->sb)
++ return -EINVAL;
++
++ if (md_copy_from_user(&info, arg, sizeof(info)))
++ return -EFAULT;
++
++ nr = info.number;
++ if (nr >= mddev->sb->nr_disks)
++ return -EINVAL;
++
++ dev = MKDEV(info.major,info.minor);
++
++ if (find_rdev_all(dev)) {
++ printk("device %s already used in a RAID array!\n",
++ partition_name(dev));
++ return -EBUSY;
++ }
++
++ SET_SB(number);
++ SET_SB(major);
++ SET_SB(minor);
++ SET_SB(raid_disk);
++ SET_SB(state);
++
++ if ((info.state & (1<<MD_DISK_FAULTY))==0) {
++ err = md_import_device (dev, 0);
++ if (err) {
++ printk("md: error, md_import_device() returned %d\n", err);
++ return -EINVAL;
++ }
++ rdev = find_rdev_all(dev);
++ if (!rdev) {
++ MD_BUG();
+ return -EINVAL;
++ }
++
++ rdev->old_dev = dev;
++ rdev->desc_nr = info.number;
++
++ bind_rdev_to_array(rdev, mddev);
++
++ persistent = !mddev->sb->not_persistent;
++ if (!persistent)
++ printk("nonpersistent superblock ...\n");
++ if (!mddev->sb->chunk_size)
++ printk("no chunksize?\n");
++
++ size = calc_dev_size(dev, mddev, persistent);
++ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
++
++ if (!mddev->sb->size || (mddev->sb->size > size))
++ mddev->sb->size = size;
++ }
++
++ /*
++ * sync all other superblocks with the main superblock
++ */
++ sync_sbs(mddev);
++
++ return 0;
++}
++#undef SET_SB
++
++static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
++{
++ int err;
++ mdk_rdev_t *rdev;
++ mdp_disk_t *disk;
++
++ if (!mddev->pers)
++ return -ENODEV;
++
++ printk("trying to remove %s from md%d ... \n",
++ partition_name(dev), mdidx(mddev));
++
++ if (!mddev->pers->diskop) {
++ printk("md%d: personality does not support diskops!\n",
++ mdidx(mddev));
++ return -EINVAL;
++ }
++
++ rdev = find_rdev(mddev, dev);
++ if (!rdev)
++ return -ENXIO;
++
++ if (rdev->desc_nr == -1) {
++ MD_BUG();
++ return -EINVAL;
++ }
++ disk = &mddev->sb->disks[rdev->desc_nr];
++ if (disk_active(disk))
++ goto busy;
++ if (disk_removed(disk)) {
++ MD_BUG();
++ return -EINVAL;
++ }
++
++ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
++ if (err == -EBUSY)
++ goto busy;
++ if (err) {
++ MD_BUG();
++ return -EINVAL;
++ }
++
++ remove_descriptor(disk, mddev->sb);
++ kick_rdev_from_array(rdev);
++ mddev->sb_dirty = 1;
++ md_update_sb(mddev);
++
++ return 0;
++busy:
++ printk("cannot remove active disk %s from md%d ... \n",
++ partition_name(dev), mdidx(mddev));
++ return -EBUSY;
++}
++
++static int hot_add_disk (mddev_t * mddev, kdev_t dev)
++{
++ int i, err, persistent;
++ unsigned int size;
++ mdk_rdev_t *rdev;
++ mdp_disk_t *disk;
++
++ if (!mddev->pers)
++ return -ENODEV;
++
++ printk("trying to hot-add %s to md%d ... \n",
++ partition_name(dev), mdidx(mddev));
++
++ if (!mddev->pers->diskop) {
++ printk("md%d: personality does not support diskops!\n",
++ mdidx(mddev));
++ return -EINVAL;
++ }
++
++ persistent = !mddev->sb->not_persistent;
++ size = calc_dev_size(dev, mddev, persistent);
++
++ if (size < mddev->sb->size) {
++ printk("md%d: disk size %d blocks < array size %d\n",
++ mdidx(mddev), size, mddev->sb->size);
++ return -ENOSPC;
++ }
++
++ rdev = find_rdev(mddev, dev);
++ if (rdev)
++ return -EBUSY;
++
++ err = md_import_device (dev, 0);
++ if (err) {
++ printk("md: error, md_import_device() returned %d\n", err);
++ return -EINVAL;
++ }
++ rdev = find_rdev_all(dev);
++ if (!rdev) {
++ MD_BUG();
++ return -EINVAL;
++ }
++ if (rdev->faulty) {
++ printk("md: can not hot-add faulty %s disk to md%d!\n",
++ partition_name(dev), mdidx(mddev));
++ err = -EINVAL;
++ goto abort_export;
++ }
++ bind_rdev_to_array(rdev, mddev);
++
++ /*
++ * The rest should better be atomic, we can have disk failures
++ * noticed in interrupt contexts ...
++ */
++ cli();
++ rdev->old_dev = dev;
++ rdev->size = size;
++ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
++
++ disk = mddev->sb->disks + mddev->sb->raid_disks;
++ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
++ disk = mddev->sb->disks + i;
++
++ if (!disk->major && !disk->minor)
++ break;
++ if (disk_removed(disk))
++ break;
++ }
++ if (i == MD_SB_DISKS) {
++ sti();
++ printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
++ err = -EBUSY;
++ goto abort_unbind_export;
++ }
++
++ if (disk_removed(disk)) {
+ /*
+- * hot_add has to bump up nb_dev itself
++ * reuse slot
+ */
+- if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
+- /*
+- * FIXME: here we should free up the inode and stuff
+- */
+- printk ("FIXME\n");
+- return -EINVAL;
++ if (disk->number != i) {
++ sti();
++ MD_BUG();
++ err = -EINVAL;
++ goto abort_unbind_export;
+ }
+- } else
+- md_dev[minor].nb_dev++;
++ } else {
++ disk->number = i;
++ }
+
+- printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
+- return (0);
++ disk->raid_disk = disk->number;
++ disk->major = MAJOR(dev);
++ disk->minor = MINOR(dev);
++
++ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
++ sti();
++ MD_BUG();
++ err = -EINVAL;
++ goto abort_unbind_export;
++ }
++
++ mark_disk_spare(disk);
++ mddev->sb->nr_disks++;
++ mddev->sb->spare_disks++;
++ mddev->sb->working_disks++;
++
++ mddev->sb_dirty = 1;
++
++ sti();
++ md_update_sb(mddev);
++
++ /*
++ * Kick recovery, maybe this spare has to be added to the
++ * array immediately.
++ */
++ md_recover_arrays();
++
++ return 0;
++
++abort_unbind_export:
++ unbind_rdev_from_array(rdev);
++
++abort_export:
++ export_rdev(rdev);
++ return err;
++}
++
++#define SET_SB(x) mddev->sb->x = info.x
++static int set_array_info (mddev_t * mddev, void * arg)
++{
++ mdu_array_info_t info;
++
++ if (mddev->sb) {
++ printk("array md%d already has a superblock!\n",
++ mdidx(mddev));
++ return -EBUSY;
++ }
++
++ if (md_copy_from_user(&info, arg, sizeof(info)))
++ return -EFAULT;
++
++ if (alloc_array_sb(mddev))
++ return -ENOMEM;
++
++ mddev->sb->major_version = MD_MAJOR_VERSION;
++ mddev->sb->minor_version = MD_MINOR_VERSION;
++ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
++ mddev->sb->ctime = CURRENT_TIME;
++
++ SET_SB(level);
++ SET_SB(size);
++ SET_SB(nr_disks);
++ SET_SB(raid_disks);
++ SET_SB(md_minor);
++ SET_SB(not_persistent);
++
++ SET_SB(state);
++ SET_SB(active_disks);
++ SET_SB(working_disks);
++ SET_SB(failed_disks);
++ SET_SB(spare_disks);
++
++ SET_SB(layout);
++ SET_SB(chunk_size);
++
++ mddev->sb->md_magic = MD_SB_MAGIC;
++
++ /*
++ * Generate a 128 bit UUID
++ */
++ get_random_bytes(&mddev->sb->set_uuid0, 4);
++ get_random_bytes(&mddev->sb->set_uuid1, 4);
++ get_random_bytes(&mddev->sb->set_uuid2, 4);
++ get_random_bytes(&mddev->sb->set_uuid3, 4);
++
++ return 0;
++}
++#undef SET_SB
++
++static int set_disk_info (mddev_t * mddev, void * arg)
++{
++ printk("not yet");
++ return -EINVAL;
++}
++
++static int clear_array (mddev_t * mddev)
++{
++ printk("not yet");
++ return -EINVAL;
++}
++
++static int write_raid_info (mddev_t * mddev)
++{
++ printk("not yet");
++ return -EINVAL;
++}
++
++static int protect_array (mddev_t * mddev)
++{
++ printk("not yet");
++ return -EINVAL;
++}
++
++static int unprotect_array (mddev_t * mddev)
++{
++ printk("not yet");
++ return -EINVAL;
++}
++
++static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
++{
++ int ret;
++
++ fsync_dev(mddev_to_kdev(mddev));
++ ret = md_error(mddev_to_kdev(mddev), dev);
++ return ret;
+ }
+
+ static int md_ioctl (struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+ {
+- int minor, err;
+- struct hd_geometry *loc = (struct hd_geometry *) arg;
++ unsigned int minor;
++ int err = 0;
++ struct hd_geometry *loc = (struct hd_geometry *) arg;
++ mddev_t *mddev = NULL;
++ kdev_t dev;
+
+- if (!capable(CAP_SYS_ADMIN))
+- return -EACCES;
++ if (!md_capable_admin())
++ return -EACCES;
+
+- if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
+- (minor & 0x7f) < MAX_PERSONALITY &&
+- pers[minor & 0x7f] &&
+- pers[minor & 0x7f]->ioctl)
+- return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
+-
+- if (minor >= MAX_MD_DEV)
+- return -EINVAL;
++ dev = inode->i_rdev;
++ minor = MINOR(dev);
++ if (minor >= MAX_MD_DEVS)
++ return -EINVAL;
+
+- switch (cmd)
+- {
+- case REGISTER_DEV:
+- return do_md_add (minor, to_kdev_t ((dev_t) arg));
++ /*
++ * Commands dealing with the RAID driver but not any
++ * particular array:
++ */
++ switch (cmd)
++ {
++ case RAID_VERSION:
++ err = get_version((void *)arg);
++ goto done;
++
++ case PRINT_RAID_DEBUG:
++ err = 0;
++ md_print_devices();
++ goto done_unlock;
++
++ case BLKGETSIZE: /* Return device size */
++ if (!arg) {
++ err = -EINVAL;
++ goto abort;
++ }
++ err = md_put_user(md_hd_struct[minor].nr_sects,
++ (long *) arg);
++ goto done;
+
+- case START_MD:
+- return do_md_run (minor, (int) arg);
++ case BLKFLSBUF:
++ fsync_dev(dev);
++ invalidate_buffers(dev);
++ goto done;
+
+- case STOP_MD:
+- return do_md_stop (minor, inode);
+-
+- case BLKGETSIZE: /* Return device size */
+- if (!arg) return -EINVAL;
+- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
+- if (err)
+- return err;
+- break;
+-
+- case BLKFLSBUF:
+- fsync_dev (inode->i_rdev);
+- invalidate_buffers (inode->i_rdev);
+- break;
+-
+- case BLKRASET:
+- if (arg > 0xff)
+- return -EINVAL;
+- read_ahead[MAJOR(inode->i_rdev)] = arg;
+- return 0;
+-
+- case BLKRAGET:
+- if (!arg) return -EINVAL;
+- err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
+- if (err)
+- return err;
+- break;
+-
+- /* We have a problem here : there is no easy way to give a CHS
+- virtual geometry. We currently pretend that we have a 2 heads
+- 4 sectors (with a BIG number of cylinders...). This drives dosfs
+- just mad... ;-) */
+-
+- case HDIO_GETGEO:
+- if (!loc) return -EINVAL;
+- err = put_user (2, (char *) &loc->heads);
+- if (err)
+- return err;
+- err = put_user (4, (char *) &loc->sectors);
+- if (err)
+- return err;
+- err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
+- if (err)
+- return err;
+- err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
+- (long *) &loc->start);
+- if (err)
+- return err;
+- break;
+-
+- RO_IOCTLS(inode->i_rdev,arg);
++ case BLKRASET:
++ if (arg > 0xff) {
++ err = -EINVAL;
++ goto abort;
++ }
++ read_ahead[MAJOR(dev)] = arg;
++ goto done;
+
+- default:
+- return -EINVAL;
+- }
++ case BLKRAGET:
++ if (!arg) {
++ err = -EINVAL;
++ goto abort;
++ }
++ err = md_put_user (read_ahead[
++ MAJOR(dev)], (long *) arg);
++ goto done;
++ default:
++ }
++
++ /*
++ * Commands creating/starting a new array:
++ */
++
++ mddev = kdev_to_mddev(dev);
++
++ switch (cmd)
++ {
++ case SET_ARRAY_INFO:
++ case START_ARRAY:
++ if (mddev) {
++ printk("array md%d already exists!\n",
++ mdidx(mddev));
++ err = -EEXIST;
++ goto abort;
++ }
++ default:
++ }
++
++ switch (cmd)
++ {
++ case SET_ARRAY_INFO:
++ mddev = alloc_mddev(dev);
++ if (!mddev) {
++ err = -ENOMEM;
++ goto abort;
++ }
++ /*
++ * alloc_mddev() should possibly self-lock.
++ */
++ err = lock_mddev(mddev);
++ if (err) {
++ printk("ioctl, reason %d, cmd %d\n", err, cmd);
++ goto abort;
++ }
++ err = set_array_info(mddev, (void *)arg);
++ if (err) {
++ printk("couldnt set array info. %d\n", err);
++ goto abort;
++ }
++ goto done_unlock;
++
++ case START_ARRAY:
++ /*
++ * possibly make it lock the array ...
++ */
++ err = autostart_array((kdev_t)arg);
++ if (err) {
++ printk("autostart %s failed!\n",
++ partition_name((kdev_t)arg));
++ goto abort;
++ }
++ goto done;
++
++ default:
++ }
++
++ /*
++ * Commands querying/configuring an existing array:
++ */
++
++ if (!mddev) {
++ err = -ENODEV;
++ goto abort;
++ }
++ err = lock_mddev(mddev);
++ if (err) {
++ printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
++ goto abort;
++ }
++
++ /*
++ * Commands even a read-only array can execute:
++ */
++ switch (cmd)
++ {
++ case GET_ARRAY_INFO:
++ err = get_array_info(mddev, (void *)arg);
++ goto done_unlock;
++
++ case GET_DISK_INFO:
++ err = get_disk_info(mddev, (void *)arg);
++ goto done_unlock;
++
++ case RESTART_ARRAY_RW:
++ err = restart_array(mddev);
++ goto done_unlock;
++
++ case STOP_ARRAY:
++ err = do_md_stop (mddev, 0);
++ goto done_unlock;
++
++ case STOP_ARRAY_RO:
++ err = do_md_stop (mddev, 1);
++ goto done_unlock;
++
++ /*
++ * We have a problem here : there is no easy way to give a CHS
++ * virtual geometry. We currently pretend that we have a 2 heads
++ * 4 sectors (with a BIG number of cylinders...). This drives
++ * dosfs just mad... ;-)
++ */
++ case HDIO_GETGEO:
++ if (!loc) {
++ err = -EINVAL;
++ goto abort_unlock;
++ }
++ err = md_put_user (2, (char *) &loc->heads);
++ if (err)
++ goto abort_unlock;
++ err = md_put_user (4, (char *) &loc->sectors);
++ if (err)
++ goto abort_unlock;
++ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
++ (short *) &loc->cylinders);
++ if (err)
++ goto abort_unlock;
++ err = md_put_user (md_hd_struct[minor].start_sect,
++ (long *) &loc->start);
++ goto done_unlock;
++ }
+
+- return (0);
++ /*
++ * The remaining ioctls are changing the state of the
++ * superblock, so we do not allow read-only arrays
++ * here:
++ */
++ if (mddev->ro) {
++ err = -EROFS;
++ goto abort_unlock;
++ }
++
++ switch (cmd)
++ {
++ case CLEAR_ARRAY:
++ err = clear_array(mddev);
++ goto done_unlock;
++
++ case ADD_NEW_DISK:
++ err = add_new_disk(mddev, (void *)arg);
++ goto done_unlock;
++
++ case HOT_REMOVE_DISK:
++ err = hot_remove_disk(mddev, (kdev_t)arg);
++ goto done_unlock;
++
++ case HOT_ADD_DISK:
++ err = hot_add_disk(mddev, (kdev_t)arg);
++ goto done_unlock;
++
++ case SET_DISK_INFO:
++ err = set_disk_info(mddev, (void *)arg);
++ goto done_unlock;
++
++ case WRITE_RAID_INFO:
++ err = write_raid_info(mddev);
++ goto done_unlock;
++
++ case UNPROTECT_ARRAY:
++ err = unprotect_array(mddev);
++ goto done_unlock;
++
++ case PROTECT_ARRAY:
++ err = protect_array(mddev);
++ goto done_unlock;
++
++ case SET_DISK_FAULTY:
++ err = set_disk_faulty(mddev, (kdev_t)arg);
++ goto done_unlock;
++
++ case RUN_ARRAY:
++ {
++ mdu_param_t param;
++
++ err = md_copy_from_user(¶m, (mdu_param_t *)arg,
++ sizeof(param));
++ if (err)
++ goto abort_unlock;
++
++ err = do_md_run (mddev);
++ /*
++ * we have to clean up the mess if
++ * the array cannot be run for some
++ * reason ...
++ */
++ if (err) {
++ mddev->sb_dirty = 0;
++ do_md_stop (mddev, 0);
++ }
++ goto done_unlock;
++ }
++
++ default:
++ printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
++ err = -EINVAL;
++ goto abort_unlock;
++ }
++
++done_unlock:
++abort_unlock:
++ if (mddev)
++ unlock_mddev(mddev);
++ else
++ printk("huh11?\n");
++
++ return err;
++done:
++ if (err)
++ printk("huh12?\n");
++abort:
++ return err;
+ }
+
++
++#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
++
+ static int md_open (struct inode *inode, struct file *file)
+ {
+- int minor=MINOR(inode->i_rdev);
++ /*
++ * Always succeed
++ */
++ return (0);
++}
++
++static void md_release (struct inode *inode, struct file *file)
++{
++ sync_dev(inode->i_rdev);
++}
++
++
++static int md_read (struct inode *inode, struct file *file,
++ char *buf, int count)
++{
++ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+
+- md_dev[minor].busy++;
+- return (0); /* Always succeed */
++ if (!mddev || !mddev->pers)
++ return -ENXIO;
++
++ return block_read (inode, file, buf, count);
+ }
+
++static int md_write (struct inode *inode, struct file *file,
++ const char *buf, int count)
++{
++ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
++
++ if (!mddev || !mddev->pers)
++ return -ENXIO;
+
+-static int md_release (struct inode *inode, struct file *file)
++ return block_write (inode, file, buf, count);
++}
++
++static struct file_operations md_fops=
+ {
+- int minor=MINOR(inode->i_rdev);
++ NULL,
++ md_read,
++ md_write,
++ NULL,
++ NULL,
++ md_ioctl,
++ NULL,
++ md_open,
++ md_release,
++ block_fsync
++};
++
++#else
+
+- sync_dev (inode->i_rdev);
+- md_dev[minor].busy--;
+- return 0;
++static int md_open (struct inode *inode, struct file *file)
++{
++ /*
++ * Always succeed
++ */
++ return (0);
+ }
+
++static int md_release (struct inode *inode, struct file *file)
++{
++ sync_dev(inode->i_rdev);
++ return 0;
++}
+
+ static ssize_t md_read (struct file *file, char *buf, size_t count,
+ loff_t *ppos)
+ {
+- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
++ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+
+- if (!md_dev[minor].pers) /* Check if device is being run */
+- return -ENXIO;
++ if (!mddev || !mddev->pers)
++ return -ENXIO;
+
+- return block_read(file, buf, count, ppos);
++ return block_read(file, buf, count, ppos);
+ }
+
+ static ssize_t md_write (struct file *file, const char *buf,
+ size_t count, loff_t *ppos)
+ {
+- int minor=MINOR(file->f_dentry->d_inode->i_rdev);
++ mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+
+- if (!md_dev[minor].pers) /* Check if device is being run */
+- return -ENXIO;
++ if (!mddev || !mddev->pers)
++ return -ENXIO;
+
+- return block_write(file, buf, count, ppos);
++ return block_write(file, buf, count, ppos);
+ }
+
+ static struct file_operations md_fops=
+ {
+- NULL,
+- md_read,
+- md_write,
+- NULL,
+- NULL,
+- md_ioctl,
+- NULL,
+- md_open,
+- NULL,
+- md_release,
+- block_fsync
++ NULL,
++ md_read,
++ md_write,
++ NULL,
++ NULL,
++ md_ioctl,
++ NULL,
++ md_open,
++ NULL,
++ md_release,
++ block_fsync
+ };
+
+-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
++#endif
++
++int md_map (kdev_t dev, kdev_t *rdev,
++ unsigned long *rsector, unsigned long size)
+ {
+- if ((unsigned int) minor >= MAX_MD_DEV)
+- {
+- printk ("Bad md device %d\n", minor);
+- return (-1);
+- }
+-
+- if (!md_dev[minor].pers)
+- {
+- printk ("Oops ! md%d not running, giving up !\n", minor);
+- return (-1);
+- }
++ int err;
++ mddev_t *mddev = kdev_to_mddev(dev);
+
+- return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
++ if (!mddev || !mddev->pers) {
++ err = -ENXIO;
++ goto out;
++ }
++
++ err = mddev->pers->map(mddev, dev, rdev, rsector, size);
++out:
++ return err;
+ }
+
+-int md_make_request (int minor, int rw, struct buffer_head * bh)
++int md_make_request (struct buffer_head * bh, int rw)
+ {
+- if (md_dev [minor].pers->make_request) {
+- if (buffer_locked(bh))
+- return 0;
++ int err;
++ mddev_t *mddev = kdev_to_mddev(bh->b_dev);
++
++ if (!mddev || !mddev->pers) {
++ err = -ENXIO;
++ goto out;
++ }
++
++ if (mddev->pers->make_request) {
++ if (buffer_locked(bh)) {
++ err = 0;
++ goto out;
++ }
+ set_bit(BH_Lock, &bh->b_state);
+ if (rw == WRITE || rw == WRITEA) {
+ if (!buffer_dirty(bh)) {
+- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+- return 0;
++ bh->b_end_io(bh, buffer_uptodate(bh));
++ err = 0;
++ goto out;
+ }
+ }
+ if (rw == READ || rw == READA) {
+ if (buffer_uptodate(bh)) {
+- bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+- return 0;
++ bh->b_end_io(bh, buffer_uptodate(bh));
++ err = 0;
++ goto out;
+ }
+ }
+- return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
++ err = mddev->pers->make_request(mddev, rw, bh);
+ } else {
+ make_request (MAJOR(bh->b_rdev), rw, bh);
+- return 0;
++ err = 0;
+ }
++out:
++ return err;
+ }
+
+ static void do_md_request (void)
+ {
+- printk ("Got md request, not good...");
+- return;
++ printk(KERN_ALERT "Got md request, not good...");
++ return;
++}
++
++int md_thread(void * arg)
++{
++ mdk_thread_t *thread = arg;
++
++ md_lock_kernel();
++ exit_mm(current);
++ exit_files(current);
++ exit_fs(current);
++
++ /*
++ * Detach thread
++ */
++ sys_setsid();
++ sprintf(current->comm, thread->name);
++ md_init_signals();
++ md_flush_signals();
++ thread->tsk = current;
++
++ /*
++ * md_thread is a 'system-thread', it's priority should be very
++ * high. We avoid resource deadlocks individually in each
++ * raid personality. (RAID5 does preallocation) We also use RR and
++ * the very same RT priority as kswapd, thus we will never get
++ * into a priority inversion deadlock.
++ *
++ * we definitely have to have equal or higher priority than
++ * bdflush, otherwise bdflush will deadlock if there are too
++ * many dirty RAID5 blocks.
++ */
++ current->policy = SCHED_OTHER;
++ current->priority = 40;
++
++ up(thread->sem);
++
++ for (;;) {
++ cli();
++ if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
++ if (!thread->run)
++ break;
++ interruptible_sleep_on(&thread->wqueue);
++ }
++ sti();
++ clear_bit(THREAD_WAKEUP, &thread->flags);
++ if (thread->run) {
++ thread->run(thread->data);
++ run_task_queue(&tq_disk);
++ }
++ if (md_signal_pending(current)) {
++ printk("%8s(%d) flushing signals.\n", current->comm,
++ current->pid);
++ md_flush_signals();
++ }
++ }
++ sti();
++ up(thread->sem);
++ return 0;
+ }
+
+-void md_wakeup_thread(struct md_thread *thread)
++void md_wakeup_thread(mdk_thread_t *thread)
+ {
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+
+-struct md_thread *md_register_thread (void (*run) (void *), void *data)
++mdk_thread_t *md_register_thread (void (*run) (void *),
++ void *data, const char *name)
+ {
+- struct md_thread *thread = (struct md_thread *)
+- kmalloc(sizeof(struct md_thread), GFP_KERNEL);
++ mdk_thread_t *thread;
+ int ret;
+ struct semaphore sem = MUTEX_LOCKED;
+
+- if (!thread) return NULL;
++ thread = (mdk_thread_t *) kmalloc
++ (sizeof(mdk_thread_t), GFP_KERNEL);
++ if (!thread)
++ return NULL;
+
+- memset(thread, 0, sizeof(struct md_thread));
++ memset(thread, 0, sizeof(mdk_thread_t));
+ init_waitqueue(&thread->wqueue);
+
+ thread->sem = &sem;
+ thread->run = run;
+ thread->data = data;
++ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+@@ -836,270 +3032,407 @@
+ return thread;
+ }
+
+-void md_unregister_thread (struct md_thread *thread)
++void md_interrupt_thread (mdk_thread_t *thread)
++{
++ if (!thread->tsk) {
++ MD_BUG();
++ return;
++ }
++ printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
++ send_sig(SIGKILL, thread->tsk, 1);
++}
++
++void md_unregister_thread (mdk_thread_t *thread)
+ {
+ struct semaphore sem = MUTEX_LOCKED;
+
+ thread->sem = &sem;
+ thread->run = NULL;
+- if (thread->tsk)
+- printk("Killing md_thread %d %p %s\n",
+- thread->tsk->pid, thread->tsk, thread->tsk->comm);
+- else
+- printk("Aiee. md_thread has 0 tsk\n");
+- send_sig(SIGKILL, thread->tsk, 1);
+- printk("downing on %p\n", &sem);
++ thread->name = NULL;
++ if (!thread->tsk) {
++ MD_BUG();
++ return;
++ }
++ md_interrupt_thread(thread);
+ down(&sem);
+ }
+
+-#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
+-
+-int md_thread(void * arg)
++void md_recover_arrays (void)
+ {
+- struct md_thread *thread = arg;
+-
+- lock_kernel();
+- exit_mm(current);
+- exit_files(current);
+- exit_fs(current);
+-
+- current->session = 1;
+- current->pgrp = 1;
+- sprintf(current->comm, "md_thread");
+- siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
+- thread->tsk = current;
+- up(thread->sem);
+-
+- for (;;) {
+- cli();
+- if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
+- do {
+- spin_lock(¤t->sigmask_lock);
+- flush_signals(current);
+- spin_unlock(¤t->sigmask_lock);
+- interruptible_sleep_on(&thread->wqueue);
+- cli();
+- if (test_bit(THREAD_WAKEUP, &thread->flags))
+- break;
+- if (!thread->run) {
+- sti();
+- up(thread->sem);
+- return 0;
+- }
+- } while (signal_pending(current));
+- }
+- sti();
+- clear_bit(THREAD_WAKEUP, &thread->flags);
+- if (thread->run) {
+- thread->run(thread->data);
+- run_task_queue(&tq_disk);
+- }
++ if (!md_recovery_thread) {
++ MD_BUG();
++ return;
+ }
++ md_wakeup_thread(md_recovery_thread);
+ }
+
+-EXPORT_SYMBOL(md_size);
+-EXPORT_SYMBOL(md_maxreadahead);
+-EXPORT_SYMBOL(register_md_personality);
+-EXPORT_SYMBOL(unregister_md_personality);
+-EXPORT_SYMBOL(partition_name);
+-EXPORT_SYMBOL(md_dev);
+-EXPORT_SYMBOL(md_error);
+-EXPORT_SYMBOL(md_register_thread);
+-EXPORT_SYMBOL(md_unregister_thread);
+-EXPORT_SYMBOL(md_update_sb);
+-EXPORT_SYMBOL(md_map);
+-EXPORT_SYMBOL(md_wakeup_thread);
+-EXPORT_SYMBOL(md_do_sync);
+
+-#ifdef CONFIG_PROC_FS
+-static struct proc_dir_entry proc_md = {
+- PROC_MD, 6, "mdstat",
+- S_IFREG | S_IRUGO, 1, 0, 0,
+- 0, &proc_array_inode_operations,
+-};
++int md_error (kdev_t dev, kdev_t rdev)
++{
++ mddev_t *mddev = kdev_to_mddev(dev);
++ mdk_rdev_t * rrdev;
++ int rc;
++
++ if (!mddev) {
++ MD_BUG();
++ return 0;
++ }
++ rrdev = find_rdev(mddev, rdev);
++ mark_rdev_faulty(rrdev);
++ /*
++ * if recovery was running, stop it now.
++ */
++ if (mddev->pers->stop_resync)
++ mddev->pers->stop_resync(mddev);
++ if (mddev->recovery_running)
++ md_interrupt_thread(md_recovery_thread);
++ if (mddev->pers->error_handler) {
++ rc = mddev->pers->error_handler(mddev, rdev);
++ md_recover_arrays();
++ return rc;
++ }
++#if 0
++ /*
++ * Drop all buffers in the failed array.
++ * _not_. This is called from IRQ handlers ...
++ */
++ invalidate_buffers(rdev);
+ #endif
++ return 0;
++}
+
+-static void md_geninit (struct gendisk *gdisk)
++static int status_unused (char * page)
+ {
+- int i;
+-
+- for(i=0;i<MAX_MD_DEV;i++)
+- {
+- md_blocksizes[i] = 1024;
+- md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
+- md_gendisk.part[i].start_sect=-1; /* avoid partition check */
+- md_gendisk.part[i].nr_sects=0;
+- md_dev[i].pers=NULL;
+- }
++ int sz = 0, i = 0;
++ mdk_rdev_t *rdev;
++ struct md_list_head *tmp;
+
+- blksize_size[MD_MAJOR] = md_blocksizes;
+- max_readahead[MD_MAJOR] = md_maxreadahead;
++ sz += sprintf(page + sz, "unused devices: ");
+
+-#ifdef CONFIG_PROC_FS
+- proc_register(&proc_root, &proc_md);
+-#endif
++ ITERATE_RDEV_ALL(rdev,tmp) {
++ if (!rdev->same_set.next && !rdev->same_set.prev) {
++ /*
++ * The device is not yet used by any array.
++ */
++ i++;
++ sz += sprintf(page + sz, "%s ",
++ partition_name(rdev->dev));
++ }
++ }
++ if (!i)
++ sz += sprintf(page + sz, "<none>");
++
++ sz += sprintf(page + sz, "\n");
++ return sz;
+ }
+
+-int md_error (kdev_t mddev, kdev_t rdev)
++
++static int status_resync (char * page, mddev_t * mddev)
+ {
+- unsigned int minor = MINOR (mddev);
+- int rc;
++ int sz = 0;
++ unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
+
+- if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
+- panic ("md_error gets unknown device\n");
+- if (!md_dev [minor].pers)
+- panic ("md_error gets an error for an unknown device\n");
+- if (md_dev [minor].pers->error_handler) {
+- rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
+-#if SUPPORT_RECONSTRUCTION
+- md_wakeup_thread(md_sync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
+- return rc;
+- }
+- return 0;
++ resync = mddev->curr_resync;
++ blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
++ max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
++
++ /*
++ * Should not happen.
++ */
++ if (!max_blocks) {
++ MD_BUG();
++ return 0;
++ }
++ res = resync*100/max_blocks;
++ if (!mddev->recovery_running)
++ /*
++ * true resync
++ */
++ sz += sprintf(page + sz, " resync=%u%%", res);
++ else
++ /*
++ * recovery ...
++ */
++ sz += sprintf(page + sz, " recovery=%u%%", res);
++
++ /*
++ * We do not want to overflow, so the order of operands and
++ * the * 100 / 100 trick are important. We do a +1 to be
++ * safe against division by zero. We only estimate anyway.
++ *
++ * dt: time until now
++ * tt: total time
++ * et: estimated finish time
++ */
++ dt = ((jiffies - mddev->resync_start) / HZ);
++ tt = (dt * (max_blocks / (resync/100+1)))/100;
++ if (tt > dt)
++ et = tt - dt;
++ else
++ /*
++ * ignore rounding effects near finish time
++ */
++ et = 0;
++
++ sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
++
++ return sz;
+ }
+
+ int get_md_status (char *page)
+ {
+- int sz=0, i, j, size;
+-
+- sz+=sprintf( page+sz, "Personalities : ");
+- for (i=0; i<MAX_PERSONALITY; i++)
+- if (pers[i])
+- sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
+-
+- page[sz-1]='\n';
+-
+- sz+=sprintf (page+sz, "read_ahead ");
+- if (read_ahead[MD_MAJOR]==INT_MAX)
+- sz+=sprintf (page+sz, "not set\n");
+- else
+- sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
++ int sz = 0, j, size;
++ struct md_list_head *tmp, *tmp2;
++ mdk_rdev_t *rdev;
++ mddev_t *mddev;
++
++ sz += sprintf(page + sz, "Personalities : ");
++ for (j = 0; j < MAX_PERSONALITY; j++)
++ if (pers[j])
++ sz += sprintf(page+sz, "[%s] ", pers[j]->name);
++
++ sz += sprintf(page+sz, "\n");
++
++
++ sz += sprintf(page+sz, "read_ahead ");
++ if (read_ahead[MD_MAJOR] == INT_MAX)
++ sz += sprintf(page+sz, "not set\n");
++ else
++ sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
+
+- for (i=0; i<MAX_MD_DEV; i++)
+- {
+- sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
+-
+- if (md_dev[i].pers)
+- sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
++ ITERATE_MDDEV(mddev,tmp) {
++ sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
++ mddev->pers ? "" : "in");
++ if (mddev->pers) {
++ if (mddev->ro)
++ sz += sprintf(page + sz, " (read-only)");
++ sz += sprintf(page + sz, " %s", mddev->pers->name);
++ }
+
+- size=0;
+- for (j=0; j<md_dev[i].nb_dev; j++)
+- {
+- sz+=sprintf (page+sz, " %s",
+- partition_name(md_dev[i].devices[j].dev));
+- size+=md_dev[i].devices[j].size;
+- }
++ size = 0;
++ ITERATE_RDEV(mddev,rdev,tmp2) {
++ sz += sprintf(page + sz, " %s[%d]",
++ partition_name(rdev->dev), rdev->desc_nr);
++ if (rdev->faulty) {
++ sz += sprintf(page + sz, "(F)");
++ continue;
++ }
++ size += rdev->size;
++ }
+
+- if (md_dev[i].nb_dev) {
+- if (md_dev[i].pers)
+- sz+=sprintf (page+sz, " %d blocks", md_size[i]);
+- else
+- sz+=sprintf (page+sz, " %d blocks", size);
+- }
++ if (mddev->nb_dev) {
++ if (mddev->pers)
++ sz += sprintf(page + sz, " %d blocks",
++ md_size[mdidx(mddev)]);
++ else
++ sz += sprintf(page + sz, " %d blocks", size);
++ }
+
+- if (!md_dev[i].pers)
+- {
+- sz+=sprintf (page+sz, "\n");
+- continue;
+- }
++ if (!mddev->pers) {
++ sz += sprintf(page+sz, "\n");
++ continue;
++ }
+
+- if (md_dev[i].pers->max_invalid_dev)
+- sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
++ sz += mddev->pers->status (page+sz, mddev);
+
+- sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
+- sz+=sprintf (page+sz, "\n");
+- }
++ if (mddev->curr_resync)
++ sz += status_resync (page+sz, mddev);
++ else {
++ if (md_atomic_read(&mddev->resync_sem.count) != 1)
++ sz += sprintf(page + sz, " resync=DELAYED");
++ }
++ sz += sprintf(page + sz, "\n");
++ }
++ sz += status_unused (page + sz);
+
+- return (sz);
++ return (sz);
+ }
+
+-int register_md_personality (int p_num, struct md_personality *p)
++int register_md_personality (int pnum, mdk_personality_t *p)
+ {
+- int i=(p_num >> PERSONALITY_SHIFT);
+-
+- if (i >= MAX_PERSONALITY)
+- return -EINVAL;
++ if (pnum >= MAX_PERSONALITY)
++ return -EINVAL;
+
+- if (pers[i])
+- return -EBUSY;
++ if (pers[pnum])
++ return -EBUSY;
+
+- pers[i]=p;
+- printk ("%s personality registered\n", p->name);
+- return 0;
++ pers[pnum] = p;
++ printk(KERN_INFO "%s personality registered\n", p->name);
++ return 0;
+ }
+
+-int unregister_md_personality (int p_num)
++int unregister_md_personality (int pnum)
+ {
+- int i=(p_num >> PERSONALITY_SHIFT);
+-
+- if (i >= MAX_PERSONALITY)
+- return -EINVAL;
++ if (pnum >= MAX_PERSONALITY)
++ return -EINVAL;
+
+- printk ("%s personality unregistered\n", pers[i]->name);
+- pers[i]=NULL;
+- return 0;
++ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
++ pers[pnum] = NULL;
++ return 0;
+ }
+
+-static md_descriptor_t *get_spare(struct md_dev *mddev)
++static mdp_disk_t *get_spare(mddev_t *mddev)
+ {
+- int i;
+- md_superblock_t *sb = mddev->sb;
+- md_descriptor_t *descriptor;
+- struct real_dev *realdev;
+-
+- for (i = 0; i < mddev->nb_dev; i++) {
+- realdev = &mddev->devices[i];
+- if (!realdev->sb)
++ mdp_super_t *sb = mddev->sb;
++ mdp_disk_t *disk;
++ mdk_rdev_t *rdev;
++ struct md_list_head *tmp;
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty)
++ continue;
++ if (!rdev->sb) {
++ MD_BUG();
+ continue;
+- descriptor = &sb->disks[realdev->sb->descriptor.number];
+- if (descriptor->state & (1 << MD_FAULTY_DEVICE))
++ }
++ disk = &sb->disks[rdev->desc_nr];
++ if (disk_faulty(disk)) {
++ MD_BUG();
+ continue;
+- if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
++ }
++ if (disk_active(disk))
+ continue;
+- return descriptor;
++ return disk;
+ }
+ return NULL;
+ }
+
++static int is_mddev_idle (mddev_t *mddev)
++{
++ mdk_rdev_t * rdev;
++ struct md_list_head *tmp;
++ int idle;
++ unsigned long curr_events;
++
++ idle = 1;
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ curr_events = io_events[MAJOR(rdev->dev)];
++
++ if (curr_events != rdev->last_events) {
++// printk("!I(%d)", curr_events-rdev->last_events);
++ rdev->last_events = curr_events;
++ idle = 0;
++ }
++ }
++ return idle;
++}
++
+ /*
+ * parallel resyncing thread.
+- *
+- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
+- * - fix read error handing
+ */
+
+-int md_do_sync(struct md_dev *mddev)
++/*
++ * Determine correct block size for this device.
++ */
++unsigned int device_bsize (kdev_t dev)
++{
++ unsigned int i, correct_size;
++
++ correct_size = BLOCK_SIZE;
++ if (blksize_size[MAJOR(dev)]) {
++ i = blksize_size[MAJOR(dev)][MINOR(dev)];
++ if (i)
++ correct_size = i;
++ }
++
++ return correct_size;
++}
++
++static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
++
++#define RA_ORDER (1)
++#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
++#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
++
++int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+ {
+- struct buffer_head *bh;
+- int max_blocks, blocksize, curr_bsize, percent=1, j;
+- kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
++ mddev_t *mddev2;
++ struct buffer_head **bh;
++ unsigned int max_blocks, blocksize, curr_bsize,
++ i, ii, j, k, chunk, window, nr_blocks, err, serialize;
++ kdev_t read_disk = mddev_to_kdev(mddev);
+ int major = MAJOR(read_disk), minor = MINOR(read_disk);
+ unsigned long starttime;
++ int max_read_errors = 2*MAX_NR_BLOCKS,
++ max_write_errors = 2*MAX_NR_BLOCKS;
++ struct md_list_head *tmp;
++
++retry_alloc:
++ bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
++ if (!bh) {
++ printk(KERN_ERR
++ "could not alloc bh array for reconstruction ... retrying!\n");
++ goto retry_alloc;
++ }
++
++ err = down_interruptible(&mddev->resync_sem);
++ if (err)
++ goto out_nolock;
++
++recheck:
++ serialize = 0;
++ ITERATE_MDDEV(mddev2,tmp) {
++ if (mddev2 == mddev)
++ continue;
++ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
++ printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
++ serialize = 1;
++ break;
++ }
++ }
++ if (serialize) {
++ interruptible_sleep_on(&resync_wait);
++ if (md_signal_pending(current)) {
++ md_flush_signals();
++ err = -EINTR;
++ goto out;
++ }
++ goto recheck;
++ }
++
++ mddev->curr_resync = 1;
+
+- blocksize = blksize_size[major][minor];
++ blocksize = device_bsize(read_disk);
+ max_blocks = blk_size[major][minor] / (blocksize >> 10);
+
+- printk("... resync log\n");
+- printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
+- printk(" .... raid array: %s\n", kdevname(read_disk));
+- printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
+- printk("md: syncing RAID array %s\n", kdevname(read_disk));
++ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
++ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
++ sysctl_speed_limit);
++ printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
++
++ /*
++ * Resync has low priority.
++ */
++ current->priority = 1;
++
++ is_mddev_idle(mddev); /* this also initializes IO event counters */
++ starttime = jiffies;
++ mddev->resync_start = starttime;
+
+- mddev->busy++;
++ /*
++ * Tune reconstruction:
++ */
++ window = md_maxreadahead[mdidx(mddev)]/1024;
++ nr_blocks = window / (blocksize >> 10);
++ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
++ nr_blocks = MAX_NR_BLOCKS;
++ printk(KERN_INFO "md: using %dk window.\n",window);
+
+- starttime=jiffies;
+- for (j = 0; j < max_blocks; j++) {
++ for (j = 0; j < max_blocks; j += nr_blocks) {
+
++ if (j)
++ mddev->curr_resync = j;
+ /*
+ * B careful. When some1 mounts a non-'blocksize' filesystem
+ * then we get the blocksize changed right under us. Go deal
+ * with it transparently, recalculate 'blocksize', 'j' and
+ * 'max_blocks':
+ */
+- curr_bsize = blksize_size[major][minor];
++ curr_bsize = device_bsize(read_disk);
+ if (curr_bsize != blocksize) {
+- diff_blocksize:
++ printk(KERN_INFO "md%d: blocksize changed\n",
++ mdidx(mddev));
++retry_read:
+ if (curr_bsize > blocksize)
+ /*
+ * this is safe, rounds downwards.
+@@ -1109,114 +3442,384 @@
+ j *= blocksize/curr_bsize;
+
+ blocksize = curr_bsize;
++ nr_blocks = window / (blocksize >> 10);
++ if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
++ nr_blocks = MAX_NR_BLOCKS;
+ max_blocks = blk_size[major][minor] / (blocksize >> 10);
+- }
+- if ((bh = breada (read_disk, j, blocksize, j * blocksize,
+- max_blocks * blocksize)) != NULL) {
+- mark_buffer_dirty(bh, 1);
+- brelse(bh);
+- } else {
++ printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
++ nr_blocks, blocksize, j, max_blocks);
+ /*
+- * FIXME: Ugly, but set_blocksize() isnt safe ...
++ * We will retry the current block-group
+ */
+- curr_bsize = blksize_size[major][minor];
+- if (curr_bsize != blocksize)
+- goto diff_blocksize;
++ }
+
+- /*
+- * It's a real read problem. FIXME, handle this
+- * a better way.
+- */
+- printk ( KERN_ALERT
+- "read error, stopping reconstruction.\n");
+- mddev->busy--;
+- return 1;
++ /*
++ * Cleanup routines expect this
++ */
++ for (k = 0; k < nr_blocks; k++)
++ bh[k] = NULL;
++
++ chunk = nr_blocks;
++ if (chunk > max_blocks-j)
++ chunk = max_blocks-j;
++
++ /*
++ * request buffer heads ...
++ */
++ for (i = 0; i < chunk; i++) {
++ bh[i] = getblk (read_disk, j+i, blocksize);
++ if (!bh[i])
++ goto read_error;
++ if (!buffer_dirty(bh[i]))
++ mark_buffer_lowprio(bh[i]);
+ }
+
+ /*
+- * Let's sleep some if we are faster than our speed limit:
++ * read buffer heads ...
+ */
+- while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
+- {
+- current->state = TASK_INTERRUPTIBLE;
+- schedule_timeout(1);
++ ll_rw_block (READ, chunk, bh);
++ run_task_queue(&tq_disk);
++
++ /*
++ * verify that all of them are OK ...
++ */
++ for (i = 0; i < chunk; i++) {
++ ii = chunk-i-1;
++ wait_on_buffer(bh[ii]);
++ if (!buffer_uptodate(bh[ii]))
++ goto read_error;
++ }
++
++retry_write:
++ for (i = 0; i < chunk; i++)
++ mark_buffer_dirty_lowprio(bh[i]);
++
++ ll_rw_block(WRITE, chunk, bh);
++ run_task_queue(&tq_disk);
++
++ for (i = 0; i < chunk; i++) {
++ ii = chunk-i-1;
++ wait_on_buffer(bh[ii]);
++
++ if (spare && disk_faulty(spare)) {
++ for (k = 0; k < chunk; k++)
++ brelse(bh[k]);
++ printk(" <SPARE FAILED!>\n ");
++ err = -EIO;
++ goto out;
++ }
++
++ if (!buffer_uptodate(bh[ii])) {
++ curr_bsize = device_bsize(read_disk);
++ if (curr_bsize != blocksize) {
++ printk(KERN_INFO
++ "md%d: blocksize changed during write\n",
++ mdidx(mddev));
++ for (k = 0; k < chunk; k++)
++ if (bh[k]) {
++ if (buffer_lowprio(bh[k]))
++ mark_buffer_clean(bh[k]);
++ brelse(bh[k]);
++ }
++ goto retry_read;
++ }
++ printk(" BAD WRITE %8d>\n", j);
++ /*
++ * Ouch, write error, retry or bail out.
++ */
++ if (max_write_errors) {
++ max_write_errors--;
++ printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
++ goto retry_write;
++ }
++ printk ( KERN_ALERT
++ "too many write errors, stopping reconstruction.\n");
++ for (k = 0; k < chunk; k++)
++ if (bh[k]) {
++ if (buffer_lowprio(bh[k]))
++ mark_buffer_clean(bh[k]);
++ brelse(bh[k]);
++ }
++ err = -EIO;
++ goto out;
++ }
+ }
+
+ /*
+- * FIXME: put this status bar thing into /proc
++ * This is the normal 'everything went OK' case
++ * do a 'free-behind' logic, we sure dont need
++ * this buffer if it was the only user.
+ */
+- if (!(j%(max_blocks/100))) {
+- if (!(percent%10))
+- printk (" %03d%% done.\n",percent);
++ for (i = 0; i < chunk; i++)
++ if (buffer_dirty(bh[i]))
++ brelse(bh[i]);
+ else
+- printk (".");
+- percent++;
++ bforget(bh[i]);
++
++
++ if (md_signal_pending(current)) {
++ /*
++ * got a signal, exit.
++ */
++ mddev->curr_resync = 0;
++ printk("md_do_sync() got signal ... exiting\n");
++ md_flush_signals();
++ err = -EINTR;
++ goto out;
+ }
++
++ /*
++ * this loop exits only if either when we are slower than
++ * the 'hard' speed limit, or the system was IO-idle for
++ * a jiffy.
++ * the system might be non-idle CPU-wise, but we only care
++ * about not overloading the IO subsystem. (things like an
++ * e2fsck being done on the RAID array should execute fast)
++ */
++repeat:
++ if (md_need_resched(current))
++ schedule();
++
++ if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
++ > sysctl_speed_limit) {
++ current->priority = 1;
++
++ if (!is_mddev_idle(mddev)) {
++ current->state = TASK_INTERRUPTIBLE;
++ md_schedule_timeout(HZ/2);
++ if (!md_signal_pending(current))
++ goto repeat;
++ }
++ } else
++ current->priority = 40;
+ }
+ fsync_dev(read_disk);
+- printk("md: %s: sync done.\n", kdevname(read_disk));
+- mddev->busy--;
+- return 0;
++ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
++ err = 0;
++ /*
++ * this also signals 'finished resyncing' to md_stop
++ */
++out:
++ up(&mddev->resync_sem);
++out_nolock:
++ free_pages((unsigned long)bh, RA_ORDER);
++ mddev->curr_resync = 0;
++ wake_up(&resync_wait);
++ return err;
++
++read_error:
++ /*
++ * set_blocksize() might change the blocksize. This
++ * should not happen often, but it happens when eg.
++ * someone mounts a filesystem that has non-1k
++ * blocksize. set_blocksize() doesnt touch our
++ * buffer, but to avoid aliasing problems we change
++ * our internal blocksize too and retry the read.
++ */
++ curr_bsize = device_bsize(read_disk);
++ if (curr_bsize != blocksize) {
++ printk(KERN_INFO "md%d: blocksize changed during read\n",
++ mdidx(mddev));
++ for (k = 0; k < chunk; k++)
++ if (bh[k]) {
++ if (buffer_lowprio(bh[k]))
++ mark_buffer_clean(bh[k]);
++ brelse(bh[k]);
++ }
++ goto retry_read;
++ }
++
++ /*
++ * It's a real read problem. We retry and bail out
++ * only if it's excessive.
++ */
++ if (max_read_errors) {
++ max_read_errors--;
++ printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
++ for (k = 0; k < chunk; k++)
++ if (bh[k]) {
++ if (buffer_lowprio(bh[k]))
++ mark_buffer_clean(bh[k]);
++ brelse(bh[k]);
++ }
++ goto retry_read;
++ }
++ printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
++ for (k = 0; k < chunk; k++)
++ if (bh[k]) {
++ if (buffer_lowprio(bh[k]))
++ mark_buffer_clean(bh[k]);
++ brelse(bh[k]);
++ }
++ err = -EIO;
++ goto out;
+ }
+
++#undef MAX_NR_BLOCKS
++
+ /*
+- * This is a kernel thread which: syncs a spare disk with the active array
++ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+-void mdsyncd (void *data)
++void md_do_recovery (void *data)
+ {
+- int i;
+- struct md_dev *mddev;
+- md_superblock_t *sb;
+- md_descriptor_t *spare;
++ int err;
++ mddev_t *mddev;
++ mdp_super_t *sb;
++ mdp_disk_t *spare;
+ unsigned long flags;
++ struct md_list_head *tmp;
+
+- for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
+- if ((sb = mddev->sb) == NULL)
++ printk(KERN_INFO "md: recovery thread got woken up ...\n");
++restart:
++ ITERATE_MDDEV(mddev,tmp) {
++ sb = mddev->sb;
++ if (!sb)
++ continue;
++ if (mddev->recovery_running)
+ continue;
+ if (sb->active_disks == sb->raid_disks)
+ continue;
+- if (!sb->spare_disks)
++ if (!sb->spare_disks) {
++ printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
+ continue;
++ }
++ /*
++ * now here we get the spare and resync it.
++ */
+ if ((spare = get_spare(mddev)) == NULL)
+ continue;
+- if (!mddev->pers->mark_spare)
++ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
++ if (!mddev->pers->diskop)
+ continue;
+- if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
++ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+ continue;
+- if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
+- mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
++ down(&mddev->recovery_sem);
++ mddev->recovery_running = 1;
++ err = md_do_sync(mddev, spare);
++ if (err == -EIO) {
++ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
++ if (!disk_faulty(spare)) {
++ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
++ mark_disk_faulty(spare);
++ mark_disk_nonsync(spare);
++ mark_disk_inactive(spare);
++ sb->spare_disks--;
++ sb->working_disks--;
++ sb->failed_disks++;
++ }
++ } else
++ if (disk_faulty(spare))
++ mddev->pers->diskop(mddev, &spare,
++ DISKOP_SPARE_INACTIVE);
++ if (err == -EINTR) {
++ /*
++ * Recovery got interrupted ...
++ * signal back that we have finished using the array.
++ */
++ mddev->pers->diskop(mddev, &spare,
++ DISKOP_SPARE_INACTIVE);
++ up(&mddev->recovery_sem);
++ mddev->recovery_running = 0;
+ continue;
++ } else {
++ mddev->recovery_running = 0;
++ up(&mddev->recovery_sem);
+ }
+ save_flags(flags);
+ cli();
+- mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
+- spare->state |= (1 << MD_SYNC_DEVICE);
+- spare->state |= (1 << MD_ACTIVE_DEVICE);
+- sb->spare_disks--;
+- sb->active_disks++;
+- mddev->sb_dirty = 1;
+- md_update_sb(mddev - md_dev);
++ if (!disk_faulty(spare)) {
++ /*
++ * the SPARE_ACTIVE diskop possibly changes the
++ * pointer too
++ */
++ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
++ mark_disk_sync(spare);
++ mark_disk_active(spare);
++ sb->active_disks++;
++ sb->spare_disks--;
++ }
+ restore_flags(flags);
++ mddev->sb_dirty = 1;
++ md_update_sb(mddev);
++ goto restart;
+ }
++ printk(KERN_INFO "md: recovery thread finished ...\n");
+
+ }
+
++int md_notify_reboot(struct notifier_block *this,
++ unsigned long code, void *x)
++{
++ struct md_list_head *tmp;
++ mddev_t *mddev;
++
++ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
++ || (code == MD_SYS_POWER_OFF)) {
++
++ printk(KERN_INFO "stopping all md devices.\n");
++
++ ITERATE_MDDEV(mddev,tmp)
++ do_md_stop (mddev, 1);
++ /*
++ * certain more exotic SCSI devices are known to be
++ * volatile wrt too early system reboots. While the
++ * right place to handle this issue is the given
++ * driver, we do want to have a safe RAID driver ...
++ */
++ md_mdelay(1000*1);
++ }
++ return NOTIFY_DONE;
++}
++
++struct notifier_block md_notifier = {
++ md_notify_reboot,
++ NULL,
++ 0
++};
++
++md__initfunc(void raid_setup(char *str, int *ints))
++{
++ char tmpline[100];
++ int len, pos, nr, i;
++
++ len = strlen(str) + 1;
++ nr = 0;
++ pos = 0;
++
++ for (i = 0; i < len; i++) {
++ char c = str[i];
++
++ if (c == ',' || !c) {
++ tmpline[pos] = 0;
++ if (!strcmp(tmpline,"noautodetect"))
++ raid_setup_args.noautodetect = 1;
++ nr++;
++ pos = 0;
++ continue;
++ }
++ tmpline[pos] = c;
++ pos++;
++ }
++ raid_setup_args.set = 1;
++ return;
++}
++
+ #ifdef CONFIG_MD_BOOT
+ struct {
+ int set;
+ int ints[100];
+ char str[100];
+-} md_setup_args __initdata = {
++} md_setup_args md__initdata = {
+ 0,{0},{0}
+ };
+
+ /* called from init/main.c */
+-__initfunc(void md_setup(char *str,int *ints))
++md__initfunc(void md_setup(char *str,int *ints))
+ {
+ int i;
+ for(i=0;i<=ints[0];i++) {
+@@ -1228,21 +3831,24 @@
+ return;
+ }
+
+-__initfunc(void do_md_setup(char *str,int *ints))
++md__initfunc(void do_md_setup(char *str,int *ints))
+ {
+- int minor, pers, factor, fault;
++#if 0
++ int minor, pers, chunk_size, fault;
+ kdev_t dev;
+ int i=1;
+
++ printk("i plan to phase this out --mingo\n");
++
+ if(ints[0] < 4) {
+- printk ("md: Too few Arguments (%d).\n", ints[0]);
++ printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
+ return;
+ }
+
+ minor=ints[i++];
+
+- if (minor >= MAX_MD_DEV) {
+- printk ("md: Minor device number too high.\n");
++ if ((unsigned int)minor >= MAX_MD_DEVS) {
++ printk (KERN_WARNING "md: Minor device number too high.\n");
+ return;
+ }
+
+@@ -1252,18 +3858,20 @@
+ case -1:
+ #ifdef CONFIG_MD_LINEAR
+ pers = LINEAR;
+- printk ("md: Setting up md%d as linear device.\n",minor);
++ printk (KERN_INFO "md: Setting up md%d as linear device.\n",
++ minor);
+ #else
+- printk ("md: Linear mode not configured."
++ printk (KERN_WARNING "md: Linear mode not configured."
+ "Recompile the kernel with linear mode enabled!\n");
+ #endif
+ break;
+ case 0:
+ pers = STRIPED;
+ #ifdef CONFIG_MD_STRIPED
+- printk ("md: Setting up md%d as a striped device.\n",minor);
++ printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
++ minor);
+ #else
+- printk ("md: Striped mode not configured."
++ printk (KERN_WARNING "md: Striped mode not configured."
+ "Recompile the kernel with striped mode enabled!\n");
+ #endif
+ break;
+@@ -1278,79 +3886,145 @@
+ break;
+ */
+ default:
+- printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
++ printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
+ return;
+ }
+
+- if(pers) {
++ if (pers) {
+
+- factor=ints[i++]; /* Chunksize */
+- fault =ints[i++]; /* Faultlevel */
++ chunk_size = ints[i++]; /* Chunksize */
++ fault = ints[i++]; /* Faultlevel */
+
+- pers=pers | factor | (fault << FAULT_SHIFT);
++ pers = pers | chunk_size | (fault << FAULT_SHIFT);
+
+- while( str && (dev = name_to_kdev_t(str))) {
+- do_md_add (minor, dev);
+- if((str = strchr (str, ',')) != NULL)
+- str++;
+- }
++ while( str && (dev = name_to_kdev_t(str))) {
++ do_md_add (minor, dev);
++ if((str = strchr (str, ',')) != NULL)
++ str++;
++ }
+
+- do_md_run (minor, pers);
+- printk ("md: Loading md%d.\n",minor);
++ do_md_run (minor, pers);
++ printk (KERN_INFO "md: Loading md%d.\n",minor);
+ }
+-
++#endif
+ }
+ #endif
+
++void hsm_init (void);
++void translucent_init (void);
+ void linear_init (void);
+ void raid0_init (void);
+ void raid1_init (void);
+ void raid5_init (void);
+
+-__initfunc(int md_init (void))
++md__initfunc(int md_init (void))
+ {
+- printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
+- MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
+- MAX_MD_DEV, MAX_REAL);
+-
+- if (register_blkdev (MD_MAJOR, "md", &md_fops))
+- {
+- printk ("Unable to get major %d for md\n", MD_MAJOR);
+- return (-1);
+- }
+-
+- blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
+- blk_dev[MD_MAJOR].current_request=NULL;
+- read_ahead[MD_MAJOR]=INT_MAX;
+- memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
+- md_gendisk.next=gendisk_head;
+-
+- gendisk_head=&md_gendisk;
+-
+-#if SUPPORT_RECONSTRUCTION
+- if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
+- printk("md: bug: md_sync_thread == NULL\n");
+-#endif /* SUPPORT_RECONSTRUCTION */
++ static char * name = "mdrecoveryd";
++
++ printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
++ MD_MAJOR_VERSION, MD_MINOR_VERSION,
++ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
++
++ if (register_blkdev (MD_MAJOR, "md", &md_fops))
++ {
++ printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
++ return (-1);
++ }
++
++ blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
++ blk_dev[MD_MAJOR].current_request = NULL;
++ read_ahead[MD_MAJOR] = INT_MAX;
++ md_gendisk.next = gendisk_head;
++
++ gendisk_head = &md_gendisk;
++
++ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
++ if (!md_recovery_thread)
++ printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
+
++ md_register_reboot_notifier(&md_notifier);
++ md_register_sysctl();
++
++#ifdef CONFIG_MD_HSM
++ hsm_init ();
++#endif
++#ifdef CONFIG_MD_TRANSLUCENT
++ translucent_init ();
++#endif
+ #ifdef CONFIG_MD_LINEAR
+- linear_init ();
++ linear_init ();
+ #endif
+ #ifdef CONFIG_MD_STRIPED
+- raid0_init ();
++ raid0_init ();
+ #endif
+ #ifdef CONFIG_MD_MIRRORING
+- raid1_init ();
++ raid1_init ();
+ #endif
+ #ifdef CONFIG_MD_RAID5
+- raid5_init ();
++ raid5_init ();
++#endif
++#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
++ /*
++ * pick a XOR routine, runtime.
++ */
++ calibrate_xor_block();
+ #endif
+- return (0);
++
++ return (0);
+ }
+
+ #ifdef CONFIG_MD_BOOT
+-__initfunc(void md_setup_drive(void))
++md__initfunc(void md_setup_drive(void))
+ {
+ if(md_setup_args.set)
+ do_md_setup(md_setup_args.str, md_setup_args.ints);
+ }
+ #endif
++
++MD_EXPORT_SYMBOL(md_size);
++MD_EXPORT_SYMBOL(register_md_personality);
++MD_EXPORT_SYMBOL(unregister_md_personality);
++MD_EXPORT_SYMBOL(partition_name);
++MD_EXPORT_SYMBOL(md_error);
++MD_EXPORT_SYMBOL(md_recover_arrays);
++MD_EXPORT_SYMBOL(md_register_thread);
++MD_EXPORT_SYMBOL(md_unregister_thread);
++MD_EXPORT_SYMBOL(md_update_sb);
++MD_EXPORT_SYMBOL(md_map);
++MD_EXPORT_SYMBOL(md_wakeup_thread);
++MD_EXPORT_SYMBOL(md_do_sync);
++MD_EXPORT_SYMBOL(md_print_devices);
++MD_EXPORT_SYMBOL(find_rdev_nr);
++MD_EXPORT_SYMBOL(md_check_ordering);
++MD_EXPORT_SYMBOL(md_interrupt_thread);
++MD_EXPORT_SYMBOL(mddev_map);
++
++#ifdef CONFIG_PROC_FS
++static struct proc_dir_entry proc_md = {
++ PROC_MD, 6, "mdstat",
++ S_IFREG | S_IRUGO, 1, 0, 0,
++ 0, &proc_array_inode_operations,
++};
++#endif
++
++static void md_geninit (struct gendisk *gdisk)
++{
++ int i;
++
++ for(i = 0; i < MAX_MD_DEVS; i++) {
++ md_blocksizes[i] = 1024;
++ md_maxreadahead[i] = MD_READAHEAD;
++ md_gendisk.part[i].start_sect = -1; /* avoid partition check */
++ md_gendisk.part[i].nr_sects = 0;
++ }
++
++ printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
++
++ blksize_size[MD_MAJOR] = md_blocksizes;
++ md_set_global_readahead(md_maxreadahead);
++
++#ifdef CONFIG_PROC_FS
++ proc_register(&proc_root, &proc_md);
++#endif
++}
++
+diff -uNr linux-orig/drivers/block/raid0.c linux/drivers/block/raid0.c
+--- linux-orig/drivers/block/raid0.c Mon Sep 4 10:39:16 2000
++++ linux/drivers/block/raid0.c Fri Dec 8 22:54:52 2000
+@@ -1,4 +1,3 @@
+-
+ /*
+ raid0.c : Multiple Devices driver for Linux
+ Copyright (C) 1994-96 Marc ZYNGIER
+@@ -18,146 +17,201 @@
+ */
+
+ #include <linux/module.h>
+-#include <linux/md.h>
+-#include <linux/raid0.h>
+-#include <linux/vmalloc.h>
++#include <linux/raid/raid0.h>
+
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+
+-static int create_strip_zones (int minor, struct md_dev *mddev)
++static int create_strip_zones (mddev_t *mddev)
+ {
+- int i, j, c=0;
+- int current_offset=0;
+- struct real_dev *smallest_by_zone;
+- struct raid0_data *data=(struct raid0_data *) mddev->private;
+-
+- data->nr_strip_zones=1;
+-
+- for (i=1; i<mddev->nb_dev; i++)
+- {
+- for (j=0; j<i; j++)
+- if (mddev->devices[i].size==mddev->devices[j].size)
+- {
+- c=1;
+- break;
+- }
+-
+- if (!c)
+- data->nr_strip_zones++;
+-
+- c=0;
+- }
+-
+- if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
+- return 1;
+-
+- data->smallest=NULL;
+-
+- for (i=0; i<data->nr_strip_zones; i++)
+- {
+- data->strip_zone[i].dev_offset=current_offset;
+- smallest_by_zone=NULL;
+- c=0;
+-
+- for (j=0; j<mddev->nb_dev; j++)
+- if (mddev->devices[j].size>current_offset)
+- {
+- data->strip_zone[i].dev[c++]=mddev->devices+j;
+- if (!smallest_by_zone ||
+- smallest_by_zone->size > mddev->devices[j].size)
+- smallest_by_zone=mddev->devices+j;
+- }
+-
+- data->strip_zone[i].nb_dev=c;
+- data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
+-
+- if (!data->smallest ||
+- data->smallest->size > data->strip_zone[i].size)
+- data->smallest=data->strip_zone+i;
+-
+- data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
+- data->strip_zone[i-1].size) : 0;
+- current_offset=smallest_by_zone->size;
+- }
+- return 0;
++ int i, c, j, j1, j2;
++ int current_offset, curr_zone_offset;
++ raid0_conf_t *conf = mddev_to_conf(mddev);
++ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
++
++ /*
++ * The number of 'same size groups'
++ */
++ conf->nr_strip_zones = 0;
++
++ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
++ printk("raid0: looking at %s\n", partition_name(rdev1->dev));
++ c = 0;
++ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
++ printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
++ if (rdev2 == rdev1) {
++ printk("raid0: END\n");
++ break;
++ }
++ if (rdev2->size == rdev1->size)
++ {
++ /*
++ * Not unique, dont count it as a new
++ * group
++ */
++ printk("raid0: EQUAL\n");
++ c = 1;
++ break;
++ }
++ printk("raid0: NOT EQUAL\n");
++ }
++ if (!c) {
++ printk("raid0: ==> UNIQUE\n");
++ conf->nr_strip_zones++;
++ printk("raid0: %d zones\n", conf->nr_strip_zones);
++ }
++ }
++ printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
++
++ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
++ conf->nr_strip_zones);
++ if (!conf->strip_zone)
++ return 1;
++
++
++ conf->smallest = NULL;
++ current_offset = 0;
++ curr_zone_offset = 0;
++
++ for (i = 0; i < conf->nr_strip_zones; i++)
++ {
++ struct strip_zone *zone = conf->strip_zone + i;
++
++ printk("zone %d\n", i);
++ zone->dev_offset = current_offset;
++ smallest = NULL;
++ c = 0;
++
++ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
++
++ printk(" checking %s ...", partition_name(rdev->dev));
++ if (rdev->size > current_offset)
++ {
++ printk(" contained as device %d\n", c);
++ zone->dev[c] = rdev;
++ c++;
++ if (!smallest || (rdev->size <smallest->size)) {
++ smallest = rdev;
++ printk(" (%d) is smallest!.\n", rdev->size);
++ }
++ } else
++ printk(" nope.\n");
++ }
++
++ zone->nb_dev = c;
++ zone->size = (smallest->size - current_offset) * c;
++ printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
++
++ if (!conf->smallest || (zone->size < conf->smallest->size))
++ conf->smallest = zone;
++
++ zone->zone_offset = curr_zone_offset;
++ curr_zone_offset += zone->size;
++
++ current_offset = smallest->size;
++ printk("current zone offset: %d\n", current_offset);
++ }
++ printk("done.\n");
++ return 0;
+ }
+
+-static int raid0_run (int minor, struct md_dev *mddev)
++static int raid0_run (mddev_t *mddev)
+ {
+- int cur=0, i=0, size, zone0_size, nb_zone;
+- struct raid0_data *data;
+-
+- MOD_INC_USE_COUNT;
++ int cur=0, i=0, size, zone0_size, nb_zone;
++ raid0_conf_t *conf;
+
+- if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
+- data=(struct raid0_data *) mddev->private;
+-
+- if (create_strip_zones (minor, mddev))
+- {
+- vfree(data);
+- return 1;
+- }
+-
+- nb_zone=data->nr_zones=
+- md_size[minor]/data->smallest->size +
+- (md_size[minor]%data->smallest->size ? 1 : 0);
+-
+- printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
+- if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
+- {
+- vfree(data->strip_zone);
+- vfree(data);
+- return 1;
+- }
+- size=data->strip_zone[cur].size;
+-
+- i=0;
+- while (cur<data->nr_strip_zones)
+- {
+- data->hash_table[i].zone0=data->strip_zone+cur;
+-
+- if (size>=data->smallest->size)/* If we completely fill the slot */
+- {
+- data->hash_table[i++].zone1=NULL;
+- size-=data->smallest->size;
+-
+- if (!size)
+- {
+- if (++cur==data->nr_strip_zones) continue;
+- size=data->strip_zone[cur].size;
+- }
+-
+- continue;
+- }
+-
+- if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
+- {
+- data->hash_table[i].zone1=NULL;
+- continue;
+- }
+-
+- zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
+- size=data->strip_zone[cur].size;
+- data->hash_table[i++].zone1=data->strip_zone+cur;
+- size-=(data->smallest->size - zone0_size);
+- }
++ MOD_INC_USE_COUNT;
+
+- return (0);
++ conf = vmalloc(sizeof (raid0_conf_t));
++ if (!conf)
++ goto out;
++ mddev->private = (void *)conf;
++
++ if (md_check_ordering(mddev)) {
++ printk("raid0: disks are not ordered, aborting!\n");
++ goto out_free_conf;
++ }
++
++ if (create_strip_zones (mddev))
++ goto out_free_conf;
++
++ printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
++ printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
++ nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
++ (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
++ printk("raid0 : nb_zone is %d.\n", nb_zone);
++ conf->nr_zones = nb_zone;
++
++ printk("raid0 : Allocating %d bytes for hash.\n",
++ sizeof(struct raid0_hash)*nb_zone);
++
++ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
++ if (!conf->hash_table)
++ goto out_free_zone_conf;
++ size = conf->strip_zone[cur].size;
++
++ i = 0;
++ while (cur < conf->nr_strip_zones) {
++ conf->hash_table[i].zone0 = conf->strip_zone + cur;
++
++ /*
++ * If we completely fill the slot
++ */
++ if (size >= conf->smallest->size) {
++ conf->hash_table[i++].zone1 = NULL;
++ size -= conf->smallest->size;
++
++ if (!size) {
++ if (++cur == conf->nr_strip_zones)
++ continue;
++ size = conf->strip_zone[cur].size;
++ }
++ continue;
++ }
++ if (++cur == conf->nr_strip_zones) {
++ /*
++ * Last dev, set unit1 as NULL
++ */
++ conf->hash_table[i].zone1=NULL;
++ continue;
++ }
++
++ /*
++ * Here we use a 2nd dev to fill the slot
++ */
++ zone0_size = size;
++ size = conf->strip_zone[cur].size;
++ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
++ size -= (conf->smallest->size - zone0_size);
++ }
++ return 0;
++
++out_free_zone_conf:
++ vfree(conf->strip_zone);
++ conf->strip_zone = NULL;
++
++out_free_conf:
++ vfree(conf);
++ mddev->private = NULL;
++out:
++ MOD_DEC_USE_COUNT;
++ return 1;
+ }
+
+-
+-static int raid0_stop (int minor, struct md_dev *mddev)
++static int raid0_stop (mddev_t *mddev)
+ {
+- struct raid0_data *data=(struct raid0_data *) mddev->private;
++ raid0_conf_t *conf = mddev_to_conf(mddev);
+
+- vfree (data->hash_table);
+- vfree (data->strip_zone);
+- vfree (data);
++ vfree (conf->hash_table);
++ conf->hash_table = NULL;
++ vfree (conf->strip_zone);
++ conf->strip_zone = NULL;
++ vfree (conf);
++ mddev->private = NULL;
+
+- MOD_DEC_USE_COUNT;
+- return 0;
++ MOD_DEC_USE_COUNT;
++ return 0;
+ }
+
+ /*
+@@ -167,135 +221,140 @@
+ * Of course, those facts may not be valid anymore (and surely won't...)
+ * Hey guys, there's some work out there ;-)
+ */
+-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+ unsigned long *rsector, unsigned long size)
+ {
+- struct raid0_data *data=(struct raid0_data *) mddev->private;
+- static struct raid0_hash *hash;
+- struct strip_zone *zone;
+- struct real_dev *tmp_dev;
+- int blk_in_chunk, factor, chunk, chunk_size;
+- long block, rblock;
+-
+- factor=FACTOR(mddev);
+- chunk_size=(1UL << FACTOR_SHIFT(factor));
+- block=*rsector >> 1;
+- hash=data->hash_table+(block/data->smallest->size);
+-
+- if (hash - data->hash_table > data->nr_zones)
+- {
+- printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
+- return -1;
+- }
+-
+- /* Sanity check */
+- if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
+- {
+- printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
+- return (-1);
+- }
+-
+- if (block >= (hash->zone0->size +
+- hash->zone0->zone_offset))
+- {
+- if (!hash->zone1)
+- {
+- printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
+- return (-1);
+- }
+-
+- zone=hash->zone1;
+- }
+- else
+- zone=hash->zone0;
++ raid0_conf_t *conf = mddev_to_conf(mddev);
++ struct raid0_hash *hash;
++ struct strip_zone *zone;
++ mdk_rdev_t *tmp_dev;
++ int blk_in_chunk, chunksize_bits, chunk, chunk_size;
++ long block, rblock;
++
++ chunk_size = mddev->param.chunk_size >> 10;
++ chunksize_bits = ffz(~chunk_size);
++ block = *rsector >> 1;
++ hash = conf->hash_table + block / conf->smallest->size;
++
++ if (hash - conf->hash_table > conf->nr_zones) {
++ printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
++ return -1;
++ }
++
++ /* Sanity check */
++ if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
++ goto bad_map;
++
++ if (!hash)
++ goto bad_hash;
++
++ if (!hash->zone0)
++ goto bad_zone0;
++
++ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
++ if (!hash->zone1)
++ goto bad_zone1;
++ zone = hash->zone1;
++ } else
++ zone = hash->zone0;
+
+- blk_in_chunk=block & (chunk_size -1);
+- chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
+- tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
+- rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
++ blk_in_chunk = block & (chunk_size -1);
++ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
++ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
++ rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
+
+- *rdev=tmp_dev->dev;
+- *rsector=rblock<<1;
++ *rdev = tmp_dev->dev;
++ *rsector = rblock << 1;
+
+- return (0);
++ return 0;
++
++bad_map:
++ printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
++ return -1;
++bad_hash:
++ printk("raid0_map bug: hash==NULL for block %ld\n", block);
++ return -1;
++bad_zone0:
++ printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
++ return -1;
++bad_zone1:
++ printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
++ return -1;
+ }
+
+
+-static int raid0_status (char *page, int minor, struct md_dev *mddev)
++static int raid0_status (char *page, mddev_t *mddev)
+ {
+- int sz=0;
++ int sz = 0;
+ #undef MD_DEBUG
+ #ifdef MD_DEBUG
+- int j, k;
+- struct raid0_data *data=(struct raid0_data *) mddev->private;
++ int j, k;
++ raid0_conf_t *conf = mddev_to_conf(mddev);
+
+- sz+=sprintf (page+sz, " ");
+- for (j=0; j<data->nr_zones; j++)
+- {
+- sz+=sprintf (page+sz, "[z%d",
+- data->hash_table[j].zone0-data->strip_zone);
+- if (data->hash_table[j].zone1)
+- sz+=sprintf (page+sz, "/z%d] ",
+- data->hash_table[j].zone1-data->strip_zone);
+- else
+- sz+=sprintf (page+sz, "] ");
+- }
++ sz += sprintf(page + sz, " ");
++ for (j = 0; j < conf->nr_zones; j++) {
++ sz += sprintf(page + sz, "[z%d",
++ conf->hash_table[j].zone0 - conf->strip_zone);
++ if (conf->hash_table[j].zone1)
++ sz += sprintf(page+sz, "/z%d] ",
++ conf->hash_table[j].zone1 - conf->strip_zone);
++ else
++ sz += sprintf(page+sz, "] ");
++ }
+
+- sz+=sprintf (page+sz, "\n");
++ sz += sprintf(page + sz, "\n");
+
+- for (j=0; j<data->nr_strip_zones; j++)
+- {
+- sz+=sprintf (page+sz, " z%d=[", j);
+- for (k=0; k<data->strip_zone[j].nb_dev; k++)
+- sz+=sprintf (page+sz, "%s/",
+- partition_name(data->strip_zone[j].dev[k]->dev));
+- sz--;
+- sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
+- data->strip_zone[j].zone_offset,
+- data->strip_zone[j].dev_offset,
+- data->strip_zone[j].size);
+- }
++ for (j = 0; j < conf->nr_strip_zones; j++) {
++ sz += sprintf(page + sz, " z%d=[", j);
++ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
++ sz += sprintf (page+sz, "%s/", partition_name(
++ conf->strip_zone[j].dev[k]->dev));
++ sz--;
++ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
++ conf->strip_zone[j].zone_offset,
++ conf->strip_zone[j].dev_offset,
++ conf->strip_zone[j].size);
++ }
+ #endif
+- sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
+- return sz;
++ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
++ return sz;
+ }
+
+-
+-static struct md_personality raid0_personality=
++static mdk_personality_t raid0_personality=
+ {
+- "raid0",
+- raid0_map,
+- NULL, /* no special make_request */
+- NULL, /* no special end_request */
+- raid0_run,
+- raid0_stop,
+- raid0_status,
+- NULL, /* no ioctls */
+- 0,
+- NULL, /* no error_handler */
+- NULL, /* hot_add_disk */
+- NULL, /* hot_remove_disk */
+- NULL /* mark_spare */
++ "raid0",
++ raid0_map,
++ NULL, /* no special make_request */
++ NULL, /* no special end_request */
++ raid0_run,
++ raid0_stop,
++ raid0_status,
++ NULL, /* no ioctls */
++ 0,
++ NULL, /* no error_handler */
++ NULL, /* no diskop */
++ NULL, /* no stop resync */
++ NULL /* no restart resync */
+ };
+
+-
+ #ifndef MODULE
+
+ void raid0_init (void)
+ {
+- register_md_personality (RAID0, &raid0_personality);
++ register_md_personality (RAID0, &raid0_personality);
+ }
+
+ #else
+
+ int init_module (void)
+ {
+- return (register_md_personality (RAID0, &raid0_personality));
++ return (register_md_personality (RAID0, &raid0_personality));
+ }
+
+ void cleanup_module (void)
+ {
+- unregister_md_personality (RAID0);
++ unregister_md_personality (RAID0);
+ }
+
+ #endif
++
+diff -uNr linux-orig/drivers/block/raid1.c linux/drivers/block/raid1.c
+--- linux-orig/drivers/block/raid1.c Fri Dec 8 22:53:51 2000
++++ linux/drivers/block/raid1.c Fri Dec 8 22:54:52 2000
+@@ -1,6 +1,6 @@
+-/************************************************************************
++/*
+ * raid1.c : Multiple Devices driver for Linux
+- * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
++ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+@@ -15,50 +15,52 @@
+ */
+
+ #include <linux/module.h>
+-#include <linux/locks.h>
+ #include <linux/malloc.h>
+-#include <linux/md.h>
+-#include <linux/raid1.h>
+-#include <asm/bitops.h>
++#include <linux/raid/raid1.h>
+ #include <asm/atomic.h>
+
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+
+-/*
+- * The following can be used to debug the driver
+- */
+-/*#define RAID1_DEBUG*/
+-#ifdef RAID1_DEBUG
+-#define PRINTK(x) do { printk x; } while (0);
+-#else
+-#define PRINTK(x) do { ; } while (0);
+-#endif
++#define MAX_LINEAR_SECTORS 128
+
+ #define MAX(a,b) ((a) > (b) ? (a) : (b))
+ #define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+-static struct md_personality raid1_personality;
+-static struct md_thread *raid1_thread = NULL;
++static mdk_personality_t raid1_personality;
+ struct buffer_head *raid1_retry_list = NULL;
+
+-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
++static void * raid1_kmalloc (int size)
++{
++ void * ptr;
++ /*
++ * now we are rather fault tolerant than nice, but
++ * there are a couple of places in the RAID code where we
++ * simply can not afford to fail an allocation because
++ * there is no failure return path (eg. make_request())
++ */
++ while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL)))
++ printk ("raid1: out of memory, retrying...\n");
++
++ memset(ptr, 0, size);
++ return ptr;
++}
++
++static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
+ unsigned long *rsector, unsigned long size)
+ {
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+- int i, n = raid_conf->raid_disks;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++ int i, disks = MD_SB_DISKS;
+
+ /*
+ * Later we do read balancing on the read side
+ * now we use the first available disk.
+ */
+
+- PRINTK(("raid1_map().\n"));
+-
+- for (i=0; i<n; i++) {
+- if (raid_conf->mirrors[i].operational) {
+- *rdev = raid_conf->mirrors[i].dev;
++ for (i = 0; i < disks; i++) {
++ if (conf->mirrors[i].operational) {
++ *rdev = conf->mirrors[i].dev;
+ return (0);
+ }
+ }
+@@ -67,29 +69,29 @@
+ return (-1);
+ }
+
+-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+ unsigned long *rsector, unsigned long size)
+ {
+ return 0;
+ }
+
+-void raid1_reschedule_retry (struct buffer_head *bh)
++static void raid1_reschedule_retry (struct buffer_head *bh)
+ {
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+-
+- PRINTK(("raid1_reschedule_retry().\n"));
++ mddev_t *mddev = r1_bh->mddev;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ r1_bh->next_retry = raid1_retry_list;
+ raid1_retry_list = bh;
+- md_wakeup_thread(raid1_thread);
++ md_wakeup_thread(conf->thread);
+ }
+
+ /*
+- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
++ * raid1_end_bh_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
++static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
+ {
+ struct buffer_head *bh = r1_bh->master_bh;
+
+@@ -97,8 +99,6 @@
+ kfree(r1_bh);
+ }
+
+-int raid1_one_error=0;
+-
+ void raid1_end_request (struct buffer_head *bh, int uptodate)
+ {
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+@@ -106,12 +106,7 @@
+
+ save_flags(flags);
+ cli();
+- PRINTK(("raid1_end_request().\n"));
+
+- if (raid1_one_error) {
+- raid1_one_error=0;
+- uptodate=0;
+- }
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+@@ -136,15 +131,11 @@
+ */
+
+ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
+-
+- PRINTK(("raid1_end_request(), read branch.\n"));
+-
+ /*
+ * we have only one buffer_head on the read side
+ */
+ if (uptodate) {
+- PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
+- raid1_end_buffer_io(r1_bh, uptodate);
++ raid1_end_bh_io(r1_bh, uptodate);
+ restore_flags(flags);
+ return;
+ }
+@@ -152,71 +143,56 @@
+ * oops, read error:
+ */
+ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
+- kdevname(bh->b_dev), bh->b_blocknr);
+- raid1_reschedule_retry (bh);
++ partition_name(bh->b_dev), bh->b_blocknr);
++ raid1_reschedule_retry(bh);
+ restore_flags(flags);
+ return;
+ }
+
+ /*
+- * WRITE or WRITEA.
+- */
+- PRINTK(("raid1_end_request(), write branch.\n"));
+-
+- /*
++ * WRITE:
++ *
+ * Let's see if all mirrored write operations have finished
+- * already [we have irqs off, so we can decrease]:
++ * already.
+ */
+
+- if (!--r1_bh->remaining) {
+- struct md_dev *mddev = r1_bh->mddev;
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+- int i, n = raid_conf->raid_disks;
++ if (atomic_dec_and_test(&r1_bh->remaining)) {
++ int i, disks = MD_SB_DISKS;
+
+- PRINTK(("raid1_end_request(), remaining == 0.\n"));
++ for ( i = 0; i < disks; i++)
++ if (r1_bh->mirror_bh[i])
++ kfree(r1_bh->mirror_bh[i]);
+
+- for ( i=0; i<n; i++)
+- if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
+-
+- raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
++ raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
+ }
+- else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
+ restore_flags(flags);
+ }
+
+-/* This routine checks if the undelying device is an md device and in that
+- * case it maps the blocks before putting the request on the queue
++/*
++ * This routine checks if the undelying device is an md device
++ * and in that case it maps the blocks before putting the
++ * request on the queue
+ */
+-static inline void
+-map_and_make_request (int rw, struct buffer_head *bh)
++static void map_and_make_request (int rw, struct buffer_head *bh)
+ {
+ if (MAJOR (bh->b_rdev) == MD_MAJOR)
+- md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
++ md_map (bh->b_rdev, &bh->b_rdev,
++ &bh->b_rsector, bh->b_size >> 9);
+ clear_bit(BH_Lock, &bh->b_state);
+ make_request (MAJOR (bh->b_rdev), rw, bh);
+ }
+
+-static int
+-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
++static int raid1_make_request (mddev_t *mddev, int rw,
++ struct buffer_head * bh)
+ {
+-
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
+ struct raid1_bh * r1_bh;
+- int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
++ int disks = MD_SB_DISKS;
++ int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
+ struct mirror_info *mirror;
+
+- PRINTK(("raid1_make_request().\n"));
+-
+- while (!( /* FIXME: now we are rather fault tolerant than nice */
+- r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
+- ) )
+- {
+- printk ("raid1_make_request(#1): out of memory\n");
+- current->policy |= SCHED_YIELD;
+- schedule();
+- }
+- memset (r1_bh, 0, sizeof (struct raid1_bh));
++ r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
+
+ /*
+ * make_request() can abort the operation when READA or WRITEA are being
+@@ -227,43 +203,65 @@
+ if (rw == READA) rw = READ;
+ if (rw == WRITEA) rw = WRITE;
+
+- if (rw == WRITE || rw == WRITEA)
+- mark_buffer_clean(bh); /* Too early ? */
++ if (rw == WRITE) {
++ /*
++ * Too early ?
++ */
++ mark_buffer_clean(bh);
++ /*
++ * not too early. we _first_ clean the bh, then we start
++ * the IO, then when the IO has finished, we unlock the
++ * bh and mark it uptodate. This way we do not miss the
++ * case when the bh got dirty again during the IO.
++ */
++ }
++
++ /*
++ * special flag for 'lowprio' reconstruction requests ...
++ */
++ if (buffer_lowprio(bh))
++ lowprio = 1;
+
+ /*
+- * i think the read and write branch should be separated completely, since we want
+- * to do read balancing on the read side for example. Comments? :) --mingo
++ * i think the read and write branch should be separated completely,
++ * since we want to do read balancing on the read side for example.
++ * Comments? :) --mingo
+ */
+
+ r1_bh->master_bh=bh;
+ r1_bh->mddev=mddev;
+ r1_bh->cmd = rw;
+
+- if (rw==READ || rw==READA) {
+- int last_used = raid_conf->last_used;
+- PRINTK(("raid1_make_request(), read branch.\n"));
+- mirror = raid_conf->mirrors + last_used;
++ if (rw==READ) {
++ int last_used = conf->last_used;
++
++ /*
++ * read balancing logic:
++ */
++ mirror = conf->mirrors + last_used;
+ bh->b_rdev = mirror->dev;
+ sectors = bh->b_size >> 9;
+- if (bh->b_blocknr * sectors == raid_conf->next_sect) {
+- raid_conf->sect_count += sectors;
+- if (raid_conf->sect_count >= mirror->sect_limit)
++
++ if (bh->b_blocknr * sectors == conf->next_sect) {
++ conf->sect_count += sectors;
++ if (conf->sect_count >= mirror->sect_limit)
+ switch_disks = 1;
+ } else
+ switch_disks = 1;
+- raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
+- if (switch_disks) {
+- PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
+- raid_conf->sect_count = 0;
+- last_used = raid_conf->last_used = mirror->next;
++ conf->next_sect = (bh->b_blocknr + 1) * sectors;
++ /*
++ * Do not switch disks if full resync is in progress ...
++ */
++ if (switch_disks && !conf->resync_mirrors) {
++ conf->sect_count = 0;
++ last_used = conf->last_used = mirror->next;
+ /*
+- * Do not switch to write-only disks ... resyncing
+- * is in progress
++ * Do not switch to write-only disks ...
++ * reconstruction is in progress
+ */
+- while (raid_conf->mirrors[last_used].write_only)
+- raid_conf->last_used = raid_conf->mirrors[last_used].next;
++ while (conf->mirrors[last_used].write_only)
++ conf->last_used = conf->mirrors[last_used].next;
+ }
+- PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
+ bh_req = &r1_bh->bh_req;
+ memcpy(bh_req, bh, sizeof(*bh));
+ bh_req->b_end_io = raid1_end_request;
+@@ -273,13 +271,12 @@
+ }
+
+ /*
+- * WRITE or WRITEA.
++ * WRITE:
+ */
+- PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
+
+- for (i = 0; i < n; i++) {
++ for (i = 0; i < disks; i++) {
+
+- if (!raid_conf->mirrors [i].operational) {
++ if (!conf->mirrors[i].operational) {
+ /*
+ * the r1_bh->mirror_bh[i] pointer remains NULL
+ */
+@@ -287,89 +284,91 @@
+ continue;
+ }
+
++ /*
++ * special case for reconstruction ...
++ */
++ if (lowprio && (i == conf->last_used)) {
++ mirror_bh[i] = NULL;
++ continue;
++ }
++
++ /*
++ * We should use a private pool (size depending on NR_REQUEST),
++ * to avoid writes filling up the memory with bhs
++ *
++ * Such pools are much faster than kmalloc anyways (so we waste
++ * almost nothing by not using the master bh when writing and
++ * win alot of cleanness) but for now we are cool enough. --mingo
++ *
++ * It's safe to sleep here, buffer heads cannot be used in a shared
++ * manner in the write branch. Look how we lock the buffer at the
++ * beginning of this function to grok the difference ;)
++ */
++ mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
++ /*
++ * prepare mirrored bh (fields ordered for max mem throughput):
++ */
++ mirror_bh[i]->b_blocknr = bh->b_blocknr;
++ mirror_bh[i]->b_dev = bh->b_dev;
++ mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
++ mirror_bh[i]->b_rsector = bh->b_rsector;
++ mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
++ if (lowprio)
++ mirror_bh[i]->b_state |= (1<<BH_LowPrio);
++
++ mirror_bh[i]->b_count = 1;
++ mirror_bh[i]->b_size = bh->b_size;
++ mirror_bh[i]->b_data = bh->b_data;
++ mirror_bh[i]->b_list = BUF_LOCKED;
++ mirror_bh[i]->b_end_io = raid1_end_request;
++ mirror_bh[i]->b_dev_id = r1_bh;
++
++ r1_bh->mirror_bh[i] = mirror_bh[i];
++ sum_bhs++;
++ }
++
++ md_atomic_set(&r1_bh->remaining, sum_bhs);
++
+ /*
+- * We should use a private pool (size depending on NR_REQUEST),
+- * to avoid writes filling up the memory with bhs
+- *
+- * Such pools are much faster than kmalloc anyways (so we waste almost
+- * nothing by not using the master bh when writing and win alot of cleanness)
+- *
+- * but for now we are cool enough. --mingo
+- *
+- * It's safe to sleep here, buffer heads cannot be used in a shared
+- * manner in the write branch. Look how we lock the buffer at the beginning
+- * of this function to grok the difference ;)
+- */
+- while (!( /* FIXME: now we are rather fault tolerant than nice */
+- mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
+- ) )
+- {
+- printk ("raid1_make_request(#2): out of memory\n");
+- current->policy |= SCHED_YIELD;
+- schedule();
+- }
+- memset (mirror_bh[i], 0, sizeof (struct buffer_head));
+-
+- /*
+- * prepare mirrored bh (fields ordered for max mem throughput):
+- */
+- mirror_bh [i]->b_blocknr = bh->b_blocknr;
+- mirror_bh [i]->b_dev = bh->b_dev;
+- mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
+- mirror_bh [i]->b_rsector = bh->b_rsector;
+- mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
+- mirror_bh [i]->b_count = 1;
+- mirror_bh [i]->b_size = bh->b_size;
+- mirror_bh [i]->b_data = bh->b_data;
+- mirror_bh [i]->b_list = BUF_LOCKED;
+- mirror_bh [i]->b_end_io = raid1_end_request;
+- mirror_bh [i]->b_dev_id = r1_bh;
+-
+- r1_bh->mirror_bh[i] = mirror_bh[i];
+- sum_bhs++;
+- }
+-
+- r1_bh->remaining = sum_bhs;
+-
+- PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
+-
+- /*
+- * We have to be a bit careful about the semaphore above, thats why we
+- * start the requests separately. Since kmalloc() could fail, sleep and
+- * make_request() can sleep too, this is the safer solution. Imagine,
+- * end_request decreasing the semaphore before we could have set it up ...
+- * We could play tricks with the semaphore (presetting it and correcting
+- * at the end if sum_bhs is not 'n' but we have to do end_request by hand
+- * if all requests finish until we had a chance to set up the semaphore
+- * correctly ... lots of races).
+- */
+- for (i = 0; i < n; i++)
+- if (mirror_bh [i] != NULL)
+- map_and_make_request (rw, mirror_bh [i]);
++ * We have to be a bit careful about the semaphore above, thats
++ * why we start the requests separately. Since kmalloc() could
++ * fail, sleep and make_request() can sleep too, this is the
++ * safer solution. Imagine, end_request decreasing the semaphore
++ * before we could have set it up ... We could play tricks with
++ * the semaphore (presetting it and correcting at the end if
++ * sum_bhs is not 'n' but we have to do end_request by hand if
++ * all requests finish until we had a chance to set up the
++ * semaphore correctly ... lots of races).
++ */
++ for (i = 0; i < disks; i++)
++ if (mirror_bh[i])
++ map_and_make_request(rw, mirror_bh[i]);
+
+ return (0);
+ }
+
+-static int raid1_status (char *page, int minor, struct md_dev *mddev)
++static int raid1_status (char *page, mddev_t *mddev)
+ {
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
+ int sz = 0, i;
+
+- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
+- for (i = 0; i < raid_conf->raid_disks; i++)
+- sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
++ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
++ conf->working_disks);
++ for (i = 0; i < conf->raid_disks; i++)
++ sz += sprintf (page+sz, "%s",
++ conf->mirrors[i].operational ? "U" : "_");
+ sz += sprintf (page+sz, "]");
+ return sz;
+ }
+
+-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
++static void unlink_disk (raid1_conf_t *conf, int target)
+ {
+- int disks = raid_conf->raid_disks;
+- int j;
++ int disks = MD_SB_DISKS;
++ int i;
+
+- for (j = 0; j < disks; j++)
+- if (raid_conf->mirrors [j].next == failed_index)
+- raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
++ for (i = 0; i < disks; i++)
++ if (conf->mirrors[i].next == target)
++ conf->mirrors[i].next = conf->mirrors[target].next;
+ }
+
+ #define LAST_DISK KERN_ALERT \
+@@ -388,48 +387,53 @@
+ #define ALREADY_SYNCING KERN_INFO \
+ "raid1: syncing already in progress.\n"
+
+-static int raid1_error (struct md_dev *mddev, kdev_t dev)
++static void mark_disk_bad (mddev_t *mddev, int failed)
+ {
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+- struct mirror_info *mirror;
+- md_superblock_t *sb = mddev->sb;
+- int disks = raid_conf->raid_disks;
+- int i;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++ struct mirror_info *mirror = conf->mirrors+failed;
++ mdp_super_t *sb = mddev->sb;
++
++ mirror->operational = 0;
++ unlink_disk(conf, failed);
++ mark_disk_faulty(sb->disks+mirror->number);
++ mark_disk_nonsync(sb->disks+mirror->number);
++ mark_disk_inactive(sb->disks+mirror->number);
++ sb->active_disks--;
++ sb->working_disks--;
++ sb->failed_disks++;
++ mddev->sb_dirty = 1;
++ md_wakeup_thread(conf->thread);
++ conf->working_disks--;
++ printk (DISK_FAILED, partition_name (mirror->dev),
++ conf->working_disks);
++}
+
+- PRINTK(("raid1_error called\n"));
++static int raid1_error (mddev_t *mddev, kdev_t dev)
++{
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++ struct mirror_info * mirrors = conf->mirrors;
++ int disks = MD_SB_DISKS;
++ int i;
+
+- if (raid_conf->working_disks == 1) {
++ if (conf->working_disks == 1) {
+ /*
+ * Uh oh, we can do nothing if this is our last disk, but
+ * first check if this is a queued request for a device
+ * which has just failed.
+ */
+- for (i = 0, mirror = raid_conf->mirrors; i < disks;
+- i++, mirror++)
+- if (mirror->dev == dev && !mirror->operational)
++ for (i = 0; i < disks; i++) {
++ if (mirrors[i].dev==dev && !mirrors[i].operational)
+ return 0;
++ }
+ printk (LAST_DISK);
+ } else {
+- /* Mark disk as unusable */
+- for (i = 0, mirror = raid_conf->mirrors; i < disks;
+- i++, mirror++) {
+- if (mirror->dev == dev && mirror->operational){
+- mirror->operational = 0;
+- raid1_fix_links (raid_conf, i);
+- sb->disks[mirror->number].state |=
+- (1 << MD_FAULTY_DEVICE);
+- sb->disks[mirror->number].state &=
+- ~(1 << MD_SYNC_DEVICE);
+- sb->disks[mirror->number].state &=
+- ~(1 << MD_ACTIVE_DEVICE);
+- sb->active_disks--;
+- sb->working_disks--;
+- sb->failed_disks++;
+- mddev->sb_dirty = 1;
+- md_wakeup_thread(raid1_thread);
+- raid_conf->working_disks--;
+- printk (DISK_FAILED, kdevname (dev),
+- raid_conf->working_disks);
++ /*
++ * Mark disk as unusable
++ */
++ for (i = 0; i < disks; i++) {
++ if (mirrors[i].dev==dev && mirrors[i].operational) {
++ mark_disk_bad (mddev, i);
++ break;
+ }
+ }
+ }
+@@ -442,219 +446,396 @@
+ #undef START_SYNCING
+
+ /*
+- * This is the personality-specific hot-addition routine
++ * Insert the spare disk into the drive-ring
+ */
++static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
++{
++ int j, next;
++ int disks = MD_SB_DISKS;
++ struct mirror_info *p = conf->mirrors;
++
++ for (j = 0; j < disks; j++, p++)
++ if (p->operational && !p->write_only) {
++ next = p->next;
++ p->next = mirror->raid_disk;
++ mirror->next = next;
++ return;
++ }
+
+-#define NO_SUPERBLOCK KERN_ERR \
+-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
++ printk("raid1: bug: no read-operational devices\n");
++}
+
+-#define WRONG_LEVEL KERN_ERR \
+-"raid1: hot-add: level of disk is not RAID-1\n"
++static void print_raid1_conf (raid1_conf_t *conf)
++{
++ int i;
++ struct mirror_info *tmp;
+
+-#define HOT_ADD_SUCCEEDED KERN_INFO \
+-"raid1: device %s hot-added\n"
++ printk("RAID1 conf printout:\n");
++ if (!conf) {
++ printk("(conf==NULL)\n");
++ return;
++ }
++ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
++ conf->raid_disks, conf->nr_disks);
+
+-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ tmp = conf->mirrors + i;
++ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
++ i, tmp->spare,tmp->operational,
++ tmp->number,tmp->raid_disk,tmp->used_slot,
++ partition_name(tmp->dev));
++ }
++}
++
++static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+ {
++ int err = 0;
++ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
++ raid1_conf_t *conf = mddev->private;
++ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ unsigned long flags;
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+- struct mirror_info *mirror;
+- md_superblock_t *sb = mddev->sb;
+- struct real_dev * realdev;
+- int n;
++ mdp_super_t *sb = mddev->sb;
++ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
++
++ save_flags(flags);
++ cli();
+
++ print_raid1_conf(conf);
+ /*
+- * The device has its superblock already read and it was found
+- * to be consistent for generic RAID usage. Now we check whether
+- * it's usable for RAID-1 hot addition.
++ * find the disk ...
+ */
++ switch (state) {
+
+- n = mddev->nb_dev++;
+- realdev = &mddev->devices[n];
+- if (!realdev->sb) {
+- printk (NO_SUPERBLOCK);
+- return -EINVAL;
+- }
+- if (realdev->sb->level != 1) {
+- printk (WRONG_LEVEL);
+- return -EINVAL;
++ case DISKOP_SPARE_ACTIVE:
++
++ /*
++ * Find the failed disk within the RAID1 configuration ...
++ * (this can only be in the first conf->working_disks part)
++ */
++ for (i = 0; i < conf->raid_disks; i++) {
++ tmp = conf->mirrors + i;
++ if ((!tmp->operational && !tmp->spare) ||
++ !tmp->used_slot) {
++ failed_disk = i;
++ break;
++ }
++ }
++ /*
++ * When we activate a spare disk we _must_ have a disk in
++ * the lower (active) part of the array to replace.
++ */
++ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ /* fall through */
++
++ case DISKOP_SPARE_WRITE:
++ case DISKOP_SPARE_INACTIVE:
++
++ /*
++ * Find the spare disk ... (can only be in the 'high'
++ * area of the array)
++ */
++ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++ tmp = conf->mirrors + i;
++ if (tmp->spare && tmp->number == (*d)->number) {
++ spare_disk = i;
++ break;
++ }
++ }
++ if (spare_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
++
++ case DISKOP_HOT_REMOVE_DISK:
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ tmp = conf->mirrors + i;
++ if (tmp->used_slot && (tmp->number == (*d)->number)) {
++ if (tmp->operational) {
++ err = -EBUSY;
++ goto abort;
++ }
++ removed_disk = i;
++ break;
++ }
++ }
++ if (removed_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
++
++ case DISKOP_HOT_ADD_DISK:
++
++ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++ tmp = conf->mirrors + i;
++ if (!tmp->used_slot) {
++ added_disk = i;
++ break;
++ }
++ }
++ if (added_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
+ }
+- /* FIXME: are there other things left we could sanity-check? */
+
++ switch (state) {
+ /*
+- * We have to disable interrupts, as our RAID-1 state is used
+- * from irq handlers as well.
++ * Switch the spare disk to write-only mode:
+ */
+- save_flags(flags);
+- cli();
++ case DISKOP_SPARE_WRITE:
++ sdisk = conf->mirrors + spare_disk;
++ sdisk->operational = 1;
++ sdisk->write_only = 1;
++ break;
++ /*
++ * Deactivate a spare disk:
++ */
++ case DISKOP_SPARE_INACTIVE:
++ sdisk = conf->mirrors + spare_disk;
++ sdisk->operational = 0;
++ sdisk->write_only = 0;
++ break;
++ /*
++ * Activate (mark read-write) the (now sync) spare disk,
++ * which means we switch it's 'raid position' (->raid_disk)
++ * with the failed disk. (only the first 'conf->nr_disks'
++ * slots are used for 'real' disks and we must preserve this
++ * property)
++ */
++ case DISKOP_SPARE_ACTIVE:
+
+- raid_conf->raid_disks++;
+- mirror = raid_conf->mirrors+n;
++ sdisk = conf->mirrors + spare_disk;
++ fdisk = conf->mirrors + failed_disk;
+
+- mirror->number=n;
+- mirror->raid_disk=n;
+- mirror->dev=dev;
+- mirror->next=0; /* FIXME */
+- mirror->sect_limit=128;
+-
+- mirror->operational=0;
+- mirror->spare=1;
+- mirror->write_only=0;
+-
+- sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
+- sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
+- sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
+- sb->nr_disks++;
+- sb->spare_disks++;
++ spare_desc = &sb->disks[sdisk->number];
++ failed_desc = &sb->disks[fdisk->number];
+
+- restore_flags(flags);
++ if (spare_desc != *d) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
+
+- md_update_sb(MINOR(dev));
++ if (spare_desc->raid_disk != sdisk->raid_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ if (sdisk->raid_disk != spare_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
+
+- printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
++ if (failed_desc->raid_disk != fdisk->raid_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
+
+- return 0;
+-}
++ if (fdisk->raid_disk != failed_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
+
+-#undef NO_SUPERBLOCK
+-#undef WRONG_LEVEL
+-#undef HOT_ADD_SUCCEEDED
++ /*
++ * do the switch finally
++ */
++ xchg_values(*spare_desc, *failed_desc);
++ xchg_values(*fdisk, *sdisk);
+
+-/*
+- * Insert the spare disk into the drive-ring
+- */
+-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
+-{
+- int j, next;
+- struct mirror_info *p = raid_conf->mirrors;
++ /*
++ * (careful, 'failed' and 'spare' are switched from now on)
++ *
++ * we want to preserve linear numbering and we want to
++ * give the proper raid_disk number to the now activated
++ * disk. (this means we switch back these values)
++ */
++
++ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
++ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
++ xchg_values(spare_desc->number, failed_desc->number);
++ xchg_values(sdisk->number, fdisk->number);
+
+- for (j = 0; j < raid_conf->raid_disks; j++, p++)
+- if (p->operational && !p->write_only) {
+- next = p->next;
+- p->next = mirror->raid_disk;
+- mirror->next = next;
+- return;
+- }
+- printk("raid1: bug: no read-operational devices\n");
+-}
++ *d = failed_desc;
+
+-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
+- int state)
+-{
+- int i = 0, failed_disk = -1;
+- struct raid1_data *raid_conf = mddev->private;
+- struct mirror_info *mirror = raid_conf->mirrors;
+- md_descriptor_t *descriptor;
+- unsigned long flags;
++ if (sdisk->dev == MKDEV(0,0))
++ sdisk->used_slot = 0;
++ /*
++ * this really activates the spare.
++ */
++ fdisk->spare = 0;
++ fdisk->write_only = 0;
++ link_disk(conf, fdisk);
+
+- for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
+- if (mirror->spare && mirror->number == spare->number)
+- goto found;
+- }
+- return 1;
+-found:
+- for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
+- i++, mirror++)
+- if (!mirror->operational)
+- failed_disk = i;
++ /*
++ * if we activate a spare, we definitely replace a
++ * non-operational disk slot in the 'low' area of
++ * the disk array.
++ */
+
+- save_flags(flags);
+- cli();
+- switch (state) {
+- case SPARE_WRITE:
+- mirror->operational = 1;
+- mirror->write_only = 1;
+- raid_conf->raid_disks = MAX(raid_conf->raid_disks,
+- mirror->raid_disk + 1);
+- break;
+- case SPARE_INACTIVE:
+- mirror->operational = 0;
+- mirror->write_only = 0;
+- break;
+- case SPARE_ACTIVE:
+- mirror->spare = 0;
+- mirror->write_only = 0;
+- raid_conf->working_disks++;
+- add_ring(raid_conf, mirror);
+-
+- if (failed_disk != -1) {
+- descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
+- i = spare->raid_disk;
+- spare->raid_disk = descriptor->raid_disk;
+- descriptor->raid_disk = i;
+- }
+- break;
+- default:
+- printk("raid1_mark_spare: bug: state == %d\n", state);
+- restore_flags(flags);
+- return 1;
++ conf->working_disks++;
++
++ break;
++
++ case DISKOP_HOT_REMOVE_DISK:
++ rdisk = conf->mirrors + removed_disk;
++
++ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ rdisk->dev = MKDEV(0,0);
++ rdisk->used_slot = 0;
++ conf->nr_disks--;
++ break;
++
++ case DISKOP_HOT_ADD_DISK:
++ adisk = conf->mirrors + added_disk;
++ added_desc = *d;
++
++ if (added_disk != added_desc->number) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ adisk->number = added_desc->number;
++ adisk->raid_disk = added_desc->raid_disk;
++ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
++
++ adisk->operational = 0;
++ adisk->write_only = 0;
++ adisk->spare = 1;
++ adisk->used_slot = 1;
++ conf->nr_disks++;
++
++ break;
++
++ default:
++ MD_BUG();
++ err = 1;
++ goto abort;
+ }
++abort:
+ restore_flags(flags);
+- return 0;
++ print_raid1_conf(conf);
++ return err;
+ }
+
++
++#define IO_ERROR KERN_ALERT \
++"raid1: %s: unrecoverable I/O read error for block %lu\n"
++
++#define REDIRECT_SECTOR KERN_ERR \
++"raid1: %s: redirecting sector %lu to another mirror\n"
++
+ /*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ */
+-void raid1d (void *data)
++static void raid1d (void *data)
+ {
+ struct buffer_head *bh;
+ kdev_t dev;
+ unsigned long flags;
+- struct raid1_bh * r1_bh;
+- struct md_dev *mddev;
++ struct raid1_bh *r1_bh;
++ mddev_t *mddev;
+
+- PRINTK(("raid1d() active\n"));
+- save_flags(flags);
+- cli();
+ while (raid1_retry_list) {
++ save_flags(flags);
++ cli();
+ bh = raid1_retry_list;
+ r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+ raid1_retry_list = r1_bh->next_retry;
+ restore_flags(flags);
+
+- mddev = md_dev + MINOR(bh->b_dev);
++ mddev = kdev_to_mddev(bh->b_dev);
+ if (mddev->sb_dirty) {
+- printk("dirty sb detected, updating.\n");
++ printk(KERN_INFO "dirty sb detected, updating.\n");
+ mddev->sb_dirty = 0;
+- md_update_sb(MINOR(bh->b_dev));
++ md_update_sb(mddev);
+ }
+ dev = bh->b_rdev;
+- __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
++ __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
++ bh->b_size >> 9);
+ if (bh->b_rdev == dev) {
+- printk (KERN_ALERT
+- "raid1: %s: unrecoverable I/O read error for block %lu\n",
+- kdevname(bh->b_dev), bh->b_blocknr);
+- raid1_end_buffer_io(r1_bh, 0);
++ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
++ raid1_end_bh_io(r1_bh, 0);
+ } else {
+- printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
+- kdevname(bh->b_dev), bh->b_blocknr);
++ printk (REDIRECT_SECTOR,
++ partition_name(bh->b_dev), bh->b_blocknr);
+ map_and_make_request (r1_bh->cmd, bh);
+ }
+- cli();
+ }
+- restore_flags(flags);
++}
++#undef IO_ERROR
++#undef REDIRECT_SECTOR
++
++/*
++ * Private kernel thread to reconstruct mirrors after an unclean
++ * shutdown.
++ */
++static void raid1syncd (void *data)
++{
++ raid1_conf_t *conf = data;
++ mddev_t *mddev = conf->mddev;
++
++ if (!conf->resync_mirrors)
++ return;
++ if (conf->resync_mirrors == 2)
++ return;
++ down(&mddev->recovery_sem);
++ if (md_do_sync(mddev, NULL)) {
++ up(&mddev->recovery_sem);
++ return;
++ }
++ /*
++ * Only if everything went Ok.
++ */
++ conf->resync_mirrors = 0;
++ up(&mddev->recovery_sem);
+ }
+
++
+ /*
+ * This will catch the scenario in which one of the mirrors was
+ * mounted as a normal device rather than as a part of a raid set.
++ *
++ * check_consistency is very personality-dependent, eg. RAID5 cannot
++ * do this check, it uses another method.
+ */
+-static int __check_consistency (struct md_dev *mddev, int row)
++static int __check_consistency (mddev_t *mddev, int row)
+ {
+- struct raid1_data *raid_conf = mddev->private;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++ int disks = MD_SB_DISKS;
+ kdev_t dev;
+ struct buffer_head *bh = NULL;
+ int i, rc = 0;
+ char *buffer = NULL;
+
+- for (i = 0; i < raid_conf->raid_disks; i++) {
+- if (!raid_conf->mirrors[i].operational)
++ for (i = 0; i < disks; i++) {
++ printk("(checking disk %d)\n",i);
++ if (!conf->mirrors[i].operational)
+ continue;
+- dev = raid_conf->mirrors[i].dev;
++ printk("(really checking disk %d)\n",i);
++ dev = conf->mirrors[i].dev;
+ set_blocksize(dev, 4096);
+ if ((bh = bread(dev, row / 4, 4096)) == NULL)
+ break;
+@@ -683,167 +864,342 @@
+ return rc;
+ }
+
+-static int check_consistency (struct md_dev *mddev)
++static int check_consistency (mddev_t *mddev)
+ {
+- int size = mddev->sb->size;
+- int row;
++ if (__check_consistency(mddev, 0))
++/*
++ * we do not do this currently, as it's perfectly possible to
++ * have an inconsistent array when it's freshly created. Only
++ * newly written data has to be consistent.
++ */
++ return 0;
+
+- for (row = 0; row < size; row += size / 8)
+- if (__check_consistency(mddev, row))
+- return 1;
+ return 0;
+ }
+
+-static int raid1_run (int minor, struct md_dev *mddev)
++#define INVALID_LEVEL KERN_WARNING \
++"raid1: md%d: raid level not set to mirroring (%d)\n"
++
++#define NO_SB KERN_ERR \
++"raid1: disabled mirror %s (couldn't access raid superblock)\n"
++
++#define ERRORS KERN_ERR \
++"raid1: disabled mirror %s (errors detected)\n"
++
++#define NOT_IN_SYNC KERN_ERR \
++"raid1: disabled mirror %s (not in sync)\n"
++
++#define INCONSISTENT KERN_ERR \
++"raid1: disabled mirror %s (inconsistent descriptor)\n"
++
++#define ALREADY_RUNNING KERN_ERR \
++"raid1: disabled mirror %s (mirror %d already operational)\n"
++
++#define OPERATIONAL KERN_INFO \
++"raid1: device %s operational as mirror %d\n"
++
++#define MEM_ERROR KERN_ERR \
++"raid1: couldn't allocate memory for md%d\n"
++
++#define SPARE KERN_INFO \
++"raid1: spare disk %s\n"
++
++#define NONE_OPERATIONAL KERN_ERR \
++"raid1: no operational mirrors for md%d\n"
++
++#define RUNNING_CKRAID KERN_ERR \
++"raid1: detected mirror differences -- running resync\n"
++
++#define ARRAY_IS_ACTIVE KERN_INFO \
++"raid1: raid set md%d active with %d out of %d mirrors\n"
++
++#define THREAD_ERROR KERN_ERR \
++"raid1: couldn't allocate thread for md%d\n"
++
++#define START_RESYNC KERN_WARNING \
++"raid1: raid set md%d not clean; reconstructing mirrors\n"
++
++static int raid1_run (mddev_t *mddev)
+ {
+- struct raid1_data *raid_conf;
+- int i, j, raid_disk;
+- md_superblock_t *sb = mddev->sb;
+- md_descriptor_t *descriptor;
+- struct real_dev *realdev;
++ raid1_conf_t *conf;
++ int i, j, disk_idx;
++ struct mirror_info *disk;
++ mdp_super_t *sb = mddev->sb;
++ mdp_disk_t *descriptor;
++ mdk_rdev_t *rdev;
++ struct md_list_head *tmp;
++ int start_recovery = 0;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != 1) {
+- printk("raid1: %s: raid level not set to mirroring (%d)\n",
+- kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
+- MOD_DEC_USE_COUNT;
+- return -EIO;
+- }
+- /****
+- * copy the now verified devices into our private RAID1 bookkeeping
+- * area. [whatever we allocate in raid1_run(), should be freed in
+- * raid1_stop()]
++ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
++ goto out;
++ }
++ /*
++ * copy the already verified devices into our private RAID1
++ * bookkeeping area. [whatever we allocate in raid1_run(),
++ * should be freed in raid1_stop()]
+ */
+
+- while (!( /* FIXME: now we are rather fault tolerant than nice */
+- mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
+- ) )
+- {
+- printk ("raid1_run(): out of memory\n");
+- current->policy |= SCHED_YIELD;
+- schedule();
+- }
+- raid_conf = mddev->private;
+- memset(raid_conf, 0, sizeof(*raid_conf));
+-
+- PRINTK(("raid1_run(%d) called.\n", minor));
+-
+- for (i = 0; i < mddev->nb_dev; i++) {
+- realdev = &mddev->devices[i];
+- if (!realdev->sb) {
+- printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
++ conf = raid1_kmalloc(sizeof(raid1_conf_t));
++ mddev->private = conf;
++ if (!conf) {
++ printk(MEM_ERROR, mdidx(mddev));
++ goto out;
++ }
++
++ ITERATE_RDEV(mddev,rdev,tmp) {
++ if (rdev->faulty) {
++ printk(ERRORS, partition_name(rdev->dev));
++ } else {
++ if (!rdev->sb) {
++ MD_BUG();
++ continue;
++ }
++ }
++ if (rdev->desc_nr == -1) {
++ MD_BUG();
+ continue;
+ }
+-
+- /*
+- * This is important -- we are using the descriptor on
+- * the disk only to get a pointer to the descriptor on
+- * the main superblock, which might be more recent.
+- */
+- descriptor = &sb->disks[realdev->sb->descriptor.number];
+- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
+- printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
++ descriptor = &sb->disks[rdev->desc_nr];
++ disk_idx = descriptor->raid_disk;
++ disk = conf->mirrors + disk_idx;
++
++ if (disk_faulty(descriptor)) {
++ disk->number = descriptor->number;
++ disk->raid_disk = disk_idx;
++ disk->dev = rdev->dev;
++ disk->sect_limit = MAX_LINEAR_SECTORS;
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 0;
++ disk->used_slot = 1;
+ continue;
+ }
+- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
+- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
+- printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
++ if (disk_active(descriptor)) {
++ if (!disk_sync(descriptor)) {
++ printk(NOT_IN_SYNC,
++ partition_name(rdev->dev));
+ continue;
+ }
+- raid_disk = descriptor->raid_disk;
+- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
+- printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
++ if ((descriptor->number > MD_SB_DISKS) ||
++ (disk_idx > sb->raid_disks)) {
++
++ printk(INCONSISTENT,
++ partition_name(rdev->dev));
+ continue;
+ }
+- if (raid_conf->mirrors[raid_disk].operational) {
+- printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
++ if (disk->operational) {
++ printk(ALREADY_RUNNING,
++ partition_name(rdev->dev),
++ disk_idx);
+ continue;
+ }
+- printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
+- raid_conf->mirrors[raid_disk].number = descriptor->number;
+- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
+- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
+- raid_conf->mirrors[raid_disk].operational = 1;
+- raid_conf->mirrors[raid_disk].sect_limit = 128;
+- raid_conf->working_disks++;
++ printk(OPERATIONAL, partition_name(rdev->dev),
++ disk_idx);
++ disk->number = descriptor->number;
++ disk->raid_disk = disk_idx;
++ disk->dev = rdev->dev;
++ disk->sect_limit = MAX_LINEAR_SECTORS;
++ disk->operational = 1;
++ disk->write_only = 0;
++ disk->spare = 0;
++ disk->used_slot = 1;
++ conf->working_disks++;
+ } else {
+ /*
+ * Must be a spare disk ..
+ */
+- printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
+- raid_disk = descriptor->raid_disk;
+- raid_conf->mirrors[raid_disk].number = descriptor->number;
+- raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
+- raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
+- raid_conf->mirrors[raid_disk].sect_limit = 128;
+-
+- raid_conf->mirrors[raid_disk].operational = 0;
+- raid_conf->mirrors[raid_disk].write_only = 0;
+- raid_conf->mirrors[raid_disk].spare = 1;
+- }
+- }
+- if (!raid_conf->working_disks) {
+- printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+- kfree(raid_conf);
+- mddev->private = NULL;
+- MOD_DEC_USE_COUNT;
+- return -EIO;
+- }
+-
+- raid_conf->raid_disks = sb->raid_disks;
+- raid_conf->mddev = mddev;
+-
+- for (j = 0; !raid_conf->mirrors[j].operational; j++);
+- raid_conf->last_used = j;
+- for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
+- if (raid_conf->mirrors[i].operational) {
+- PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
+- raid_conf->mirrors[i].next = j;
++ printk(SPARE, partition_name(rdev->dev));
++ disk->number = descriptor->number;
++ disk->raid_disk = disk_idx;
++ disk->dev = rdev->dev;
++ disk->sect_limit = MAX_LINEAR_SECTORS;
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 1;
++ disk->used_slot = 1;
++ }
++ }
++ if (!conf->working_disks) {
++ printk(NONE_OPERATIONAL, mdidx(mddev));
++ goto out_free_conf;
++ }
++
++ conf->raid_disks = sb->raid_disks;
++ conf->nr_disks = sb->nr_disks;
++ conf->mddev = mddev;
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++
++ descriptor = sb->disks+i;
++ disk_idx = descriptor->raid_disk;
++ disk = conf->mirrors + disk_idx;
++
++ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
++ !disk->used_slot) {
++
++ disk->number = descriptor->number;
++ disk->raid_disk = disk_idx;
++ disk->dev = MKDEV(0,0);
++
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 0;
++ disk->used_slot = 1;
++ }
++ }
++
++ /*
++ * find the first working one and use it as a starting point
++ * to read balancing.
++ */
++ for (j = 0; !conf->mirrors[j].operational; j++)
++ /* nothing */;
++ conf->last_used = j;
++
++ /*
++ * initialize the 'working disks' list.
++ */
++ for (i = conf->raid_disks - 1; i >= 0; i--) {
++ if (conf->mirrors[i].operational) {
++ conf->mirrors[i].next = j;
+ j = i;
+ }
+ }
+
+- if (check_consistency(mddev)) {
+- printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
+- sb->state |= 1 << MD_SB_ERRORS;
+- kfree(raid_conf);
+- mddev->private = NULL;
+- MOD_DEC_USE_COUNT;
+- return -EIO;
++ if (conf->working_disks != sb->raid_disks) {
++ printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
++ start_recovery = 1;
+ }
+
++ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
++ /*
++ * we do sanity checks even if the device says
++ * it's clean ...
++ */
++ if (check_consistency(mddev)) {
++ printk(RUNNING_CKRAID);
++ sb->state &= ~(1 << MD_SB_CLEAN);
++ }
++ }
++
++ {
++ const char * name = "raid1d";
++
++ conf->thread = md_register_thread(raid1d, conf, name);
++ if (!conf->thread) {
++ printk(THREAD_ERROR, mdidx(mddev));
++ goto out_free_conf;
++ }
++ }
++
++ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
++ const char * name = "raid1syncd";
++
++ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
++ if (!conf->resync_thread) {
++ printk(THREAD_ERROR, mdidx(mddev));
++ goto out_free_conf;
++ }
++
++ printk(START_RESYNC, mdidx(mddev));
++ conf->resync_mirrors = 1;
++ md_wakeup_thread(conf->resync_thread);
++ }
++
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+- for (i = 0; i < sb->nr_disks ; i++) {
+- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ mark_disk_nonsync(sb->disks+i);
+ for (j = 0; j < sb->raid_disks; j++) {
+- if (!raid_conf->mirrors[j].operational)
++ if (!conf->mirrors[j].operational)
+ continue;
+- if (sb->disks[i].number == raid_conf->mirrors[j].number)
+- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
++ if (sb->disks[i].number == conf->mirrors[j].number)
++ mark_disk_sync(sb->disks+i);
+ }
+ }
+- sb->active_disks = raid_conf->working_disks;
++ sb->active_disks = conf->working_disks;
+
+- printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
+- /* Ok, everything is just fine now */
+- return (0);
++ if (start_recovery)
++ md_recover_arrays();
++
++
++ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
++ /*
++ * Ok, everything is just fine now
++ */
++ return 0;
++
++out_free_conf:
++ kfree(conf);
++ mddev->private = NULL;
++out:
++ MOD_DEC_USE_COUNT;
++ return -EIO;
++}
++
++#undef INVALID_LEVEL
++#undef NO_SB
++#undef ERRORS
++#undef NOT_IN_SYNC
++#undef INCONSISTENT
++#undef ALREADY_RUNNING
++#undef OPERATIONAL
++#undef SPARE
++#undef NONE_OPERATIONAL
++#undef RUNNING_CKRAID
++#undef ARRAY_IS_ACTIVE
++
++static int raid1_stop_resync (mddev_t *mddev)
++{
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++
++ if (conf->resync_thread) {
++ if (conf->resync_mirrors) {
++ conf->resync_mirrors = 2;
++ md_interrupt_thread(conf->resync_thread);
++ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
++ return 1;
++ }
++ return 0;
++ }
++ return 0;
++}
++
++static int raid1_restart_resync (mddev_t *mddev)
++{
++ raid1_conf_t *conf = mddev_to_conf(mddev);
++
++ if (conf->resync_mirrors) {
++ if (!conf->resync_thread) {
++ MD_BUG();
++ return 0;
++ }
++ conf->resync_mirrors = 1;
++ md_wakeup_thread(conf->resync_thread);
++ return 1;
++ }
++ return 0;
+ }
+
+-static int raid1_stop (int minor, struct md_dev *mddev)
++static int raid1_stop (mddev_t *mddev)
+ {
+- struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+- kfree (raid_conf);
++ md_unregister_thread(conf->thread);
++ if (conf->resync_thread)
++ md_unregister_thread(conf->resync_thread);
++ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+ }
+
+-static struct md_personality raid1_personality=
++static mdk_personality_t raid1_personality=
+ {
+ "raid1",
+ raid1_map,
+@@ -855,15 +1211,13 @@
+ NULL, /* no ioctls */
+ 0,
+ raid1_error,
+- raid1_hot_add_disk,
+- /* raid1_hot_remove_drive */ NULL,
+- raid1_mark_spare
++ raid1_diskop,
++ raid1_stop_resync,
++ raid1_restart_resync
+ };
+
+ int raid1_init (void)
+ {
+- if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
+- return -EBUSY;
+ return register_md_personality (RAID1, &raid1_personality);
+ }
+
+@@ -875,7 +1229,6 @@
+
+ void cleanup_module (void)
+ {
+- md_unregister_thread (raid1_thread);
+ unregister_md_personality (RAID1);
+ }
+ #endif
+diff -uNr linux-orig/drivers/block/raid5.c linux/drivers/block/raid5.c
+--- linux-orig/drivers/block/raid5.c Fri May 8 00:17:13 1998
++++ linux/drivers/block/raid5.c Fri Dec 8 22:54:52 2000
+@@ -1,4 +1,4 @@
+-/*****************************************************************************
++/*
+ * raid5.c : Multiple Devices driver for Linux
+ * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+@@ -14,16 +14,15 @@
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
++
+ #include <linux/module.h>
+ #include <linux/locks.h>
+ #include <linux/malloc.h>
+-#include <linux/md.h>
+-#include <linux/raid5.h>
++#include <linux/raid/raid5.h>
+ #include <asm/bitops.h>
+ #include <asm/atomic.h>
+-#include <asm/md.h>
+
+-static struct md_personality raid5_personality;
++static mdk_personality_t raid5_personality;
+
+ /*
+ * Stripe cache
+@@ -33,7 +32,7 @@
+ #define HASH_PAGES_ORDER 0
+ #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+ #define HASH_MASK (NR_HASH - 1)
+-#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
++#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
+
+ /*
+ * The following can be used to debug the driver
+@@ -46,6 +45,8 @@
+ #define PRINTK(x) do { ; } while (0)
+ #endif
+
++static void print_raid5_conf (raid5_conf_t *conf);
++
+ static inline int stripe_locked(struct stripe_head *sh)
+ {
+ return test_bit(STRIPE_LOCKED, &sh->state);
+@@ -61,32 +62,32 @@
+ */
+ static inline void lock_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
++ raid5_conf_t *conf = sh->raid_conf;
++ if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
+ PRINTK(("locking stripe %lu\n", sh->sector));
+- raid_conf->nr_locked_stripes++;
++ conf->nr_locked_stripes++;
+ }
+ }
+
+ static inline void unlock_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
++ raid5_conf_t *conf = sh->raid_conf;
++ if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
+ PRINTK(("unlocking stripe %lu\n", sh->sector));
+- raid_conf->nr_locked_stripes--;
++ conf->nr_locked_stripes--;
+ wake_up(&sh->wait);
+ }
+ }
+
+ static inline void finish_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
++ raid5_conf_t *conf = sh->raid_conf;
+ unlock_stripe(sh);
+ sh->cmd = STRIPE_NONE;
+ sh->phase = PHASE_COMPLETE;
+- raid_conf->nr_pending_stripes--;
+- raid_conf->nr_cached_stripes++;
+- wake_up(&raid_conf->wait_for_stripe);
++ conf->nr_pending_stripes--;
++ conf->nr_cached_stripes++;
++ wake_up(&conf->wait_for_stripe);
+ }
+
+ void __wait_on_stripe(struct stripe_head *sh)
+@@ -114,7 +115,7 @@
+ __wait_on_stripe(sh);
+ }
+
+-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
++static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+ PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
+
+@@ -123,21 +124,22 @@
+ sh->hash_next->hash_pprev = sh->hash_pprev;
+ *sh->hash_pprev = sh->hash_next;
+ sh->hash_pprev = NULL;
+- raid_conf->nr_hashed_stripes--;
++ conf->nr_hashed_stripes--;
+ }
+ }
+
+-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
++static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+- struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
++ struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
+
+- PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
++ PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
++ sh->sector, conf->nr_hashed_stripes));
+
+ if ((sh->hash_next = *shp) != NULL)
+ (*shp)->hash_pprev = &sh->hash_next;
+ *shp = sh;
+ sh->hash_pprev = shp;
+- raid_conf->nr_hashed_stripes++;
++ conf->nr_hashed_stripes++;
+ }
+
+ static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
+@@ -145,13 +147,15 @@
+ struct buffer_head *bh;
+ unsigned long flags;
+
+- save_flags(flags);
+- cli();
+- if ((bh = sh->buffer_pool) == NULL)
+- return NULL;
++ md_spin_lock_irqsave(&sh->stripe_lock, flags);
++ bh = sh->buffer_pool;
++ if (!bh)
++ goto out_unlock;
+ sh->buffer_pool = bh->b_next;
+ bh->b_size = b_size;
+- restore_flags(flags);
++out_unlock:
++ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
++
+ return bh;
+ }
+
+@@ -160,12 +164,14 @@
+ struct buffer_head *bh;
+ unsigned long flags;
+
+- save_flags(flags);
+- cli();
+- if ((bh = sh->bh_pool) == NULL)
+- return NULL;
++ md_spin_lock_irqsave(&sh->stripe_lock, flags);
++ bh = sh->bh_pool;
++ if (!bh)
++ goto out_unlock;
+ sh->bh_pool = bh->b_next;
+- restore_flags(flags);
++out_unlock:
++ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
++
+ return bh;
+ }
+
+@@ -173,54 +179,52 @@
+ {
+ unsigned long flags;
+
+- save_flags(flags);
+- cli();
++ md_spin_lock_irqsave(&sh->stripe_lock, flags);
+ bh->b_next = sh->buffer_pool;
+ sh->buffer_pool = bh;
+- restore_flags(flags);
++ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+
+ static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
+ {
+ unsigned long flags;
+
+- save_flags(flags);
+- cli();
++ md_spin_lock_irqsave(&sh->stripe_lock, flags);
+ bh->b_next = sh->bh_pool;
+ sh->bh_pool = bh;
+- restore_flags(flags);
++ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+
+-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
++static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+ {
+ struct stripe_head *sh;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+- if ((sh = raid_conf->free_sh_list) == NULL) {
++ if ((sh = conf->free_sh_list) == NULL) {
+ restore_flags(flags);
+ return NULL;
+ }
+- raid_conf->free_sh_list = sh->free_next;
+- raid_conf->nr_free_sh--;
+- if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
++ conf->free_sh_list = sh->free_next;
++ conf->nr_free_sh--;
++ if (!conf->nr_free_sh && conf->free_sh_list)
+ printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
+ restore_flags(flags);
+- if (sh->hash_pprev || sh->nr_pending || sh->count)
++ if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
+ printk("get_free_stripe(): bug\n");
+ return sh;
+ }
+
+-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
++static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+- sh->free_next = raid_conf->free_sh_list;
+- raid_conf->free_sh_list = sh;
+- raid_conf->nr_free_sh++;
++ sh->free_next = conf->free_sh_list;
++ conf->free_sh_list = sh;
++ conf->nr_free_sh++;
+ restore_flags(flags);
+ }
+
+@@ -324,8 +328,8 @@
+
+ static void kfree_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int disks = raid_conf->raid_disks, j;
++ raid5_conf_t *conf = sh->raid_conf;
++ int disks = conf->raid_disks, j;
+
+ PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
+ if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
+@@ -338,19 +342,19 @@
+ if (sh->bh_new[j] || sh->bh_copy[j])
+ printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
+ }
+- remove_hash(raid_conf, sh);
+- put_free_stripe(raid_conf, sh);
++ remove_hash(conf, sh);
++ put_free_stripe(conf, sh);
+ }
+
+-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
++static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
+ {
+ struct stripe_head *sh;
+ int i, count = 0;
+
+- PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
++ PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
+ for (i = 0; i < NR_HASH; i++) {
+ repeat:
+- sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
++ sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
+ for (; sh; sh = sh->hash_next) {
+ if (sh->phase != PHASE_COMPLETE)
+ continue;
+@@ -360,30 +364,30 @@
+ continue;
+ kfree_stripe(sh);
+ if (++count == nr) {
+- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
+- raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
++ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
++ conf->clock = (i + conf->clock) & HASH_MASK;
+ return nr;
+ }
+ goto repeat;
+ }
+ }
+- PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
++ PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
+ return count;
+ }
+
+-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+ struct stripe_head *sh;
+
+- if (raid_conf->buffer_size != size) {
+- PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
+- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
+- raid_conf->buffer_size = size;
++ if (conf->buffer_size != size) {
++ PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
++ shrink_stripe_cache(conf, conf->max_nr_stripes);
++ conf->buffer_size = size;
+ }
+
+ PRINTK(("find_stripe, sector %lu\n", sector));
+- for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
+- if (sh->sector == sector && sh->raid_conf == raid_conf) {
++ for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
++ if (sh->sector == sector && sh->raid_conf == conf) {
+ if (sh->size == size) {
+ PRINTK(("found stripe %lu\n", sector));
+ return sh;
+@@ -397,7 +401,7 @@
+ return NULL;
+ }
+
+-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
++static int grow_stripes(raid5_conf_t *conf, int num, int priority)
+ {
+ struct stripe_head *sh;
+
+@@ -405,62 +409,64 @@
+ if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
+ return 1;
+ memset(sh, 0, sizeof(*sh));
+- if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
+- shrink_buffers(sh, 2 * raid_conf->raid_disks);
++ sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
++
++ if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
++ shrink_buffers(sh, 2 * conf->raid_disks);
+ kfree(sh);
+ return 1;
+ }
+- if (grow_bh(sh, raid_conf->raid_disks, priority)) {
+- shrink_buffers(sh, 2 * raid_conf->raid_disks);
+- shrink_bh(sh, raid_conf->raid_disks);
++ if (grow_bh(sh, conf->raid_disks, priority)) {
++ shrink_buffers(sh, 2 * conf->raid_disks);
++ shrink_bh(sh, conf->raid_disks);
+ kfree(sh);
+ return 1;
+ }
+- put_free_stripe(raid_conf, sh);
+- raid_conf->nr_stripes++;
++ put_free_stripe(conf, sh);
++ conf->nr_stripes++;
+ }
+ return 0;
+ }
+
+-static void shrink_stripes(struct raid5_data *raid_conf, int num)
++static void shrink_stripes(raid5_conf_t *conf, int num)
+ {
+ struct stripe_head *sh;
+
+ while (num--) {
+- sh = get_free_stripe(raid_conf);
++ sh = get_free_stripe(conf);
+ if (!sh)
+ break;
+- shrink_buffers(sh, raid_conf->raid_disks * 2);
+- shrink_bh(sh, raid_conf->raid_disks);
++ shrink_buffers(sh, conf->raid_disks * 2);
++ shrink_bh(sh, conf->raid_disks);
+ kfree(sh);
+- raid_conf->nr_stripes--;
++ conf->nr_stripes--;
+ }
+ }
+
+-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+ struct stripe_head *sh = NULL, *tmp;
+ struct buffer_head *buffer_pool, *bh_pool;
+
+ PRINTK(("kmalloc_stripe called\n"));
+
+- while ((sh = get_free_stripe(raid_conf)) == NULL) {
+- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
+- if ((sh = get_free_stripe(raid_conf)) != NULL)
++ while ((sh = get_free_stripe(conf)) == NULL) {
++ shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
++ if ((sh = get_free_stripe(conf)) != NULL)
+ break;
+- if (!raid_conf->nr_pending_stripes)
++ if (!conf->nr_pending_stripes)
+ printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
+- md_wakeup_thread(raid_conf->thread);
++ md_wakeup_thread(conf->thread);
+ PRINTK(("waiting for some stripes to complete\n"));
+- sleep_on(&raid_conf->wait_for_stripe);
++ sleep_on(&conf->wait_for_stripe);
+ }
+
+ /*
+ * The above might have slept, so perhaps another process
+ * already created the stripe for us..
+ */
+- if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
+- put_free_stripe(raid_conf, sh);
++ if ((tmp = find_stripe(conf, sector, size)) != NULL) {
++ put_free_stripe(conf, sh);
+ wait_on_stripe(tmp);
+ return tmp;
+ }
+@@ -472,25 +478,25 @@
+ sh->bh_pool = bh_pool;
+ sh->phase = PHASE_COMPLETE;
+ sh->cmd = STRIPE_NONE;
+- sh->raid_conf = raid_conf;
++ sh->raid_conf = conf;
+ sh->sector = sector;
+ sh->size = size;
+- raid_conf->nr_cached_stripes++;
+- insert_hash(raid_conf, sh);
++ conf->nr_cached_stripes++;
++ insert_hash(conf, sh);
+ } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
+ return sh;
+ }
+
+-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+ struct stripe_head *sh;
+
+ PRINTK(("get_stripe, sector %lu\n", sector));
+- sh = find_stripe(raid_conf, sector, size);
++ sh = find_stripe(conf, sector, size);
+ if (sh)
+ wait_on_stripe(sh);
+ else
+- sh = kmalloc_stripe(raid_conf, sector, size);
++ sh = kmalloc_stripe(conf, sector, size);
+ return sh;
+ }
+
+@@ -523,7 +529,7 @@
+ bh->b_end_io(bh, uptodate);
+ if (!uptodate)
+ printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
+- "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
++ "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
+ }
+
+ static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
+@@ -537,36 +543,35 @@
+ static void raid5_end_request (struct buffer_head * bh, int uptodate)
+ {
+ struct stripe_head *sh = bh->b_dev_id;
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int disks = raid_conf->raid_disks, i;
++ raid5_conf_t *conf = sh->raid_conf;
++ int disks = conf->raid_disks, i;
+ unsigned long flags;
+
+ PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
+- save_flags(flags);
+- cli();
++ md_spin_lock_irqsave(&sh->stripe_lock, flags);
+ raid5_mark_buffer_uptodate(bh, uptodate);
+- --sh->nr_pending;
+- if (!sh->nr_pending) {
+- md_wakeup_thread(raid_conf->thread);
+- atomic_inc(&raid_conf->nr_handle);
++ if (atomic_dec_and_test(&sh->nr_pending)) {
++ md_wakeup_thread(conf->thread);
++ atomic_inc(&conf->nr_handle);
+ }
+- if (!uptodate)
++ if (!uptodate) {
+ md_error(bh->b_dev, bh->b_rdev);
+- if (raid_conf->failed_disks) {
++ }
++ if (conf->failed_disks) {
+ for (i = 0; i < disks; i++) {
+- if (raid_conf->disks[i].operational)
++ if (conf->disks[i].operational)
+ continue;
+ if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
+ continue;
+- if (bh->b_rdev != raid_conf->disks[i].dev)
++ if (bh->b_rdev != conf->disks[i].dev)
+ continue;
+ set_bit(STRIPE_ERROR, &sh->state);
+ }
+ }
+- restore_flags(flags);
++ md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+
+-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+ unsigned long *rsector, unsigned long size)
+ {
+ /* No complex mapping used: the core of the work is done in the
+@@ -577,11 +582,10 @@
+
+ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- struct md_dev *mddev = raid_conf->mddev;
+- int minor = (int) (mddev - md_dev);
++ raid5_conf_t *conf = sh->raid_conf;
++ mddev_t *mddev = conf->mddev;
+ char *b_data;
+- kdev_t dev = MKDEV(MD_MAJOR, minor);
++ kdev_t dev = mddev_to_kdev(mddev);
+ int block = sh->sector / (sh->size >> 9);
+
+ b_data = ((volatile struct buffer_head *) bh)->b_data;
+@@ -589,7 +593,7 @@
+ init_buffer(bh, dev, block, raid5_end_request, sh);
+ ((volatile struct buffer_head *) bh)->b_data = b_data;
+
+- bh->b_rdev = raid_conf->disks[i].dev;
++ bh->b_rdev = conf->disks[i].dev;
+ bh->b_rsector = sh->sector;
+
+ bh->b_state = (1 << BH_Req);
+@@ -597,33 +601,62 @@
+ bh->b_list = BUF_LOCKED;
+ }
+
+-static int raid5_error (struct md_dev *mddev, kdev_t dev)
++static int raid5_error (mddev_t *mddev, kdev_t dev)
+ {
+- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+- md_superblock_t *sb = mddev->sb;
++ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++ mdp_super_t *sb = mddev->sb;
+ struct disk_info *disk;
+ int i;
+
+ PRINTK(("raid5_error called\n"));
+- raid_conf->resync_parity = 0;
+- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
++ conf->resync_parity = 0;
++ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
+ if (disk->dev == dev && disk->operational) {
+ disk->operational = 0;
+- sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
+- sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
+- sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
++ mark_disk_faulty(sb->disks+disk->number);
++ mark_disk_nonsync(sb->disks+disk->number);
++ mark_disk_inactive(sb->disks+disk->number);
+ sb->active_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ mddev->sb_dirty = 1;
+- raid_conf->working_disks--;
+- raid_conf->failed_disks++;
+- md_wakeup_thread(raid_conf->thread);
++ conf->working_disks--;
++ conf->failed_disks++;
++ md_wakeup_thread(conf->thread);
+ printk (KERN_ALERT
+- "RAID5: Disk failure on %s, disabling device."
+- "Operation continuing on %d devices\n",
+- kdevname (dev), raid_conf->working_disks);
++ "raid5: Disk failure on %s, disabling device."
++ " Operation continuing on %d devices\n",
++ partition_name (dev), conf->working_disks);
++ return -EIO;
+ }
++ }
++ /*
++ * handle errors in spares (during reconstruction)
++ */
++ if (conf->spare) {
++ disk = conf->spare;
++ if (disk->dev == dev) {
++ printk (KERN_ALERT
++ "raid5: Disk failure on spare %s\n",
++ partition_name (dev));
++ if (!conf->spare->operational) {
++ MD_BUG();
++ return -EIO;
++ }
++ disk->operational = 0;
++ disk->write_only = 0;
++ conf->spare = NULL;
++ mark_disk_faulty(sb->disks+disk->number);
++ mark_disk_nonsync(sb->disks+disk->number);
++ mark_disk_inactive(sb->disks+disk->number);
++ sb->spare_disks--;
++ sb->working_disks--;
++ sb->failed_disks++;
++
++ return -EIO;
++ }
++ }
++ MD_BUG();
+ return 0;
+ }
+
+@@ -634,12 +667,12 @@
+ static inline unsigned long
+ raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
+ unsigned int * dd_idx, unsigned int * pd_idx,
+- struct raid5_data *raid_conf)
++ raid5_conf_t *conf)
+ {
+ unsigned int stripe;
+ int chunk_number, chunk_offset;
+ unsigned long new_sector;
+- int sectors_per_chunk = raid_conf->chunk_size >> 9;
++ int sectors_per_chunk = conf->chunk_size >> 9;
+
+ /* First compute the information on this sector */
+
+@@ -662,9 +695,9 @@
+ /*
+ * Select the parity disk based on the user selected algorithm.
+ */
+- if (raid_conf->level == 4)
++ if (conf->level == 4)
+ *pd_idx = data_disks;
+- else switch (raid_conf->algorithm) {
++ else switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ *pd_idx = data_disks - stripe % raid_disks;
+ if (*dd_idx >= *pd_idx)
+@@ -684,7 +717,7 @@
+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ default:
+- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
++ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+ }
+
+ /*
+@@ -705,16 +738,16 @@
+
+ static unsigned long compute_blocknr(struct stripe_head *sh, int i)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
++ raid5_conf_t *conf = sh->raid_conf;
++ int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+ unsigned long new_sector = sh->sector, check;
+- int sectors_per_chunk = raid_conf->chunk_size >> 9;
++ int sectors_per_chunk = conf->chunk_size >> 9;
+ unsigned long stripe = new_sector / sectors_per_chunk;
+ int chunk_offset = new_sector % sectors_per_chunk;
+ int chunk_number, dummy1, dummy2, dd_idx = i;
+ unsigned long r_sector, blocknr;
+
+- switch (raid_conf->algorithm) {
++ switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ if (i > sh->pd_idx)
+@@ -727,14 +760,14 @@
+ i -= (sh->pd_idx + 1);
+ break;
+ default:
+- printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
++ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+ }
+
+ chunk_number = stripe * data_disks + i;
+ r_sector = chunk_number * sectors_per_chunk + chunk_offset;
+ blocknr = r_sector / (sh->size >> 9);
+
+- check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
++ check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+ if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+ printk("compute_blocknr: map not correct\n");
+ return 0;
+@@ -742,36 +775,11 @@
+ return blocknr;
+ }
+
+-#ifdef HAVE_ARCH_XORBLOCK
+-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
+-{
+- __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
+-}
+-#else
+-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
+-{
+- long lines = dest->b_size / (sizeof (long)) / 8, i;
+- long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
+-
+- for (i = lines; i > 0; i--) {
+- *(destp + 0) ^= *(sourcep + 0);
+- *(destp + 1) ^= *(sourcep + 1);
+- *(destp + 2) ^= *(sourcep + 2);
+- *(destp + 3) ^= *(sourcep + 3);
+- *(destp + 4) ^= *(sourcep + 4);
+- *(destp + 5) ^= *(sourcep + 5);
+- *(destp + 6) ^= *(sourcep + 6);
+- *(destp + 7) ^= *(sourcep + 7);
+- destp += 8;
+- sourcep += 8;
+- }
+-}
+-#endif
+-
+ static void compute_block(struct stripe_head *sh, int dd_idx)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int i, disks = raid_conf->raid_disks;
++ raid5_conf_t *conf = sh->raid_conf;
++ int i, count, disks = conf->raid_disks;
++ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+
+ PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
+
+@@ -780,69 +788,100 @@
+ raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
+
+ memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
++ bh_ptr[0] = sh->bh_old[dd_idx];
++ count = 1;
+ for (i = 0; i < disks; i++) {
+ if (i == dd_idx)
+ continue;
+ if (sh->bh_old[i]) {
+- xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
+- continue;
+- } else
++ bh_ptr[count++] = sh->bh_old[i];
++ } else {
+ printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
++ }
++ if (count == MAX_XOR_BLOCKS) {
++ xor_block(count, &bh_ptr[0]);
++ count = 1;
++ }
++ }
++ if(count != 1) {
++ xor_block(count, &bh_ptr[0]);
+ }
+ raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
+ }
+
+ static void compute_parity(struct stripe_head *sh, int method)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
++ raid5_conf_t *conf = sh->raid_conf;
++ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
++ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+
+ PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
++ lowprio = 1;
+ for (i = 0; i < disks; i++) {
+ if (i == pd_idx || !sh->bh_new[i])
+ continue;
+ if (!sh->bh_copy[i])
+ sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
+ raid5_build_block(sh, sh->bh_copy[i], i);
++ if (!buffer_lowprio(sh->bh_new[i]))
++ lowprio = 0;
++ else
++ mark_buffer_lowprio(sh->bh_copy[i]);
+ mark_buffer_clean(sh->bh_new[i]);
+ memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
+ }
+ if (sh->bh_copy[pd_idx] == NULL)
+ sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
+ raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
++ if (lowprio)
++ mark_buffer_lowprio(sh->bh_copy[pd_idx]);
+
+ if (method == RECONSTRUCT_WRITE) {
+ memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
++ bh_ptr[0] = sh->bh_copy[pd_idx];
++ count = 1;
+ for (i = 0; i < disks; i++) {
+ if (i == sh->pd_idx)
+ continue;
+ if (sh->bh_new[i]) {
+- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
+- continue;
++ bh_ptr[count++] = sh->bh_copy[i];
++ } else if (sh->bh_old[i]) {
++ bh_ptr[count++] = sh->bh_old[i];
+ }
+- if (sh->bh_old[i]) {
+- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
+- continue;
++ if (count == MAX_XOR_BLOCKS) {
++ xor_block(count, &bh_ptr[0]);
++ count = 1;
+ }
+ }
++ if (count != 1) {
++ xor_block(count, &bh_ptr[0]);
++ }
+ } else if (method == READ_MODIFY_WRITE) {
+ memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
++ bh_ptr[0] = sh->bh_copy[pd_idx];
++ count = 1;
+ for (i = 0; i < disks; i++) {
+ if (i == sh->pd_idx)
+ continue;
+ if (sh->bh_new[i] && sh->bh_old[i]) {
+- xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
+- xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
+- continue;
++ bh_ptr[count++] = sh->bh_copy[i];
++ bh_ptr[count++] = sh->bh_old[i];
++ }
++ if (count >= (MAX_XOR_BLOCKS - 1)) {
++ xor_block(count, &bh_ptr[0]);
++ count = 1;
+ }
+ }
++ if (count != 1) {
++ xor_block(count, &bh_ptr[0]);
++ }
+ }
+ raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
+ }
+
+ static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
++ raid5_conf_t *conf = sh->raid_conf;
+ struct buffer_head *bh_req;
+
+ if (sh->bh_new[dd_idx]) {
+@@ -860,19 +899,22 @@
+ if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
+ sh->phase = PHASE_BEGIN;
+ sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
+- raid_conf->nr_pending_stripes++;
+- atomic_inc(&raid_conf->nr_handle);
++ conf->nr_pending_stripes++;
++ atomic_inc(&conf->nr_handle);
+ }
+ sh->bh_new[dd_idx] = bh;
+ sh->bh_req[dd_idx] = bh_req;
+ sh->cmd_new[dd_idx] = rw;
+ sh->new[dd_idx] = 1;
++
++ if (buffer_lowprio(bh))
++ mark_buffer_lowprio(bh_req);
+ }
+
+ static void complete_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- int disks = raid_conf->raid_disks;
++ raid5_conf_t *conf = sh->raid_conf;
++ int disks = conf->raid_disks;
+ int i, new = 0;
+
+ PRINTK(("complete_stripe %lu\n", sh->sector));
+@@ -909,6 +951,22 @@
+ }
+ }
+
++
++static int is_stripe_lowprio(struct stripe_head *sh, int disks)
++{
++ int i, lowprio = 1;
++
++ for (i = 0; i < disks; i++) {
++ if (sh->bh_new[i])
++ if (!buffer_lowprio(sh->bh_new[i]))
++ lowprio = 0;
++ if (sh->bh_old[i])
++ if (!buffer_lowprio(sh->bh_old[i]))
++ lowprio = 0;
++ }
++ return lowprio;
++}
++
+ /*
+ * handle_stripe() is our main logic routine. Note that:
+ *
+@@ -919,28 +977,27 @@
+ * 2. We should be careful to set sh->nr_pending whenever we sleep,
+ * to prevent re-entry of handle_stripe() for the same sh.
+ *
+- * 3. raid_conf->failed_disks and disk->operational can be changed
++ * 3. conf->failed_disks and disk->operational can be changed
+ * from an interrupt. This complicates things a bit, but it allows
+ * us to stop issuing requests for a failed drive as soon as possible.
+ */
+ static void handle_stripe(struct stripe_head *sh)
+ {
+- struct raid5_data *raid_conf = sh->raid_conf;
+- struct md_dev *mddev = raid_conf->mddev;
+- int minor = (int) (mddev - md_dev);
++ raid5_conf_t *conf = sh->raid_conf;
++ mddev_t *mddev = conf->mddev;
+ struct buffer_head *bh;
+- int disks = raid_conf->raid_disks;
+- int i, nr = 0, nr_read = 0, nr_write = 0;
++ int disks = conf->raid_disks;
++ int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
+ int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
+ int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
+ int reading = 0, nr_writing = 0;
+ int method1 = INT_MAX, method2 = INT_MAX;
+ int block;
+ unsigned long flags;
+- int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
++ int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
+
+ PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
+- if (sh->nr_pending) {
++ if (md_atomic_read(&sh->nr_pending)) {
+ printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
+ return;
+ }
+@@ -949,9 +1006,9 @@
+ return;
+ }
+
+- atomic_dec(&raid_conf->nr_handle);
++ atomic_dec(&conf->nr_handle);
+
+- if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
++ if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
+ printk("raid5: restarting stripe %lu\n", sh->sector);
+ sh->phase = PHASE_BEGIN;
+ }
+@@ -969,11 +1026,11 @@
+ save_flags(flags);
+ cli();
+ for (i = 0; i < disks; i++) {
+- operational[i] = raid_conf->disks[i].operational;
+- if (i == sh->pd_idx && raid_conf->resync_parity)
++ operational[i] = conf->disks[i].operational;
++ if (i == sh->pd_idx && conf->resync_parity)
+ operational[i] = 0;
+ }
+- failed_disks = raid_conf->failed_disks;
++ failed_disks = conf->failed_disks;
+ restore_flags(flags);
+
+ if (failed_disks > 1) {
+@@ -1017,7 +1074,7 @@
+ }
+
+ if (nr_write && nr_read)
+- printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
++ printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
+
+ if (nr_write) {
+ /*
+@@ -1030,7 +1087,7 @@
+ if (sh->bh_new[i])
+ continue;
+ block = (int) compute_blocknr(sh, i);
+- bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
++ bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
+ if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
+ PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
+ add_stripe_bh(sh, bh, i, WRITE);
+@@ -1064,21 +1121,22 @@
+
+ if (!method1 || !method2) {
+ lock_stripe(sh);
+- sh->nr_pending++;
++ lowprio = is_stripe_lowprio(sh, disks);
++ atomic_inc(&sh->nr_pending);
+ sh->phase = PHASE_WRITE;
+ compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+ for (i = 0; i < disks; i++) {
+- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
++ if (!operational[i] && !conf->spare && !conf->resync_parity)
+ continue;
+ if (i == sh->pd_idx || sh->bh_new[i])
+ nr_writing++;
+ }
+
+- sh->nr_pending = nr_writing;
+- PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
++ md_atomic_set(&sh->nr_pending, nr_writing);
++ PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+
+ for (i = 0; i < disks; i++) {
+- if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
++ if (!operational[i] && !conf->spare && !conf->resync_parity)
+ continue;
+ bh = sh->bh_copy[i];
+ if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
+@@ -1089,18 +1147,30 @@
+ bh->b_state |= (1<<BH_Dirty);
+ PRINTK(("making request for buffer %d\n", i));
+ clear_bit(BH_Lock, &bh->b_state);
+- if (!operational[i] && !raid_conf->resync_parity) {
+- bh->b_rdev = raid_conf->spare->dev;
+- make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
+- } else
+- make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
++ if (!operational[i] && !conf->resync_parity) {
++ bh->b_rdev = conf->spare->dev;
++ make_request(MAJOR(conf->spare->dev), WRITE, bh);
++ } else {
++#if 0
++ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
++#else
++ if (!lowprio || (i==sh->pd_idx))
++ make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
++ else {
++ mark_buffer_clean(bh);
++ raid5_end_request(bh,1);
++ sh->new[i] = 0;
++ }
++#endif
++ }
+ }
+ }
+ return;
+ }
+
+ lock_stripe(sh);
+- sh->nr_pending++;
++ lowprio = is_stripe_lowprio(sh, disks);
++ atomic_inc(&sh->nr_pending);
+ if (method1 < method2) {
+ sh->write_method = RECONSTRUCT_WRITE;
+ for (i = 0; i < disks; i++) {
+@@ -1110,6 +1180,8 @@
+ continue;
+ sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+ raid5_build_block(sh, sh->bh_old[i], i);
++ if (lowprio)
++ mark_buffer_lowprio(sh->bh_old[i]);
+ reading++;
+ }
+ } else {
+@@ -1121,19 +1193,21 @@
+ continue;
+ sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+ raid5_build_block(sh, sh->bh_old[i], i);
++ if (lowprio)
++ mark_buffer_lowprio(sh->bh_old[i]);
+ reading++;
+ }
+ }
+ sh->phase = PHASE_READ_OLD;
+- sh->nr_pending = reading;
+- PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
++ md_atomic_set(&sh->nr_pending, reading);
++ PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+ for (i = 0; i < disks; i++) {
+ if (!sh->bh_old[i])
+ continue;
+ if (buffer_uptodate(sh->bh_old[i]))
+ continue;
+ clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
+- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
++ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
+ }
+ } else {
+ /*
+@@ -1141,7 +1215,8 @@
+ */
+ method1 = nr_read - nr_cache_overwrite;
+ lock_stripe(sh);
+- sh->nr_pending++;
++ lowprio = is_stripe_lowprio(sh,disks);
++ atomic_inc(&sh->nr_pending);
+
+ PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
+ if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
+@@ -1149,18 +1224,22 @@
+ for (i = 0; i < disks; i++) {
+ if (!sh->bh_new[i])
+ continue;
+- if (!sh->bh_old[i])
++ if (!sh->bh_old[i]) {
+ compute_block(sh, i);
++ if (lowprio)
++ mark_buffer_lowprio
++ (sh->bh_old[i]);
++ }
+ memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
+ }
+- sh->nr_pending--;
++ atomic_dec(&sh->nr_pending);
+ complete_stripe(sh);
+ return;
+ }
+ if (nr_failed_overwrite) {
+ sh->phase = PHASE_READ_OLD;
+- sh->nr_pending = (disks - 1) - nr_cache;
+- PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
++ md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
++ PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+ for (i = 0; i < disks; i++) {
+ if (sh->bh_old[i])
+ continue;
+@@ -1168,13 +1247,16 @@
+ continue;
+ sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+ raid5_build_block(sh, sh->bh_old[i], i);
++ if (lowprio)
++ mark_buffer_lowprio(sh->bh_old[i]);
+ clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
+- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
++ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
+ }
+ } else {
+ sh->phase = PHASE_READ;
+- sh->nr_pending = nr_read - nr_cache_overwrite;
+- PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
++ md_atomic_set(&sh->nr_pending,
++ nr_read - nr_cache_overwrite);
++ PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+ for (i = 0; i < disks; i++) {
+ if (!sh->bh_new[i])
+ continue;
+@@ -1182,16 +1264,16 @@
+ memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
+ continue;
+ }
+- make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
++ make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
+ }
+ }
+ }
+ }
+
+-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
++static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
+ {
+- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+- const unsigned int raid_disks = raid_conf->raid_disks;
++ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++ const unsigned int raid_disks = conf->raid_disks;
+ const unsigned int data_disks = raid_disks - 1;
+ unsigned int dd_idx, pd_idx;
+ unsigned long new_sector;
+@@ -1202,15 +1284,15 @@
+ if (rw == WRITEA) rw = WRITE;
+
+ new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
+- &dd_idx, &pd_idx, raid_conf);
++ &dd_idx, &pd_idx, conf);
+
+ PRINTK(("raid5_make_request, sector %lu\n", new_sector));
+ repeat:
+- sh = get_stripe(raid_conf, new_sector, bh->b_size);
++ sh = get_stripe(conf, new_sector, bh->b_size);
+ if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
+ PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
+ lock_stripe(sh);
+- if (!sh->nr_pending)
++ if (!md_atomic_read(&sh->nr_pending))
+ handle_stripe(sh);
+ goto repeat;
+ }
+@@ -1221,24 +1303,24 @@
+ printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
+ printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
+ lock_stripe(sh);
+- md_wakeup_thread(raid_conf->thread);
++ md_wakeup_thread(conf->thread);
+ wait_on_stripe(sh);
+ goto repeat;
+ }
+ add_stripe_bh(sh, bh, dd_idx, rw);
+
+- md_wakeup_thread(raid_conf->thread);
++ md_wakeup_thread(conf->thread);
+ return 0;
+ }
+
+ static void unplug_devices(struct stripe_head *sh)
+ {
+ #if 0
+- struct raid5_data *raid_conf = sh->raid_conf;
++ raid5_conf_t *conf = sh->raid_conf;
+ int i;
+
+- for (i = 0; i < raid_conf->raid_disks; i++)
+- unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
++ for (i = 0; i < conf->raid_disks; i++)
++ unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
+ #endif
+ }
+
+@@ -1252,8 +1334,8 @@
+ static void raid5d (void *data)
+ {
+ struct stripe_head *sh;
+- struct raid5_data *raid_conf = data;
+- struct md_dev *mddev = raid_conf->mddev;
++ raid5_conf_t *conf = data;
++ mddev_t *mddev = conf->mddev;
+ int i, handled = 0, unplug = 0;
+ unsigned long flags;
+
+@@ -1261,47 +1343,47 @@
+
+ if (mddev->sb_dirty) {
+ mddev->sb_dirty = 0;
+- md_update_sb((int) (mddev - md_dev));
++ md_update_sb(mddev);
+ }
+ for (i = 0; i < NR_HASH; i++) {
+ repeat:
+- sh = raid_conf->stripe_hashtbl[i];
++ sh = conf->stripe_hashtbl[i];
+ for (; sh; sh = sh->hash_next) {
+- if (sh->raid_conf != raid_conf)
++ if (sh->raid_conf != conf)
+ continue;
+ if (sh->phase == PHASE_COMPLETE)
+ continue;
+- if (sh->nr_pending)
++ if (md_atomic_read(&sh->nr_pending))
+ continue;
+- if (sh->sector == raid_conf->next_sector) {
+- raid_conf->sector_count += (sh->size >> 9);
+- if (raid_conf->sector_count >= 128)
++ if (sh->sector == conf->next_sector) {
++ conf->sector_count += (sh->size >> 9);
++ if (conf->sector_count >= 128)
+ unplug = 1;
+ } else
+ unplug = 1;
+ if (unplug) {
+- PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
++ PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
+ unplug_devices(sh);
+ unplug = 0;
+- raid_conf->sector_count = 0;
++ conf->sector_count = 0;
+ }
+- raid_conf->next_sector = sh->sector + (sh->size >> 9);
++ conf->next_sector = sh->sector + (sh->size >> 9);
+ handled++;
+ handle_stripe(sh);
+ goto repeat;
+ }
+ }
+- if (raid_conf) {
+- PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
++ if (conf) {
++ PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
+ save_flags(flags);
+ cli();
+- if (!atomic_read(&raid_conf->nr_handle))
+- clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
++ if (!md_atomic_read(&conf->nr_handle))
++ clear_bit(THREAD_WAKEUP, &conf->thread->flags);
++ restore_flags(flags);
+ }
+ PRINTK(("--- raid5d inactive\n"));
+ }
+
+-#if SUPPORT_RECONSTRUCTION
+ /*
+ * Private kernel thread for parity reconstruction after an unclean
+ * shutdown. Reconstruction on spare drives in case of a failed drive
+@@ -1309,44 +1391,64 @@
+ */
+ static void raid5syncd (void *data)
+ {
+- struct raid5_data *raid_conf = data;
+- struct md_dev *mddev = raid_conf->mddev;
++ raid5_conf_t *conf = data;
++ mddev_t *mddev = conf->mddev;
+
+- if (!raid_conf->resync_parity)
++ if (!conf->resync_parity)
++ return;
++ if (conf->resync_parity == 2)
++ return;
++ down(&mddev->recovery_sem);
++ if (md_do_sync(mddev,NULL)) {
++ up(&mddev->recovery_sem);
++ printk("raid5: resync aborted!\n");
+ return;
+- md_do_sync(mddev);
+- raid_conf->resync_parity = 0;
++ }
++ conf->resync_parity = 0;
++ up(&mddev->recovery_sem);
++ printk("raid5: resync finished.\n");
+ }
+-#endif /* SUPPORT_RECONSTRUCTION */
+
+-static int __check_consistency (struct md_dev *mddev, int row)
++static int __check_consistency (mddev_t *mddev, int row)
+ {
+- struct raid5_data *raid_conf = mddev->private;
++ raid5_conf_t *conf = mddev->private;
+ kdev_t dev;
+ struct buffer_head *bh[MD_SB_DISKS], tmp;
+- int i, rc = 0, nr = 0;
++ int i, rc = 0, nr = 0, count;
++ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+
+- if (raid_conf->working_disks != raid_conf->raid_disks)
++ if (conf->working_disks != conf->raid_disks)
+ return 0;
+ tmp.b_size = 4096;
+ if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
+ return 0;
++ md_clear_page((unsigned long)tmp.b_data);
+ memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
+- for (i = 0; i < raid_conf->raid_disks; i++) {
+- dev = raid_conf->disks[i].dev;
++ for (i = 0; i < conf->raid_disks; i++) {
++ dev = conf->disks[i].dev;
+ set_blocksize(dev, 4096);
+ if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
+ break;
+ nr++;
+ }
+- if (nr == raid_conf->raid_disks) {
+- for (i = 1; i < nr; i++)
+- xor_block(&tmp, bh[i]);
++ if (nr == conf->raid_disks) {
++ bh_ptr[0] = &tmp;
++ count = 1;
++ for (i = 1; i < nr; i++) {
++ bh_ptr[count++] = bh[i];
++ if (count == MAX_XOR_BLOCKS) {
++ xor_block(count, &bh_ptr[0]);
++ count = 1;
++ }
++ }
++ if (count != 1) {
++ xor_block(count, &bh_ptr[0]);
++ }
+ if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
+ rc = 1;
+ }
+- for (i = 0; i < raid_conf->raid_disks; i++) {
+- dev = raid_conf->disks[i].dev;
++ for (i = 0; i < conf->raid_disks; i++) {
++ dev = conf->disks[i].dev;
+ if (bh[i]) {
+ bforget(bh[i]);
+ bh[i] = NULL;
+@@ -1358,285 +1460,607 @@
+ return rc;
+ }
+
+-static int check_consistency (struct md_dev *mddev)
++static int check_consistency (mddev_t *mddev)
+ {
+- int size = mddev->sb->size;
+- int row;
++ if (__check_consistency(mddev, 0))
++/*
++ * We are not checking this currently, as it's legitimate to have
++ * an inconsistent array, at creation time.
++ */
++ return 0;
+
+- for (row = 0; row < size; row += size / 8)
+- if (__check_consistency(mddev, row))
+- return 1;
+ return 0;
+ }
+
+-static int raid5_run (int minor, struct md_dev *mddev)
++static int raid5_run (mddev_t *mddev)
+ {
+- struct raid5_data *raid_conf;
++ raid5_conf_t *conf;
+ int i, j, raid_disk, memory;
+- md_superblock_t *sb = mddev->sb;
+- md_descriptor_t *descriptor;
+- struct real_dev *realdev;
++ mdp_super_t *sb = mddev->sb;
++ mdp_disk_t *desc;
++ mdk_rdev_t *rdev;
++ struct disk_info *disk;
++ struct md_list_head *tmp;
++ int start_recovery = 0;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != 5 && sb->level != 4) {
+- printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
++ printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+ }
+
+- mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
+- if ((raid_conf = mddev->private) == NULL)
++ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
++ if ((conf = mddev->private) == NULL)
+ goto abort;
+- memset (raid_conf, 0, sizeof (*raid_conf));
+- raid_conf->mddev = mddev;
++ memset (conf, 0, sizeof (*conf));
++ conf->mddev = mddev;
+
+- if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
++ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+ goto abort;
+- memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
++ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+
+- init_waitqueue(&raid_conf->wait_for_stripe);
+- PRINTK(("raid5_run(%d) called.\n", minor));
+-
+- for (i = 0; i < mddev->nb_dev; i++) {
+- realdev = &mddev->devices[i];
+- if (!realdev->sb) {
+- printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
+- continue;
+- }
++ init_waitqueue(&conf->wait_for_stripe);
++ PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
+
++ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * This is important -- we are using the descriptor on
+ * the disk only to get a pointer to the descriptor on
+ * the main superblock, which might be more recent.
+ */
+- descriptor = &sb->disks[realdev->sb->descriptor.number];
+- if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
+- printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
++ desc = sb->disks + rdev->desc_nr;
++ raid_disk = desc->raid_disk;
++ disk = conf->disks + raid_disk;
++
++ if (disk_faulty(desc)) {
++ printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
++ if (!rdev->faulty) {
++ MD_BUG();
++ goto abort;
++ }
++ disk->number = desc->number;
++ disk->raid_disk = raid_disk;
++ disk->dev = rdev->dev;
++
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 0;
++ disk->used_slot = 1;
+ continue;
+ }
+- if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
+- if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
+- printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
+- continue;
++ if (disk_active(desc)) {
++ if (!disk_sync(desc)) {
++ printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
++ MD_BUG();
++ goto abort;
+ }
+- raid_disk = descriptor->raid_disk;
+- if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
+- printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
++ if (raid_disk > sb->raid_disks) {
++ printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
+ continue;
+ }
+- if (raid_conf->disks[raid_disk].operational) {
+- printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
++ if (disk->operational) {
++ printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
+ continue;
+ }
+- printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
++ printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
+
+- raid_conf->disks[raid_disk].number = descriptor->number;
+- raid_conf->disks[raid_disk].raid_disk = raid_disk;
+- raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
+- raid_conf->disks[raid_disk].operational = 1;
++ disk->number = desc->number;
++ disk->raid_disk = raid_disk;
++ disk->dev = rdev->dev;
++ disk->operational = 1;
++ disk->used_slot = 1;
+
+- raid_conf->working_disks++;
++ conf->working_disks++;
+ } else {
+ /*
+ * Must be a spare disk ..
+ */
+- printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
+- raid_disk = descriptor->raid_disk;
+- raid_conf->disks[raid_disk].number = descriptor->number;
+- raid_conf->disks[raid_disk].raid_disk = raid_disk;
+- raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
+-
+- raid_conf->disks[raid_disk].operational = 0;
+- raid_conf->disks[raid_disk].write_only = 0;
+- raid_conf->disks[raid_disk].spare = 1;
+- }
+- }
+- raid_conf->raid_disks = sb->raid_disks;
+- raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
+- raid_conf->mddev = mddev;
+- raid_conf->chunk_size = sb->chunk_size;
+- raid_conf->level = sb->level;
+- raid_conf->algorithm = sb->parity_algorithm;
+- raid_conf->max_nr_stripes = NR_STRIPES;
++ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
++ disk->number = desc->number;
++ disk->raid_disk = raid_disk;
++ disk->dev = rdev->dev;
+
+- if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
+- printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
+- goto abort;
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 1;
++ disk->used_slot = 1;
++ }
+ }
+- if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
+- printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ desc = sb->disks + i;
++ raid_disk = desc->raid_disk;
++ disk = conf->disks + raid_disk;
++
++ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
++ !conf->disks[raid_disk].used_slot) {
++
++ disk->number = desc->number;
++ disk->raid_disk = raid_disk;
++ disk->dev = MKDEV(0,0);
++
++ disk->operational = 0;
++ disk->write_only = 0;
++ disk->spare = 0;
++ disk->used_slot = 1;
++ }
++ }
++
++ conf->raid_disks = sb->raid_disks;
++ /*
++ * 0 for a fully functional array, 1 for a degraded array.
++ */
++ conf->failed_disks = conf->raid_disks - conf->working_disks;
++ conf->mddev = mddev;
++ conf->chunk_size = sb->chunk_size;
++ conf->level = sb->level;
++ conf->algorithm = sb->layout;
++ conf->max_nr_stripes = NR_STRIPES;
++
++#if 0
++ for (i = 0; i < conf->raid_disks; i++) {
++ if (!conf->disks[i].used_slot) {
++ MD_BUG();
++ goto abort;
++ }
++ }
++#endif
++ if (!conf->chunk_size || conf->chunk_size % 4) {
++ printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
+ goto abort;
+ }
+- if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+- printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
++ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
++ printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
+ goto abort;
+ }
+- if (raid_conf->failed_disks > 1) {
+- printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
++ if (conf->failed_disks > 1) {
++ printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
+ goto abort;
+ }
+
+- if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
+- printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
+- sb->state |= 1 << MD_SB_ERRORS;
+- goto abort;
++ if (conf->working_disks != sb->raid_disks) {
++ printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
++ start_recovery = 1;
+ }
+
+- if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
+- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+- goto abort;
++ if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
++ check_consistency(mddev)) {
++ printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
++ sb->state &= ~(1 << MD_SB_CLEAN);
+ }
+
+-#if SUPPORT_RECONSTRUCTION
+- if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
+- printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+- goto abort;
++ {
++ const char * name = "raid5d";
++
++ conf->thread = md_register_thread(raid5d, conf, name);
++ if (!conf->thread) {
++ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
++ goto abort;
++ }
+ }
+-#endif /* SUPPORT_RECONSTRUCTION */
+
+- memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
+- raid_conf->raid_disks * (sizeof(struct buffer_head) +
++ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
++ conf->raid_disks * (sizeof(struct buffer_head) +
+ 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+- if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
++ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
+ printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
+- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
++ shrink_stripes(conf, conf->max_nr_stripes);
+ goto abort;
+ } else
+- printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
++ printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+- for (i = 0; i < sb->nr_disks ; i++) {
+- sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
++ for (i = 0; i < MD_SB_DISKS ; i++) {
++ mark_disk_nonsync(sb->disks + i);
+ for (j = 0; j < sb->raid_disks; j++) {
+- if (!raid_conf->disks[j].operational)
++ if (!conf->disks[j].operational)
+ continue;
+- if (sb->disks[i].number == raid_conf->disks[j].number)
+- sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
++ if (sb->disks[i].number == conf->disks[j].number)
++ mark_disk_sync(sb->disks + i);
+ }
+ }
+- sb->active_disks = raid_conf->working_disks;
++ sb->active_disks = conf->working_disks;
+
+ if (sb->active_disks == sb->raid_disks)
+- printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
++ printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+ else
+- printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
++ printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
++
++ if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
++ const char * name = "raid5syncd";
++
++ conf->resync_thread = md_register_thread(raid5syncd, conf,name);
++ if (!conf->resync_thread) {
++ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
++ goto abort;
++ }
+
+- if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
+- printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
+- raid_conf->resync_parity = 1;
+-#if SUPPORT_RECONSTRUCTION
+- md_wakeup_thread(raid_conf->resync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
++ printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
++ conf->resync_parity = 1;
++ md_wakeup_thread(conf->resync_thread);
+ }
+
++ print_raid5_conf(conf);
++ if (start_recovery)
++ md_recover_arrays();
++ print_raid5_conf(conf);
++
+ /* Ok, everything is just fine now */
+ return (0);
+ abort:
+- if (raid_conf) {
+- if (raid_conf->stripe_hashtbl)
+- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
+- kfree(raid_conf);
++ if (conf) {
++ print_raid5_conf(conf);
++ if (conf->stripe_hashtbl)
++ free_pages((unsigned long) conf->stripe_hashtbl,
++ HASH_PAGES_ORDER);
++ kfree(conf);
+ }
+ mddev->private = NULL;
+- printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
++ printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+ }
+
+-static int raid5_stop (int minor, struct md_dev *mddev)
++static int raid5_stop_resync (mddev_t *mddev)
++{
++ raid5_conf_t *conf = mddev_to_conf(mddev);
++ mdk_thread_t *thread = conf->resync_thread;
++
++ if (thread) {
++ if (conf->resync_parity) {
++ conf->resync_parity = 2;
++ md_interrupt_thread(thread);
++ printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
++ return 1;
++ }
++ return 0;
++ }
++ return 0;
++}
++
++static int raid5_restart_resync (mddev_t *mddev)
+ {
+- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
++ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+- shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
+- shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
+- md_unregister_thread(raid_conf->thread);
+-#if SUPPORT_RECONSTRUCTION
+- md_unregister_thread(raid_conf->resync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
+- free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
+- kfree(raid_conf);
++ if (conf->resync_parity) {
++ if (!conf->resync_thread) {
++ MD_BUG();
++ return 0;
++ }
++ printk("raid5: waking up raid5resync.\n");
++ conf->resync_parity = 1;
++ md_wakeup_thread(conf->resync_thread);
++ return 1;
++ } else
++ printk("raid5: no restart-resync needed.\n");
++ return 0;
++}
++
++
++static int raid5_stop (mddev_t *mddev)
++{
++ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++
++ shrink_stripe_cache(conf, conf->max_nr_stripes);
++ shrink_stripes(conf, conf->max_nr_stripes);
++ md_unregister_thread(conf->thread);
++ if (conf->resync_thread)
++ md_unregister_thread(conf->resync_thread);
++ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
++ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+ }
+
+-static int raid5_status (char *page, int minor, struct md_dev *mddev)
++static int raid5_status (char *page, mddev_t *mddev)
+ {
+- struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+- md_superblock_t *sb = mddev->sb;
++ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++ mdp_super_t *sb = mddev->sb;
+ int sz = 0, i;
+
+- sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
+- sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
+- for (i = 0; i < raid_conf->raid_disks; i++)
+- sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
++ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
++ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
++ for (i = 0; i < conf->raid_disks; i++)
++ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
+ sz += sprintf (page+sz, "]");
+ return sz;
+ }
+
+-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
++static void print_raid5_conf (raid5_conf_t *conf)
++{
++ int i;
++ struct disk_info *tmp;
++
++ printk("RAID5 conf printout:\n");
++ if (!conf) {
++ printk("(conf==NULL)\n");
++ return;
++ }
++ printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
++ conf->working_disks, conf->failed_disks);
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ tmp = conf->disks + i;
++ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
++ i, tmp->spare,tmp->operational,
++ tmp->number,tmp->raid_disk,tmp->used_slot,
++ partition_name(tmp->dev));
++ }
++}
++
++static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+ {
+- int i = 0, failed_disk = -1;
+- struct raid5_data *raid_conf = mddev->private;
+- struct disk_info *disk = raid_conf->disks;
++ int err = 0;
++ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
++ raid5_conf_t *conf = mddev->private;
++ struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ unsigned long flags;
+- md_superblock_t *sb = mddev->sb;
+- md_descriptor_t *descriptor;
++ mdp_super_t *sb = mddev->sb;
++ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+
+- for (i = 0; i < MD_SB_DISKS; i++, disk++) {
+- if (disk->spare && disk->number == spare->number)
+- goto found;
+- }
+- return 1;
+-found:
+- for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
+- if (!disk->operational)
+- failed_disk = i;
+- if (failed_disk == -1)
+- return 1;
+ save_flags(flags);
+ cli();
++
++ print_raid5_conf(conf);
++ /*
++ * find the disk ...
++ */
+ switch (state) {
+- case SPARE_WRITE:
+- disk->operational = 1;
+- disk->write_only = 1;
+- raid_conf->spare = disk;
+- break;
+- case SPARE_INACTIVE:
+- disk->operational = 0;
+- disk->write_only = 0;
+- raid_conf->spare = NULL;
+- break;
+- case SPARE_ACTIVE:
+- disk->spare = 0;
+- disk->write_only = 0;
+
+- descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
+- i = spare->raid_disk;
+- disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
+- if (disk->raid_disk != failed_disk)
+- printk("raid5: disk->raid_disk != failed_disk");
+- descriptor->raid_disk = i;
+-
+- raid_conf->spare = NULL;
+- raid_conf->working_disks++;
+- raid_conf->failed_disks--;
+- raid_conf->disks[failed_disk] = *disk;
+- break;
+- default:
+- printk("raid5_mark_spare: bug: state == %d\n", state);
+- restore_flags(flags);
+- return 1;
++ case DISKOP_SPARE_ACTIVE:
++
++ /*
++ * Find the failed disk within the RAID5 configuration ...
++ * (this can only be in the first conf->raid_disks part)
++ */
++ for (i = 0; i < conf->raid_disks; i++) {
++ tmp = conf->disks + i;
++ if ((!tmp->operational && !tmp->spare) ||
++ !tmp->used_slot) {
++ failed_disk = i;
++ break;
++ }
++ }
++ /*
++ * When we activate a spare disk we _must_ have a disk in
++ * the lower (active) part of the array to replace.
++ */
++ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ /* fall through */
++
++ case DISKOP_SPARE_WRITE:
++ case DISKOP_SPARE_INACTIVE:
++
++ /*
++ * Find the spare disk ... (can only be in the 'high'
++ * area of the array)
++ */
++ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++ tmp = conf->disks + i;
++ if (tmp->spare && tmp->number == (*d)->number) {
++ spare_disk = i;
++ break;
++ }
++ }
++ if (spare_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
++
++ case DISKOP_HOT_REMOVE_DISK:
++
++ for (i = 0; i < MD_SB_DISKS; i++) {
++ tmp = conf->disks + i;
++ if (tmp->used_slot && (tmp->number == (*d)->number)) {
++ if (tmp->operational) {
++ err = -EBUSY;
++ goto abort;
++ }
++ removed_disk = i;
++ break;
++ }
++ }
++ if (removed_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
++
++ case DISKOP_HOT_ADD_DISK:
++
++ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++ tmp = conf->disks + i;
++ if (!tmp->used_slot) {
++ added_disk = i;
++ break;
++ }
++ }
++ if (added_disk == -1) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ break;
++ }
++
++ switch (state) {
++ /*
++ * Switch the spare disk to write-only mode:
++ */
++ case DISKOP_SPARE_WRITE:
++ if (conf->spare) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ sdisk = conf->disks + spare_disk;
++ sdisk->operational = 1;
++ sdisk->write_only = 1;
++ conf->spare = sdisk;
++ break;
++ /*
++ * Deactivate a spare disk:
++ */
++ case DISKOP_SPARE_INACTIVE:
++ sdisk = conf->disks + spare_disk;
++ sdisk->operational = 0;
++ sdisk->write_only = 0;
++ /*
++ * Was the spare being resynced?
++ */
++ if (conf->spare == sdisk)
++ conf->spare = NULL;
++ break;
++ /*
++ * Activate (mark read-write) the (now sync) spare disk,
++ * which means we switch it's 'raid position' (->raid_disk)
++ * with the failed disk. (only the first 'conf->raid_disks'
++ * slots are used for 'real' disks and we must preserve this
++ * property)
++ */
++ case DISKOP_SPARE_ACTIVE:
++ if (!conf->spare) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ sdisk = conf->disks + spare_disk;
++ fdisk = conf->disks + failed_disk;
++
++ spare_desc = &sb->disks[sdisk->number];
++ failed_desc = &sb->disks[fdisk->number];
++
++ if (spare_desc != *d) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ if (spare_desc->raid_disk != sdisk->raid_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ if (sdisk->raid_disk != spare_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ if (failed_desc->raid_disk != fdisk->raid_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ if (fdisk->raid_disk != failed_disk) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ /*
++ * do the switch finally
++ */
++ xchg_values(*spare_desc, *failed_desc);
++ xchg_values(*fdisk, *sdisk);
++
++ /*
++ * (careful, 'failed' and 'spare' are switched from now on)
++ *
++ * we want to preserve linear numbering and we want to
++ * give the proper raid_disk number to the now activated
++ * disk. (this means we switch back these values)
++ */
++
++ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
++ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
++ xchg_values(spare_desc->number, failed_desc->number);
++ xchg_values(sdisk->number, fdisk->number);
++
++ *d = failed_desc;
++
++ if (sdisk->dev == MKDEV(0,0))
++ sdisk->used_slot = 0;
++
++ /*
++ * this really activates the spare.
++ */
++ fdisk->spare = 0;
++ fdisk->write_only = 0;
++
++ /*
++ * if we activate a spare, we definitely replace a
++ * non-operational disk slot in the 'low' area of
++ * the disk array.
++ */
++ conf->failed_disks--;
++ conf->working_disks++;
++ conf->spare = NULL;
++
++ break;
++
++ case DISKOP_HOT_REMOVE_DISK:
++ rdisk = conf->disks + removed_disk;
++
++ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++ rdisk->dev = MKDEV(0,0);
++ rdisk->used_slot = 0;
++
++ break;
++
++ case DISKOP_HOT_ADD_DISK:
++ adisk = conf->disks + added_disk;
++ added_desc = *d;
++
++ if (added_disk != added_desc->number) {
++ MD_BUG();
++ err = 1;
++ goto abort;
++ }
++
++ adisk->number = added_desc->number;
++ adisk->raid_disk = added_desc->raid_disk;
++ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
++
++ adisk->operational = 0;
++ adisk->write_only = 0;
++ adisk->spare = 1;
++ adisk->used_slot = 1;
++
++
++ break;
++
++ default:
++ MD_BUG();
++ err = 1;
++ goto abort;
+ }
++abort:
+ restore_flags(flags);
+- return 0;
++ print_raid5_conf(conf);
++ return err;
+ }
+
+-static struct md_personality raid5_personality=
++static mdk_personality_t raid5_personality=
+ {
+ "raid5",
+ raid5_map,
+@@ -1648,14 +2072,19 @@
+ NULL, /* no ioctls */
+ 0,
+ raid5_error,
+- /* raid5_hot_add_disk, */ NULL,
+- /* raid1_hot_remove_drive */ NULL,
+- raid5_mark_spare
++ raid5_diskop,
++ raid5_stop_resync,
++ raid5_restart_resync
+ };
+
+ int raid5_init (void)
+ {
+- return register_md_personality (RAID5, &raid5_personality);
++ int err;
++
++ err = register_md_personality (RAID5, &raid5_personality);
++ if (err)
++ return err;
++ return 0;
+ }
+
+ #ifdef MODULE
+diff -uNr linux-orig/drivers/block/translucent.c linux/drivers/block/translucent.c
+--- linux-orig/drivers/block/translucent.c Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/translucent.c Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,136 @@
++/*
++ translucent.c : Translucent RAID driver for Linux
++ Copyright (C) 1998 Ingo Molnar
++
++ Translucent mode management functions.
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#include <linux/module.h>
++
++#include <linux/raid/md.h>
++#include <linux/malloc.h>
++
++#include <linux/raid/translucent.h>
++
++#define MAJOR_NR MD_MAJOR
++#define MD_DRIVER
++#define MD_PERSONALITY
++
++static int translucent_run (mddev_t *mddev)
++{
++ translucent_conf_t *conf;
++ mdk_rdev_t *rdev;
++ int i;
++
++ MOD_INC_USE_COUNT;
++
++ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
++ if (!conf)
++ goto out;
++ mddev->private = conf;
++
++ if (mddev->nb_dev != 2) {
++ printk("translucent: this mode needs 2 disks, aborting!\n");
++ goto out;
++ }
++
++ if (md_check_ordering(mddev)) {
++ printk("translucent: disks are not ordered, aborting!\n");
++ goto out;
++ }
++
++ ITERATE_RDEV_ORDERED(mddev,rdev,i) {
++ dev_info_t *disk = conf->disks + i;
++
++ disk->dev = rdev->dev;
++ disk->size = rdev->size;
++ }
++
++ return 0;
++
++out:
++ if (conf)
++ kfree(conf);
++
++ MOD_DEC_USE_COUNT;
++ return 1;
++}
++
++static int translucent_stop (mddev_t *mddev)
++{
++ translucent_conf_t *conf = mddev_to_conf(mddev);
++
++ kfree(conf);
++
++ MOD_DEC_USE_COUNT;
++
++ return 0;
++}
++
++
++static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++ unsigned long *rsector, unsigned long size)
++{
++ translucent_conf_t *conf = mddev_to_conf(mddev);
++
++ *rdev = conf->disks[0].dev;
++
++ return 0;
++}
++
++static int translucent_status (char *page, mddev_t *mddev)
++{
++ int sz = 0;
++
++ sz += sprintf(page+sz, " %d%% full", 10);
++ return sz;
++}
++
++
++static mdk_personality_t translucent_personality=
++{
++ "translucent",
++ translucent_map,
++ NULL,
++ NULL,
++ translucent_run,
++ translucent_stop,
++ translucent_status,
++ NULL,
++ 0,
++ NULL,
++ NULL,
++ NULL,
++ NULL
++};
++
++#ifndef MODULE
++
++md__initfunc(void translucent_init (void))
++{
++ register_md_personality (TRANSLUCENT, &translucent_personality);
++}
++
++#else
++
++int init_module (void)
++{
++ return (register_md_personality (TRANSLUCENT, &translucent_personality));
++}
++
++void cleanup_module (void)
++{
++ unregister_md_personality (TRANSLUCENT);
++}
++
++#endif
++
+diff -uNr linux-orig/drivers/block/xor.c linux/drivers/block/xor.c
+--- linux-orig/drivers/block/xor.c Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/xor.c Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,1894 @@
++/*
++ * xor.c : Multiple Devices driver for Linux
++ *
++ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
++ *
++ *
++ * optimized RAID-5 checksumming functions.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2, or (at your option)
++ * any later version.
++ *
++ * You should have received a copy of the GNU General Public License
++ * (for example /usr/src/linux/COPYING); if not, write to the Free
++ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ */
++#include <linux/module.h>
++#include <linux/raid/md.h>
++#ifdef __sparc_v9__
++#include <asm/head.h>
++#include <asm/asi.h>
++#include <asm/visasm.h>
++#endif
++
++/*
++ * we use the 'XOR function template' to register multiple xor
++ * functions runtime. The kernel measures their speed upon bootup
++ * and decides which one to use. (compile-time registration is
++ * not enough as certain CPU features like MMX can only be detected
++ * runtime)
++ *
++ * this architecture makes it pretty easy to add new routines
++ * that are faster on certain CPUs, without killing other CPU's
++ * 'native' routine. Although the current routines are belived
++ * to be the physically fastest ones on all CPUs tested, but
++ * feel free to prove me wrong and add yet another routine =B-)
++ * --mingo
++ */
++
++#define MAX_XOR_BLOCKS 5
++
++#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
++
++typedef void (*xor_block_t) XOR_ARGS;
++xor_block_t xor_block = NULL;
++
++#ifndef __sparc_v9__
++
++struct xor_block_template;
++
++struct xor_block_template {
++ char * name;
++ xor_block_t xor_block;
++ int speed;
++ struct xor_block_template * next;
++};
++
++struct xor_block_template * xor_functions = NULL;
++
++#define XORBLOCK_TEMPLATE(x) \
++static void xor_block_##x XOR_ARGS; \
++static struct xor_block_template t_xor_block_##x = \
++ { #x, xor_block_##x, 0, NULL }; \
++static void xor_block_##x XOR_ARGS
++
++#ifdef __i386__
++
++#ifdef CONFIG_X86_XMM
++/*
++ * Cache avoiding checksumming functions utilizing KNI instructions
++ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
++ */
++
++XORBLOCK_TEMPLATE(pIII_kni)
++{
++ char xmm_save[16*4];
++ int cr0;
++ int lines = (bh_ptr[0]->b_size>>8);
++
++ __asm__ __volatile__ (
++ "movl %%cr0,%0 ;\n\t"
++ "clts ;\n\t"
++ "movups %%xmm0,(%1) ;\n\t"
++ "movups %%xmm1,0x10(%1) ;\n\t"
++ "movups %%xmm2,0x20(%1) ;\n\t"
++ "movups %%xmm3,0x30(%1) ;\n\t"
++ : "=r" (cr0)
++ : "r" (xmm_save)
++ : "memory" );
++
++#define OFFS(x) "8*("#x"*2)"
++#define PF0(x) \
++ " prefetcht0 "OFFS(x)"(%1) ;\n"
++#define LD(x,y) \
++ " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
++#define ST(x,y) \
++ " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
++#define PF1(x) \
++ " prefetchnta "OFFS(x)"(%2) ;\n"
++#define PF2(x) \
++ " prefetchnta "OFFS(x)"(%3) ;\n"
++#define PF3(x) \
++ " prefetchnta "OFFS(x)"(%4) ;\n"
++#define PF4(x) \
++ " prefetchnta "OFFS(x)"(%5) ;\n"
++#define PF5(x) \
++ " prefetchnta "OFFS(x)"(%6) ;\n"
++#define XO1(x,y) \
++ " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
++#define XO2(x,y) \
++ " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
++#define XO3(x,y) \
++ " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
++#define XO4(x,y) \
++ " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
++#define XO5(x,y) \
++ " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
++
++ switch(count) {
++ case 2:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ LD(i,0) \
++ LD(i+1,1) \
++ PF1(i) \
++ PF1(i+2) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ PF0(i+4) \
++ PF0(i+6) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ ST(i,0) \
++ ST(i+1,1) \
++ ST(i+2,2) \
++ ST(i+3,3) \
++
++
++ PF0(0)
++ PF0(2)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $256, %1 ;\n"
++ " addl $256, %2 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data)
++ : "memory" );
++ break;
++ case 3:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ PF1(i) \
++ PF1(i+2) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ PF2(i) \
++ PF2(i+2) \
++ PF0(i+4) \
++ PF0(i+6) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ XO2(i,0) \
++ XO2(i+1,1) \
++ XO2(i+2,2) \
++ XO2(i+3,3) \
++ ST(i,0) \
++ ST(i+1,1) \
++ ST(i+2,2) \
++ ST(i+3,3) \
++
++
++ PF0(0)
++ PF0(2)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $256, %1 ;\n"
++ " addl $256, %2 ;\n"
++ " addl $256, %3 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data)
++ : "memory" );
++ break;
++ case 4:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ PF1(i) \
++ PF1(i+2) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ PF2(i) \
++ PF2(i+2) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ PF3(i) \
++ PF3(i+2) \
++ PF0(i+4) \
++ PF0(i+6) \
++ XO2(i,0) \
++ XO2(i+1,1) \
++ XO2(i+2,2) \
++ XO2(i+3,3) \
++ XO3(i,0) \
++ XO3(i+1,1) \
++ XO3(i+2,2) \
++ XO3(i+3,3) \
++ ST(i,0) \
++ ST(i+1,1) \
++ ST(i+2,2) \
++ ST(i+3,3) \
++
++
++ PF0(0)
++ PF0(2)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $256, %1 ;\n"
++ " addl $256, %2 ;\n"
++ " addl $256, %3 ;\n"
++ " addl $256, %4 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data)
++ : "memory" );
++ break;
++ case 5:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ PF1(i) \
++ PF1(i+2) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ PF2(i) \
++ PF2(i+2) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ PF3(i) \
++ PF3(i+2) \
++ XO2(i,0) \
++ XO2(i+1,1) \
++ XO2(i+2,2) \
++ XO2(i+3,3) \
++ PF4(i) \
++ PF4(i+2) \
++ PF0(i+4) \
++ PF0(i+6) \
++ XO3(i,0) \
++ XO3(i+1,1) \
++ XO3(i+2,2) \
++ XO3(i+3,3) \
++ XO4(i,0) \
++ XO4(i+1,1) \
++ XO4(i+2,2) \
++ XO4(i+3,3) \
++ ST(i,0) \
++ ST(i+1,1) \
++ ST(i+2,2) \
++ ST(i+3,3) \
++
++
++ PF0(0)
++ PF0(2)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $256, %1 ;\n"
++ " addl $256, %2 ;\n"
++ " addl $256, %3 ;\n"
++ " addl $256, %4 ;\n"
++ " addl $256, %5 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data),
++ "r" (bh_ptr[4]->b_data)
++ : "memory");
++ break;
++ }
++
++ __asm__ __volatile__ (
++ "sfence ;\n\t"
++ "movups (%1),%%xmm0 ;\n\t"
++ "movups 0x10(%1),%%xmm1 ;\n\t"
++ "movups 0x20(%1),%%xmm2 ;\n\t"
++ "movups 0x30(%1),%%xmm3 ;\n\t"
++ "movl %0,%%cr0 ;\n\t"
++ :
++ : "r" (cr0), "r" (xmm_save)
++ : "memory" );
++}
++
++#undef OFFS
++#undef LD
++#undef ST
++#undef PF0
++#undef PF1
++#undef PF2
++#undef PF3
++#undef PF4
++#undef PF5
++#undef XO1
++#undef XO2
++#undef XO3
++#undef XO4
++#undef XO5
++#undef BLOCK
++
++#endif /* CONFIG_X86_XMM */
++
++/*
++ * high-speed RAID5 checksumming functions utilizing MMX instructions
++ * Copyright (C) 1998 Ingo Molnar
++ */
++XORBLOCK_TEMPLATE(pII_mmx)
++{
++ char fpu_save[108];
++ int lines = (bh_ptr[0]->b_size>>7);
++
++ if (!(current->flags & PF_USEDFPU))
++ __asm__ __volatile__ ( " clts;\n");
++
++ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
++
++#define LD(x,y) \
++ " movq 8*("#x")(%1), %%mm"#y" ;\n"
++#define ST(x,y) \
++ " movq %%mm"#y", 8*("#x")(%1) ;\n"
++#define XO1(x,y) \
++ " pxor 8*("#x")(%2), %%mm"#y" ;\n"
++#define XO2(x,y) \
++ " pxor 8*("#x")(%3), %%mm"#y" ;\n"
++#define XO3(x,y) \
++ " pxor 8*("#x")(%4), %%mm"#y" ;\n"
++#define XO4(x,y) \
++ " pxor 8*("#x")(%5), %%mm"#y" ;\n"
++
++ switch(count) {
++ case 2:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ XO1(i,0) \
++ ST(i,0) \
++ XO1(i+1,1) \
++ ST(i+1,1) \
++ XO1(i+2,2) \
++ ST(i+2,2) \
++ XO1(i+3,3) \
++ ST(i+3,3)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $128, %1 ;\n"
++ " addl $128, %2 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data)
++ : "memory");
++ break;
++ case 3:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ XO2(i,0) \
++ ST(i,0) \
++ XO2(i+1,1) \
++ ST(i+1,1) \
++ XO2(i+2,2) \
++ ST(i+2,2) \
++ XO2(i+3,3) \
++ ST(i+3,3)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $128, %1 ;\n"
++ " addl $128, %2 ;\n"
++ " addl $128, %3 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data)
++ : "memory");
++ break;
++ case 4:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ XO2(i,0) \
++ XO2(i+1,1) \
++ XO2(i+2,2) \
++ XO2(i+3,3) \
++ XO3(i,0) \
++ ST(i,0) \
++ XO3(i+1,1) \
++ ST(i+1,1) \
++ XO3(i+2,2) \
++ ST(i+2,2) \
++ XO3(i+3,3) \
++ ST(i+3,3)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $128, %1 ;\n"
++ " addl $128, %2 ;\n"
++ " addl $128, %3 ;\n"
++ " addl $128, %4 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data)
++ : "memory");
++ break;
++ case 5:
++ __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++ LD(i,0) \
++ LD(i+1,1) \
++ LD(i+2,2) \
++ LD(i+3,3) \
++ XO1(i,0) \
++ XO1(i+1,1) \
++ XO1(i+2,2) \
++ XO1(i+3,3) \
++ XO2(i,0) \
++ XO2(i+1,1) \
++ XO2(i+2,2) \
++ XO2(i+3,3) \
++ XO3(i,0) \
++ XO3(i+1,1) \
++ XO3(i+2,2) \
++ XO3(i+3,3) \
++ XO4(i,0) \
++ ST(i,0) \
++ XO4(i+1,1) \
++ ST(i+1,1) \
++ XO4(i+2,2) \
++ ST(i+2,2) \
++ XO4(i+3,3) \
++ ST(i+3,3)
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++
++ BLOCK(0)
++ BLOCK(4)
++ BLOCK(8)
++ BLOCK(12)
++
++ " addl $128, %1 ;\n"
++ " addl $128, %2 ;\n"
++ " addl $128, %3 ;\n"
++ " addl $128, %4 ;\n"
++ " addl $128, %5 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data),
++ "r" (bh_ptr[4]->b_data)
++ : "memory");
++ break;
++ }
++
++ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
++
++ if (!(current->flags & PF_USEDFPU))
++ stts();
++}
++
++#undef LD
++#undef XO1
++#undef XO2
++#undef XO3
++#undef XO4
++#undef ST
++#undef BLOCK
++
++XORBLOCK_TEMPLATE(p5_mmx)
++{
++ char fpu_save[108];
++ int lines = (bh_ptr[0]->b_size>>6);
++
++ if (!(current->flags & PF_USEDFPU))
++ __asm__ __volatile__ ( " clts;\n");
++
++ __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
++
++ switch(count) {
++ case 2:
++ __asm__ __volatile__ (
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++ " movq (%1), %%mm0 ;\n"
++ " movq 8(%1), %%mm1 ;\n"
++ " pxor (%2), %%mm0 ;\n"
++ " movq 16(%1), %%mm2 ;\n"
++ " movq %%mm0, (%1) ;\n"
++ " pxor 8(%2), %%mm1 ;\n"
++ " movq 24(%1), %%mm3 ;\n"
++ " movq %%mm1, 8(%1) ;\n"
++ " pxor 16(%2), %%mm2 ;\n"
++ " movq 32(%1), %%mm4 ;\n"
++ " movq %%mm2, 16(%1) ;\n"
++ " pxor 24(%2), %%mm3 ;\n"
++ " movq 40(%1), %%mm5 ;\n"
++ " movq %%mm3, 24(%1) ;\n"
++ " pxor 32(%2), %%mm4 ;\n"
++ " movq 48(%1), %%mm6 ;\n"
++ " movq %%mm4, 32(%1) ;\n"
++ " pxor 40(%2), %%mm5 ;\n"
++ " movq 56(%1), %%mm7 ;\n"
++ " movq %%mm5, 40(%1) ;\n"
++ " pxor 48(%2), %%mm6 ;\n"
++ " pxor 56(%2), %%mm7 ;\n"
++ " movq %%mm6, 48(%1) ;\n"
++ " movq %%mm7, 56(%1) ;\n"
++
++ " addl $64, %1 ;\n"
++ " addl $64, %2 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data)
++ : "memory" );
++ break;
++ case 3:
++ __asm__ __volatile__ (
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++ " movq (%1), %%mm0 ;\n"
++ " movq 8(%1), %%mm1 ;\n"
++ " pxor (%2), %%mm0 ;\n"
++ " movq 16(%1), %%mm2 ;\n"
++ " pxor 8(%2), %%mm1 ;\n"
++ " pxor (%3), %%mm0 ;\n"
++ " pxor 16(%2), %%mm2 ;\n"
++ " movq %%mm0, (%1) ;\n"
++ " pxor 8(%3), %%mm1 ;\n"
++ " pxor 16(%3), %%mm2 ;\n"
++ " movq 24(%1), %%mm3 ;\n"
++ " movq %%mm1, 8(%1) ;\n"
++ " movq 32(%1), %%mm4 ;\n"
++ " movq 40(%1), %%mm5 ;\n"
++ " pxor 24(%2), %%mm3 ;\n"
++ " movq %%mm2, 16(%1) ;\n"
++ " pxor 32(%2), %%mm4 ;\n"
++ " pxor 24(%3), %%mm3 ;\n"
++ " pxor 40(%2), %%mm5 ;\n"
++ " movq %%mm3, 24(%1) ;\n"
++ " pxor 32(%3), %%mm4 ;\n"
++ " pxor 40(%3), %%mm5 ;\n"
++ " movq 48(%1), %%mm6 ;\n"
++ " movq %%mm4, 32(%1) ;\n"
++ " movq 56(%1), %%mm7 ;\n"
++ " pxor 48(%2), %%mm6 ;\n"
++ " movq %%mm5, 40(%1) ;\n"
++ " pxor 56(%2), %%mm7 ;\n"
++ " pxor 48(%3), %%mm6 ;\n"
++ " pxor 56(%3), %%mm7 ;\n"
++ " movq %%mm6, 48(%1) ;\n"
++ " movq %%mm7, 56(%1) ;\n"
++
++ " addl $64, %1 ;\n"
++ " addl $64, %2 ;\n"
++ " addl $64, %3 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data)
++ : "memory" );
++ break;
++ case 4:
++ __asm__ __volatile__ (
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++ " movq (%1), %%mm0 ;\n"
++ " movq 8(%1), %%mm1 ;\n"
++ " pxor (%2), %%mm0 ;\n"
++ " movq 16(%1), %%mm2 ;\n"
++ " pxor 8(%2), %%mm1 ;\n"
++ " pxor (%3), %%mm0 ;\n"
++ " pxor 16(%2), %%mm2 ;\n"
++ " pxor 8(%3), %%mm1 ;\n"
++ " pxor (%4), %%mm0 ;\n"
++ " movq 24(%1), %%mm3 ;\n"
++ " pxor 16(%3), %%mm2 ;\n"
++ " pxor 8(%4), %%mm1 ;\n"
++ " movq %%mm0, (%1) ;\n"
++ " movq 32(%1), %%mm4 ;\n"
++ " pxor 24(%2), %%mm3 ;\n"
++ " pxor 16(%4), %%mm2 ;\n"
++ " movq %%mm1, 8(%1) ;\n"
++ " movq 40(%1), %%mm5 ;\n"
++ " pxor 32(%2), %%mm4 ;\n"
++ " pxor 24(%3), %%mm3 ;\n"
++ " movq %%mm2, 16(%1) ;\n"
++ " pxor 40(%2), %%mm5 ;\n"
++ " pxor 32(%3), %%mm4 ;\n"
++ " pxor 24(%4), %%mm3 ;\n"
++ " movq %%mm3, 24(%1) ;\n"
++ " movq 56(%1), %%mm7 ;\n"
++ " movq 48(%1), %%mm6 ;\n"
++ " pxor 40(%3), %%mm5 ;\n"
++ " pxor 32(%4), %%mm4 ;\n"
++ " pxor 48(%2), %%mm6 ;\n"
++ " movq %%mm4, 32(%1) ;\n"
++ " pxor 56(%2), %%mm7 ;\n"
++ " pxor 40(%4), %%mm5 ;\n"
++ " pxor 48(%3), %%mm6 ;\n"
++ " pxor 56(%3), %%mm7 ;\n"
++ " movq %%mm5, 40(%1) ;\n"
++ " pxor 48(%4), %%mm6 ;\n"
++ " pxor 56(%4), %%mm7 ;\n"
++ " movq %%mm6, 48(%1) ;\n"
++ " movq %%mm7, 56(%1) ;\n"
++
++ " addl $64, %1 ;\n"
++ " addl $64, %2 ;\n"
++ " addl $64, %3 ;\n"
++ " addl $64, %4 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data)
++ : "memory" );
++ break;
++ case 5:
++ __asm__ __volatile__ (
++
++ " .align 32,0x90 ;\n"
++ " 1: ;\n"
++ " movq (%1), %%mm0 ;\n"
++ " movq 8(%1), %%mm1 ;\n"
++ " pxor (%2), %%mm0 ;\n"
++ " pxor 8(%2), %%mm1 ;\n"
++ " movq 16(%1), %%mm2 ;\n"
++ " pxor (%3), %%mm0 ;\n"
++ " pxor 8(%3), %%mm1 ;\n"
++ " pxor 16(%2), %%mm2 ;\n"
++ " pxor (%4), %%mm0 ;\n"
++ " pxor 8(%4), %%mm1 ;\n"
++ " pxor 16(%3), %%mm2 ;\n"
++ " movq 24(%1), %%mm3 ;\n"
++ " pxor (%5), %%mm0 ;\n"
++ " pxor 8(%5), %%mm1 ;\n"
++ " movq %%mm0, (%1) ;\n"
++ " pxor 16(%4), %%mm2 ;\n"
++ " pxor 24(%2), %%mm3 ;\n"
++ " movq %%mm1, 8(%1) ;\n"
++ " pxor 16(%5), %%mm2 ;\n"
++ " pxor 24(%3), %%mm3 ;\n"
++ " movq 32(%1), %%mm4 ;\n"
++ " movq %%mm2, 16(%1) ;\n"
++ " pxor 24(%4), %%mm3 ;\n"
++ " pxor 32(%2), %%mm4 ;\n"
++ " movq 40(%1), %%mm5 ;\n"
++ " pxor 24(%5), %%mm3 ;\n"
++ " pxor 32(%3), %%mm4 ;\n"
++ " pxor 40(%2), %%mm5 ;\n"
++ " movq %%mm3, 24(%1) ;\n"
++ " pxor 32(%4), %%mm4 ;\n"
++ " pxor 40(%3), %%mm5 ;\n"
++ " movq 48(%1), %%mm6 ;\n"
++ " movq 56(%1), %%mm7 ;\n"
++ " pxor 32(%5), %%mm4 ;\n"
++ " pxor 40(%4), %%mm5 ;\n"
++ " pxor 48(%2), %%mm6 ;\n"
++ " pxor 56(%2), %%mm7 ;\n"
++ " movq %%mm4, 32(%1) ;\n"
++ " pxor 48(%3), %%mm6 ;\n"
++ " pxor 56(%3), %%mm7 ;\n"
++ " pxor 40(%5), %%mm5 ;\n"
++ " pxor 48(%4), %%mm6 ;\n"
++ " pxor 56(%4), %%mm7 ;\n"
++ " movq %%mm5, 40(%1) ;\n"
++ " pxor 48(%5), %%mm6 ;\n"
++ " pxor 56(%5), %%mm7 ;\n"
++ " movq %%mm6, 48(%1) ;\n"
++ " movq %%mm7, 56(%1) ;\n"
++
++ " addl $64, %1 ;\n"
++ " addl $64, %2 ;\n"
++ " addl $64, %3 ;\n"
++ " addl $64, %4 ;\n"
++ " addl $64, %5 ;\n"
++ " decl %0 ;\n"
++ " jnz 1b ;\n"
++
++ :
++ : "r" (lines),
++ "r" (bh_ptr[0]->b_data),
++ "r" (bh_ptr[1]->b_data),
++ "r" (bh_ptr[2]->b_data),
++ "r" (bh_ptr[3]->b_data),
++ "r" (bh_ptr[4]->b_data)
++ : "memory" );
++ break;
++ }
++
++ __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
++
++ if (!(current->flags & PF_USEDFPU))
++ stts();
++}
++#endif /* __i386__ */
++#endif /* !__sparc_v9__ */
++
++#ifdef __sparc_v9__
++/*
++ * High speed xor_block operation for RAID4/5 utilizing the
++ * UltraSparc Visual Instruction Set.
++ *
++ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
++ *
++ * Requirements:
++ * !(((long)dest | (long)sourceN) & (64 - 1)) &&
++ * !(len & 127) && len >= 256
++ *
++ * It is done in pure assembly, as otherwise gcc makes it
++ * a non-leaf function, which is not what we want.
++ * Also, we don't measure the speeds as on other architectures,
++ * as the measuring routine does not take into account cold caches
++ * and the fact that xor_block_VIS bypasses the caches.
++ * xor_block_32regs might be 5% faster for count 2 if caches are hot
++ * and things just right (for count 3 VIS is about as fast as 32regs for
++ * hot caches and for count 4 and 5 VIS is faster by good margin always),
++ * but I think it is better not to pollute the caches.
++ * Actually, if I'd just fight for speed for hot caches, I could
++ * write a hybrid VIS/integer routine, which would do always two
++ * 64B blocks in VIS and two in IEUs, but I really care more about
++ * caches.
++ */
++extern void *VISenter(void);
++extern void xor_block_VIS XOR_ARGS;
++
++void __xor_block_VIS(void)
++{
++__asm__ ("
++ .globl xor_block_VIS
++xor_block_VIS:
++ ldx [%%o1 + 0], %%o4
++ ldx [%%o1 + 8], %%o3
++ ldx [%%o4 + %1], %%g5
++ ldx [%%o4 + %0], %%o4
++ ldx [%%o3 + %0], %%o3
++ rd %%fprs, %%o5
++ andcc %%o5, %2, %%g0
++ be,pt %%icc, 297f
++ sethi %%hi(%5), %%g1
++ jmpl %%g1 + %%lo(%5), %%g7
++ add %%g7, 8, %%g7
++297: wr %%g0, %4, %%fprs
++ membar #LoadStore|#StoreLoad|#StoreStore
++ sub %%g5, 64, %%g5
++ ldda [%%o4] %3, %%f0
++ ldda [%%o3] %3, %%f16
++ cmp %%o0, 4
++ bgeu,pt %%xcc, 10f
++ cmp %%o0, 3
++ be,pn %%xcc, 13f
++ mov -64, %%g1
++ sub %%g5, 64, %%g5
++ rd %%asi, %%g1
++ wr %%g0, %3, %%asi
++
++2: ldda [%%o4 + 64] %%asi, %%f32
++ fxor %%f0, %%f16, %%f16
++ fxor %%f2, %%f18, %%f18
++ fxor %%f4, %%f20, %%f20
++ fxor %%f6, %%f22, %%f22
++ fxor %%f8, %%f24, %%f24
++ fxor %%f10, %%f26, %%f26
++ fxor %%f12, %%f28, %%f28
++ fxor %%f14, %%f30, %%f30
++ stda %%f16, [%%o4] %3
++ ldda [%%o3 + 64] %%asi, %%f48
++ ldda [%%o4 + 128] %%asi, %%f0
++ fxor %%f32, %%f48, %%f48
++ fxor %%f34, %%f50, %%f50
++ add %%o4, 128, %%o4
++ fxor %%f36, %%f52, %%f52
++ add %%o3, 128, %%o3
++ fxor %%f38, %%f54, %%f54
++ subcc %%g5, 128, %%g5
++ fxor %%f40, %%f56, %%f56
++ fxor %%f42, %%f58, %%f58
++ fxor %%f44, %%f60, %%f60
++ fxor %%f46, %%f62, %%f62
++ stda %%f48, [%%o4 - 64] %%asi
++ bne,pt %%xcc, 2b
++ ldda [%%o3] %3, %%f16
++
++ ldda [%%o4 + 64] %%asi, %%f32
++ fxor %%f0, %%f16, %%f16
++ fxor %%f2, %%f18, %%f18
++ fxor %%f4, %%f20, %%f20
++ fxor %%f6, %%f22, %%f22
++ fxor %%f8, %%f24, %%f24
++ fxor %%f10, %%f26, %%f26
++ fxor %%f12, %%f28, %%f28
++ fxor %%f14, %%f30, %%f30
++ stda %%f16, [%%o4] %3
++ ldda [%%o3 + 64] %%asi, %%f48
++ membar #Sync
++ fxor %%f32, %%f48, %%f48
++ fxor %%f34, %%f50, %%f50
++ fxor %%f36, %%f52, %%f52
++ fxor %%f38, %%f54, %%f54
++ fxor %%f40, %%f56, %%f56
++ fxor %%f42, %%f58, %%f58
++ fxor %%f44, %%f60, %%f60
++ fxor %%f46, %%f62, %%f62
++ stda %%f48, [%%o4 + 64] %%asi
++ membar #Sync|#StoreStore|#StoreLoad
++ wr %%g0, 0, %%fprs
++ retl
++ wr %%g1, %%g0, %%asi
++
++13: ldx [%%o1 + 16], %%o2
++ ldx [%%o2 + %0], %%o2
++
++3: ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f48
++ fxor %%f2, %%f18, %%f50
++ add %%o4, 64, %%o4
++ fxor %%f4, %%f20, %%f52
++ fxor %%f6, %%f22, %%f54
++ add %%o3, 64, %%o3
++ fxor %%f8, %%f24, %%f56
++ fxor %%f10, %%f26, %%f58
++ fxor %%f12, %%f28, %%f60
++ fxor %%f14, %%f30, %%f62
++ ldda [%%o4] %3, %%f0
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ add %%o2, 64, %%o2
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ subcc %%g5, 64, %%g5
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ stda %%f48, [%%o4 + %%g1] %3
++ bne,pt %%xcc, 3b
++ ldda [%%o3] %3, %%f16
++
++ ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f48
++ fxor %%f2, %%f18, %%f50
++ fxor %%f4, %%f20, %%f52
++ fxor %%f6, %%f22, %%f54
++ fxor %%f8, %%f24, %%f56
++ fxor %%f10, %%f26, %%f58
++ fxor %%f12, %%f28, %%f60
++ fxor %%f14, %%f30, %%f62
++ membar #Sync
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ stda %%f48, [%%o4] %3
++ membar #Sync|#StoreStore|#StoreLoad
++ retl
++ wr %%g0, 0, %%fprs
++
++10: cmp %%o0, 5
++ be,pt %%xcc, 15f
++ mov -64, %%g1
++
++14: ldx [%%o1 + 16], %%o2
++ ldx [%%o1 + 24], %%o0
++ ldx [%%o2 + %0], %%o2
++ ldx [%%o0 + %0], %%o0
++
++4: ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f16
++ fxor %%f2, %%f18, %%f18
++ add %%o4, 64, %%o4
++ fxor %%f4, %%f20, %%f20
++ fxor %%f6, %%f22, %%f22
++ add %%o3, 64, %%o3
++ fxor %%f8, %%f24, %%f24
++ fxor %%f10, %%f26, %%f26
++ fxor %%f12, %%f28, %%f28
++ fxor %%f14, %%f30, %%f30
++ ldda [%%o0] %3, %%f48
++ fxor %%f16, %%f32, %%f32
++ fxor %%f18, %%f34, %%f34
++ fxor %%f20, %%f36, %%f36
++ fxor %%f22, %%f38, %%f38
++ add %%o2, 64, %%o2
++ fxor %%f24, %%f40, %%f40
++ fxor %%f26, %%f42, %%f42
++ fxor %%f28, %%f44, %%f44
++ fxor %%f30, %%f46, %%f46
++ ldda [%%o4] %3, %%f0
++ fxor %%f32, %%f48, %%f48
++ fxor %%f34, %%f50, %%f50
++ fxor %%f36, %%f52, %%f52
++ add %%o0, 64, %%o0
++ fxor %%f38, %%f54, %%f54
++ fxor %%f40, %%f56, %%f56
++ fxor %%f42, %%f58, %%f58
++ subcc %%g5, 64, %%g5
++ fxor %%f44, %%f60, %%f60
++ fxor %%f46, %%f62, %%f62
++ stda %%f48, [%%o4 + %%g1] %3
++ bne,pt %%xcc, 4b
++ ldda [%%o3] %3, %%f16
++
++ ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f16
++ fxor %%f2, %%f18, %%f18
++ fxor %%f4, %%f20, %%f20
++ fxor %%f6, %%f22, %%f22
++ fxor %%f8, %%f24, %%f24
++ fxor %%f10, %%f26, %%f26
++ fxor %%f12, %%f28, %%f28
++ fxor %%f14, %%f30, %%f30
++ ldda [%%o0] %3, %%f48
++ fxor %%f16, %%f32, %%f32
++ fxor %%f18, %%f34, %%f34
++ fxor %%f20, %%f36, %%f36
++ fxor %%f22, %%f38, %%f38
++ fxor %%f24, %%f40, %%f40
++ fxor %%f26, %%f42, %%f42
++ fxor %%f28, %%f44, %%f44
++ fxor %%f30, %%f46, %%f46
++ membar #Sync
++ fxor %%f32, %%f48, %%f48
++ fxor %%f34, %%f50, %%f50
++ fxor %%f36, %%f52, %%f52
++ fxor %%f38, %%f54, %%f54
++ fxor %%f40, %%f56, %%f56
++ fxor %%f42, %%f58, %%f58
++ fxor %%f44, %%f60, %%f60
++ fxor %%f46, %%f62, %%f62
++ stda %%f48, [%%o4] %3
++ membar #Sync|#StoreStore|#StoreLoad
++ retl
++ wr %%g0, 0, %%fprs
++
++15: ldx [%%o1 + 16], %%o2
++ ldx [%%o1 + 24], %%o0
++ ldx [%%o1 + 32], %%o1
++ ldx [%%o2 + %0], %%o2
++ ldx [%%o0 + %0], %%o0
++ ldx [%%o1 + %0], %%o1
++
++5: ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f48
++ fxor %%f2, %%f18, %%f50
++ add %%o4, 64, %%o4
++ fxor %%f4, %%f20, %%f52
++ fxor %%f6, %%f22, %%f54
++ add %%o3, 64, %%o3
++ fxor %%f8, %%f24, %%f56
++ fxor %%f10, %%f26, %%f58
++ fxor %%f12, %%f28, %%f60
++ fxor %%f14, %%f30, %%f62
++ ldda [%%o0] %3, %%f16
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ add %%o2, 64, %%o2
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ ldda [%%o1] %3, %%f32
++ fxor %%f48, %%f16, %%f48
++ fxor %%f50, %%f18, %%f50
++ add %%o0, 64, %%o0
++ fxor %%f52, %%f20, %%f52
++ fxor %%f54, %%f22, %%f54
++ add %%o1, 64, %%o1
++ fxor %%f56, %%f24, %%f56
++ fxor %%f58, %%f26, %%f58
++ fxor %%f60, %%f28, %%f60
++ fxor %%f62, %%f30, %%f62
++ ldda [%%o4] %3, %%f0
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ subcc %%g5, 64, %%g5
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ stda %%f48, [%%o4 + %%g1] %3
++ bne,pt %%xcc, 5b
++ ldda [%%o3] %3, %%f16
++
++ ldda [%%o2] %3, %%f32
++ fxor %%f0, %%f16, %%f48
++ fxor %%f2, %%f18, %%f50
++ fxor %%f4, %%f20, %%f52
++ fxor %%f6, %%f22, %%f54
++ fxor %%f8, %%f24, %%f56
++ fxor %%f10, %%f26, %%f58
++ fxor %%f12, %%f28, %%f60
++ fxor %%f14, %%f30, %%f62
++ ldda [%%o0] %3, %%f16
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ ldda [%%o1] %3, %%f32
++ fxor %%f48, %%f16, %%f48
++ fxor %%f50, %%f18, %%f50
++ fxor %%f52, %%f20, %%f52
++ fxor %%f54, %%f22, %%f54
++ fxor %%f56, %%f24, %%f56
++ fxor %%f58, %%f26, %%f58
++ fxor %%f60, %%f28, %%f60
++ fxor %%f62, %%f30, %%f62
++ membar #Sync
++ fxor %%f48, %%f32, %%f48
++ fxor %%f50, %%f34, %%f50
++ fxor %%f52, %%f36, %%f52
++ fxor %%f54, %%f38, %%f54
++ fxor %%f56, %%f40, %%f56
++ fxor %%f58, %%f42, %%f58
++ fxor %%f60, %%f44, %%f60
++ fxor %%f62, %%f46, %%f62
++ stda %%f48, [%%o4] %3
++ membar #Sync|#StoreStore|#StoreLoad
++ retl
++ wr %%g0, 0, %%fprs
++ " : :
++ "i" (&((struct buffer_head *)0)->b_data),
++ "i" (&((struct buffer_head *)0)->b_size),
++ "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
++ "i" (FPRS_FEF), "i" (VISenter));
++}
++#endif /* __sparc_v9__ */
++
++#if defined(__sparc__) && !defined(__sparc_v9__)
++/*
++ * High speed xor_block operation for RAID4/5 utilizing the
++ * ldd/std SPARC instructions.
++ *
++ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
++ *
++ */
++
++XORBLOCK_TEMPLATE(SPARC)
++{
++ int size = bh_ptr[0]->b_size;
++ int lines = size / (sizeof (long)) / 8, i;
++ long *destp = (long *) bh_ptr[0]->b_data;
++ long *source1 = (long *) bh_ptr[1]->b_data;
++ long *source2, *source3, *source4;
++
++ switch (count) {
++ case 2:
++ for (i = lines; i > 0; i--) {
++ __asm__ __volatile__("
++ ldd [%0 + 0x00], %%g2
++ ldd [%0 + 0x08], %%g4
++ ldd [%0 + 0x10], %%o0
++ ldd [%0 + 0x18], %%o2
++ ldd [%1 + 0x00], %%o4
++ ldd [%1 + 0x08], %%l0
++ ldd [%1 + 0x10], %%l2
++ ldd [%1 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ std %%g2, [%0 + 0x00]
++ std %%g4, [%0 + 0x08]
++ std %%o0, [%0 + 0x10]
++ std %%o2, [%0 + 0x18]
++ " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
++ "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
++ destp += 8;
++ source1 += 8;
++ }
++ break;
++ case 3:
++ source2 = (long *) bh_ptr[2]->b_data;
++ for (i = lines; i > 0; i--) {
++ __asm__ __volatile__("
++ ldd [%0 + 0x00], %%g2
++ ldd [%0 + 0x08], %%g4
++ ldd [%0 + 0x10], %%o0
++ ldd [%0 + 0x18], %%o2
++ ldd [%1 + 0x00], %%o4
++ ldd [%1 + 0x08], %%l0
++ ldd [%1 + 0x10], %%l2
++ ldd [%1 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%2 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%2 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%2 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%2 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ std %%g2, [%0 + 0x00]
++ std %%g4, [%0 + 0x08]
++ std %%o0, [%0 + 0x10]
++ std %%o2, [%0 + 0x18]
++ " : : "r" (destp), "r" (source1), "r" (source2)
++ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++ "l0", "l1", "l2", "l3", "l4", "l5");
++ destp += 8;
++ source1 += 8;
++ source2 += 8;
++ }
++ break;
++ case 4:
++ source2 = (long *) bh_ptr[2]->b_data;
++ source3 = (long *) bh_ptr[3]->b_data;
++ for (i = lines; i > 0; i--) {
++ __asm__ __volatile__("
++ ldd [%0 + 0x00], %%g2
++ ldd [%0 + 0x08], %%g4
++ ldd [%0 + 0x10], %%o0
++ ldd [%0 + 0x18], %%o2
++ ldd [%1 + 0x00], %%o4
++ ldd [%1 + 0x08], %%l0
++ ldd [%1 + 0x10], %%l2
++ ldd [%1 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%2 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%2 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%2 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%2 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%3 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%3 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%3 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%3 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ std %%g2, [%0 + 0x00]
++ std %%g4, [%0 + 0x08]
++ std %%o0, [%0 + 0x10]
++ std %%o2, [%0 + 0x18]
++ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
++ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++ "l0", "l1", "l2", "l3", "l4", "l5");
++ destp += 8;
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ }
++ break;
++ case 5:
++ source2 = (long *) bh_ptr[2]->b_data;
++ source3 = (long *) bh_ptr[3]->b_data;
++ source4 = (long *) bh_ptr[4]->b_data;
++ for (i = lines; i > 0; i--) {
++ __asm__ __volatile__("
++ ldd [%0 + 0x00], %%g2
++ ldd [%0 + 0x08], %%g4
++ ldd [%0 + 0x10], %%o0
++ ldd [%0 + 0x18], %%o2
++ ldd [%1 + 0x00], %%o4
++ ldd [%1 + 0x08], %%l0
++ ldd [%1 + 0x10], %%l2
++ ldd [%1 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%2 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%2 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%2 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%2 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%3 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%3 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%3 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%3 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ ldd [%4 + 0x00], %%o4
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ ldd [%4 + 0x08], %%l0
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ ldd [%4 + 0x10], %%l2
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ ldd [%4 + 0x18], %%l4
++ xor %%g2, %%o4, %%g2
++ xor %%g3, %%o5, %%g3
++ xor %%g4, %%l0, %%g4
++ xor %%g5, %%l1, %%g5
++ xor %%o0, %%l2, %%o0
++ xor %%o1, %%l3, %%o1
++ xor %%o2, %%l4, %%o2
++ xor %%o3, %%l5, %%o3
++ std %%g2, [%0 + 0x00]
++ std %%g4, [%0 + 0x08]
++ std %%o0, [%0 + 0x10]
++ std %%o2, [%0 + 0x18]
++ " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
++ : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++ "l0", "l1", "l2", "l3", "l4", "l5");
++ destp += 8;
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ source4 += 8;
++ }
++ break;
++ }
++}
++#endif /* __sparc_v[78]__ */
++
++#ifndef __sparc_v9__
++
++/*
++ * this one works reasonably on any x86 CPU
++ * (send me an assembly version for inclusion if you can make it faster)
++ *
++ * this one is just as fast as written in pure assembly on x86.
++ * the reason for this separate version is that the
++ * fast open-coded xor routine "32reg" produces suboptimal code
++ * on x86, due to lack of registers.
++ */
++XORBLOCK_TEMPLATE(8regs)
++{
++ int len = bh_ptr[0]->b_size;
++ long *destp = (long *) bh_ptr[0]->b_data;
++ long *source1, *source2, *source3, *source4;
++ long lines = len / (sizeof (long)) / 8, i;
++
++ switch(count) {
++ case 2:
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ *(destp + 0) ^= *(source1 + 0);
++ *(destp + 1) ^= *(source1 + 1);
++ *(destp + 2) ^= *(source1 + 2);
++ *(destp + 3) ^= *(source1 + 3);
++ *(destp + 4) ^= *(source1 + 4);
++ *(destp + 5) ^= *(source1 + 5);
++ *(destp + 6) ^= *(source1 + 6);
++ *(destp + 7) ^= *(source1 + 7);
++ source1 += 8;
++ destp += 8;
++ }
++ break;
++ case 3:
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ *(destp + 0) ^= *(source1 + 0);
++ *(destp + 0) ^= *(source2 + 0);
++ *(destp + 1) ^= *(source1 + 1);
++ *(destp + 1) ^= *(source2 + 1);
++ *(destp + 2) ^= *(source1 + 2);
++ *(destp + 2) ^= *(source2 + 2);
++ *(destp + 3) ^= *(source1 + 3);
++ *(destp + 3) ^= *(source2 + 3);
++ *(destp + 4) ^= *(source1 + 4);
++ *(destp + 4) ^= *(source2 + 4);
++ *(destp + 5) ^= *(source1 + 5);
++ *(destp + 5) ^= *(source2 + 5);
++ *(destp + 6) ^= *(source1 + 6);
++ *(destp + 6) ^= *(source2 + 6);
++ *(destp + 7) ^= *(source1 + 7);
++ *(destp + 7) ^= *(source2 + 7);
++ source1 += 8;
++ source2 += 8;
++ destp += 8;
++ }
++ break;
++ case 4:
++ source3 = (long *) bh_ptr[3]->b_data;
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ *(destp + 0) ^= *(source1 + 0);
++ *(destp + 0) ^= *(source2 + 0);
++ *(destp + 0) ^= *(source3 + 0);
++ *(destp + 1) ^= *(source1 + 1);
++ *(destp + 1) ^= *(source2 + 1);
++ *(destp + 1) ^= *(source3 + 1);
++ *(destp + 2) ^= *(source1 + 2);
++ *(destp + 2) ^= *(source2 + 2);
++ *(destp + 2) ^= *(source3 + 2);
++ *(destp + 3) ^= *(source1 + 3);
++ *(destp + 3) ^= *(source2 + 3);
++ *(destp + 3) ^= *(source3 + 3);
++ *(destp + 4) ^= *(source1 + 4);
++ *(destp + 4) ^= *(source2 + 4);
++ *(destp + 4) ^= *(source3 + 4);
++ *(destp + 5) ^= *(source1 + 5);
++ *(destp + 5) ^= *(source2 + 5);
++ *(destp + 5) ^= *(source3 + 5);
++ *(destp + 6) ^= *(source1 + 6);
++ *(destp + 6) ^= *(source2 + 6);
++ *(destp + 6) ^= *(source3 + 6);
++ *(destp + 7) ^= *(source1 + 7);
++ *(destp + 7) ^= *(source2 + 7);
++ *(destp + 7) ^= *(source3 + 7);
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ destp += 8;
++ }
++ break;
++ case 5:
++ source4 = (long *) bh_ptr[4]->b_data;
++ source3 = (long *) bh_ptr[3]->b_data;
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ *(destp + 0) ^= *(source1 + 0);
++ *(destp + 0) ^= *(source2 + 0);
++ *(destp + 0) ^= *(source3 + 0);
++ *(destp + 0) ^= *(source4 + 0);
++ *(destp + 1) ^= *(source1 + 1);
++ *(destp + 1) ^= *(source2 + 1);
++ *(destp + 1) ^= *(source3 + 1);
++ *(destp + 1) ^= *(source4 + 1);
++ *(destp + 2) ^= *(source1 + 2);
++ *(destp + 2) ^= *(source2 + 2);
++ *(destp + 2) ^= *(source3 + 2);
++ *(destp + 2) ^= *(source4 + 2);
++ *(destp + 3) ^= *(source1 + 3);
++ *(destp + 3) ^= *(source2 + 3);
++ *(destp + 3) ^= *(source3 + 3);
++ *(destp + 3) ^= *(source4 + 3);
++ *(destp + 4) ^= *(source1 + 4);
++ *(destp + 4) ^= *(source2 + 4);
++ *(destp + 4) ^= *(source3 + 4);
++ *(destp + 4) ^= *(source4 + 4);
++ *(destp + 5) ^= *(source1 + 5);
++ *(destp + 5) ^= *(source2 + 5);
++ *(destp + 5) ^= *(source3 + 5);
++ *(destp + 5) ^= *(source4 + 5);
++ *(destp + 6) ^= *(source1 + 6);
++ *(destp + 6) ^= *(source2 + 6);
++ *(destp + 6) ^= *(source3 + 6);
++ *(destp + 6) ^= *(source4 + 6);
++ *(destp + 7) ^= *(source1 + 7);
++ *(destp + 7) ^= *(source2 + 7);
++ *(destp + 7) ^= *(source3 + 7);
++ *(destp + 7) ^= *(source4 + 7);
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ source4 += 8;
++ destp += 8;
++ }
++ break;
++ }
++}
++
++/*
++ * platform independent RAID5 checksum calculation, this should
++ * be very fast on any platform that has a decent amount of
++ * registers. (32 or more)
++ */
++XORBLOCK_TEMPLATE(32regs)
++{
++ int size = bh_ptr[0]->b_size;
++ int lines = size / (sizeof (long)) / 8, i;
++ long *destp = (long *) bh_ptr[0]->b_data;
++ long *source1, *source2, *source3, *source4;
++
++ /* LOTS of registers available...
++ We do explicite loop-unrolling here for code which
++ favours RISC machines. In fact this is almoast direct
++ RISC assembly on Alpha and SPARC :-) */
++
++
++ switch(count) {
++ case 2:
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ register long d0, d1, d2, d3, d4, d5, d6, d7;
++ d0 = destp[0]; /* Pull the stuff into registers */
++ d1 = destp[1]; /* ... in bursts, if possible. */
++ d2 = destp[2];
++ d3 = destp[3];
++ d4 = destp[4];
++ d5 = destp[5];
++ d6 = destp[6];
++ d7 = destp[7];
++ d0 ^= source1[0];
++ d1 ^= source1[1];
++ d2 ^= source1[2];
++ d3 ^= source1[3];
++ d4 ^= source1[4];
++ d5 ^= source1[5];
++ d6 ^= source1[6];
++ d7 ^= source1[7];
++ destp[0] = d0; /* Store the result (in burts) */
++ destp[1] = d1;
++ destp[2] = d2;
++ destp[3] = d3;
++ destp[4] = d4; /* Store the result (in burts) */
++ destp[5] = d5;
++ destp[6] = d6;
++ destp[7] = d7;
++ source1 += 8;
++ destp += 8;
++ }
++ break;
++ case 3:
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ register long d0, d1, d2, d3, d4, d5, d6, d7;
++ d0 = destp[0]; /* Pull the stuff into registers */
++ d1 = destp[1]; /* ... in bursts, if possible. */
++ d2 = destp[2];
++ d3 = destp[3];
++ d4 = destp[4];
++ d5 = destp[5];
++ d6 = destp[6];
++ d7 = destp[7];
++ d0 ^= source1[0];
++ d1 ^= source1[1];
++ d2 ^= source1[2];
++ d3 ^= source1[3];
++ d4 ^= source1[4];
++ d5 ^= source1[5];
++ d6 ^= source1[6];
++ d7 ^= source1[7];
++ d0 ^= source2[0];
++ d1 ^= source2[1];
++ d2 ^= source2[2];
++ d3 ^= source2[3];
++ d4 ^= source2[4];
++ d5 ^= source2[5];
++ d6 ^= source2[6];
++ d7 ^= source2[7];
++ destp[0] = d0; /* Store the result (in burts) */
++ destp[1] = d1;
++ destp[2] = d2;
++ destp[3] = d3;
++ destp[4] = d4; /* Store the result (in burts) */
++ destp[5] = d5;
++ destp[6] = d6;
++ destp[7] = d7;
++ source1 += 8;
++ source2 += 8;
++ destp += 8;
++ }
++ break;
++ case 4:
++ source3 = (long *) bh_ptr[3]->b_data;
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ register long d0, d1, d2, d3, d4, d5, d6, d7;
++ d0 = destp[0]; /* Pull the stuff into registers */
++ d1 = destp[1]; /* ... in bursts, if possible. */
++ d2 = destp[2];
++ d3 = destp[3];
++ d4 = destp[4];
++ d5 = destp[5];
++ d6 = destp[6];
++ d7 = destp[7];
++ d0 ^= source1[0];
++ d1 ^= source1[1];
++ d2 ^= source1[2];
++ d3 ^= source1[3];
++ d4 ^= source1[4];
++ d5 ^= source1[5];
++ d6 ^= source1[6];
++ d7 ^= source1[7];
++ d0 ^= source2[0];
++ d1 ^= source2[1];
++ d2 ^= source2[2];
++ d3 ^= source2[3];
++ d4 ^= source2[4];
++ d5 ^= source2[5];
++ d6 ^= source2[6];
++ d7 ^= source2[7];
++ d0 ^= source3[0];
++ d1 ^= source3[1];
++ d2 ^= source3[2];
++ d3 ^= source3[3];
++ d4 ^= source3[4];
++ d5 ^= source3[5];
++ d6 ^= source3[6];
++ d7 ^= source3[7];
++ destp[0] = d0; /* Store the result (in burts) */
++ destp[1] = d1;
++ destp[2] = d2;
++ destp[3] = d3;
++ destp[4] = d4; /* Store the result (in burts) */
++ destp[5] = d5;
++ destp[6] = d6;
++ destp[7] = d7;
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ destp += 8;
++ }
++ break;
++ case 5:
++ source4 = (long *) bh_ptr[4]->b_data;
++ source3 = (long *) bh_ptr[3]->b_data;
++ source2 = (long *) bh_ptr[2]->b_data;
++ source1 = (long *) bh_ptr[1]->b_data;
++ for (i = lines; i > 0; i--) {
++ register long d0, d1, d2, d3, d4, d5, d6, d7;
++ d0 = destp[0]; /* Pull the stuff into registers */
++ d1 = destp[1]; /* ... in bursts, if possible. */
++ d2 = destp[2];
++ d3 = destp[3];
++ d4 = destp[4];
++ d5 = destp[5];
++ d6 = destp[6];
++ d7 = destp[7];
++ d0 ^= source1[0];
++ d1 ^= source1[1];
++ d2 ^= source1[2];
++ d3 ^= source1[3];
++ d4 ^= source1[4];
++ d5 ^= source1[5];
++ d6 ^= source1[6];
++ d7 ^= source1[7];
++ d0 ^= source2[0];
++ d1 ^= source2[1];
++ d2 ^= source2[2];
++ d3 ^= source2[3];
++ d4 ^= source2[4];
++ d5 ^= source2[5];
++ d6 ^= source2[6];
++ d7 ^= source2[7];
++ d0 ^= source3[0];
++ d1 ^= source3[1];
++ d2 ^= source3[2];
++ d3 ^= source3[3];
++ d4 ^= source3[4];
++ d5 ^= source3[5];
++ d6 ^= source3[6];
++ d7 ^= source3[7];
++ d0 ^= source4[0];
++ d1 ^= source4[1];
++ d2 ^= source4[2];
++ d3 ^= source4[3];
++ d4 ^= source4[4];
++ d5 ^= source4[5];
++ d6 ^= source4[6];
++ d7 ^= source4[7];
++ destp[0] = d0; /* Store the result (in burts) */
++ destp[1] = d1;
++ destp[2] = d2;
++ destp[3] = d3;
++ destp[4] = d4; /* Store the result (in burts) */
++ destp[5] = d5;
++ destp[6] = d6;
++ destp[7] = d7;
++ source1 += 8;
++ source2 += 8;
++ source3 += 8;
++ source4 += 8;
++ destp += 8;
++ }
++ break;
++ }
++}
++
++/*
++ * (the -6*32 shift factor colors the cache)
++ */
++#define SIZE (PAGE_SIZE-6*32)
++
++static void xor_speed ( struct xor_block_template * func,
++ struct buffer_head *b1, struct buffer_head *b2)
++{
++ int speed;
++ unsigned long now;
++ int i, count, max;
++ struct buffer_head *bh_ptr[6];
++
++ func->next = xor_functions;
++ xor_functions = func;
++ bh_ptr[0] = b1;
++ bh_ptr[1] = b2;
++
++ /*
++ * count the number of XORs done during a whole jiffy.
++ * calculate the speed of checksumming from this.
++ * (we use a 2-page allocation to have guaranteed
++ * color L1-cache layout)
++ */
++ max = 0;
++ for (i = 0; i < 5; i++) {
++ now = jiffies;
++ count = 0;
++ while (jiffies == now) {
++ mb();
++ func->xor_block(2,bh_ptr);
++ mb();
++ count++;
++ mb();
++ }
++ if (count > max)
++ max = count;
++ }
++
++ speed = max * (HZ*SIZE/1024);
++ func->speed = speed;
++
++ printk( " %-10s: %5d.%03d MB/sec\n", func->name,
++ speed / 1000, speed % 1000);
++}
++
++static inline void pick_fastest_function(void)
++{
++ struct xor_block_template *f, *fastest;
++
++ fastest = xor_functions;
++ for (f = fastest; f; f = f->next) {
++ if (f->speed > fastest->speed)
++ fastest = f;
++ }
++#ifdef CONFIG_X86_XMM
++ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
++ fastest = &t_xor_block_pIII_kni;
++ }
++#endif
++ xor_block = fastest->xor_block;
++ printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
++ fastest->speed / 1000, fastest->speed % 1000);
++}
++
++
++void calibrate_xor_block(void)
++{
++ struct buffer_head b1, b2;
++
++ memset(&b1,0,sizeof(b1));
++ b2 = b1;
++
++ b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
++ if (!b1.b_data) {
++ pick_fastest_function();
++ return;
++ }
++ b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
++
++ b1.b_size = SIZE;
++
++ printk(KERN_INFO "raid5: measuring checksumming speed\n");
++
++ sti(); /* should be safe */
++
++#if defined(__sparc__) && !defined(__sparc_v9__)
++ printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
++ xor_speed(&t_xor_block_SPARC,&b1,&b2);
++#endif
++
++#ifdef CONFIG_X86_XMM
++ if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
++ printk(KERN_INFO
++ "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
++ /* we force the use of the KNI xor block because it
++ can write around l2. we may also be able
++ to load into the l1 only depending on how
++ the cpu deals with a load to a line that is
++ being prefetched.
++ */
++ xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
++ }
++#endif /* CONFIG_X86_XMM */
++
++#ifdef __i386__
++
++ if (md_cpu_has_mmx()) {
++ printk(KERN_INFO
++ "raid5: MMX detected, trying high-speed MMX checksum routines\n");
++ xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
++ xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
++ }
++
++#endif /* __i386__ */
++
++
++ xor_speed(&t_xor_block_8regs,&b1,&b2);
++ xor_speed(&t_xor_block_32regs,&b1,&b2);
++
++ free_pages((unsigned long)b1.b_data,2);
++ pick_fastest_function();
++}
++
++#else /* __sparc_v9__ */
++
++void calibrate_xor_block(void)
++{
++ printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
++ xor_block = xor_block_VIS;
++}
++
++#endif /* __sparc_v9__ */
++
++MD_EXPORT_SYMBOL(xor_block);
++
+diff -uNr linux-orig/include/linux/blkdev.h linux/include/linux/blkdev.h
+--- linux-orig/include/linux/blkdev.h Fri Dec 8 22:53:55 2000
++++ linux/include/linux/blkdev.h Fri Dec 8 22:54:52 2000
+@@ -91,8 +91,9 @@
+ extern void make_request(int major,int rw, struct buffer_head * bh);
+
+ /* md needs this function to remap requests */
+-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
+-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
++extern int md_map (kdev_t dev, kdev_t *rdev,
++ unsigned long *rsector, unsigned long size);
++extern int md_make_request (struct buffer_head * bh, int rw);
+ extern int md_error (kdev_t mddev, kdev_t rdev);
+
+ extern int * blk_size[MAX_BLKDEV];
+diff -uNr linux-orig/include/linux/fs.h linux/include/linux/fs.h
+--- linux-orig/include/linux/fs.h Fri Dec 8 22:53:55 2000
++++ linux/include/linux/fs.h Fri Dec 8 22:54:52 2000
+@@ -185,6 +185,7 @@
+ #define BH_Lock 2 /* 1 if the buffer is locked */
+ #define BH_Req 3 /* 0 if the buffer has been invalidated */
+ #define BH_Protected 6 /* 1 if the buffer is protected */
++#define BH_LowPrio 7 /* 1 if the buffer is lowprio */
+
+ /*
+ * Try to keep the most commonly used fields in single cache lines (16
+@@ -778,6 +779,7 @@
+ extern void refile_buffer(struct buffer_head * buf);
+ extern void set_writetime(struct buffer_head * buf, int flag);
+ extern int try_to_free_buffers(struct page *, int wait);
++extern void cache_drop_behind(struct buffer_head *bh);
+
+ extern int nr_buffers;
+ extern long buffermem;
+@@ -798,6 +800,25 @@
+ }
+ }
+
++extern inline void mark_buffer_highprio(struct buffer_head * bh)
++{
++ clear_bit(BH_LowPrio, &bh->b_state);
++}
++
++extern inline void mark_buffer_lowprio(struct buffer_head * bh)
++{
++ /*
++ * dirty buffers cannot be marked lowprio.
++ */
++ if (!buffer_dirty(bh))
++ set_bit(BH_LowPrio, &bh->b_state);
++}
++
++static inline int buffer_lowprio(struct buffer_head * bh)
++{
++ return test_bit(BH_LowPrio, &bh->b_state);
++}
++
+ extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
+ {
+ if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
+@@ -805,6 +826,23 @@
+ if (bh->b_list != BUF_DIRTY)
+ refile_buffer(bh);
+ }
++ /*
++ * if a buffer gets marked dirty then it has to lose
++ * it's lowprio state.
++ */
++ mark_buffer_highprio(bh);
++}
++
++extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
++{
++ if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
++ if (bh->b_list != BUF_DIRTY)
++ refile_buffer(bh);
++ /*
++ * Mark it lowprio only if it was not dirty before!
++ */
++ set_bit(BH_LowPrio, &bh->b_state);
++ }
+ }
+
+ extern int check_disk_change(kdev_t dev);
+@@ -882,6 +920,7 @@
+ extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
+ extern void ll_rw_block(int, int, struct buffer_head * bh[]);
+ extern int is_read_only(kdev_t);
++extern int is_device_idle(kdev_t);
+ extern void __brelse(struct buffer_head *);
+ extern inline void brelse(struct buffer_head *buf)
+ {
+@@ -897,8 +936,12 @@
+ extern void set_blocksize(kdev_t dev, int size);
+ extern unsigned int get_hardblocksize(kdev_t dev);
+ extern struct buffer_head * bread(kdev_t dev, int block, int size);
++extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
++extern void bread_ahead (kdev_t dev, int block, int size);
+ extern struct buffer_head * breada(kdev_t dev,int block, int size,
+ unsigned int pos, unsigned int filesize);
++extern struct buffer_head * breada_blocks(kdev_t dev,int block,
++ int size, int blocks);
+
+ extern int brw_page(int, struct page *, kdev_t, int [], int, int);
+
+diff -uNr linux-orig/include/linux/md.h linux/include/linux/md.h
+--- linux-orig/include/linux/md.h Fri May 8 00:17:13 1998
++++ linux/include/linux/md.h Fri Dec 8 22:54:52 2000
+@@ -1,300 +0,0 @@
+-/*
+- md.h : Multiple Devices driver for Linux
+- Copyright (C) 1994-96 Marc ZYNGIER
+- <zyngier@ufr-info-p7.ibp.fr> or
+- <maz@gloups.fdn.fr>
+-
+- This program is free software; you can redistribute it and/or modify
+- it under the terms of the GNU General Public License as published by
+- the Free Software Foundation; either version 2, or (at your option)
+- any later version.
+-
+- You should have received a copy of the GNU General Public License
+- (for example /usr/src/linux/COPYING); if not, write to the Free
+- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+-*/
+-
+-#ifndef _MD_H
+-#define _MD_H
+-
+-#include <linux/major.h>
+-#include <linux/ioctl.h>
+-#include <linux/types.h>
+-
+-/*
+- * Different major versions are not compatible.
+- * Different minor versions are only downward compatible.
+- * Different patchlevel versions are downward and upward compatible.
+- */
+-#define MD_MAJOR_VERSION 0
+-#define MD_MINOR_VERSION 36
+-#define MD_PATCHLEVEL_VERSION 6
+-
+-#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
+-
+-/* ioctls */
+-#define REGISTER_DEV _IO (MD_MAJOR, 1)
+-#define START_MD _IO (MD_MAJOR, 2)
+-#define STOP_MD _IO (MD_MAJOR, 3)
+-#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
+-
+-/*
+- personalities :
+- Byte 0 : Chunk size factor
+- Byte 1 : Fault tolerance count for each physical device
+- ( 0 means no fault tolerance,
+- 0xFF means always tolerate faults), not used by now.
+- Byte 2 : Personality
+- Byte 3 : Reserved.
+- */
+-
+-#define FAULT_SHIFT 8
+-#define PERSONALITY_SHIFT 16
+-
+-#define FACTOR_MASK 0x000000FFUL
+-#define FAULT_MASK 0x0000FF00UL
+-#define PERSONALITY_MASK 0x00FF0000UL
+-
+-#define MD_RESERVED 0 /* Not used by now */
+-#define LINEAR (1UL << PERSONALITY_SHIFT)
+-#define STRIPED (2UL << PERSONALITY_SHIFT)
+-#define RAID0 STRIPED
+-#define RAID1 (3UL << PERSONALITY_SHIFT)
+-#define RAID5 (4UL << PERSONALITY_SHIFT)
+-#define MAX_PERSONALITY 5
+-
+-/*
+- * MD superblock.
+- *
+- * The MD superblock maintains some statistics on each MD configuration.
+- * Each real device in the MD set contains it near the end of the device.
+- * Some of the ideas are copied from the ext2fs implementation.
+- *
+- * We currently use 4096 bytes as follows:
+- *
+- * word offset function
+- *
+- * 0 - 31 Constant generic MD device information.
+- * 32 - 63 Generic state information.
+- * 64 - 127 Personality specific information.
+- * 128 - 511 12 32-words descriptors of the disks in the raid set.
+- * 512 - 911 Reserved.
+- * 912 - 1023 Disk specific descriptor.
+- */
+-
+-/*
+- * If x is the real device size in bytes, we return an apparent size of:
+- *
+- * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+- *
+- * and place the 4kB superblock at offset y.
+- */
+-#define MD_RESERVED_BYTES (64 * 1024)
+-#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
+-#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
+-
+-#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+-#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
+-
+-#define MD_SB_BYTES 4096
+-#define MD_SB_WORDS (MD_SB_BYTES / 4)
+-#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
+-#define MD_SB_SECTORS (MD_SB_BYTES / 512)
+-
+-/*
+- * The following are counted in 32-bit words
+- */
+-#define MD_SB_GENERIC_OFFSET 0
+-#define MD_SB_PERSONALITY_OFFSET 64
+-#define MD_SB_DISKS_OFFSET 128
+-#define MD_SB_DESCRIPTOR_OFFSET 992
+-
+-#define MD_SB_GENERIC_CONSTANT_WORDS 32
+-#define MD_SB_GENERIC_STATE_WORDS 32
+-#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+-#define MD_SB_PERSONALITY_WORDS 64
+-#define MD_SB_DISKS_WORDS 384
+-#define MD_SB_DESCRIPTOR_WORDS 32
+-#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+-#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+-#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
+-
+-/*
+- * Device "operational" state bits
+- */
+-#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
+-#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
+-#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
+-
+-typedef struct md_device_descriptor_s {
+- __u32 number; /* 0 Device number in the entire set */
+- __u32 major; /* 1 Device major number */
+- __u32 minor; /* 2 Device minor number */
+- __u32 raid_disk; /* 3 The role of the device in the raid set */
+- __u32 state; /* 4 Operational state */
+- __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+-} md_descriptor_t;
+-
+-#define MD_SB_MAGIC 0xa92b4efc
+-
+-/*
+- * Superblock state bits
+- */
+-#define MD_SB_CLEAN 0
+-#define MD_SB_ERRORS 1
+-
+-typedef struct md_superblock_s {
+-
+- /*
+- * Constant generic information
+- */
+- __u32 md_magic; /* 0 MD identifier */
+- __u32 major_version; /* 1 major version to which the set conforms */
+- __u32 minor_version; /* 2 minor version to which the set conforms */
+- __u32 patch_version; /* 3 patchlevel version to which the set conforms */
+- __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
+- __u32 set_magic; /* 5 Raid set identifier */
+- __u32 ctime; /* 6 Creation time */
+- __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
+- __u32 size; /* 8 Apparent size of each individual disk, in kB */
+- __u32 nr_disks; /* 9 Number of total disks in the raid set */
+- __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
+- __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
+-
+- /*
+- * Generic state information
+- */
+- __u32 utime; /* 0 Superblock update time */
+- __u32 state; /* 1 State bits (clean, ...) */
+- __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
+- __u32 working_disks; /* 3 Number of working disks */
+- __u32 failed_disks; /* 4 Number of failed disks */
+- __u32 spare_disks; /* 5 Number of spare disks */
+- __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
+-
+- /*
+- * Personality information
+- */
+- __u32 parity_algorithm;
+- __u32 chunk_size;
+- __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
+-
+- /*
+- * Disks information
+- */
+- md_descriptor_t disks[MD_SB_DISKS];
+-
+- /*
+- * Reserved
+- */
+- __u32 reserved[MD_SB_RESERVED_WORDS];
+-
+- /*
+- * Active descriptor
+- */
+- md_descriptor_t descriptor;
+-} md_superblock_t;
+-
+-#ifdef __KERNEL__
+-
+-#include <linux/mm.h>
+-#include <linux/fs.h>
+-#include <linux/blkdev.h>
+-#include <asm/semaphore.h>
+-
+-/*
+- * Kernel-based reconstruction is mostly working, but still requires
+- * some additional work.
+- */
+-#define SUPPORT_RECONSTRUCTION 0
+-
+-#define MAX_REAL 8 /* Max number of physical dev per md dev */
+-#define MAX_MD_DEV 4 /* Max number of md dev */
+-
+-#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
+-#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
+-#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
+-
+-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
+-
+-struct real_dev
+-{
+- kdev_t dev; /* Device number */
+- int size; /* Device size (in blocks) */
+- int offset; /* Real device offset (in blocks) in md dev
+- (only used in linear mode) */
+- struct inode *inode; /* Lock inode */
+- md_superblock_t *sb;
+- u32 sb_offset;
+-};
+-
+-struct md_dev;
+-
+-#define SPARE_INACTIVE 0
+-#define SPARE_WRITE 1
+-#define SPARE_ACTIVE 2
+-
+-struct md_personality
+-{
+- char *name;
+- int (*map)(struct md_dev *mddev, kdev_t *rdev,
+- unsigned long *rsector, unsigned long size);
+- int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
+- void (*end_request)(struct buffer_head * bh, int uptodate);
+- int (*run)(int minor, struct md_dev *mddev);
+- int (*stop)(int minor, struct md_dev *mddev);
+- int (*status)(char *page, int minor, struct md_dev *mddev);
+- int (*ioctl)(struct inode *inode, struct file *file,
+- unsigned int cmd, unsigned long arg);
+- int max_invalid_dev;
+- int (*error_handler)(struct md_dev *mddev, kdev_t dev);
+-
+-/*
+- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
+- * hot-removed. Hot removal is different from failure. (failure marks
+- * a disk inactive, but the disk is still part of the array)
+- */
+- int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
+- int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
+- int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
+-};
+-
+-struct md_dev
+-{
+- struct real_dev devices[MAX_REAL];
+- struct md_personality *pers;
+- md_superblock_t *sb;
+- int sb_dirty;
+- int repartition;
+- int busy;
+- int nb_dev;
+- void *private;
+-};
+-
+-struct md_thread {
+- void (*run) (void *data);
+- void *data;
+- struct wait_queue *wqueue;
+- unsigned long flags;
+- struct semaphore *sem;
+- struct task_struct *tsk;
+-};
+-
+-#define THREAD_WAKEUP 0
+-
+-extern struct md_dev md_dev[MAX_MD_DEV];
+-extern int md_size[MAX_MD_DEV];
+-extern int md_maxreadahead[MAX_MD_DEV];
+-
+-extern char *partition_name (kdev_t dev);
+-
+-extern int register_md_personality (int p_num, struct md_personality *p);
+-extern int unregister_md_personality (int p_num);
+-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
+-extern void md_unregister_thread (struct md_thread *thread);
+-extern void md_wakeup_thread(struct md_thread *thread);
+-extern int md_update_sb (int minor);
+-extern int md_do_sync(struct md_dev *mddev);
+-
+-#endif __KERNEL__
+-#endif _MD_H
+diff -uNr linux-orig/include/linux/raid/hsm.h linux/include/linux/raid/hsm.h
+--- linux-orig/include/linux/raid/hsm.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/hsm.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,65 @@
++#ifndef _HSM_H
++#define _HSM_H
++
++#include <linux/raid/md.h>
++
++#if __alpha__
++#error fix cpu_addr on Alpha first
++#endif
++
++#include <linux/raid/hsm_p.h>
++
++#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
++#define index_dev(lv,index) index_pv((lv),(index))->dev
++#define index_block(lv,index) (index)->data.phys_block
++#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
++
++#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
++
++
++typedef struct pv_bg_desc_s {
++ unsigned int free_blocks;
++ pv_block_group_t *bg;
++} pv_bg_desc_t;
++
++typedef struct pv_s pv_t;
++typedef struct vg_s vg_t;
++typedef struct lv_s lv_t;
++
++struct pv_s
++{
++ int phys_nr;
++ kdev_t dev;
++ pv_sb_t *pv_sb;
++ pv_bg_desc_t *bg_array;
++};
++
++struct lv_s
++{
++ int log_id;
++ vg_t *vg;
++
++ unsigned int max_indices;
++ unsigned int free_indices;
++ lv_lptr_t root_index;
++
++ kdev_t dev;
++};
++
++struct vg_s
++{
++ int nr_pv;
++ pv_t pv_array [MD_SB_DISKS];
++
++ int nr_lv;
++ lv_t lv_array [HSM_MAX_LVS_PER_VG];
++
++ vg_sb_t *vg_sb;
++ mddev_t *mddev;
++};
++
++#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
++#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
++
++#endif
++
+diff -uNr linux-orig/include/linux/raid/hsm_p.h linux/include/linux/raid/hsm_p.h
+--- linux-orig/include/linux/raid/hsm_p.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/hsm_p.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,237 @@
++#ifndef _HSM_P_H
++#define _HSM_P_H
++
++#define HSM_BLOCKSIZE 4096
++#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
++#define PACKED __attribute__ ((packed))
++
++/*
++ * Identifies a block in physical space
++ */
++typedef struct phys_idx_s {
++ __u16 phys_nr;
++ __u32 phys_block;
++
++} PACKED phys_idx_t;
++
++/*
++ * Identifies a block in logical space
++ */
++typedef struct log_idx_s {
++ __u16 log_id;
++ __u32 log_index;
++
++} PACKED log_idx_t;
++
++/*
++ * Describes one PV
++ */
++#define HSM_PV_SB_MAGIC 0xf091ae9fU
++
++#define HSM_PV_SB_GENERIC_WORDS 32
++#define HSM_PV_SB_RESERVED_WORDS \
++ (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
++
++/*
++ * On-disk PV identification data, on block 0 in any PV.
++ */
++typedef struct pv_sb_s
++{
++ __u32 pv_magic; /* 0 */
++
++ __u32 pv_uuid0; /* 1 */
++ __u32 pv_uuid1; /* 2 */
++ __u32 pv_uuid2; /* 3 */
++ __u32 pv_uuid3; /* 4 */
++
++ __u32 pv_major; /* 5 */
++ __u32 pv_minor; /* 6 */
++ __u32 pv_patch; /* 7 */
++
++ __u32 pv_ctime; /* 8 Creation time */
++
++ __u32 pv_total_size; /* 9 size of this PV, in blocks */
++ __u32 pv_first_free; /* 10 first free block */
++ __u32 pv_first_used; /* 11 first used block */
++ __u32 pv_blocks_left; /* 12 unallocated blocks */
++ __u32 pv_bg_size; /* 13 size of a block group, in blocks */
++ __u32 pv_block_size; /* 14 size of blocks, in bytes */
++ __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
++ __u32 pv_block_groups; /* 16 number of block groups */
++
++ __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
++
++ /*
++ * Reserved
++ */
++ __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
++
++} PACKED pv_sb_t;
++
++/*
++ * this is pretty much arbitrary, but has to be less than ~64
++ */
++#define HSM_MAX_LVS_PER_VG 32
++
++#define HSM_VG_SB_GENERIC_WORDS 32
++
++#define LV_DESCRIPTOR_WORDS 8
++#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
++ LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
++
++#if (HSM_PV_SB_RESERVED_WORDS < 0)
++#error you messed this one up dude ...
++#endif
++
++typedef struct lv_descriptor_s
++{
++ __u32 lv_id; /* 0 */
++ phys_idx_t lv_root_idx; /* 1 */
++ __u16 __reserved; /* 2 */
++ __u32 lv_max_indices; /* 3 */
++ __u32 lv_free_indices; /* 4 */
++ __u32 md_id; /* 5 */
++
++ __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
++
++} PACKED lv_descriptor_t;
++
++#define HSM_VG_SB_MAGIC 0x98320d7aU
++/*
++ * On-disk VG identification data, in block 1 on all PVs
++ */
++typedef struct vg_sb_s
++{
++ __u32 vg_magic; /* 0 */
++ __u32 nr_lvs; /* 1 */
++
++ __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
++
++ lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
++ /*
++ * Reserved
++ */
++ __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
++
++} PACKED vg_sb_t;
++
++/*
++ * Describes one LV
++ */
++
++#define HSM_LV_SB_MAGIC 0xe182bd8aU
++
++/* do we need lv_sb_t? */
++
++typedef struct lv_sb_s
++{
++ /*
++ * On-disk LV identifier
++ */
++ __u32 lv_magic; /* 0 LV identifier */
++ __u32 lv_uuid0; /* 1 */
++ __u32 lv_uuid1; /* 2 */
++ __u32 lv_uuid2; /* 3 */
++ __u32 lv_uuid3; /* 4 */
++
++ __u32 lv_major; /* 5 PV identifier */
++ __u32 lv_minor; /* 6 PV identifier */
++ __u32 lv_patch; /* 7 PV identifier */
++
++ __u32 ctime; /* 8 Creation time */
++ __u32 size; /* 9 size of this LV, in blocks */
++ phys_idx_t start; /* 10 position of root index block */
++ log_idx_t first_free; /* 11-12 first free index */
++
++ /*
++ * Reserved
++ */
++ __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
++
++} PACKED lv_sb_t;
++
++/*
++ * Pointer pointing from the physical space, points to
++ * the LV owning this block. It also contains various
++ * statistics about the physical block.
++ */
++typedef struct pv_pptr_s
++{
++ union {
++ /* case 1 */
++ struct {
++ log_idx_t owner;
++ log_idx_t predicted;
++ __u32 last_referenced;
++ } used;
++ /* case 2 */
++ struct {
++ __u16 log_id;
++ __u16 __unused1;
++ __u32 next_free;
++ __u32 __unused2;
++ __u32 __unused3;
++ } free;
++ } u;
++} PACKED pv_pptr_t;
++
++static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
++{
++ return !pptr->u.free.log_id;
++}
++
++
++#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
++
++#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
++/*
++ * A table of pointers filling up a single block, managing
++ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
++ * groups form the physical space of blocks.
++ */
++typedef struct pv_block_group_s
++{
++ __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
++
++ pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
++
++} PACKED pv_block_group_t;
++
++/*
++ * Pointer from the logical space, points to
++ * the (PV,block) containing this logical block
++ */
++typedef struct lv_lptr_s
++{
++ phys_idx_t data;
++ __u16 __reserved;
++ __u32 cpu_addr;
++ __u32 __reserved2;
++
++} PACKED lv_lptr_t;
++
++static __inline__ int index_free (const lv_lptr_t * index)
++{
++ return !index->data.phys_block;
++}
++
++static __inline__ int index_present (const lv_lptr_t * index)
++{
++ return index->cpu_addr;
++}
++
++
++#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
++/*
++ * A table of pointers filling up a single block, managing
++ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
++ * the logical space of blocks.
++ */
++typedef struct lv_index_block_s
++{
++ lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
++
++} PACKED lv_index_block_t;
++
++#endif
++
+diff -uNr linux-orig/include/linux/raid/linear.h linux/include/linux/raid/linear.h
+--- linux-orig/include/linux/raid/linear.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/linear.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,32 @@
++#ifndef _LINEAR_H
++#define _LINEAR_H
++
++#include <linux/raid/md.h>
++
++struct dev_info {
++ kdev_t dev;
++ int size;
++ unsigned int offset;
++};
++
++typedef struct dev_info dev_info_t;
++
++struct linear_hash
++{
++ dev_info_t *dev0, *dev1;
++};
++
++struct linear_private_data
++{
++ struct linear_hash *hash_table;
++ dev_info_t disks[MD_SB_DISKS];
++ dev_info_t *smallest;
++ int nr_zones;
++};
++
++
++typedef struct linear_private_data linear_conf_t;
++
++#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/md.h linux/include/linux/raid/md.h
+--- linux-orig/include/linux/raid/md.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,96 @@
++/*
++ md.h : Multiple Devices driver for Linux
++ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++ Copyright (C) 1994-96 Marc ZYNGIER
++ <zyngier@ufr-info-p7.ibp.fr> or
++ <maz@gloups.fdn.fr>
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#ifndef _MD_H
++#define _MD_H
++
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <asm/semaphore.h>
++#include <linux/major.h>
++#include <linux/ioctl.h>
++#include <linux/types.h>
++#include <asm/bitops.h>
++#include <linux/module.h>
++#include <linux/mm.h>
++#include <linux/hdreg.h>
++#include <linux/sysctl.h>
++#include <linux/fs.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <linux/delay.h>
++#include <net/checksum.h>
++#include <linux/random.h>
++#include <linux/locks.h>
++#include <asm/io.h>
++
++#include <linux/raid/md_compatible.h>
++/*
++ * 'md_p.h' holds the 'physical' layout of RAID devices
++ * 'md_u.h' holds the user <=> kernel API
++ *
++ * 'md_k.h' holds kernel internal definitions
++ */
++
++#include <linux/raid/md_p.h>
++#include <linux/raid/md_u.h>
++#include <linux/raid/md_k.h>
++
++/*
++ * Different major versions are not compatible.
++ * Different minor versions are only downward compatible.
++ * Different patchlevel versions are downward and upward compatible.
++ */
++#define MD_MAJOR_VERSION 0
++#define MD_MINOR_VERSION 90
++#define MD_PATCHLEVEL_VERSION 0
++
++extern int md_size[MAX_MD_DEVS];
++extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
++
++extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
++extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
++extern char * partition_name (kdev_t dev);
++extern int register_md_personality (int p_num, mdk_personality_t *p);
++extern int unregister_md_personality (int p_num);
++extern mdk_thread_t * md_register_thread (void (*run) (void *data),
++ void *data, const char *name);
++extern void md_unregister_thread (mdk_thread_t *thread);
++extern void md_wakeup_thread(mdk_thread_t *thread);
++extern void md_interrupt_thread (mdk_thread_t *thread);
++extern int md_update_sb (mddev_t *mddev);
++extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
++extern void md_recover_arrays (void);
++extern int md_check_ordering (mddev_t *mddev);
++extern void autodetect_raid(void);
++extern struct gendisk * find_gendisk (kdev_t dev);
++extern int md_notify_reboot(struct notifier_block *this,
++ unsigned long code, void *x);
++#if CONFIG_BLK_DEV_MD
++extern void raid_setup(char *str,int *ints) md__init;
++#endif
++#ifdef CONFIG_MD_BOOT
++extern void md_setup(char *str,int *ints) md__init;
++#endif
++
++extern void md_print_devices (void);
++
++#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
++
++#endif _MD_H
++
+diff -uNr linux-orig/include/linux/raid/md_compatible.h linux/include/linux/raid/md_compatible.h
+--- linux-orig/include/linux/raid/md_compatible.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_compatible.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,387 @@
++
++/*
++ md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
++ Copyright (C) 1998 Ingo Molnar
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#include <linux/version.h>
++
++#ifndef _MD_COMPATIBLE_H
++#define _MD_COMPATIBLE_H
++
++#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
++
++#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
++
++/* 000 */
++#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
++
++#ifdef __i386__
++/* 001 */
++extern __inline__ int md_cpu_has_mmx(void)
++{
++ return x86_capability & 0x00800000;
++}
++#endif
++
++/* 002 */
++#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
++
++/* 003 */
++/*
++ * someone please suggest a sane compatibility layer for modules
++ */
++#define MD_EXPORT_SYMBOL(x)
++
++/* 004 */
++static inline unsigned long
++md_copy_from_user(void *to, const void *from, unsigned long n)
++{
++ int err;
++
++ err = verify_area(VERIFY_READ,from,n);
++ if (!err)
++ memcpy_fromfs(to, from, n);
++ return err;
++}
++
++/* 005 */
++extern inline unsigned long
++md_copy_to_user(void *to, const void *from, unsigned long n)
++{
++ int err;
++
++ err = verify_area(VERIFY_WRITE,to,n);
++ if (!err)
++ memcpy_tofs(to, from, n);
++ return err;
++}
++
++/* 006 */
++#define md_put_user(x,ptr) \
++({ \
++ int __err; \
++ \
++ __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
++ if (!__err) \
++ put_user(x,ptr); \
++ __err; \
++})
++
++/* 007 */
++extern inline int md_capable_admin(void)
++{
++ return suser();
++}
++
++/* 008 */
++#define MD_FILE_TO_INODE(file) ((file)->f_inode)
++
++/* 009 */
++extern inline void md_flush_signals (void)
++{
++ current->signal = 0;
++}
++
++/* 010 */
++#define __S(nr) (1<<((nr)-1))
++extern inline void md_init_signals (void)
++{
++ current->exit_signal = SIGCHLD;
++ current->blocked = ~(__S(SIGKILL));
++}
++#undef __S
++
++/* 011 */
++extern inline unsigned long md_signal_pending (struct task_struct * tsk)
++{
++ return (tsk->signal & ~tsk->blocked);
++}
++
++/* 012 */
++#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
++
++/* 013 */
++#define md_mdelay(n) (\
++ {unsigned long msec=(n); while (msec--) udelay(1000);})
++
++/* 014 */
++#define MD_SYS_DOWN 0
++#define MD_SYS_HALT 0
++#define MD_SYS_POWER_OFF 0
++
++/* 015 */
++#define md_register_reboot_notifier(x)
++
++/* 016 */
++extern __inline__ unsigned long
++md_test_and_set_bit(int nr, void * addr)
++{
++ unsigned long flags;
++ unsigned long oldbit;
++
++ save_flags(flags);
++ cli();
++ oldbit = test_bit(nr,addr);
++ set_bit(nr,addr);
++ restore_flags(flags);
++ return oldbit;
++}
++
++/* 017 */
++extern __inline__ unsigned long
++md_test_and_clear_bit(int nr, void * addr)
++{
++ unsigned long flags;
++ unsigned long oldbit;
++
++ save_flags(flags);
++ cli();
++ oldbit = test_bit(nr,addr);
++ clear_bit(nr,addr);
++ restore_flags(flags);
++ return oldbit;
++}
++
++/* 018 */
++#define md_atomic_read(x) (*(volatile int *)(x))
++#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
++
++/* 019 */
++extern __inline__ void md_lock_kernel (void)
++{
++#if __SMP__
++ lock_kernel();
++ syscall_count++;
++#endif
++}
++
++extern __inline__ void md_unlock_kernel (void)
++{
++#if __SMP__
++ syscall_count--;
++ unlock_kernel();
++#endif
++}
++/* 020 */
++
++#define md__init
++#define md__initdata
++#define md__initfunc(__arginit) __arginit
++
++/* 021 */
++
++/* 022 */
++
++struct md_list_head {
++ struct md_list_head *next, *prev;
++};
++
++#define MD_LIST_HEAD(name) \
++ struct md_list_head name = { &name, &name }
++
++#define MD_INIT_LIST_HEAD(ptr) do { \
++ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
++} while (0)
++
++static __inline__ void md__list_add(struct md_list_head * new,
++ struct md_list_head * prev,
++ struct md_list_head * next)
++{
++ next->prev = new;
++ new->next = next;
++ new->prev = prev;
++ prev->next = new;
++}
++
++static __inline__ void md_list_add(struct md_list_head *new,
++ struct md_list_head *head)
++{
++ md__list_add(new, head, head->next);
++}
++
++static __inline__ void md__list_del(struct md_list_head * prev,
++ struct md_list_head * next)
++{
++ next->prev = prev;
++ prev->next = next;
++}
++
++static __inline__ void md_list_del(struct md_list_head *entry)
++{
++ md__list_del(entry->prev, entry->next);
++}
++
++static __inline__ int md_list_empty(struct md_list_head *head)
++{
++ return head->next == head;
++}
++
++#define md_list_entry(ptr, type, member) \
++ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
++
++/* 023 */
++
++static __inline__ signed long md_schedule_timeout(signed long timeout)
++{
++ current->timeout = jiffies + timeout;
++ schedule();
++ return 0;
++}
++
++/* 024 */
++#define md_need_resched(tsk) (need_resched)
++
++/* 025 */
++typedef struct { int gcc_is_buggy; } md_spinlock_t;
++#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
++
++#define md_spin_lock_irq cli
++#define md_spin_unlock_irq sti
++#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
++#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
++
++/* END */
++
++#else
++
++#include <linux/reboot.h>
++#include <linux/vmalloc.h>
++
++/* 000 */
++#define md__get_free_pages(x,y) __get_free_pages(x,y)
++
++#ifdef __i386__
++/* 001 */
++extern __inline__ int md_cpu_has_mmx(void)
++{
++ return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
++}
++#endif
++
++/* 002 */
++#define md_clear_page(page) clear_page(page)
++
++/* 003 */
++#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
++
++/* 004 */
++#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
++
++/* 005 */
++#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
++
++/* 006 */
++#define md_put_user put_user
++
++/* 007 */
++extern inline int md_capable_admin(void)
++{
++ return capable(CAP_SYS_ADMIN);
++}
++
++/* 008 */
++#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
++
++/* 009 */
++extern inline void md_flush_signals (void)
++{
++ spin_lock(¤t->sigmask_lock);
++ flush_signals(current);
++ spin_unlock(¤t->sigmask_lock);
++}
++
++/* 010 */
++extern inline void md_init_signals (void)
++{
++ current->exit_signal = SIGCHLD;
++ siginitsetinv(¤t->blocked, sigmask(SIGKILL));
++}
++
++/* 011 */
++#define md_signal_pending signal_pending
++
++/* 012 */
++extern inline void md_set_global_readahead(int * table)
++{
++ max_readahead[MD_MAJOR] = table;
++}
++
++/* 013 */
++#define md_mdelay(x) mdelay(x)
++
++/* 014 */
++#define MD_SYS_DOWN SYS_DOWN
++#define MD_SYS_HALT SYS_HALT
++#define MD_SYS_POWER_OFF SYS_POWER_OFF
++
++/* 015 */
++#define md_register_reboot_notifier register_reboot_notifier
++
++/* 016 */
++#define md_test_and_set_bit test_and_set_bit
++
++/* 017 */
++#define md_test_and_clear_bit test_and_clear_bit
++
++/* 018 */
++#define md_atomic_read atomic_read
++#define md_atomic_set atomic_set
++
++/* 019 */
++#define md_lock_kernel lock_kernel
++#define md_unlock_kernel unlock_kernel
++
++/* 020 */
++
++#include <linux/init.h>
++
++#define md__init __init
++#define md__initdata __initdata
++#define md__initfunc(__arginit) __initfunc(__arginit)
++
++/* 021 */
++
++
++/* 022 */
++
++#define md_list_head list_head
++#define MD_LIST_HEAD(name) LIST_HEAD(name)
++#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
++#define md_list_add list_add
++#define md_list_del list_del
++#define md_list_empty list_empty
++
++#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
++
++/* 023 */
++
++#define md_schedule_timeout schedule_timeout
++
++/* 024 */
++#define md_need_resched(tsk) ((tsk)->need_resched)
++
++/* 025 */
++#define md_spinlock_t spinlock_t
++#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
++
++#define md_spin_lock_irq spin_lock_irq
++#define md_spin_unlock_irq spin_unlock_irq
++#define md_spin_unlock_irqrestore spin_unlock_irqrestore
++#define md_spin_lock_irqsave spin_lock_irqsave
++
++/* END */
++
++#endif
++
++#endif _MD_COMPATIBLE_H
++
+diff -uNr linux-orig/include/linux/raid/md_k.h linux/include/linux/raid/md_k.h
+--- linux-orig/include/linux/raid/md_k.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_k.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,338 @@
++/*
++ md_k.h : kernel internal structure of the Linux MD driver
++ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#ifndef _MD_K_H
++#define _MD_K_H
++
++#define MD_RESERVED 0UL
++#define LINEAR 1UL
++#define STRIPED 2UL
++#define RAID0 STRIPED
++#define RAID1 3UL
++#define RAID5 4UL
++#define TRANSLUCENT 5UL
++#define HSM 6UL
++#define MAX_PERSONALITY 7UL
++
++extern inline int pers_to_level (int pers)
++{
++ switch (pers) {
++ case HSM: return -3;
++ case TRANSLUCENT: return -2;
++ case LINEAR: return -1;
++ case RAID0: return 0;
++ case RAID1: return 1;
++ case RAID5: return 5;
++ }
++ panic("pers_to_level()");
++}
++
++extern inline int level_to_pers (int level)
++{
++ switch (level) {
++ case -3: return HSM;
++ case -2: return TRANSLUCENT;
++ case -1: return LINEAR;
++ case 0: return RAID0;
++ case 1: return RAID1;
++ case 4:
++ case 5: return RAID5;
++ }
++ return MD_RESERVED;
++}
++
++typedef struct mddev_s mddev_t;
++typedef struct mdk_rdev_s mdk_rdev_t;
++
++#if (MINORBITS != 8)
++#error MD doesnt handle bigger kdev yet
++#endif
++
++#define MAX_REAL 12 /* Max number of disks per md dev */
++#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
++
++/*
++ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
++ * the personality. (eg. HSM uses this to identify individual LVs)
++ */
++typedef struct dev_mapping_s {
++ mddev_t *mddev;
++ void *data;
++} dev_mapping_t;
++
++extern dev_mapping_t mddev_map [MAX_MD_DEVS];
++
++extern inline mddev_t * kdev_to_mddev (kdev_t dev)
++{
++ return mddev_map[MINOR(dev)].mddev;
++}
++
++/*
++ * options passed in raidrun:
++ */
++
++#define MAX_CHUNK_SIZE (4096*1024)
++
++/*
++ * default readahead
++ */
++#define MD_READAHEAD (256 * 512)
++
++extern inline int disk_faulty(mdp_disk_t * d)
++{
++ return d->state & (1 << MD_DISK_FAULTY);
++}
++
++extern inline int disk_active(mdp_disk_t * d)
++{
++ return d->state & (1 << MD_DISK_ACTIVE);
++}
++
++extern inline int disk_sync(mdp_disk_t * d)
++{
++ return d->state & (1 << MD_DISK_SYNC);
++}
++
++extern inline int disk_spare(mdp_disk_t * d)
++{
++ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
++}
++
++extern inline int disk_removed(mdp_disk_t * d)
++{
++ return d->state & (1 << MD_DISK_REMOVED);
++}
++
++extern inline void mark_disk_faulty(mdp_disk_t * d)
++{
++ d->state |= (1 << MD_DISK_FAULTY);
++}
++
++extern inline void mark_disk_active(mdp_disk_t * d)
++{
++ d->state |= (1 << MD_DISK_ACTIVE);
++}
++
++extern inline void mark_disk_sync(mdp_disk_t * d)
++{
++ d->state |= (1 << MD_DISK_SYNC);
++}
++
++extern inline void mark_disk_spare(mdp_disk_t * d)
++{
++ d->state = 0;
++}
++
++extern inline void mark_disk_removed(mdp_disk_t * d)
++{
++ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
++}
++
++extern inline void mark_disk_inactive(mdp_disk_t * d)
++{
++ d->state &= ~(1 << MD_DISK_ACTIVE);
++}
++
++extern inline void mark_disk_nonsync(mdp_disk_t * d)
++{
++ d->state &= ~(1 << MD_DISK_SYNC);
++}
++
++/*
++ * MD's 'extended' device
++ */
++struct mdk_rdev_s
++{
++ struct md_list_head same_set; /* RAID devices within the same set */
++ struct md_list_head all; /* all RAID devices */
++ struct md_list_head pending; /* undetected RAID devices */
++
++ kdev_t dev; /* Device number */
++ kdev_t old_dev; /* "" when it was last imported */
++ int size; /* Device size (in blocks) */
++ mddev_t *mddev; /* RAID array if running */
++ unsigned long last_events; /* IO event timestamp */
++
++ struct inode *inode; /* Lock inode */
++ struct file filp; /* Lock file */
++
++ mdp_super_t *sb;
++ int sb_offset;
++
++ int faulty; /* if faulty do not issue IO requests */
++ int desc_nr; /* descriptor index in the superblock */
++};
++
++
++/*
++ * disk operations in a working array:
++ */
++#define DISKOP_SPARE_INACTIVE 0
++#define DISKOP_SPARE_WRITE 1
++#define DISKOP_SPARE_ACTIVE 2
++#define DISKOP_HOT_REMOVE_DISK 3
++#define DISKOP_HOT_ADD_DISK 4
++
++typedef struct mdk_personality_s mdk_personality_t;
++
++struct mddev_s
++{
++ void *private;
++ mdk_personality_t *pers;
++ int __minor;
++ mdp_super_t *sb;
++ int nb_dev;
++ struct md_list_head disks;
++ int sb_dirty;
++ mdu_param_t param;
++ int ro;
++ unsigned int curr_resync;
++ unsigned long resync_start;
++ char *name;
++ int recovery_running;
++ struct semaphore reconfig_sem;
++ struct semaphore recovery_sem;
++ struct semaphore resync_sem;
++ struct md_list_head all_mddevs;
++};
++
++struct mdk_personality_s
++{
++ char *name;
++ int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++ unsigned long *rsector, unsigned long size);
++ int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
++ void (*end_request)(struct buffer_head * bh, int uptodate);
++ int (*run)(mddev_t *mddev);
++ int (*stop)(mddev_t *mddev);
++ int (*status)(char *page, mddev_t *mddev);
++ int (*ioctl)(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg);
++ int max_invalid_dev;
++ int (*error_handler)(mddev_t *mddev, kdev_t dev);
++
++/*
++ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
++ * hot-removed. Hot removal is different from failure. (failure marks
++ * a disk inactive, but the disk is still part of the array) The interface
++ * to such operations is the 'pers->diskop()' function, can be NULL.
++ *
++ * the diskop function can change the pointer pointing to the incoming
++ * descriptor, but must do so very carefully. (currently only
++ * SPARE_ACTIVE expects such a change)
++ */
++ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
++
++ int (*stop_resync)(mddev_t *mddev);
++ int (*restart_resync)(mddev_t *mddev);
++};
++
++
++/*
++ * Currently we index md_array directly, based on the minor
++ * number. This will have to change to dynamic allocation
++ * once we start supporting partitioning of md devices.
++ */
++extern inline int mdidx (mddev_t * mddev)
++{
++ return mddev->__minor;
++}
++
++extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
++{
++ return MKDEV(MD_MAJOR, mdidx(mddev));
++}
++
++extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
++extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
++
++/*
++ * iterates through some rdev ringlist. It's safe to remove the
++ * current 'rdev'. Dont touch 'tmp' though.
++ */
++#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
++ \
++ for (tmp = head.next; \
++ rdev = md_list_entry(tmp, mdk_rdev_t, field), \
++ tmp = tmp->next, tmp->prev != &head \
++ ; )
++/*
++ * iterates through the 'same array disks' ringlist
++ */
++#define ITERATE_RDEV(mddev,rdev,tmp) \
++ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
++
++/*
++ * Same as above, but assumes that the device has rdev->desc_nr numbered
++ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
++ */
++#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
++ for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
++
++
++/*
++ * Iterates through all 'RAID managed disks'
++ */
++#define ITERATE_RDEV_ALL(rdev,tmp) \
++ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
++
++/*
++ * Iterates through 'pending RAID disks'
++ */
++#define ITERATE_RDEV_PENDING(rdev,tmp) \
++ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
++
++/*
++ * iterates through all used mddevs in the system.
++ */
++#define ITERATE_MDDEV(mddev,tmp) \
++ \
++ for (tmp = all_mddevs.next; \
++ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
++ tmp = tmp->next, tmp->prev != &all_mddevs \
++ ; )
++
++extern inline int lock_mddev (mddev_t * mddev)
++{
++ return down_interruptible(&mddev->reconfig_sem);
++}
++
++extern inline void unlock_mddev (mddev_t * mddev)
++{
++ up(&mddev->reconfig_sem);
++}
++
++#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
++ x = y; y = __tmp; } while (0)
++
++typedef struct mdk_thread_s {
++ void (*run) (void *data);
++ void *data;
++ struct wait_queue *wqueue;
++ unsigned long flags;
++ struct semaphore *sem;
++ struct task_struct *tsk;
++ const char *name;
++} mdk_thread_t;
++
++#define THREAD_WAKEUP 0
++
++typedef struct dev_name_s {
++ struct md_list_head list;
++ kdev_t dev;
++ char name [MAX_DISKNAME_LEN];
++} dev_name_t;
++
++#endif _MD_K_H
++
+diff -uNr linux-orig/include/linux/raid/md_p.h linux/include/linux/raid/md_p.h
+--- linux-orig/include/linux/raid/md_p.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_p.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,161 @@
++/*
++ md_p.h : physical layout of Linux RAID devices
++ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#ifndef _MD_P_H
++#define _MD_P_H
++
++/*
++ * RAID superblock.
++ *
++ * The RAID superblock maintains some statistics on each RAID configuration.
++ * Each real device in the RAID set contains it near the end of the device.
++ * Some of the ideas are copied from the ext2fs implementation.
++ *
++ * We currently use 4096 bytes as follows:
++ *
++ * word offset function
++ *
++ * 0 - 31 Constant generic RAID device information.
++ * 32 - 63 Generic state information.
++ * 64 - 127 Personality specific information.
++ * 128 - 511 12 32-words descriptors of the disks in the raid set.
++ * 512 - 911 Reserved.
++ * 912 - 1023 Disk specific descriptor.
++ */
++
++/*
++ * If x is the real device size in bytes, we return an apparent size of:
++ *
++ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
++ *
++ * and place the 4kB superblock at offset y.
++ */
++#define MD_RESERVED_BYTES (64 * 1024)
++#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
++#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
++
++#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
++#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
++
++#define MD_SB_BYTES 4096
++#define MD_SB_WORDS (MD_SB_BYTES / 4)
++#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
++#define MD_SB_SECTORS (MD_SB_BYTES / 512)
++
++/*
++ * The following are counted in 32-bit words
++ */
++#define MD_SB_GENERIC_OFFSET 0
++#define MD_SB_PERSONALITY_OFFSET 64
++#define MD_SB_DISKS_OFFSET 128
++#define MD_SB_DESCRIPTOR_OFFSET 992
++
++#define MD_SB_GENERIC_CONSTANT_WORDS 32
++#define MD_SB_GENERIC_STATE_WORDS 32
++#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
++#define MD_SB_PERSONALITY_WORDS 64
++#define MD_SB_DISKS_WORDS 384
++#define MD_SB_DESCRIPTOR_WORDS 32
++#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
++#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
++#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
++
++/*
++ * Device "operational" state bits
++ */
++#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
++#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
++#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
++#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
++
++typedef struct mdp_device_descriptor_s {
++ __u32 number; /* 0 Device number in the entire set */
++ __u32 major; /* 1 Device major number */
++ __u32 minor; /* 2 Device minor number */
++ __u32 raid_disk; /* 3 The role of the device in the raid set */
++ __u32 state; /* 4 Operational state */
++ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
++} mdp_disk_t;
++
++#define MD_SB_MAGIC 0xa92b4efc
++
++/*
++ * Superblock state bits
++ */
++#define MD_SB_CLEAN 0
++#define MD_SB_ERRORS 1
++
++typedef struct mdp_superblock_s {
++ /*
++ * Constant generic information
++ */
++ __u32 md_magic; /* 0 MD identifier */
++ __u32 major_version; /* 1 major version to which the set conforms */
++ __u32 minor_version; /* 2 minor version ... */
++ __u32 patch_version; /* 3 patchlevel version ... */
++ __u32 gvalid_words; /* 4 Number of used words in this section */
++ __u32 set_uuid0; /* 5 Raid set identifier */
++ __u32 ctime; /* 6 Creation time */
++ __u32 level; /* 7 Raid personality */
++ __u32 size; /* 8 Apparent size of each individual disk */
++ __u32 nr_disks; /* 9 total disks in the raid set */
++ __u32 raid_disks; /* 10 disks in a fully functional raid set */
++ __u32 md_minor; /* 11 preferred MD minor device number */
++ __u32 not_persistent; /* 12 does it have a persistent superblock */
++ __u32 set_uuid1; /* 13 Raid set identifier #2 */
++ __u32 set_uuid2; /* 14 Raid set identifier #3 */
++ __u32 set_uuid3; /* 14 Raid set identifier #4 */
++ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
++
++ /*
++ * Generic state information
++ */
++ __u32 utime; /* 0 Superblock update time */
++ __u32 state; /* 1 State bits (clean, ...) */
++ __u32 active_disks; /* 2 Number of currently active disks */
++ __u32 working_disks; /* 3 Number of working disks */
++ __u32 failed_disks; /* 4 Number of failed disks */
++ __u32 spare_disks; /* 5 Number of spare disks */
++ __u32 sb_csum; /* 6 checksum of the whole superblock */
++ __u64 events; /* 7 number of superblock updates (64-bit!) */
++ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
++
++ /*
++ * Personality information
++ */
++ __u32 layout; /* 0 the array's physical layout */
++ __u32 chunk_size; /* 1 chunk size in bytes */
++ __u32 root_pv; /* 2 LV root PV */
++ __u32 root_block; /* 3 LV root block */
++ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
++
++ /*
++ * Disks information
++ */
++ mdp_disk_t disks[MD_SB_DISKS];
++
++ /*
++ * Reserved
++ */
++ __u32 reserved[MD_SB_RESERVED_WORDS];
++
++ /*
++ * Active descriptor
++ */
++ mdp_disk_t this_disk;
++
++} mdp_super_t;
++
++#endif _MD_P_H
++
+diff -uNr linux-orig/include/linux/raid/md_u.h linux/include/linux/raid/md_u.h
+--- linux-orig/include/linux/raid/md_u.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_u.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,115 @@
++/*
++ md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
++ Copyright (C) 1998 Ingo Molnar
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2, or (at your option)
++ any later version.
++
++ You should have received a copy of the GNU General Public License
++ (for example /usr/src/linux/COPYING); if not, write to the Free
++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++*/
++
++#ifndef _MD_U_H
++#define _MD_U_H
++
++/* ioctls */
++
++/* status */
++#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
++#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
++#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
++#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
++
++/* configuration */
++#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
++#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
++#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
++#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
++#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
++#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
++#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
++#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
++#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
++#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
++
++/* usage */
++#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
++#define START_ARRAY _IO (MD_MAJOR, 0x31)
++#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
++#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
++#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
++
++typedef struct mdu_version_s {
++ int major;
++ int minor;
++ int patchlevel;
++} mdu_version_t;
++
++typedef struct mdu_array_info_s {
++ /*
++ * Generic constant information
++ */
++ int major_version;
++ int minor_version;
++ int patch_version;
++ int ctime;
++ int level;
++ int size;
++ int nr_disks;
++ int raid_disks;
++ int md_minor;
++ int not_persistent;
++
++ /*
++ * Generic state information
++ */
++ int utime; /* 0 Superblock update time */
++ int state; /* 1 State bits (clean, ...) */
++ int active_disks; /* 2 Number of currently active disks */
++ int working_disks; /* 3 Number of working disks */
++ int failed_disks; /* 4 Number of failed disks */
++ int spare_disks; /* 5 Number of spare disks */
++
++ /*
++ * Personality information
++ */
++ int layout; /* 0 the array's physical layout */
++ int chunk_size; /* 1 chunk size in bytes */
++
++} mdu_array_info_t;
++
++typedef struct mdu_disk_info_s {
++ /*
++ * configuration/status of one particular disk
++ */
++ int number;
++ int major;
++ int minor;
++ int raid_disk;
++ int state;
++
++} mdu_disk_info_t;
++
++typedef struct mdu_start_info_s {
++ /*
++ * configuration/status of one particular disk
++ */
++ int major;
++ int minor;
++ int raid_disk;
++ int state;
++
++} mdu_start_info_t;
++
++typedef struct mdu_param_s
++{
++ int personality; /* 1,2,3,4 */
++ int chunk_size; /* in bytes */
++ int max_fault; /* unused for now */
++} mdu_param_t;
++
++#endif _MD_U_H
++
+diff -uNr linux-orig/include/linux/raid/raid0.h linux/include/linux/raid/raid0.h
+--- linux-orig/include/linux/raid/raid0.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid0.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,33 @@
++#ifndef _RAID0_H
++#define _RAID0_H
++
++#include <linux/raid/md.h>
++
++struct strip_zone
++{
++ int zone_offset; /* Zone offset in md_dev */
++ int dev_offset; /* Zone offset in real dev */
++ int size; /* Zone size */
++ int nb_dev; /* # of devices attached to the zone */
++ mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
++};
++
++struct raid0_hash
++{
++ struct strip_zone *zone0, *zone1;
++};
++
++struct raid0_private_data
++{
++ struct raid0_hash *hash_table; /* Dynamically allocated */
++ struct strip_zone *strip_zone; /* This one too */
++ int nr_strip_zones;
++ struct strip_zone *smallest;
++ int nr_zones;
++};
++
++typedef struct raid0_private_data raid0_conf_t;
++
++#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/raid1.h linux/include/linux/raid/raid1.h
+--- linux-orig/include/linux/raid/raid1.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid1.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,64 @@
++#ifndef _RAID1_H
++#define _RAID1_H
++
++#include <linux/raid/md.h>
++
++struct mirror_info {
++ int number;
++ int raid_disk;
++ kdev_t dev;
++ int next;
++ int sect_limit;
++
++ /*
++ * State bits:
++ */
++ int operational;
++ int write_only;
++ int spare;
++
++ int used_slot;
++};
++
++struct raid1_private_data {
++ mddev_t *mddev;
++ struct mirror_info mirrors[MD_SB_DISKS];
++ int nr_disks;
++ int raid_disks;
++ int working_disks;
++ int last_used;
++ unsigned long next_sect;
++ int sect_count;
++ mdk_thread_t *thread, *resync_thread;
++ int resync_mirrors;
++ struct mirror_info *spare;
++};
++
++typedef struct raid1_private_data raid1_conf_t;
++
++/*
++ * this is the only point in the RAID code where we violate
++ * C type safety. mddev->private is an 'opaque' pointer.
++ */
++#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
++
++/*
++ * this is our 'private' 'collective' RAID1 buffer head.
++ * it contains information about what kind of IO operations were started
++ * for this RAID1 operation, and about their status:
++ */
++
++struct raid1_bh {
++ atomic_t remaining; /* 'have we finished' count,
++ * used from IRQ handlers
++ */
++ int cmd;
++ unsigned long state;
++ mddev_t *mddev;
++ struct buffer_head *master_bh;
++ struct buffer_head *mirror_bh [MD_SB_DISKS];
++ struct buffer_head bh_req;
++ struct buffer_head *next_retry;
++};
++
++#endif
+diff -uNr linux-orig/include/linux/raid/raid5.h linux/include/linux/raid/raid5.h
+--- linux-orig/include/linux/raid/raid5.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid5.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,113 @@
++#ifndef _RAID5_H
++#define _RAID5_H
++
++#include <linux/raid/md.h>
++#include <linux/raid/xor.h>
++
++struct disk_info {
++ kdev_t dev;
++ int operational;
++ int number;
++ int raid_disk;
++ int write_only;
++ int spare;
++ int used_slot;
++};
++
++struct stripe_head {
++ md_spinlock_t stripe_lock;
++ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
++ struct stripe_head *free_next; /* pool of free sh's */
++ struct buffer_head *buffer_pool; /* pool of free buffers */
++ struct buffer_head *bh_pool; /* pool of free bh's */
++ struct raid5_private_data *raid_conf;
++ struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
++ struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
++ struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
++ struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
++ int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
++ int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
++ unsigned long sector; /* sector of this row */
++ int size; /* buffers size */
++ int pd_idx; /* parity disk index */
++ atomic_t nr_pending; /* nr of pending cmds */
++ unsigned long state; /* state flags */
++ int cmd; /* stripe cmd */
++ int count; /* nr of waiters */
++ int write_method; /* reconstruct-write / read-modify-write */
++ int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
++ struct wait_queue *wait; /* processes waiting for this stripe */
++};
++
++/*
++ * Phase
++ */
++#define PHASE_BEGIN 0
++#define PHASE_READ_OLD 1
++#define PHASE_WRITE 2
++#define PHASE_READ 3
++#define PHASE_COMPLETE 4
++
++/*
++ * Write method
++ */
++#define METHOD_NONE 0
++#define RECONSTRUCT_WRITE 1
++#define READ_MODIFY_WRITE 2
++
++/*
++ * Stripe state
++ */
++#define STRIPE_LOCKED 0
++#define STRIPE_ERROR 1
++
++/*
++ * Stripe commands
++ */
++#define STRIPE_NONE 0
++#define STRIPE_WRITE 1
++#define STRIPE_READ 2
++
++struct raid5_private_data {
++ struct stripe_head **stripe_hashtbl;
++ mddev_t *mddev;
++ mdk_thread_t *thread, *resync_thread;
++ struct disk_info disks[MD_SB_DISKS];
++ struct disk_info *spare;
++ int buffer_size;
++ int chunk_size, level, algorithm;
++ int raid_disks, working_disks, failed_disks;
++ int sector_count;
++ unsigned long next_sector;
++ atomic_t nr_handle;
++ struct stripe_head *next_free_stripe;
++ int nr_stripes;
++ int resync_parity;
++ int max_nr_stripes;
++ int clock;
++ int nr_hashed_stripes;
++ int nr_locked_stripes;
++ int nr_pending_stripes;
++ int nr_cached_stripes;
++
++ /*
++ * Free stripes pool
++ */
++ int nr_free_sh;
++ struct stripe_head *free_sh_list;
++ struct wait_queue *wait_for_stripe;
++};
++
++typedef struct raid5_private_data raid5_conf_t;
++
++#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
++
++/*
++ * Our supported algorithms
++ */
++#define ALGORITHM_LEFT_ASYMMETRIC 0
++#define ALGORITHM_RIGHT_ASYMMETRIC 1
++#define ALGORITHM_LEFT_SYMMETRIC 2
++#define ALGORITHM_RIGHT_SYMMETRIC 3
++
++#endif
+diff -uNr linux-orig/include/linux/raid/translucent.h linux/include/linux/raid/translucent.h
+--- linux-orig/include/linux/raid/translucent.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/translucent.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,23 @@
++#ifndef _TRANSLUCENT_H
++#define _TRANSLUCENT_H
++
++#include <linux/raid/md.h>
++
++typedef struct dev_info dev_info_t;
++
++struct dev_info {
++ kdev_t dev;
++ int size;
++};
++
++struct translucent_private_data
++{
++ dev_info_t disks[MD_SB_DISKS];
++};
++
++
++typedef struct translucent_private_data translucent_conf_t;
++
++#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/xor.h linux/include/linux/raid/xor.h
+--- linux-orig/include/linux/raid/xor.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/xor.h Fri Dec 8 22:54:52 2000
+@@ -0,0 +1,12 @@
++#ifndef _XOR_H
++#define _XOR_H
++
++#include <linux/raid/md.h>
++
++#define MAX_XOR_BLOCKS 5
++
++extern void calibrate_xor_block(void);
++extern void (*xor_block)(unsigned int count,
++ struct buffer_head **bh_ptr);
++
++#endif
+diff -uNr linux-orig/include/linux/raid0.h linux/include/linux/raid0.h
+--- linux-orig/include/linux/raid0.h Tue Oct 29 06:20:24 1996
++++ linux/include/linux/raid0.h Fri Dec 8 22:54:52 2000
+@@ -1,27 +0,0 @@
+-#ifndef _RAID0_H
+-#define _RAID0_H
+-
+-struct strip_zone
+-{
+- int zone_offset; /* Zone offset in md_dev */
+- int dev_offset; /* Zone offset in real dev */
+- int size; /* Zone size */
+- int nb_dev; /* Number of devices attached to the zone */
+- struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
+-};
+-
+-struct raid0_hash
+-{
+- struct strip_zone *zone0, *zone1;
+-};
+-
+-struct raid0_data
+-{
+- struct raid0_hash *hash_table; /* Dynamically allocated */
+- struct strip_zone *strip_zone; /* This one too */
+- int nr_strip_zones;
+- struct strip_zone *smallest;
+- int nr_zones;
+-};
+-
+-#endif
+diff -uNr linux-orig/include/linux/raid1.h linux/include/linux/raid1.h
+--- linux-orig/include/linux/raid1.h Fri May 8 00:17:13 1998
++++ linux/include/linux/raid1.h Fri Dec 8 22:54:52 2000
+@@ -1,49 +0,0 @@
+-#ifndef _RAID1_H
+-#define _RAID1_H
+-
+-#include <linux/md.h>
+-
+-struct mirror_info {
+- int number;
+- int raid_disk;
+- kdev_t dev;
+- int next;
+- int sect_limit;
+-
+- /*
+- * State bits:
+- */
+- int operational;
+- int write_only;
+- int spare;
+-};
+-
+-struct raid1_data {
+- struct md_dev *mddev;
+- struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
+- int raid_disks;
+- int working_disks; /* Number of working disks */
+- int last_used;
+- unsigned long next_sect;
+- int sect_count;
+- int resync_running;
+-};
+-
+-/*
+- * this is our 'private' 'collective' RAID1 buffer head.
+- * it contains information about what kind of IO operations were started
+- * for this RAID5 operation, and about their status:
+- */
+-
+-struct raid1_bh {
+- unsigned int remaining;
+- int cmd;
+- unsigned long state;
+- struct md_dev *mddev;
+- struct buffer_head *master_bh;
+- struct buffer_head *mirror_bh [MD_SB_DISKS];
+- struct buffer_head bh_req;
+- struct buffer_head *next_retry;
+-};
+-
+-#endif
+diff -uNr linux-orig/include/linux/raid5.h linux/include/linux/raid5.h
+--- linux-orig/include/linux/raid5.h Fri May 8 00:17:13 1998
++++ linux/include/linux/raid5.h Fri Dec 8 22:54:52 2000
+@@ -1,110 +0,0 @@
+-#ifndef _RAID5_H
+-#define _RAID5_H
+-
+-#ifdef __KERNEL__
+-#include <linux/md.h>
+-#include <asm/atomic.h>
+-
+-struct disk_info {
+- kdev_t dev;
+- int operational;
+- int number;
+- int raid_disk;
+- int write_only;
+- int spare;
+-};
+-
+-struct stripe_head {
+- struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
+- struct stripe_head *free_next; /* pool of free sh's */
+- struct buffer_head *buffer_pool; /* pool of free buffers */
+- struct buffer_head *bh_pool; /* pool of free bh's */
+- struct raid5_data *raid_conf;
+- struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
+- struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
+- struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
+- struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
+- int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
+- int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
+- unsigned long sector; /* sector of this row */
+- int size; /* buffers size */
+- int pd_idx; /* parity disk index */
+- int nr_pending; /* nr of pending cmds */
+- unsigned long state; /* state flags */
+- int cmd; /* stripe cmd */
+- int count; /* nr of waiters */
+- int write_method; /* reconstruct-write / read-modify-write */
+- int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
+- struct wait_queue *wait; /* processes waiting for this stripe */
+-};
+-
+-/*
+- * Phase
+- */
+-#define PHASE_BEGIN 0
+-#define PHASE_READ_OLD 1
+-#define PHASE_WRITE 2
+-#define PHASE_READ 3
+-#define PHASE_COMPLETE 4
+-
+-/*
+- * Write method
+- */
+-#define METHOD_NONE 0
+-#define RECONSTRUCT_WRITE 1
+-#define READ_MODIFY_WRITE 2
+-
+-/*
+- * Stripe state
+- */
+-#define STRIPE_LOCKED 0
+-#define STRIPE_ERROR 1
+-
+-/*
+- * Stripe commands
+- */
+-#define STRIPE_NONE 0
+-#define STRIPE_WRITE 1
+-#define STRIPE_READ 2
+-
+-struct raid5_data {
+- struct stripe_head **stripe_hashtbl;
+- struct md_dev *mddev;
+- struct md_thread *thread, *resync_thread;
+- struct disk_info disks[MD_SB_DISKS];
+- struct disk_info *spare;
+- int buffer_size;
+- int chunk_size, level, algorithm;
+- int raid_disks, working_disks, failed_disks;
+- int sector_count;
+- unsigned long next_sector;
+- atomic_t nr_handle;
+- struct stripe_head *next_free_stripe;
+- int nr_stripes;
+- int resync_parity;
+- int max_nr_stripes;
+- int clock;
+- int nr_hashed_stripes;
+- int nr_locked_stripes;
+- int nr_pending_stripes;
+- int nr_cached_stripes;
+-
+- /*
+- * Free stripes pool
+- */
+- int nr_free_sh;
+- struct stripe_head *free_sh_list;
+- struct wait_queue *wait_for_stripe;
+-};
+-
+-#endif
+-
+-/*
+- * Our supported algorithms
+- */
+-#define ALGORITHM_LEFT_ASYMMETRIC 0
+-#define ALGORITHM_RIGHT_ASYMMETRIC 1
+-#define ALGORITHM_LEFT_SYMMETRIC 2
+-#define ALGORITHM_RIGHT_SYMMETRIC 3
+-
+-#endif
+diff -uNr linux-orig/include/linux/sysctl.h linux/include/linux/sysctl.h
+--- linux-orig/include/linux/sysctl.h Fri Dec 8 22:53:55 2000
++++ linux/include/linux/sysctl.h Fri Dec 8 22:57:02 2000
+@@ -435,6 +435,7 @@
+ enum {
+ DEV_CDROM=1,
+ DEV_HWMON=2,
++ DEV_MD=4,
+ DEV_MAC_HID=5
+ };
+
+@@ -446,6 +447,11 @@
+ DEV_CDROM_DEBUG=4,
+ DEV_CDROM_LOCK=5,
+ DEV_CDROM_CHECK_MEDIA=6
++};
++
++/* /proc/sys/dev/md */
++enum {
++ DEV_MD_SPEED_LIMIT=1
+ };
+
+ /* /proc/sys/dev/mac_hid */
+diff -uNr linux-orig/init/main.c linux/init/main.c
+--- linux-orig/init/main.c Fri Dec 8 22:53:56 2000
++++ linux/init/main.c Fri Dec 8 22:54:52 2000
+@@ -19,6 +19,7 @@
+ #include <linux/utsname.h>
+ #include <linux/ioport.h>
+ #include <linux/init.h>
++#include <linux/raid/md.h>
+ #include <linux/smp_lock.h>
+ #include <linux/blk.h>
+ #include <linux/hdreg.h>
+@@ -550,7 +551,7 @@
+ #ifdef CONFIG_BLK_DEV_FD
+ { "fd", 0x0200 },
+ #endif
+-#ifdef CONFIG_MD_BOOT
++#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
+ { "md", 0x0900 },
+ #endif
+ #ifdef CONFIG_BLK_DEV_XD
+@@ -1058,6 +1059,9 @@
+ #ifdef CONFIG_MD_BOOT
+ { "md=", md_setup},
+ #endif
++#if CONFIG_BLK_DEV_MD
++ { "raid=", raid_setup},
++#endif
+ #ifdef CONFIG_ADBMOUSE
+ { "adb_buttons=", adb_mouse_setup },
+ #endif
+@@ -1637,6 +1641,9 @@
+ while (pid != wait(&i));
+ if (MAJOR(real_root_dev) != RAMDISK_MAJOR
+ || MINOR(real_root_dev) != 0) {
++#ifdef CONFIG_BLK_DEV_MD
++ autodetect_raid();
++#endif
+ error = change_root(real_root_dev,"/initrd");
+ if (error)
+ printk(KERN_ERR "Change root to /initrd: "