- based on Mingo autoraid patch.

author kloczek <kloczek@pld-linux.org>

Fri, 15 Dec 2000 00:37:40 +0000 (00:37 +0000)

committer cvs2git <feedback@pld-linux.org>

Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
author kloczek <kloczek@pld-linux.org>
Fri, 15 Dec 2000 00:37:40 +0000 (00:37 +0000)
committer cvs2git <feedback@pld-linux.org>
Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
diff --git a/kernel-autoraidraid.patch b/kernel-autoraidraid.patch

new file mode 100644 (file)

index 0000000..32074f3
--- /dev/null
+++ b/kernel-autoraidraid.patch
@@ -0,0 +1,15170 @@
+diff -uNr linux-orig/Documentation/Configure.help linux/Documentation/Configure.help
+--- linux-orig/Documentation/Configure.help    Fri Dec  8 22:53:50 2000
++++ linux/Documentation/Configure.help Fri Dec  8 22:54:52 2000
+@@ -1015,6 +1015,13 @@
+ 
+   If unsure, say N.
+ 
++Autodetect RAID partitions
++CONFIG_AUTODETECT_RAID
++  This feature lets the kernel detect RAID partitions on bootup.
++  An autodetect RAID partition is a normal partition with partition
++  type 0xfd. Use this if you want to boot RAID devices, or want to
++  run them automatically.
++
+ Linear (append) mode
+ CONFIG_MD_LINEAR
+   If you say Y here, then your multiple devices driver will be able to
+@@ -1093,6 +1100,21 @@
+   Documentation/modules.txt.
+ 
+   If unsure, say Y.
++
++Translucent Block Device Support (EXPERIMENTAL)
++CONFIG_MD_TRANSLUCENT
++  DO NOT USE THIS STUFF YET!
++
++  currently there is only a placeholder there as the implementation
++  is not yet usable.
++
++Hierarchical Storage Management support (EXPERIMENTAL)
++CONFIG_MD_HSM
++  DO NOT USE THIS STUFF YET!
++
++  i have released this so people can comment on the architecture,
++  but user-space tools are still unusable so there is nothing much
++  you can do with this.
+ 
+ Boot support (linear, striped)
+ CONFIG_MD_BOOT
+diff -uNr linux-orig/arch/i386/defconfig linux/arch/i386/defconfig
+--- linux-orig/arch/i386/defconfig     Fri Dec  8 22:53:50 2000
++++ linux/arch/i386/defconfig  Fri Dec  8 22:54:52 2000
+@@ -93,7 +93,15 @@
+ #
+ # CONFIG_BLK_DEV_LOOP is not set
+ # CONFIG_BLK_DEV_NBD is not set
+-# CONFIG_BLK_DEV_MD is not set
++CONFIG_BLK_DEV_MD=y
++CONFIG_AUTODETECT_RAID=y
++CONFIG_MD_TRANSLUCENT=y
++CONFIG_MD_LINEAR=y
++CONFIG_MD_STRIPED=y
++CONFIG_MD_MIRRORING=y
++CONFIG_MD_RAID5=y
++CONFIG_MD_BOOT=y
++CONFIG_BLK_DEV_HSM=y
+ # CONFIG_BLK_DEV_RAM is not set
+ # CONFIG_BLK_DEV_XD is not set
+ # CONFIG_BLK_DEV_DAC960 is not set
+diff -uNr linux-orig/arch/sparc/config.in linux/arch/sparc/config.in
+--- linux-orig/arch/sparc/config.in    Fri Dec  8 22:53:51 2000
++++ linux/arch/sparc/config.in Fri Dec  8 22:54:52 2000
+@@ -88,10 +88,16 @@
+ 
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
+   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
+   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
++  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
++fi
++if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
++  bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
+ fi
+ 
+ tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
+diff -uNr linux-orig/arch/sparc/defconfig linux/arch/sparc/defconfig
+--- linux-orig/arch/sparc/defconfig    Fri Dec  8 22:53:51 2000
++++ linux/arch/sparc/defconfig Fri Dec  8 22:54:52 2000
+@@ -89,10 +89,13 @@
+ #
+ CONFIG_BLK_DEV_FD=y
+ CONFIG_BLK_DEV_MD=y
++# CONFIG_AUTODETECT_RAID is not set
+ CONFIG_MD_LINEAR=m
+ CONFIG_MD_STRIPED=m
+ CONFIG_MD_MIRRORING=m
+ CONFIG_MD_RAID5=m
++# CONFIG_MD_TRANSLUCENT is not set
++# CONFIG_MD_HSM is not set
+ CONFIG_BLK_DEV_RAM=y
+ CONFIG_BLK_DEV_RAM_SIZE=4096
+ CONFIG_BLK_DEV_INITRD=y
+diff -uNr linux-orig/arch/sparc64/config.in linux/arch/sparc64/config.in
+--- linux-orig/arch/sparc64/config.in  Fri Dec  8 22:53:51 2000
++++ linux/arch/sparc64/config.in       Fri Dec  8 22:54:52 2000
+@@ -102,10 +102,16 @@
+ 
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
+   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
+   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
++  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
++fi
++if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
++  bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
+ fi
+ 
+ tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
+diff -uNr linux-orig/arch/sparc64/defconfig linux/arch/sparc64/defconfig
+--- linux-orig/arch/sparc64/defconfig  Fri Dec  8 22:53:51 2000
++++ linux/arch/sparc64/defconfig       Fri Dec  8 22:54:52 2000
+@@ -106,10 +106,13 @@
+ #
+ CONFIG_BLK_DEV_FD=y
+ CONFIG_BLK_DEV_MD=y
++# CONFIG_AUTODETECT_RAID is not set
+ CONFIG_MD_LINEAR=m
+ CONFIG_MD_STRIPED=m
+ CONFIG_MD_MIRRORING=m
+ CONFIG_MD_RAID5=m
++# CONFIG_MD_TRANSLUCENT is not set
++# CONFIG_MD_HSM is not set
+ CONFIG_BLK_DEV_RAM=y
+ CONFIG_BLK_DEV_RAM_SIZE=4096
+ CONFIG_BLK_DEV_INITRD=y
+diff -uNr linux-orig/drivers/block/Config.in linux/drivers/block/Config.in
+--- linux-orig/drivers/block/Config.in Fri Dec  8 22:53:51 2000
++++ linux/drivers/block/Config.in      Fri Dec  8 22:54:52 2000
+@@ -103,10 +103,13 @@
+ fi
+ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
+ if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
++  bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
+   tristate '   Linear (append) mode' CONFIG_MD_LINEAR
+   tristate '   RAID-0 (striping) mode' CONFIG_MD_STRIPED
+   tristate '   RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
+   tristate '   RAID-4/RAID-5 mode' CONFIG_MD_RAID5
++  tristate '   Translucent mode' CONFIG_MD_TRANSLUCENT
++  tristate '   Hierarchical Storage Management support' CONFIG_MD_HSM
+ fi
+ if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
+   bool '      Boot support (linear, striped)' CONFIG_MD_BOOT
+diff -uNr linux-orig/drivers/block/Makefile linux/drivers/block/Makefile
+--- linux-orig/drivers/block/Makefile  Fri Dec  8 22:53:51 2000
++++ linux/drivers/block/Makefile       Fri Dec  8 22:54:52 2000
+@@ -294,10 +294,28 @@
+ endif
+ 
+ ifeq ($(CONFIG_MD_RAID5),y)
++LX_OBJS += xor.o
+ L_OBJS += raid5.o
+ else
+   ifeq ($(CONFIG_MD_RAID5),m)
++  LX_OBJS += xor.o
+   M_OBJS += raid5.o
++  endif
++endif
++
++ifeq ($(CONFIG_MD_TRANSLUCENT),y)
++L_OBJS += translucent.o
++else
++  ifeq ($(CONFIG_MD_TRANSLUCENT),m)
++  M_OBJS += translucent.o
++  endif
++endif
++
++ifeq ($(CONFIG_MD_HSM),y)
++L_OBJS += hsm.o
++else
++  ifeq ($(CONFIG_MD_HSM),m)
++  M_OBJS += hsm.o
+   endif
+ endif
+ 
+diff -uNr linux-orig/drivers/block/genhd.c linux/drivers/block/genhd.c
+--- linux-orig/drivers/block/genhd.c   Fri Dec  8 22:53:56 2000
++++ linux/drivers/block/genhd.c        Fri Dec  8 22:54:52 2000
+@@ -28,6 +28,7 @@
+ #include <linux/string.h>
+ #include <linux/blk.h>
+ #include <linux/init.h>
++#include <linux/raid/md.h>
+ 
+ #ifdef CONFIG_ARCH_S390
+ #include <asm/dasd.h>
+@@ -1731,6 +1732,9 @@
+       else
+ #endif
+       rd_load();
++#endif
++#ifdef CONFIG_BLK_DEV_MD
++      autodetect_raid();
+ #endif
+ #ifdef CONFIG_MD_BOOT
+         md_setup_drive();
+diff -uNr linux-orig/drivers/block/hsm.c linux/drivers/block/hsm.c
+--- linux-orig/drivers/block/hsm.c     Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/hsm.c  Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,840 @@
++/*
++   hsm.c : HSM RAID driver for Linux
++              Copyright (C) 1998 Ingo Molnar
++
++   HSM mode management functions.
++
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#include <linux/module.h>
++
++#include <linux/raid/md.h>
++#include <linux/malloc.h>
++
++#include <linux/raid/hsm.h>
++#include <linux/blk.h>
++
++#define MAJOR_NR MD_MAJOR
++#define MD_DRIVER
++#define MD_PERSONALITY
++
++
++#define DEBUG_HSM 1
++
++#if DEBUG_HSM
++#define dprintk(x,y...) printk(x,##y)
++#else
++#define dprintk(x,y...) do { } while (0)
++#endif
++
++void print_bh(struct buffer_head *bh)
++{
++      dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh, 
++              bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
++              bh->b_rsector, bh->b_this_page, bh->b_state,
++              bh->b_next_free, bh->b_count, bh->b_data,
++              bh->b_list, bh->b_flushtime
++      );
++}
++
++static int check_bg (pv_t *pv, pv_block_group_t * bg)
++{
++      int i, free = 0;
++
++      dprintk("checking bg ...\n");
++
++      for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
++              if (pv_pptr_free(bg->blocks + i)) {
++                      free++;
++                      if (test_bit(i, bg->used_bitmap)) {
++                              printk("hm, bit %d set?\n", i);
++                      }
++              } else {
++                      if (!test_bit(i, bg->used_bitmap)) {
++                              printk("hm, bit %d not set?\n", i);
++                      }
++              }
++      }
++      dprintk("%d free blocks in bg ...\n", free);
++      return free;
++}
++
++static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
++{
++      unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
++      struct buffer_head *bh;
++
++      dprintk("... getting BG at %u ...\n", bg_pos);
++
++        bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
++      if (!bh) {
++              MD_BUG();
++              return;
++      }
++      desc->bg = (pv_block_group_t *) bh->b_data;
++      desc->free_blocks = check_bg(pv, desc->bg);
++}
++
++static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
++                              unsigned int lblock, lv_lptr_t * index)
++{
++      int i;
++
++      for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
++              pv_pptr_t * bptr = desc->bg->blocks + i;
++              if (pv_pptr_free(bptr)) {
++                      unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
++
++                      if (test_bit(i, desc->bg->used_bitmap)) {
++                              MD_BUG();
++                              continue;
++                      }
++                      bptr->u.used.owner.log_id = lv->log_id;
++                      bptr->u.used.owner.log_index = lblock;
++                      index->data.phys_nr = pv->phys_nr;
++                      index->data.phys_block = bg_pos + i + 1;
++                      set_bit(i, desc->bg->used_bitmap);
++                      desc->free_blocks--;
++                      dprintk(".....free blocks left in bg %p: %d\n",
++                                      desc->bg, desc->free_blocks);
++                      return 0;
++              }
++      }
++      return -ENOSPC;
++}
++
++static int __get_free_block (lv_t *lv, pv_t *pv,
++                                      unsigned int lblock, lv_lptr_t * index)
++{
++      int i;
++
++      dprintk("trying to get free block for lblock %d ...\n", lblock);
++
++      for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
++              pv_bg_desc_t *desc = pv->bg_array + i;
++
++              dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
++              if (!desc->bg)
++                      get_bg(pv, desc, i);
++
++              if (desc->bg && desc->free_blocks)
++                      return find_free_block(lv, pv, desc, i,
++                                                      lblock, index);
++      }
++      dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
++      return -ENOSPC;
++}
++
++static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
++{
++      int err;
++
++      if (!lv->free_indices)
++              return -ENOSPC;
++
++      /* fix me */
++      err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
++
++      if (err || !index->data.phys_block) {
++              MD_BUG();
++              return -ENOSPC;
++      }
++
++      lv->free_indices--;
++
++      return 0;
++}
++
++/*
++ * fix me: wordsize assumptions ...
++ */
++#define INDEX_BITS 8
++#define INDEX_DEPTH (32/INDEX_BITS)
++#define INDEX_MASK ((1<<INDEX_BITS) - 1)
++
++static void print_index_list (lv_t *lv, lv_lptr_t *index)
++{
++      lv_lptr_t *tmp;
++      int i;
++
++      dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
++              index->data.phys_block, index->cpu_addr);
++
++      tmp = index_child(index);
++      for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++              if (index_block(lv, tmp))
++                      dprintk("(%d->%d)", i, index_block(lv, tmp));
++              tmp++;
++      }
++      dprintk(".]\n");
++}
++
++static int read_index_group (lv_t *lv, lv_lptr_t *index)
++{
++      lv_lptr_t *index_group, *tmp;
++      struct buffer_head *bh;
++      int i;
++
++      dprintk("reading index group <%s:%d>\n",
++              partition_name(index_dev(lv, index)), index_block(lv, index));
++
++      bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
++      if (!bh) {
++              MD_BUG();
++              return -EIO;
++      }
++      if (!buffer_uptodate(bh))
++              MD_BUG();
++
++      index_group = (lv_lptr_t *) bh->b_data;
++      tmp = index_group;
++      for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++              if (index_block(lv, tmp)) {
++                      dprintk("index group has BLOCK %d, non-present.\n", i);
++                      tmp->cpu_addr = 0;
++              }
++              tmp++;
++      }
++      index->cpu_addr = ptr_to_cpuaddr(index_group);
++
++      dprintk("have read index group %p at block %d.\n",
++                              index_group, index_block(lv, index));
++      print_index_list(lv, index);
++
++      return 0;
++}
++
++static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
++{
++      struct buffer_head *bh;
++      lv_lptr_t * index_group;
++      
++      if (get_free_block(lv, lblock, index))
++              return -ENOSPC;
++
++      dprintk("creating block for index group <%s:%d>\n",
++              partition_name(index_dev(lv, index)), index_block(lv, index));
++
++      bh = getblk(index_dev(lv, index),
++                       index_block(lv, index), HSM_BLOCKSIZE);
++
++      index_group = (lv_lptr_t *) bh->b_data;
++      md_clear_page(index_group);
++      mark_buffer_uptodate(bh, 1);
++
++      index->cpu_addr = ptr_to_cpuaddr(index_group);
++
++      dprintk("allocated index group %p at block %d.\n",
++                              index_group, index_block(lv, index));
++      return 0;
++}
++
++static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
++{
++      lv_lptr_t * index = index_child(&lv->root_index);
++      int idx, l;
++
++      for (l = INDEX_DEPTH-1; l >= 0; l--) {
++              idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
++              index += idx;
++              if (!l)
++                      break;
++              if (!index_present(index)) {
++                      dprintk("no group, level %u, pos %u\n", l, idx);
++                      if (alloc_index_group(lv, lblock, index))
++                              return NULL;
++              }
++              index = index_child(index);
++      }
++      if (!index_block(lv,index)) {
++              dprintk("no data, pos %u\n", idx);
++              if (get_free_block(lv, lblock, index))
++                      return NULL;
++              return index;
++      }
++      MD_BUG();
++      return index;
++}
++
++static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
++{
++      lv_lptr_t * index = index_child(&lv->root_index);
++      int idx, l;
++
++      for (l = INDEX_DEPTH-1; l >= 0; l--) {
++              idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
++              index += idx;
++              if (!l)
++                      break;
++              if (index_free(index))
++                      return NULL;
++              if (!index_present(index))
++                      read_index_group(lv, index);
++              if (!index_present(index)) {
++                      MD_BUG();
++                      return NULL;
++              }
++              index = index_child(index);
++      }
++      if (!index_block(lv,index))
++              return NULL;
++      return index;
++}
++
++static int read_root_index(lv_t *lv)
++{
++      int err;
++      lv_lptr_t *index = &lv->root_index;
++
++      if (!index_block(lv, index)) {
++              printk("LV has no root index yet, creating.\n");
++
++              err = alloc_index_group (lv, 0, index);
++              if (err) {
++                      printk("could not create index group, err:%d\n", err);
++                      return err;
++              }
++              lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
++                                      lv->root_index.data;
++      } else {
++              printk("LV already has a root index.\n");
++              printk("... at <%s:%d>.\n",
++                      partition_name(index_dev(lv, index)),
++                      index_block(lv, index));
++
++              read_index_group(lv, index);
++      }
++      return 0;
++}
++
++static int init_pv(pv_t *pv)
++{
++      struct buffer_head *bh;
++      pv_sb_t *pv_sb;
++
++        bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
++      if (!bh) {
++              MD_BUG();
++              return -1;
++      }
++
++      pv_sb = (pv_sb_t *) bh->b_data;
++      pv->pv_sb = pv_sb;
++
++      if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
++              printk("%s is not a PV, has magic %x instead of %x!\n",
++                      partition_name(pv->dev), pv_sb->pv_magic,
++                      HSM_PV_SB_MAGIC);
++              return -1;
++      }
++      printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
++                                                      pv->phys_nr);
++      printk("... created under HSM version %d.%d.%d, at %x.\n",
++          pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
++      printk("... total # of blocks: %d (%d left unallocated).\n",
++                       pv_sb->pv_total_size, pv_sb->pv_blocks_left);
++
++      printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
++      printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
++      printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
++      printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
++
++      if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
++              MD_BUG();
++              return 1;
++      }
++      pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
++      if (!pv->bg_array) {
++              MD_BUG();
++              return 1;
++      }
++      memset(pv->bg_array, 0, PAGE_SIZE);
++
++      return 0;
++}
++
++static int free_pv(pv_t *pv)
++{
++      struct buffer_head *bh;
++
++      dprintk("freeing PV %d ...\n", pv->phys_nr);
++
++      if (pv->bg_array) {
++              int i;
++
++              dprintk(".... freeing BGs ...\n");
++              for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
++                      unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
++                      pv_bg_desc_t *desc = pv->bg_array + i;
++
++                      if (desc->bg) {
++                              dprintk(".... freeing BG %d ...\n", i);
++                              bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
++                              mark_buffer_dirty(bh, 1);
++                              brelse(bh);
++                              brelse(bh);
++                      }
++              }
++              free_page((unsigned long)pv->bg_array);
++      } else
++              MD_BUG();
++
++        bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
++      if (!bh) {
++              MD_BUG();
++              return -1;
++      }
++      mark_buffer_dirty(bh, 1);
++      brelse(bh);
++      brelse(bh);
++
++      return 0;
++}
++
++struct semaphore hsm_sem = MUTEX;
++
++#define HSM_SECTORS (HSM_BLOCKSIZE/512)
++
++static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++                      unsigned long *rsector, unsigned long bsectors)
++{
++      lv_t *lv = kdev_to_lv(dev);
++      lv_lptr_t *index;
++      unsigned int lblock = *rsector / HSM_SECTORS;
++      unsigned int offset = *rsector % HSM_SECTORS;
++      int err = -EIO;
++
++      if (!lv) {
++              printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
++              goto out;
++      }
++      if (offset + bsectors > HSM_SECTORS) {
++              MD_BUG();
++              goto out;
++      }
++      down(&hsm_sem);
++      index = find_index(lv, lblock);
++      if (!index) {
++              printk("no block %u yet ... allocating\n", lblock);
++              index = alloc_fixed_index(lv, lblock);
++      }
++
++      err = 0;
++
++      printk(" %u <%s : %ld(%ld)> -> ", lblock,
++              partition_name(*rdev), *rsector, bsectors);
++
++      *rdev = index_dev(lv, index);
++      *rsector = index_block(lv, index) * HSM_SECTORS + offset;
++
++      printk(" <%s : %ld> %u\n",
++              partition_name(*rdev), *rsector, index_block(lv, index));
++
++      up(&hsm_sem);
++out:
++      return err;
++}
++
++static void free_index (lv_t *lv, lv_lptr_t * index)
++{
++      struct buffer_head *bh;
++
++      printk("tryin to get cached block for index group <%s:%d>\n",
++              partition_name(index_dev(lv, index)), index_block(lv, index));
++
++      bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
++
++      printk("....FREEING ");
++      print_index_list(lv, index);
++
++      if (bh) {
++              if (!buffer_uptodate(bh))
++                      MD_BUG();
++              if ((lv_lptr_t *)bh->b_data != index_child(index)) {
++                      printk("huh? b_data is %p, index content is %p.\n",
++                              bh->b_data, index_child(index));
++              } else 
++                      printk("good, b_data == index content == %p.\n",
++                              index_child(index));
++              printk("b_count == %d, writing.\n", bh->b_count);
++              mark_buffer_dirty(bh, 1);
++              brelse(bh);
++              brelse(bh);
++              printk("done.\n");
++      } else {
++              printk("FAILED!\n");
++      }
++      print_index_list(lv, index);
++      index_child(index) = NULL;
++}
++
++static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
++{
++      char dots [3*8];
++      lv_lptr_t * index;
++      int i, nr_dots;
++
++      nr_dots = (INDEX_DEPTH-level)*3;
++      memcpy(dots,"...............",nr_dots);
++      dots[nr_dots] = 0;
++
++      dprintk("%s level %d index group block:\n", dots, level);
++
++
++      index = index_0;
++      for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++              if (index->data.phys_block) {
++                      dprintk("%s block <%u,%u,%x>\n", dots,
++                              index->data.phys_nr,
++                              index->data.phys_block,
++                              index->cpu_addr);
++                      if (level && index_present(index)) {
++                              dprintk("%s==> deeper one level\n", dots);
++                              free_index_group(lv, level-1,
++                                              index_child(index));
++                              dprintk("%s freeing index group block %p ...",
++                                              dots, index_child(index));
++                              free_index(lv, index);
++                      }
++              }
++              index++;
++      }
++      dprintk("%s DONE: level %d index group block.\n", dots, level);
++}
++
++static void free_lv_indextree (lv_t *lv)
++{
++      dprintk("freeing LV %d ...\n", lv->log_id);
++      dprintk("..root index: %p\n", index_child(&lv->root_index));
++      dprintk("..INDEX TREE:\n");
++      free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
++      dprintk("..freeing root index %p ...", index_child(&lv->root_index));
++      dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
++              lv->root_index.data.phys_block, lv->root_index.cpu_addr);
++      free_index(lv, &lv->root_index);
++      dprintk("..INDEX TREE done.\n");
++      fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
++      lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
++}
++
++static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
++{
++      char dots [3*5];
++      lv_lptr_t * index;
++      int i, nr_dots;
++
++      nr_dots = (INDEX_DEPTH-level)*3;
++      memcpy(dots,"...............",nr_dots);
++      dots[nr_dots] = 0;
++
++      dprintk("%s level %d index group block:\n", dots, level);
++
++
++      for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
++              index = index_0 + i;
++              if (index->data.phys_block) {
++                      dprintk("%s block <%u,%u,%x>\n", dots,
++                              index->data.phys_nr,
++                              index->data.phys_block,
++                              index->cpu_addr);
++                      if (level && index_present(index)) {
++                              dprintk("%s==> deeper one level\n", dots);
++                              print_index_group(lv, level-1,
++                                                      index_child(index));
++                      }
++              }
++      }
++      dprintk("%s DONE: level %d index group block.\n", dots, level);
++}
++
++static void print_lv (lv_t *lv)
++{
++      dprintk("printing LV %d ...\n", lv->log_id);
++      dprintk("..root index: %p\n", index_child(&lv->root_index));
++      dprintk("..INDEX TREE:\n");
++      print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
++      dprintk("..INDEX TREE done.\n");
++}
++
++static int map_lv (lv_t *lv)
++{
++      kdev_t dev = lv->dev;
++      unsigned int nr = MINOR(dev);
++      mddev_t *mddev = lv->vg->mddev;
++
++      if (MAJOR(dev) != MD_MAJOR) {
++              MD_BUG();
++              return -1;
++      }
++      if (kdev_to_mddev(dev)) {
++              MD_BUG();
++              return -1;
++      }
++      md_hd_struct[nr].start_sect = 0;
++      md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
++      md_size[nr] = md_size[mdidx(mddev)];
++      add_mddev_mapping(mddev, dev, lv);
++
++      return 0;
++}
++
++static int unmap_lv (lv_t *lv)
++{
++      kdev_t dev = lv->dev;
++      unsigned int nr = MINOR(dev);
++
++      if (MAJOR(dev) != MD_MAJOR) {
++              MD_BUG();
++              return -1;
++      }
++      md_hd_struct[nr].start_sect = 0;
++      md_hd_struct[nr].nr_sects = 0;
++      md_size[nr] = 0;
++      del_mddev_mapping(lv->vg->mddev, dev);
++
++      return 0;
++}
++
++static int init_vg (vg_t *vg)
++{
++      int i;
++      lv_t *lv;
++      kdev_t dev;
++      vg_sb_t *vg_sb;
++      struct buffer_head *bh;
++      lv_descriptor_t *lv_desc;
++
++      /*
++       * fix me: read all PVs and compare the SB
++       */
++        dev = vg->pv_array[0].dev;
++        bh = bread (dev, 1, HSM_BLOCKSIZE);
++      if (!bh) {
++              MD_BUG();
++              return -1;
++      }
++
++      vg_sb = (vg_sb_t *) bh->b_data;
++      vg->vg_sb = vg_sb;
++
++      if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
++              printk("%s is not a valid VG, has magic %x instead of %x!\n",
++                      partition_name(dev), vg_sb->vg_magic,
++                      HSM_VG_SB_MAGIC);
++              return -1;
++      }
++
++      vg->nr_lv = 0;
++      for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++              unsigned int id;
++              lv_desc = vg->vg_sb->lv_array + i;
++
++              id = lv_desc->lv_id;
++              if (!id) {
++                      printk("... LV desc %d empty\n", i);
++                      continue;
++              }
++              if (id >= HSM_MAX_LVS_PER_VG) {
++                      MD_BUG();
++                      continue;
++              }
++
++              lv = vg->lv_array + id;
++              if (lv->vg) {
++                      MD_BUG();
++                      continue;
++              }
++              lv->log_id = id;
++              lv->vg = vg;
++              lv->max_indices = lv_desc->lv_max_indices;
++              lv->free_indices = lv_desc->lv_free_indices;
++              lv->root_index.data = lv_desc->lv_root_idx;
++              lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
++
++              vg->nr_lv++;
++
++              map_lv(lv);
++              if (read_root_index(lv)) {
++                      vg->nr_lv--;
++                      unmap_lv(lv);
++                      memset(lv, 0, sizeof(*lv));
++              }
++      }
++      if (vg->nr_lv != vg_sb->nr_lvs)
++              MD_BUG();
++
++      return 0;
++}
++
++static int hsm_run (mddev_t *mddev)
++{
++      int i;
++      vg_t *vg;
++      mdk_rdev_t *rdev;
++
++      MOD_INC_USE_COUNT;
++
++      vg = kmalloc (sizeof (*vg), GFP_KERNEL);
++      if (!vg)
++              goto out;
++      memset(vg, 0, sizeof(*vg));
++      mddev->private = vg;
++      vg->mddev = mddev;
++
++      if (md_check_ordering(mddev)) {
++              printk("hsm: disks are not ordered, aborting!\n");
++              goto out;
++      }
++
++      set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
++
++      vg->nr_pv = mddev->nb_dev;
++      ITERATE_RDEV_ORDERED(mddev,rdev,i) {
++              pv_t *pv = vg->pv_array + i;
++
++              pv->dev = rdev->dev;
++              fsync_dev (pv->dev);
++              set_blocksize (pv->dev, HSM_BLOCKSIZE);
++              pv->phys_nr = i;
++              if (init_pv(pv))
++                      goto out;
++      }
++
++      init_vg(vg);
++
++      return 0;
++
++out:
++      if (vg) {
++              kfree(vg);
++              mddev->private = NULL;
++      }
++      MOD_DEC_USE_COUNT;
++
++      return 1;
++}
++
++static int hsm_stop (mddev_t *mddev)
++{
++      lv_t *lv;
++      vg_t *vg;
++      int i;
++
++      vg = mddev_to_vg(mddev);
++
++      for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++              lv = vg->lv_array + i;
++              if (!lv->log_id)
++                      continue;
++              print_lv(lv);
++              free_lv_indextree(lv);
++              unmap_lv(lv);
++      }
++      for (i = 0; i < vg->nr_pv; i++)
++              free_pv(vg->pv_array + i);
++
++      kfree(vg);
++
++      MOD_DEC_USE_COUNT;
++
++      return 0;
++}
++
++
++static int hsm_status (char *page, mddev_t *mddev)
++{
++      int sz = 0, i;
++      lv_t *lv;
++      vg_t *vg;
++
++      vg = mddev_to_vg(mddev);
++
++      for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
++              lv = vg->lv_array + i;
++              if (!lv->log_id)
++                      continue;
++              sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
++                      lv->max_indices - lv->free_indices, lv->max_indices);
++      }
++      return sz;
++}
++
++
++static mdk_personality_t hsm_personality=
++{
++      "hsm",
++      hsm_map,
++      NULL,
++      NULL,
++      hsm_run,
++      hsm_stop,
++      hsm_status,
++      NULL,
++      0,
++      NULL,
++      NULL,
++      NULL,
++      NULL
++};
++
++#ifndef MODULE
++
++md__initfunc(void hsm_init (void))
++{
++      register_md_personality (HSM, &hsm_personality);
++}
++
++#else
++
++int init_module (void)
++{
++      return (register_md_personality (HSM, &hsm_personality));
++}
++
++void cleanup_module (void)
++{
++      unregister_md_personality (HSM);
++}
++
++#endif
++
++/*
++ * This Linus-trick catches bugs via the linker.
++ */
++
++extern void __BUG__in__hsm_dot_c_1(void);
++extern void __BUG__in__hsm_dot_c_2(void);
++extern void __BUG__in__hsm_dot_c_3(void);
++extern void __BUG__in__hsm_dot_c_4(void);
++extern void __BUG__in__hsm_dot_c_5(void);
++extern void __BUG__in__hsm_dot_c_6(void);
++extern void __BUG__in__hsm_dot_c_7(void);
++ 
++void bugcatcher (void)
++{
++        if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
++                __BUG__in__hsm_dot_c_1();
++        if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
++                __BUG__in__hsm_dot_c_2();
++
++        if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
++                __BUG__in__hsm_dot_c_4();
++        if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
++                __BUG__in__hsm_dot_c_3();
++      if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
++                __BUG__in__hsm_dot_c_6();
++
++      if (sizeof(lv_lptr_t) != 16)
++                __BUG__in__hsm_dot_c_5();
++      if (sizeof(pv_pptr_t) != 16)
++                __BUG__in__hsm_dot_c_6();
++}
++
+diff -uNr linux-orig/drivers/block/linear.c linux/drivers/block/linear.c
+--- linux-orig/drivers/block/linear.c  Sat Nov  8 12:39:12 1997
++++ linux/drivers/block/linear.c       Fri Dec  8 22:54:52 2000
+@@ -1,4 +1,3 @@
+-
+ /*
+    linear.c : Multiple Devices driver for Linux
+               Copyright (C) 1994-96 Marc ZYNGIER
+@@ -19,186 +18,207 @@
+ 
+ #include <linux/module.h>
+ 
+-#include <linux/md.h>
++#include <linux/raid/md.h>
+ #include <linux/malloc.h>
+-#include <linux/init.h>
+ 
+-#include "linear.h"
++#include <linux/raid/linear.h>
+ 
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+ 
+-static int linear_run (int minor, struct md_dev *mddev)
++static int linear_run (mddev_t *mddev)
+ {
+-  int cur=0, i, size, dev0_size, nb_zone;
+-  struct linear_data *data;
+-
+-  MOD_INC_USE_COUNT;
+-
+-  mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
+-  data=(struct linear_data *) mddev->private;
+-
+-  /*
+-     Find out the smallest device. This was previously done
+-     at registry time, but since it violates modularity,
+-     I moved it here... Any comment ? ;-)
+-   */
+-
+-  data->smallest=mddev->devices;
+-  for (i=1; i<mddev->nb_dev; i++)
+-    if (data->smallest->size > mddev->devices[i].size)
+-      data->smallest=mddev->devices+i;
+-  
+-  nb_zone=data->nr_zones=
+-    md_size[minor]/data->smallest->size +
+-    (md_size[minor]%data->smallest->size ? 1 : 0);
+-  
+-  data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
+-
+-  size=mddev->devices[cur].size;
++      linear_conf_t *conf;
++      struct linear_hash *table;
++      mdk_rdev_t *rdev;
++      int size, i, j, nb_zone;
++      unsigned int curr_offset;
++
++      MOD_INC_USE_COUNT;
++
++      conf = kmalloc (sizeof (*conf), GFP_KERNEL);
++      if (!conf)
++              goto out;
++      mddev->private = conf;
++
++      if (md_check_ordering(mddev)) {
++              printk("linear: disks are not ordered, aborting!\n");
++              goto out;
++      }
++      /*
++       * Find the smallest device.
++       */
++
++      conf->smallest = NULL;
++      curr_offset = 0;
++      ITERATE_RDEV_ORDERED(mddev,rdev,j) {
++              dev_info_t *disk = conf->disks + j;
++
++              disk->dev = rdev->dev;
++              disk->size = rdev->size;
++              disk->offset = curr_offset;
++
++              curr_offset += disk->size;
++
++              if (!conf->smallest || (disk->size < conf->smallest->size))
++                      conf->smallest = disk;
++      }
++
++      nb_zone = conf->nr_zones =
++              md_size[mdidx(mddev)] / conf->smallest->size +
++              ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
++  
++      conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
++                                      GFP_KERNEL);
++      if (!conf->hash_table)
++              goto out;
++
++      /*
++       * Here we generate the linear hash table
++       */
++      table = conf->hash_table;
++      i = 0;
++      size = 0;
++      for (j = 0; j < mddev->nb_dev; j++) {
++              dev_info_t *disk = conf->disks + j;
++
++              if (size < 0) {
++                      table->dev1 = disk;
++                      table++;
++              }
++              size += disk->size;
++
++              while (size) {
++                      table->dev0 = disk;
++                      size -= conf->smallest->size;
++                      if (size < 0)
++                              break;
++                      table->dev1 = NULL;
++                      table++;
++              }
++      }
++      table->dev1 = NULL;
++
++      return 0;
++
++out:
++      if (conf)
++              kfree(conf);
++      MOD_DEC_USE_COUNT;
++      return 1;
++}
++
++static int linear_stop (mddev_t *mddev)
++{
++      linear_conf_t *conf = mddev_to_conf(mddev);
++  
++      kfree(conf->hash_table);
++      kfree(conf);
+ 
+-  i=0;
+-  while (cur<mddev->nb_dev)
+-  {
+-    data->hash_table[i].dev0=mddev->devices+cur;
++      MOD_DEC_USE_COUNT;
+ 
+-    if (size>=data->smallest->size) /* If we completely fill the slot */
+-    {
+-      data->hash_table[i++].dev1=NULL;
+-      size-=data->smallest->size;
+-
+-      if (!size)
+-      {
+-      if (++cur==mddev->nb_dev) continue;
+-      size=mddev->devices[cur].size;
+-      }
+-
+-      continue;
+-    }
+-
+-    if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
+-    {
+-      data->hash_table[i].dev1=NULL;
+-      continue;
+-    }
+-
+-    dev0_size=size;           /* Here, we use a 2nd dev to fill the slot */
+-    size=mddev->devices[cur].size;
+-    data->hash_table[i++].dev1=mddev->devices+cur;
+-    size-=(data->smallest->size - dev0_size);
+-  }
+-
+-  return 0;
+-}
+-
+-static int linear_stop (int minor, struct md_dev *mddev)
+-{
+-  struct linear_data *data=(struct linear_data *) mddev->private;
+-  
+-  kfree (data->hash_table);
+-  kfree (data);
+-
+-  MOD_DEC_USE_COUNT;
+-
+-  return 0;
++      return 0;
+ }
+ 
+ 
+-static int linear_map (struct md_dev *mddev, kdev_t *rdev,
++static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+                      unsigned long *rsector, unsigned long size)
+ {
+-  struct linear_data *data=(struct linear_data *) mddev->private;
+-  struct linear_hash *hash;
+-  struct real_dev *tmp_dev;
+-  long block;
+-
+-  block=*rsector >> 1;
+-  hash=data->hash_table+(block/data->smallest->size);
+-  
+-  if (block >= (hash->dev0->size + hash->dev0->offset))
+-  {
+-    if (!hash->dev1)
+-    {
+-      printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
+-      return (-1);
+-    }
+-    
+-    tmp_dev=hash->dev1;
+-  }
+-  else
+-    tmp_dev=hash->dev0;
++      linear_conf_t *conf = mddev_to_conf(mddev);
++      struct linear_hash *hash;
++      dev_info_t *tmp_dev;
++      long block;
++
++      block = *rsector >> 1;
++      hash = conf->hash_table + (block / conf->smallest->size);
++  
++      if (block >= (hash->dev0->size + hash->dev0->offset))
++      {
++              if (!hash->dev1)
++              {
++                      printk ("linear_map : hash->dev1==NULL for block %ld\n",
++                                              block);
++                      return -1;
++              }
++              tmp_dev = hash->dev1;
++      } else
++              tmp_dev = hash->dev0;
+     
+-  if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
+-    printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
+-          block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
++      if (block >= (tmp_dev->size + tmp_dev->offset)
++                              || block < tmp_dev->offset)
++      printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
++              block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
+   
+-  *rdev=tmp_dev->dev;
+-  *rsector=(block-(tmp_dev->offset)) << 1;
++      *rdev = tmp_dev->dev;
++      *rsector = (block - tmp_dev->offset) << 1;
+ 
+-  return (0);
++      return 0;
+ }
+ 
+-static int linear_status (char *page, int minor, struct md_dev *mddev)
++static int linear_status (char *page, mddev_t *mddev)
+ {
+-  int sz=0;
++      int sz=0;
+ 
+ #undef MD_DEBUG
+ #ifdef MD_DEBUG
+-  int j;
+-  struct linear_data *data=(struct linear_data *) mddev->private;
++      int j;
++      linear_conf_t *conf = mddev_to_conf(mddev);
+   
+-  sz+=sprintf (page+sz, "      ");
+-  for (j=0; j<data->nr_zones; j++)
+-  {
+-    sz+=sprintf (page+sz, "[%s",
+-               partition_name (data->hash_table[j].dev0->dev));
+-
+-    if (data->hash_table[j].dev1)
+-      sz+=sprintf (page+sz, "/%s] ",
+-                 partition_name(data->hash_table[j].dev1->dev));
+-    else
+-      sz+=sprintf (page+sz, "] ");
+-  }
+-
+-  sz+=sprintf (page+sz, "\n");
++      sz += sprintf(page+sz, "      ");
++      for (j = 0; j < conf->nr_zones; j++)
++      {
++              sz += sprintf(page+sz, "[%s",
++                      partition_name(conf->hash_table[j].dev0->dev));
++
++              if (conf->hash_table[j].dev1)
++                      sz += sprintf(page+sz, "/%s] ",
++                        partition_name(conf->hash_table[j].dev1->dev));
++              else
++                      sz += sprintf(page+sz, "] ");
++      }
++      sz += sprintf(page+sz, "\n");
+ #endif
+-  sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
+-  return sz;
++      sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
++      return sz;
+ }
+ 
+ 
+-static struct md_personality linear_personality=
++static mdk_personality_t linear_personality=
+ {
+-  "linear",
+-  linear_map,
+-  NULL,
+-  NULL,
+-  linear_run,
+-  linear_stop,
+-  linear_status,
+-  NULL,                               /* no ioctls */
+-  0
++      "linear",
++      linear_map,
++      NULL,
++      NULL,
++      linear_run,
++      linear_stop,
++      linear_status,
++      NULL,
++      0,
++      NULL,
++      NULL,
++      NULL,
++      NULL
+ };
+ 
+-
+ #ifndef MODULE
+ 
+-__initfunc(void linear_init (void))
++md__initfunc(void linear_init (void))
+ {
+-  register_md_personality (LINEAR, &linear_personality);
++      register_md_personality (LINEAR, &linear_personality);
+ }
+ 
+ #else
+ 
+ int init_module (void)
+ {
+-  return (register_md_personality (LINEAR, &linear_personality));
++      return (register_md_personality (LINEAR, &linear_personality));
+ }
+ 
+ void cleanup_module (void)
+ {
+-  unregister_md_personality (LINEAR);
++      unregister_md_personality (LINEAR);
+ }
+ 
+ #endif
++
+diff -uNr linux-orig/drivers/block/linear.h linux/drivers/block/linear.h
+--- linux-orig/drivers/block/linear.h  Fri Nov 22 07:07:23 1996
++++ linux/drivers/block/linear.h       Fri Dec  8 22:54:52 2000
+@@ -1,16 +0,0 @@
+-#ifndef _LINEAR_H
+-#define _LINEAR_H
+-
+-struct linear_hash
+-{
+-  struct real_dev *dev0, *dev1;
+-};
+-
+-struct linear_data
+-{
+-  struct linear_hash *hash_table; /* Dynamically allocated */
+-  struct real_dev *smallest;
+-  int nr_zones;
+-};
+-
+-#endif
+diff -uNr linux-orig/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
+--- linux-orig/drivers/block/ll_rw_blk.c       Fri Dec  8 22:53:51 2000
++++ linux/drivers/block/ll_rw_blk.c    Fri Dec  8 22:54:52 2000
+@@ -23,6 +23,7 @@
+ #include <asm/io.h>
+ #include <asm/uaccess.h>
+ #include <linux/blk.h>
++#include <linux/raid/md.h>
+ 
+ #include <linux/module.h>
+ 
+@@ -53,6 +54,11 @@
+ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
+ 
+ /*
++ * per-major idle-IO detection
++ */
++unsigned long io_events[MAX_BLKDEV] = {0, };
++
++/*
+  * used to wait on when there are no free requests
+  */
+ struct wait_queue * wait_for_request;
+@@ -583,6 +589,8 @@
+               return;
+       /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
+       lock_buffer(bh);
++      if (!buffer_lowprio(bh))
++              io_events[major]++;
+ 
+       if (blk_size[major]) {
+               unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
+@@ -832,7 +840,7 @@
+               bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
+ #ifdef CONFIG_BLK_DEV_MD
+               if (major==MD_MAJOR &&
+-                  md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
++                      md_map (bh[i]->b_dev, &bh[i]->b_rdev,
+                           &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
+                       printk (KERN_ERR
+                               "Bad md_map in ll_rw_block\n");
+@@ -852,7 +860,7 @@
+                       set_bit(BH_Req, &bh[i]->b_state);
+ #ifdef CONFIG_BLK_DEV_MD
+                       if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
+-                              md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
++                              md_make_request(bh[i], rw);
+                               continue;
+                       }
+ #endif
+diff -uNr linux-orig/drivers/block/md.c linux/drivers/block/md.c
+--- linux-orig/drivers/block/md.c      Mon Sep  4 10:39:16 2000
++++ linux/drivers/block/md.c   Fri Dec  8 22:54:52 2000
+@@ -1,21 +1,17 @@
+-
+ /*
+    md.c : Multiple Devices driver for Linux
+-          Copyright (C) 1994-96 Marc ZYNGIER
+-        <zyngier@ufr-info-p7.ibp.fr> or
+-        <maz@gloups.fdn.fr>
++          Copyright (C) 1998, 1999 Ingo Molnar
+ 
+-   A lot of inspiration came from hd.c ...
++     completely rewritten, based on the MD driver code from Marc Zyngier
+ 
+-   kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+-   boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
++   Changes:
+ 
+-   RAID-1/RAID-5 extensions by:
+-        Ingo Molnar, Miguel de Icaza, Gadi Oxman
++   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
++   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
++   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
++   - kmod support by: Cyrus Durgin
++   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ 
+-   Changes for kmod by:
+-      Cyrus Durgin
+-   
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2, or (at your option)
+@@ -26,807 +22,3007 @@
+    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+ */
+ 
+-/*
+- * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
+- * the extra system load does not show up that much. Increase it if your
+- * system can take more.
+- */
+-#define SPEED_LIMIT 1024
++#include <linux/raid/md.h>
++#include <linux/raid/xor.h>
+ 
+-#include <linux/config.h>
+-#include <linux/module.h>
+-#include <linux/version.h>
+-#include <linux/malloc.h>
+-#include <linux/mm.h>
+-#include <linux/md.h>
+-#include <linux/hdreg.h>
+-#include <linux/stat.h>
+-#include <linux/fs.h>
+-#include <linux/proc_fs.h>
+-#include <linux/blkdev.h>
+-#include <linux/genhd.h>
+-#include <linux/smp_lock.h>
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+-#include <linux/errno.h>
+-#include <linux/init.h>
+ 
+ #define __KERNEL_SYSCALLS__
+ #include <linux/unistd.h>
+ 
++#include <asm/unaligned.h>
++
++extern asmlinkage int sys_sched_yield(void);
++extern asmlinkage int sys_setsid(void);
++
++extern unsigned long io_events[MAX_BLKDEV];
++
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ 
+ #include <linux/blk.h>
+-#include <asm/uaccess.h>
+-#include <asm/bitops.h>
+-#include <asm/atomic.h>
+ 
+ #ifdef CONFIG_MD_BOOT
+-extern kdev_t name_to_kdev_t(char *line) __init;
++extern kdev_t name_to_kdev_t(char *line) md__init;
+ #endif
+ 
+-static struct hd_struct md_hd_struct[MAX_MD_DEV];
+-static int md_blocksizes[MAX_MD_DEV];
+-int md_maxreadahead[MAX_MD_DEV];
+-#if SUPPORT_RECONSTRUCTION
+-static struct md_thread *md_sync_thread = NULL;
+-#endif /* SUPPORT_RECONSTRUCTION */
++static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
++
++/*
++ * these have to be allocated separately because external
++ * subsystems want to have a pre-defined structure
++ */
++struct hd_struct md_hd_struct[MAX_MD_DEVS];
++static int md_blocksizes[MAX_MD_DEVS];
++static int md_maxreadahead[MAX_MD_DEVS];
++static mdk_thread_t *md_recovery_thread = NULL;
+ 
+-int md_size[MAX_MD_DEV]={0, };
++int md_size[MAX_MD_DEVS] = {0, };
+ 
+ static void md_geninit (struct gendisk *);
+ 
+ static struct gendisk md_gendisk=
+ {
+-  MD_MAJOR,
+-  "md",
+-  0,
+-  1,
+-  MAX_MD_DEV,
+-  md_geninit,
+-  md_hd_struct,
+-  md_size,
+-  MAX_MD_DEV,
+-  NULL,
+-  NULL
++      MD_MAJOR,
++      "md",
++      0,
++      1,
++      MAX_MD_DEVS,
++      md_geninit,
++      md_hd_struct,
++      md_size,
++      MAX_MD_DEVS,
++      NULL,
++      NULL
+ };
+ 
+-static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
+-struct md_dev md_dev[MAX_MD_DEV];
+-
+-int md_thread(void * arg);
++/*
++ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
++ * is 100 KB/sec, so the extra system load does not show up that much.
++ * Increase it if you want to have more _guaranteed_ speed. Note that
++ * the RAID driver will use the maximum available bandwith if the IO
++ * subsystem is idle.
++ *
++ * you can change it via /proc/sys/dev/speed-limit
++ */
+ 
+-static struct gendisk *find_gendisk (kdev_t dev)
+-{
+-  struct gendisk *tmp=gendisk_head;
++static int sysctl_speed_limit = 100;
+ 
+-  while (tmp != NULL)
+-  {
+-    if (tmp->major==MAJOR(dev))
+-      return (tmp);
+-    
+-    tmp=tmp->next;
+-  }
++static struct ctl_table_header *md_table_header;
+ 
+-  return (NULL);
+-}
++static ctl_table md_table[] = {
++      {DEV_MD_SPEED_LIMIT, "speed-limit",
++       &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
++      {0}
++};
+ 
+-char *partition_name (kdev_t dev)
+-{
+-  static char name[40];               /* This should be long
+-                                 enough for a device name ! */
+-  struct gendisk *hd = find_gendisk (dev);
++static ctl_table md_dir_table[] = {
++        {DEV_MD, "md", NULL, 0, 0555, md_table},
++        {0}
++};
+ 
+-  if (!hd)
+-  {
+-    sprintf (name, "[dev %s]", kdevname(dev));
+-    return (name);
+-  }
++static ctl_table md_root_table[] = {
++        {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
++        {0}
++};
+ 
+-  return disk_name (hd, MINOR(dev), name);  /* routine in genhd.c */
++static void md_register_sysctl(void)
++{
++        md_table_header = register_sysctl_table(md_root_table, 1);
+ }
+ 
+-static int legacy_raid_sb (int minor, int pnum)
++void md_unregister_sysctl(void)
+ {
+-      int i, factor;
++        unregister_sysctl_table(md_table_header);
++}
++
++/*
++ * The mapping between kdev and mddev is not necessary a simple
++ * one! Eg. HSM uses several sub-devices to implement Logical
++ * Volumes. All these sub-devices map to the same mddev.
++ */
++dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
+ 
+-      factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
++void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
++{
++      unsigned int minor = MINOR(dev);
+ 
+-      /*****
+-       * do size and offset calculations.
+-       */
+-      for (i=0; i<md_dev[minor].nb_dev; i++) {
+-              md_dev[minor].devices[i].size &= ~(factor - 1);
+-              md_size[minor] += md_dev[minor].devices[i].size;
+-              md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + 
+-                                                      md_dev[minor].devices[i-1].size) : 0;
++      if (MAJOR(dev) != MD_MAJOR) {
++              MD_BUG();
++              return;
+       }
+-      if (pnum == RAID0 >> PERSONALITY_SHIFT)
+-              md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
+-      return 0;
++      if (mddev_map[minor].mddev != NULL) {
++              MD_BUG();
++              return;
++      }
++      mddev_map[minor].mddev = mddev;
++      mddev_map[minor].data = data;
+ }
+ 
+-static void free_sb (struct md_dev *mddev)
++void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
+ {
+-      int i;
+-      struct real_dev *realdev;
++      unsigned int minor = MINOR(dev);
+ 
+-      if (mddev->sb) {
+-              free_page((unsigned long) mddev->sb);
+-              mddev->sb = NULL;
++      if (MAJOR(dev) != MD_MAJOR) {
++              MD_BUG();
++              return;
+       }
+-      for (i = 0; i <mddev->nb_dev; i++) {
+-              realdev = mddev->devices + i;
+-              if (realdev->sb) {
+-                      free_page((unsigned long) realdev->sb);
+-                      realdev->sb = NULL;
+-              }
++      if (mddev_map[minor].mddev != mddev) {
++              MD_BUG();
++              return;
+       }
++      mddev_map[minor].mddev = NULL;
++      mddev_map[minor].data = NULL;
+ }
+ 
+ /*
+- * Check one RAID superblock for generic plausibility
++ * Enables to iterate over all existing md arrays
+  */
++static MD_LIST_HEAD(all_mddevs);
+ 
+-#define BAD_MAGIC KERN_ERR \
+-"md: %s: invalid raid superblock magic (%x) on block %u\n"
++static mddev_t * alloc_mddev (kdev_t dev)
++{
++      mddev_t * mddev;
+ 
+-#define OUT_OF_MEM KERN_ALERT \
+-"md: out of memory.\n"
++      if (MAJOR(dev) != MD_MAJOR) {
++              MD_BUG();
++              return 0;
++      }
++      mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
++      if (!mddev)
++              return NULL;
++              
++      memset(mddev, 0, sizeof(*mddev));
+ 
+-#define NO_DEVICE KERN_ERR \
+-"md: disabled device %s\n"
++      mddev->__minor = MINOR(dev);
++      mddev->reconfig_sem = MUTEX;
++      mddev->recovery_sem = MUTEX;
++      mddev->resync_sem = MUTEX;
++      MD_INIT_LIST_HEAD(&mddev->disks);
++      /*
++       * The 'base' mddev is the one with data NULL.
++       * personalities can create additional mddevs 
++       * if necessary.
++       */
++      add_mddev_mapping(mddev, dev, 0);
++      md_list_add(&mddev->all_mddevs, &all_mddevs);
+ 
+-#define SUCCESS 0
+-#define FAILURE -1
++      return mddev;
++}
+ 
+-static int analyze_one_sb (struct real_dev * rdev)
++static void free_mddev (mddev_t *mddev)
+ {
+-      int ret = FAILURE;
+-      struct buffer_head *bh;
+-      kdev_t dev = rdev->dev;
+-      md_superblock_t *sb;
++      if (!mddev) {
++              MD_BUG();
++              return;
++      }
+ 
+       /*
+-       * Read the superblock, it's at the end of the disk
++       * Make sure nobody else is using this mddev
++       * (careful, we rely on the global kernel lock here)
+        */
+-      rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
+-      set_blocksize (dev, MD_SB_BYTES);
+-      bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+-
+-      if (bh) {
+-              sb = (md_superblock_t *) bh->b_data;
+-              if (sb->md_magic != MD_SB_MAGIC) {
+-                      printk (BAD_MAGIC, kdevname(dev),
+-                                       sb->md_magic, rdev->sb_offset);
+-                      goto abort;
+-              }
+-              rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
+-              if (!rdev->sb) {
+-                      printk (OUT_OF_MEM);
+-                      goto abort;
+-              }
+-              memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
++      while (md_atomic_read(&mddev->resync_sem.count) != 1)
++              schedule();
++      while (md_atomic_read(&mddev->recovery_sem.count) != 1)
++              schedule();
+ 
+-              rdev->size = sb->size;
+-      } else
+-              printk (NO_DEVICE,kdevname(rdev->dev));
+-      ret = SUCCESS;
+-abort:
+-      if (bh)
+-              brelse (bh);
+-      return ret;
++      del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
++      md_list_del(&mddev->all_mddevs);
++      MD_INIT_LIST_HEAD(&mddev->all_mddevs);
++      kfree(mddev);
+ }
+ 
+-#undef SUCCESS
+-#undef FAILURE
+-
+-#undef BAD_MAGIC
+-#undef OUT_OF_MEM
+-#undef NO_DEVICE
+ 
+-/*
+- * Check a full RAID array for plausibility
+- */
++struct gendisk * find_gendisk (kdev_t dev)
++{
++      struct gendisk *tmp = gendisk_head;
+ 
+-#define INCONSISTENT KERN_ERR \
+-"md: superblock inconsistency -- run ckraid\n"
++      while (tmp != NULL) {
++              if (tmp->major == MAJOR(dev))
++                      return (tmp);
++              tmp = tmp->next;
++      }
++      return (NULL);
++}
+ 
+-#define OUT_OF_DATE KERN_ERR \
+-"md: superblock update time inconsistenty -- using the most recent one\n"
++mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
++{
++      mdk_rdev_t * rdev;
++      struct md_list_head *tmp;
+ 
+-#define OLD_VERSION KERN_ALERT \
+-"md: %s: unsupported raid array version %d.%d.%d\n"
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->desc_nr == nr)
++                      return rdev;
++      }
++      return NULL;
++}
+ 
+-#define NOT_CLEAN KERN_ERR \
+-"md: %s: raid array is not clean -- run ckraid\n"
++mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
++{
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
+ 
+-#define NOT_CLEAN_IGNORE KERN_ERR \
+-"md: %s: raid array is not clean -- reconstructing parity\n"
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->dev == dev)
++                      return rdev;
++      }
++      return NULL;
++}
+ 
+-#define UNKNOWN_LEVEL KERN_ERR \
+-"md: %s: unsupported raid level %d\n"
++static MD_LIST_HEAD(device_names);
+ 
+-static int analyze_sbs (int minor, int pnum)
++char * partition_name (kdev_t dev)
+ {
+-      struct md_dev *mddev = md_dev + minor;
+-      int i, N = mddev->nb_dev, out_of_date = 0;
+-      struct real_dev * disks = mddev->devices;
+-      md_superblock_t *sb, *freshest = NULL;
++      struct gendisk *hd;
++      static char nomem [] = "<nomem>";
++      dev_name_t *dname;
++      struct md_list_head *tmp = device_names.next;
+ 
+-      /*
+-       * RAID-0 and linear don't use a RAID superblock
+-       */
+-      if (pnum == RAID0 >> PERSONALITY_SHIFT ||
+-              pnum == LINEAR >> PERSONALITY_SHIFT)
+-                      return legacy_raid_sb (minor, pnum);
++      while (tmp != &device_names) {
++              dname = md_list_entry(tmp, dev_name_t, list);
++              if (dname->dev == dev)
++                      return dname->name;
++              tmp = tmp->next;
++      }
++
++      dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+ 
++      if (!dname)
++              return nomem;
+       /*
+-       * Verify the RAID superblock on each real device
++       * ok, add this new device name to the list
+        */
+-      for (i = 0; i < N; i++)
+-              if (analyze_one_sb(disks+i))
+-                      goto abort;
++      hd = find_gendisk (dev);
++
++      if (!hd)
++              sprintf (dname->name, "[dev %s]", kdevname(dev));
++      else
++              disk_name (hd, MINOR(dev), dname->name);
++
++      dname->dev = dev;
++      md_list_add(&dname->list, &device_names);
++
++      return dname->name;
++}
++
++static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
++                                              int persistent)
++{
++      unsigned int size = 0;
++
++      if (blk_size[MAJOR(dev)])
++              size = blk_size[MAJOR(dev)][MINOR(dev)];
++      if (persistent)
++              size = MD_NEW_SIZE_BLOCKS(size);
++      return size;
++}
++
++static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
++{
++      unsigned int size;
++
++      size = calc_dev_sboffset(dev, mddev, persistent);
++      if (!mddev->sb) {
++              MD_BUG();
++              return size;
++      }
++      if (mddev->sb->chunk_size)
++              size &= ~(mddev->sb->chunk_size/1024 - 1);
++      return size;
++}
++
++/*
++ * We check wether all devices are numbered from 0 to nb_dev-1. The
++ * order is guaranteed even after device name changes.
++ *
++ * Some personalities (raid0, linear) use this. Personalities that
++ * provide data have to be able to deal with loss of individual
++ * disks, so they do their checking themselves.
++ */
++int md_check_ordering (mddev_t *mddev)
++{
++      int i, c;
++      mdk_rdev_t *rdev;
++      struct md_list_head *tmp;
+ 
+       /*
+-       * The superblock constant part has to be the same
+-       * for all disks in the array.
++       * First, all devices must be fully functional
+        */
+-      sb = NULL;
+-      for (i = 0; i < N; i++) {
+-              if (!disks[i].sb)
+-                      continue;
+-              if (!sb) {
+-                      sb = disks[i].sb;
+-                      continue;
+-              }
+-              if (memcmp(sb,
+-                         disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
+-                      printk (INCONSISTENT);
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty) {
++                      printk("md: md%d's device %s faulty, aborting.\n",
++                              mdidx(mddev), partition_name(rdev->dev));
+                       goto abort;
+               }
+       }
+ 
+-      /*
+-       * OK, we have all disks and the array is ready to run. Let's
+-       * find the freshest superblock, that one will be the superblock
+-       * that represents the whole array.
+-       */
+-      if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
++      c = 0;
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              c++;
++      }
++      if (c != mddev->nb_dev) {
++              MD_BUG();
+               goto abort;
+-      freshest = NULL;
+-      for (i = 0; i < N; i++) {
+-              if (!disks[i].sb)
+-                      continue;
+-              if (!freshest) {
+-                      freshest = disks[i].sb;
+-                      continue;
+-              }
+-              /*
+-               * Find the newest superblock version
+-               */
+-              if (disks[i].sb->utime != freshest->utime) {
+-                      out_of_date = 1;
+-                      if (disks[i].sb->utime > freshest->utime)
+-                              freshest = disks[i].sb;
+-              }
+       }
+-      if (out_of_date)
+-              printk(OUT_OF_DATE);
+-      memcpy (sb, freshest, sizeof(*freshest));
+-
+-      /*
+-       * Check if we can support this RAID array
+-       */
+-      if (sb->major_version != MD_MAJOR_VERSION ||
+-                      sb->minor_version > MD_MINOR_VERSION) {
+-
+-              printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
+-                              sb->major_version, sb->minor_version,
+-                              sb->patch_version);
++      if (mddev->nb_dev != mddev->sb->raid_disks) {
++              printk("md: md%d, array needs %d disks, has %d, aborting.\n",
++                      mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
+               goto abort;
+       }
+-
+       /*
+-       * We need to add this as a superblock option.
++       * Now the numbering check
+        */
+-#if SUPPORT_RECONSTRUCTION
+-      if (sb->state != (1 << MD_SB_CLEAN)) {
+-              if (sb->level == 1) {
+-                      printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
++      for (i = 0; i < mddev->nb_dev; i++) {
++              c = 0;
++              ITERATE_RDEV(mddev,rdev,tmp) {
++                      if (rdev->desc_nr == i)
++                              c++;
++              }
++              if (c == 0) {
++                      printk("md: md%d, missing disk #%d, aborting.\n",
++                              mdidx(mddev), i);
+                       goto abort;
+-              } else
+-                      printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
+-      }
+-#else
+-      if (sb->state != (1 << MD_SB_CLEAN)) {
+-              printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
+-              goto abort;
+-      }
+-#endif /* SUPPORT_RECONSTRUCTION */
+-
+-      switch (sb->level) {
+-              case 1:
+-                      md_size[minor] = sb->size;
+-                      md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
+-                      break;
+-              case 4:
+-              case 5:
+-                      md_size[minor] = sb->size * (sb->raid_disks - 1);
+-                      md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
+-                      break;
+-              default:
+-                      printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
+-                                      sb->level);
++              }
++              if (c > 1) {
++                      printk("md: md%d, too many disks #%d, aborting.\n",
++                              mdidx(mddev), i);
+                       goto abort;
++              }
+       }
+       return 0;
+ abort:
+-      free_sb(mddev);
+       return 1;
+ }
+ 
+-#undef INCONSISTENT
+-#undef OUT_OF_DATE
+-#undef OLD_VERSION
+-#undef NOT_CLEAN
+-#undef OLD_LEVEL
+-
+-int md_update_sb(int minor)
++static unsigned int zoned_raid_size (mddev_t *mddev)
+ {
+-      struct md_dev *mddev = md_dev + minor;
+-      struct buffer_head *bh;
+-      md_superblock_t *sb = mddev->sb;
+-      struct real_dev *realdev;
+-      kdev_t dev;
+-      int i;
+-      u32 sb_offset;
++      unsigned int mask;
++      mdk_rdev_t * rdev;
++      struct md_list_head *tmp;
+ 
+-      sb->utime = CURRENT_TIME;
+-      for (i = 0; i < mddev->nb_dev; i++) {
+-              realdev = mddev->devices + i;
+-              if (!realdev->sb)
+-                      continue;
+-              dev = realdev->dev;
+-              sb_offset = realdev->sb_offset;
+-              set_blocksize(dev, MD_SB_BYTES);
+-              printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
+-              bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+-              if (bh) {
+-                      sb = (md_superblock_t *) bh->b_data;
+-                      memcpy(sb, mddev->sb, MD_SB_BYTES);
+-                      memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
+-                      mark_buffer_uptodate(bh, 1);
+-                      mark_buffer_dirty(bh, 1);
+-                      ll_rw_block(WRITE, 1, &bh);
+-                      wait_on_buffer(bh);
+-                      bforget(bh);
+-                      fsync_dev(dev);
+-                      invalidate_buffers(dev);
+-              } else
+-                      printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
++      if (!mddev->sb) {
++              MD_BUG();
++              return -EINVAL;
++      }
++      /*
++       * do size and offset calculations.
++       */
++      mask = ~(mddev->sb->chunk_size/1024 - 1);
++printk("mask %08x\n", mask);
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++printk(" rdev->size: %d\n", rdev->size);
++              rdev->size &= mask;
++printk(" masked rdev->size: %d\n", rdev->size);
++              md_size[mdidx(mddev)] += rdev->size;
++printk("  new md_size: %d\n", md_size[mdidx(mddev)]);
+       }
+       return 0;
+ }
+ 
+-static int do_md_run (int minor, int repart)
++static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
+ {
+-  int pnum, i, min, factor, err;
++      if (disk_active(disk)) {
++              sb->working_disks--;
++      } else {
++              if (disk_spare(disk)) {
++                      sb->spare_disks--;
++                      sb->working_disks--;
++              } else  {
++                      sb->failed_disks--;
++              }
++      }
++      sb->nr_disks--;
++      disk->major = 0;
++      disk->minor = 0;
++      mark_disk_removed(disk);
++}
+ 
+-  if (!md_dev[minor].nb_dev)
+-    return -EINVAL;
+-  
+-  if (md_dev[minor].pers)
+-    return -EBUSY;
++#define BAD_MAGIC KERN_ERR \
++"md: invalid raid superblock magic on %s\n"
+ 
+-  md_dev[minor].repartition=repart;
+-  
+-  if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
+-      >= MAX_PERSONALITY)
+-    return -EINVAL;
+-
+-  /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
+-  if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
+-        for (i = 0; i < md_dev [minor].nb_dev; i++)
+-                if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
+-                        return -EINVAL;
+-  }
+-  if (!pers[pnum])
+-  {
+-#ifdef CONFIG_KMOD
+-    char module_name[80];
+-    sprintf (module_name, "md-personality-%d", pnum);
+-    request_module (module_name);
+-    if (!pers[pnum])
+-#endif
+-      return -EINVAL;
+-  }
+-  
+-  factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+-  
+-  for (i=0; i<md_dev[minor].nb_dev; i++)
+-    if (md_dev[minor].devices[i].size<min)
+-    {
+-      printk ("Dev %s smaller than %dk, cannot shrink\n",
+-            partition_name (md_dev[minor].devices[i].dev), min);
+-      return -EINVAL;
+-    }
+-
+-  for (i=0; i<md_dev[minor].nb_dev; i++) {
+-    fsync_dev(md_dev[minor].devices[i].dev);
+-    invalidate_buffers(md_dev[minor].devices[i].dev);
+-  }
+-  
+-  /* Resize devices according to the factor. It is used to align
+-     partitions size on a given chunk size. */
+-  md_size[minor]=0;
+-
+-  /*
+-   * Analyze the raid superblock
+-   */ 
+-  if (analyze_sbs(minor, pnum))
+-    return -EINVAL;
++#define BAD_MINOR KERN_ERR \
++"md: %s: invalid raid minor (%x)\n"
+ 
+-  md_dev[minor].pers=pers[pnum];
+-  
+-  if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
+-  {
+-    md_dev[minor].pers=NULL;
+-    free_sb(md_dev + minor);
+-    return (err);
+-  }
+-
+-  if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
+-  {
+-    md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
+-    md_update_sb(minor);
+-  }
+-
+-  /* FIXME : We assume here we have blocks
+-     that are twice as large as sectors.
+-     THIS MAY NOT BE TRUE !!! */
+-  md_hd_struct[minor].start_sect=0;
+-  md_hd_struct[minor].nr_sects=md_size[minor]<<1;
+-  
+-  read_ahead[MD_MAJOR] = 128;
+-  return (0);
+-}
++#define OUT_OF_MEM KERN_ALERT \
++"md: out of memory.\n"
++
++#define NO_SB KERN_ERR \
++"md: disabled device %s, could not read superblock.\n"
+ 
+-static int do_md_stop (int minor, struct inode *inode)
++#define BAD_CSUM KERN_WARNING \
++"md: invalid superblock checksum on %s\n"
++
++static int alloc_array_sb (mddev_t * mddev)
+ {
+-      int i;
+-  
+-      if (inode->i_count>1 || md_dev[minor].busy>1) {
+-              /*
+-               * ioctl : one open channel
+-               */
+-              printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
+-                              minor, inode->i_count, md_dev[minor].busy);
+-              return -EBUSY;
+-      }
+-  
+-      if (md_dev[minor].pers) {
+-              /*
+-               * It is safe to call stop here, it only frees private
+-               * data. Also, it tells us if a device is unstoppable
+-               * (eg. resyncing is in progress)
+-               */
+-              if (md_dev[minor].pers->stop (minor, md_dev+minor))
+-                      return -EBUSY;
+-              /*
+-               *  The device won't exist anymore -> flush it now
+-               */
+-              fsync_dev (inode->i_rdev);
+-              invalidate_buffers (inode->i_rdev);
+-              if (md_dev[minor].sb) {
+-                      md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
+-                      md_update_sb(minor);
+-              }
++      if (mddev->sb) {
++              MD_BUG();
++              return 0;
+       }
+-  
+-      /* Remove locks. */
+-      if (md_dev[minor].sb)
+-      free_sb(md_dev + minor);
+-      for (i=0; i<md_dev[minor].nb_dev; i++)
+-              clear_inode (md_dev[minor].devices[i].inode);
+-
+-      md_dev[minor].nb_dev=md_size[minor]=0;
+-      md_hd_struct[minor].nr_sects=0;
+-      md_dev[minor].pers=NULL;
+-  
+-      read_ahead[MD_MAJOR] = 128;
+-  
+-      return (0);
++
++      mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
++      if (!mddev->sb)
++              return -ENOMEM;
++      md_clear_page((unsigned long)mddev->sb);
++      return 0;
+ }
+ 
+-static int do_md_add (int minor, kdev_t dev)
++static int alloc_disk_sb (mdk_rdev_t * rdev)
+ {
+-      int i;
+-      int hot_add=0;
+-      struct real_dev *realdev;
++      if (rdev->sb)
++              MD_BUG();
+ 
+-      if (md_dev[minor].nb_dev==MAX_REAL)
++      rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
++      if (!rdev->sb) {
++              printk (OUT_OF_MEM);
+               return -EINVAL;
++      }
++      md_clear_page((unsigned long)rdev->sb);
+ 
+-      if (!fs_may_mount (dev))
+-              return -EBUSY;
++      return 0;
++}
+ 
+-      if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
+-              printk("md_add(): zero device size, huh, bailing out.\n");
+-              return -EINVAL;
++static void free_disk_sb (mdk_rdev_t * rdev)
++{
++      if (rdev->sb) {
++              free_page((unsigned long) rdev->sb);
++              rdev->sb = NULL;
++              rdev->sb_offset = 0;
++              rdev->size = 0;
++      } else {
++              if (!rdev->faulty)
++                      MD_BUG();
+       }
++}
+ 
+-      if (md_dev[minor].pers) {
+-              /*
+-               * The array is already running, hot-add the drive, or
+-               * bail out:
+-               */
+-              if (!md_dev[minor].pers->hot_add_disk)
+-                      return -EBUSY;
+-              else
+-                      hot_add=1;
++static void mark_rdev_faulty (mdk_rdev_t * rdev)
++{
++      unsigned long flags;
++
++      if (!rdev) {
++              MD_BUG();
++              return;
+       }
++      save_flags(flags);
++      cli();
++      free_disk_sb(rdev);
++      rdev->faulty = 1;
++      restore_flags(flags);
++}
++
++static int read_disk_sb (mdk_rdev_t * rdev)
++{
++      int ret = -EINVAL;
++      struct buffer_head *bh = NULL;
++      kdev_t dev = rdev->dev;
++      mdp_super_t *sb;
++      u32 sb_offset;
+ 
++      if (!rdev->sb) {
++              MD_BUG();
++              goto abort;
++      }       
++      
+       /*
+-       * Careful. We cannot increase nb_dev for a running array.
++       * Calculate the position of the superblock,
++       * it's at the end of the disk
+        */
+-      i=md_dev[minor].nb_dev;
+-      realdev = &md_dev[minor].devices[i];
+-      realdev->dev=dev;
+-  
+-      /* Lock the device by inserting a dummy inode. This doesn't
+-         smell very good, but I need to be consistent with the
+-         mount stuff, specially with fs_may_mount. If someone have
+-         a better idea, please help ! */
+-  
+-      realdev->inode=get_empty_inode ();
+-      realdev->inode->i_dev=dev;      /* don't care about other fields */
+-      insert_inode_hash (realdev->inode);
+-  
+-      /* Sizes are now rounded at run time */
+-  
+-/*  md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
+-
+-      realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
++      sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
++      rdev->sb_offset = sb_offset;
++      printk("(read) %s's sb offset: %d", partition_name(dev),
++                                                       sb_offset);
++      fsync_dev(dev);
++      set_blocksize (dev, MD_SB_BYTES);
++      bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+ 
+-      if (hot_add) {
++      if (bh) {
++              sb = (mdp_super_t *) bh->b_data;
++              memcpy (rdev->sb, sb, MD_SB_BYTES);
++      } else {
++              printk (NO_SB,partition_name(rdev->dev));
++              goto abort;
++      }
++      printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
++      ret = 0;
++abort:
++      if (bh)
++              brelse (bh);
++      return ret;
++}
++
++static unsigned int calc_sb_csum (mdp_super_t * sb)
++{
++      unsigned int disk_csum, csum;
++
++      disk_csum = sb->sb_csum;
++      sb->sb_csum = 0;
++      csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
++      sb->sb_csum = disk_csum;
++      return csum;
++}
++
++/*
++ * Check one RAID superblock for generic plausibility
++ */
++
++static int check_disk_sb (mdk_rdev_t * rdev)
++{
++      mdp_super_t *sb;
++      int ret = -EINVAL;
++
++      sb = rdev->sb;
++      if (!sb) {
++              MD_BUG();
++              goto abort;
++      }
++
++      if (sb->md_magic != MD_SB_MAGIC) {
++              printk (BAD_MAGIC, partition_name(rdev->dev));
++              goto abort;
++      }
++
++      if (sb->md_minor >= MAX_MD_DEVS) {
++              printk (BAD_MINOR, partition_name(rdev->dev),
++                                                      sb->md_minor);
++              goto abort;
++      }
++
++      if (calc_sb_csum(sb) != sb->sb_csum)
++              printk(BAD_CSUM, partition_name(rdev->dev));
++      ret = 0;
++abort:
++      return ret;
++}
++
++static kdev_t dev_unit(kdev_t dev)
++{
++      unsigned int mask;
++      struct gendisk *hd = find_gendisk(dev);
++
++      if (!hd)
++              return 0;
++      mask = ~((1 << hd->minor_shift) - 1);
++
++      return MKDEV(MAJOR(dev), MINOR(dev) & mask);
++}
++
++static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
++{
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++
++      ITERATE_RDEV(mddev,rdev,tmp)
++              if (dev_unit(rdev->dev) == dev_unit(dev))
++                      return rdev;
++
++      return NULL;
++}
++
++static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
++{
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++
++      ITERATE_RDEV(mddev1,rdev,tmp)
++              if (match_dev_unit(mddev2, rdev->dev))
++                      return 1;
++
++      return 0;
++}
++
++static MD_LIST_HEAD(all_raid_disks);
++static MD_LIST_HEAD(pending_raid_disks);
++
++static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
++{
++      mdk_rdev_t *same_pdev;
++
++      if (rdev->mddev) {
++              MD_BUG();
++              return;
++      }
++      same_pdev = match_dev_unit(mddev, rdev->dev);
++      if (same_pdev)
++              printk( KERN_WARNING
++"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
++"     protection against single-disk failure might be compromised.\n",
++                      mdidx(mddev), partition_name(rdev->dev),
++                              partition_name(same_pdev->dev));
++              
++      md_list_add(&rdev->same_set, &mddev->disks);
++      rdev->mddev = mddev;
++      mddev->nb_dev++;
++      printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
++}
++
++static void unbind_rdev_from_array (mdk_rdev_t * rdev)
++{
++      if (!rdev->mddev) {
++              MD_BUG();
++              return;
++      }
++      md_list_del(&rdev->same_set);
++      MD_INIT_LIST_HEAD(&rdev->same_set);
++      rdev->mddev->nb_dev--;
++      printk("unbind<%s,%d>\n", partition_name(rdev->dev),
++                                               rdev->mddev->nb_dev);
++      rdev->mddev = NULL;
++}
++
++/*
++ * prevent the device from being mounted, repartitioned or
++ * otherwise reused by a RAID array (or any other kernel
++ * subsystem), by opening the device. [simply getting an
++ * inode is not enough, the SCSI module usage code needs
++ * an explicit open() on the device]
++ */
++static int lock_rdev (mdk_rdev_t *rdev)
++{
++      int err = 0;
++
++      /*
++       * First insert a dummy inode.
++       */
++      if (rdev->inode)
++              MD_BUG();
++      rdev->inode = get_empty_inode();
++      /*
++       * we dont care about any other fields
++       */
++      rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
++      insert_inode_hash(rdev->inode);
++
++      memset(&rdev->filp, 0, sizeof(rdev->filp));
++      rdev->filp.f_mode = 3; /* read write */
++      err = blkdev_open(rdev->inode, &rdev->filp);
++      if (err) {
++              printk("blkdev_open() failed: %d\n", err);
++              clear_inode(rdev->inode);
++              rdev->inode = NULL;
++      }
++      return err;
++}
++
++static void unlock_rdev (mdk_rdev_t *rdev)
++{
++      blkdev_release(rdev->inode);
++      if (!rdev->inode)
++              MD_BUG();
++      clear_inode(rdev->inode);
++      rdev->inode = NULL;
++}
++
++static void export_rdev (mdk_rdev_t * rdev)
++{
++      printk("export_rdev(%s)\n",partition_name(rdev->dev));
++      if (rdev->mddev)
++              MD_BUG();
++      unlock_rdev(rdev);
++      free_disk_sb(rdev);
++      md_list_del(&rdev->all);
++      MD_INIT_LIST_HEAD(&rdev->all);
++      if (rdev->pending.next != &rdev->pending) {
++              printk("(%s was pending)\n",partition_name(rdev->dev));
++              md_list_del(&rdev->pending);
++              MD_INIT_LIST_HEAD(&rdev->pending);
++      }
++      rdev->dev = 0;
++      rdev->faulty = 0;
++      kfree(rdev);
++}
++
++static void kick_rdev_from_array (mdk_rdev_t * rdev)
++{
++      unbind_rdev_from_array(rdev);
++      export_rdev(rdev);
++}
++
++static void export_array (mddev_t *mddev)
++{
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++      mdp_super_t *sb = mddev->sb;
++
++      if (mddev->sb) {
++              mddev->sb = NULL;
++              free_page((unsigned long) sb);
++      }
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (!rdev->mddev) {
++                      MD_BUG();
++                      continue;
++              }
++              kick_rdev_from_array(rdev);
++      }
++      if (mddev->nb_dev)
++              MD_BUG();
++}
++
++#undef BAD_CSUM
++#undef BAD_MAGIC
++#undef OUT_OF_MEM
++#undef NO_SB
++
++static void print_desc(mdp_disk_t *desc)
++{
++      printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
++              partition_name(MKDEV(desc->major,desc->minor)),
++              desc->major,desc->minor,desc->raid_disk,desc->state);
++}
++
++static void print_sb(mdp_super_t *sb)
++{
++      int i;
++
++      printk("  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
++              sb->major_version, sb->minor_version, sb->patch_version,
++              sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
++              sb->ctime);
++      printk("     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
++              sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
++              sb->layout, sb->chunk_size);
++      printk("     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
++              sb->utime, sb->state, sb->active_disks, sb->working_disks,
++              sb->failed_disks, sb->spare_disks,
++              sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              mdp_disk_t *desc;
++
++              desc = sb->disks + i;
++              printk("     D %2d: ", i);
++              print_desc(desc);
++      }
++      printk("     THIS: ");
++      print_desc(&sb->this_disk);
++
++}
++
++static void print_rdev(mdk_rdev_t *rdev)
++{
++      printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
++              partition_name(rdev->dev), partition_name(rdev->old_dev),
++              rdev->size, rdev->faulty, rdev->desc_nr);
++      if (rdev->sb) {
++              printk("rdev superblock:\n");
++              print_sb(rdev->sb);
++      } else
++              printk("no rdev superblock!\n");
++}
++
++void md_print_devices (void)
++{
++      struct md_list_head *tmp, *tmp2;
++      mdk_rdev_t *rdev;
++      mddev_t *mddev;
++
++      printk("\n");
++      printk("       **********************************\n");
++      printk("       * <COMPLETE RAID STATE PRINTOUT> *\n");
++      printk("       **********************************\n");
++      ITERATE_MDDEV(mddev,tmp) {
++              printk("md%d: ", mdidx(mddev));
++
++              ITERATE_RDEV(mddev,rdev,tmp2)
++                      printk("<%s>", partition_name(rdev->dev));
++
++              if (mddev->sb) {
++                      printk(" array superblock:\n");
++                      print_sb(mddev->sb);
++              } else
++                      printk(" no array superblock.\n");
++
++              ITERATE_RDEV(mddev,rdev,tmp2)
++                      print_rdev(rdev);
++      }
++      printk("       **********************************\n");
++      printk("\n");
++}
++
++static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
++{
++      int ret;
++      mdp_super_t *tmp1, *tmp2;
++
++      tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
++      tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
++
++      if (!tmp1 || !tmp2) {
++              ret = 0;
++              goto abort;
++      }
++
++      *tmp1 = *sb1;
++      *tmp2 = *sb2;
++
++      /*
++       * nr_disks is not constant
++       */
++      tmp1->nr_disks = 0;
++      tmp2->nr_disks = 0;
++
++      if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
++              ret = 0;
++      else
++              ret = 1;
++
++abort:
++      if (tmp1)
++              kfree(tmp1);
++      if (tmp2)
++              kfree(tmp2);
++
++      return ret;
++}
++
++static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
++{
++      if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
++              (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
++              (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
++              (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
++
++              return 1;
++
++      return 0;
++}
++
++static mdk_rdev_t * find_rdev_all (kdev_t dev)
++{
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++
++      tmp = all_raid_disks.next;
++      while (tmp != &all_raid_disks) {
++              rdev = md_list_entry(tmp, mdk_rdev_t, all);
++              if (rdev->dev == dev)
++                      return rdev;
++              tmp = tmp->next;
++      }
++      return NULL;
++}
++
++#define GETBLK_FAILED KERN_ERR \
++"md: getblk failed for device %s\n"
++
++static int write_disk_sb(mdk_rdev_t * rdev)
++{
++      struct buffer_head *bh;
++      kdev_t dev;
++      u32 sb_offset, size;
++      mdp_super_t *sb;
++
++      if (!rdev->sb) {
++              MD_BUG();
++              return -1;
++      }
++      if (rdev->faulty) {
++              MD_BUG();
++              return -1;
++      }
++      if (rdev->sb->md_magic != MD_SB_MAGIC) {
++              MD_BUG();
++              return -1;
++      }
++
++      dev = rdev->dev;
++      sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
++      if (rdev->sb_offset != sb_offset) {
++              printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
++              goto skip;
++      }
++      /*
++       * If the disk went offline meanwhile and it's just a spare, then
++       * it's size has changed to zero silently, and the MD code does
++       * not yet know that it's faulty.
++       */
++      size = calc_dev_size(dev, rdev->mddev, 1);
++      if (size != rdev->size) {
++              printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
++              goto skip;
++      }
++
++      printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
++      fsync_dev(dev);
++      set_blocksize(dev, MD_SB_BYTES);
++      bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
++      if (!bh) {
++              printk(GETBLK_FAILED, partition_name(dev));
++              return 1;
++      }
++      memset(bh->b_data,0,bh->b_size);
++      sb = (mdp_super_t *) bh->b_data;
++      memcpy(sb, rdev->sb, MD_SB_BYTES);
++
++      mark_buffer_uptodate(bh, 1);
++      mark_buffer_dirty(bh, 1);
++      ll_rw_block(WRITE, 1, &bh);
++      wait_on_buffer(bh);
++      brelse(bh);
++      fsync_dev(dev);
++skip:
++      return 0;
++}
++#undef GETBLK_FAILED KERN_ERR
++
++static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
++{
++      int i, ok = 0;
++      mdp_disk_t *desc;
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              desc = mddev->sb->disks + i;
++#if 0
++              if (disk_faulty(desc)) {
++                      if (MKDEV(desc->major,desc->minor) == rdev->dev)
++                              ok = 1;
++                      continue;
++              }
++#endif
++              if (MKDEV(desc->major,desc->minor) == rdev->dev) {
++                      rdev->sb->this_disk = *desc;
++                      rdev->desc_nr = desc->number;
++                      ok = 1;
++                      break;
++              }
++      }
++
++      if (!ok) {
++              MD_BUG();
++      }
++}
++
++static int sync_sbs(mddev_t * mddev)
++{
++      mdk_rdev_t *rdev;
++      mdp_super_t *sb;
++        struct md_list_head *tmp;
++
++        ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty)
++                      continue;
++              sb = rdev->sb;
++              *sb = *mddev->sb;
++              set_this_disk(mddev, rdev);
++              sb->sb_csum = calc_sb_csum(sb);
++      }
++      return 0;
++}
++
++int md_update_sb(mddev_t * mddev)
++{
++      int first, err, count = 100;
++        struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++      __u64 ev;
++
++repeat:
++      mddev->sb->utime = CURRENT_TIME;
++      ev = get_unaligned(&mddev->sb->events);
++      ++ev;
++      put_unaligned(ev,&mddev->sb->events);
++      if (ev == (__u64)0) {
++              /*
++               * oops, this 64-bit counter should never wrap.
++               * Either we are in around ~1 trillion A.C., assuming
++               * 1 reboot per second, or we have a bug:
++               */
++              MD_BUG();
++              --ev;
++              put_unaligned(ev,&mddev->sb->events);
++      }
++      sync_sbs(mddev);
++
++      /*
++       * do not write anything to disk if using
++       * nonpersistent superblocks
++       */
++      if (mddev->sb->not_persistent)
++              return 0;
++
++      printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
++                                      mdidx(mddev));
++
++      first = 1;
++      err = 0;
++        ITERATE_RDEV(mddev,rdev,tmp) {
++              if (!first) {
++                      first = 0;
++                      printk(", ");
++              }
++              if (rdev->faulty)
++                      printk("(skipping faulty ");
++              printk("%s ", partition_name(rdev->dev));
++              if (!rdev->faulty) {
++                      printk("[events: %08lx]",
++                             (unsigned long)get_unaligned(&rdev->sb->events));
++                      err += write_disk_sb(rdev);
++              } else
++                      printk(")\n");
++      }
++      printk(".\n");
++      if (err) {
++              printk("errors occured during superblock update, repeating\n");
++              if (--count)
++                      goto repeat;
++              printk("excessive errors occured during superblock update, exiting\n");
++      }
++      return 0;
++}
++
++/*
++ * Import a device. If 'on_disk', then sanity check the superblock
++ *
++ * mark the device faulty if:
++ *
++ *   - the device is nonexistent (zero size)
++ *   - the device has no valid superblock
++ *
++ * a faulty rdev _never_ has rdev->sb set.
++ */
++static int md_import_device (kdev_t newdev, int on_disk)
++{
++      int err;
++      mdk_rdev_t *rdev;
++      unsigned int size;
++
++      if (find_rdev_all(newdev))
++              return -EEXIST;
++
++      rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
++      if (!rdev) {
++              printk("could not alloc mem for %s!\n", partition_name(newdev));
++              return -ENOMEM;
++      }
++      memset(rdev, 0, sizeof(*rdev));
++
++      if (!fs_may_mount(newdev)) {
++              printk("md: can not import %s, has active inodes!\n",
++                      partition_name(newdev));
++              err = -EBUSY;
++              goto abort_free;
++      }
++
++      if ((err = alloc_disk_sb(rdev)))
++              goto abort_free;
++
++      rdev->dev = newdev;
++      if (lock_rdev(rdev)) {
++              printk("md: could not lock %s, zero-size? Marking faulty.\n",
++                      partition_name(newdev));
++              err = -EINVAL;
++              goto abort_free;
++      }
++      rdev->desc_nr = -1;
++      rdev->faulty = 0;
++
++      size = 0;
++      if (blk_size[MAJOR(newdev)])
++              size = blk_size[MAJOR(newdev)][MINOR(newdev)];
++      if (!size) {
++              printk("md: %s has zero size, marking faulty!\n",
++                              partition_name(newdev));
++              err = -EINVAL;
++              goto abort_free;
++      }
++
++      if (on_disk) {
++              if ((err = read_disk_sb(rdev))) {
++                      printk("md: could not read %s's sb, not importing!\n",
++                                      partition_name(newdev));
++                      goto abort_free;
++              }
++              if ((err = check_disk_sb(rdev))) {
++                      printk("md: %s has invalid sb, not importing!\n",
++                                      partition_name(newdev));
++                      goto abort_free;
++              }
++
++              rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
++                                      rdev->sb->this_disk.minor);
++              rdev->desc_nr = rdev->sb->this_disk.number;
++      }
++      md_list_add(&rdev->all, &all_raid_disks);
++      MD_INIT_LIST_HEAD(&rdev->pending);
++
++      if (rdev->faulty && rdev->sb)
++              free_disk_sb(rdev);
++      return 0;
++
++abort_free:
++      if (rdev->sb) {
++              if (rdev->inode)
++                      unlock_rdev(rdev);
++              free_disk_sb(rdev);
++      }
++      kfree(rdev);
++      return err;
++}
++
++/*
++ * Check a full RAID array for plausibility
++ */
++
++#define INCONSISTENT KERN_ERR \
++"md: fatal superblock inconsistency in %s -- removing from array\n"
++
++#define OUT_OF_DATE KERN_ERR \
++"md: superblock update time inconsistency -- using the most recent one\n"
++
++#define OLD_VERSION KERN_ALERT \
++"md: md%d: unsupported raid array version %d.%d.%d\n"
++
++#define NOT_CLEAN_IGNORE KERN_ERR \
++"md: md%d: raid array is not clean -- starting background reconstruction\n"
++
++#define UNKNOWN_LEVEL KERN_ERR \
++"md: md%d: unsupported raid level %d\n"
++
++static int analyze_sbs (mddev_t * mddev)
++{
++      int out_of_date = 0, i;
++      struct md_list_head *tmp, *tmp2;
++      mdk_rdev_t *rdev, *rdev2, *freshest;
++      mdp_super_t *sb;
++
++      /*
++       * Verify the RAID superblock on each real device
++       */
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty) {
++                      MD_BUG();
++                      goto abort;
++              }
++              if (!rdev->sb) {
++                      MD_BUG();
++                      goto abort;
++              }
++              if (check_disk_sb(rdev))
++                      goto abort;
++      }
++
++      /*
++       * The superblock constant part has to be the same
++       * for all disks in the array.
++       */
++      sb = NULL;
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (!sb) {
++                      sb = rdev->sb;
++                      continue;
++              }
++              if (!sb_equal(sb, rdev->sb)) {
++                      printk (INCONSISTENT, partition_name(rdev->dev));
++                      kick_rdev_from_array(rdev);
++                      continue;
++              }
++      }
++
++      /*
++       * OK, we have all disks and the array is ready to run. Let's
++       * find the freshest superblock, that one will be the superblock
++       * that represents the whole array.
++       */
++      if (!mddev->sb)
++              if (alloc_array_sb(mddev))
++                      goto abort;
++      sb = mddev->sb;
++      freshest = NULL;
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              __u64 ev1, ev2;
++              /*
++               * if the checksum is invalid, use the superblock
++               * only as a last resort. (decrease it's age by
++               * one event)
++               */
++              if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
++                      __u64 ev = get_unaligned(&rdev->sb->events);
++                      if (ev != (__u64)0) {
++                              --ev;
++                              put_unaligned(ev,&rdev->sb->events);
++                      }
++              }
++
++              printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
++                     (unsigned long)get_unaligned(&rdev->sb->events));
++              if (!freshest) {
++                      freshest = rdev;
++                      continue;
++              }
++              /*
++               * Find the newest superblock version
++               */
++              ev1 = get_unaligned(&rdev->sb->events);
++              ev2 = get_unaligned(&freshest->sb->events);
++              if (ev1 != ev2) {
++                      out_of_date = 1;
++                      if (ev1 > ev2)
++                              freshest = rdev;
++              }
++      }
++      if (out_of_date) {
++              printk(OUT_OF_DATE);
++              printk("freshest: %s\n", partition_name(freshest->dev));
++      }
++      memcpy (sb, freshest->sb, sizeof(*sb));
++
++      /*
++       * at this point we have picked the 'best' superblock
++       * from all available superblocks.
++       * now we validate this superblock and kick out possibly
++       * failed disks.
++       */
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              /*
++               * Kick all non-fresh devices faulty
++               */
++              __u64 ev1, ev2;
++              ev1 = get_unaligned(&rdev->sb->events);
++              ev2 = get_unaligned(&sb->events);
++              ++ev1;
++              if (ev1 < ev2) {
++                      printk("md: kicking non-fresh %s from array!\n",
++                                              partition_name(rdev->dev));
++                      kick_rdev_from_array(rdev);
++                      continue;
++              }
++      }
++
++      /*
++       * Fix up changed device names ... but only if this disk has a
++       * recent update time. Use faulty checksum ones too.
++       */
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              __u64 ev1, ev2, ev3;
++              if (rdev->faulty) { /* REMOVEME */
++                      MD_BUG();
++                      goto abort;
++              }
++              ev1 = get_unaligned(&rdev->sb->events);
++              ev2 = get_unaligned(&sb->events);
++              ev3 = ev2;
++              --ev3;
++              if ((rdev->dev != rdev->old_dev) &&
++                  ((ev1 == ev2) || (ev1 == ev3))) {
++                      mdp_disk_t *desc;
++
++                      printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
++                      if (rdev->desc_nr == -1) {
++                              MD_BUG();
++                              goto abort;
++                      }
++                      desc = &sb->disks[rdev->desc_nr];
++                      if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
++                              MD_BUG();
++                              goto abort;
++                      }
++                      desc->major = MAJOR(rdev->dev);
++                      desc->minor = MINOR(rdev->dev);
++                      desc = &rdev->sb->this_disk;
++                      desc->major = MAJOR(rdev->dev);
++                      desc->minor = MINOR(rdev->dev);
++              }
++      }
++
++      /*
++       * Remove unavailable and faulty devices ...
++       *
++       * note that if an array becomes completely unrunnable due to
++       * missing devices, we do not write the superblock back, so the
++       * administrator has a chance to fix things up. The removal thus
++       * only happens if it's nonfatal to the contents of the array.
++       */
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              int found;
++              mdp_disk_t *desc;
++              kdev_t dev;
++
++              desc = sb->disks + i;
++              dev = MKDEV(desc->major, desc->minor);
++
++              /*
++               * We kick faulty devices/descriptors immediately.
++               */
++              if (disk_faulty(desc)) {
++                      found = 0;
++                      ITERATE_RDEV(mddev,rdev,tmp) {
++                              if (rdev->desc_nr != desc->number)
++                                      continue;
++                              printk("md%d: kicking faulty %s!\n",
++                                      mdidx(mddev),partition_name(rdev->dev));
++                              kick_rdev_from_array(rdev);
++                              found = 1;
++                              break;
++                      }
++                      if (!found) {
++                              if (dev == MKDEV(0,0))
++                                      continue;
++                              printk("md%d: removing former faulty %s!\n",
++                                      mdidx(mddev), partition_name(dev));
++                      }
++                      remove_descriptor(desc, sb);
++                      continue;
++              }
++
++              if (dev == MKDEV(0,0))
++                      continue;
++              /*
++               * Is this device present in the rdev ring?
++               */
++              found = 0;
++              ITERATE_RDEV(mddev,rdev,tmp) {
++                      if (rdev->desc_nr == desc->number) {
++                              found = 1;
++                              break;
++                      }
++              }
++              if (found) 
++                      continue;
++
++              printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
++              remove_descriptor(desc, sb);
++      }
++
++      /*
++       * Double check wether all devices mentioned in the
++       * superblock are in the rdev ring.
++       */
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              mdp_disk_t *desc;
++              kdev_t dev;
++
++              desc = sb->disks + i;
++              dev = MKDEV(desc->major, desc->minor);
++
++              if (dev == MKDEV(0,0))
++                      continue;
++
++              if (disk_faulty(desc)) {
++                      MD_BUG();
++                      goto abort;
++              }
++
++              rdev = find_rdev(mddev, dev);
++              if (!rdev) {
++                      MD_BUG();
++                      goto abort;
++              }
++      }
++
++      /*
++       * Do a final reality check.
++       */
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->desc_nr == -1) {
++                      MD_BUG();
++                      goto abort;
++              }
++              /*
++               * is the desc_nr unique?
++               */
++              ITERATE_RDEV(mddev,rdev2,tmp2) {
++                      if ((rdev2 != rdev) &&
++                                      (rdev2->desc_nr == rdev->desc_nr)) {
++                              MD_BUG();
++                              goto abort;
++                      }
++              }
++              /*
++               * is the device unique?
++               */
++              ITERATE_RDEV(mddev,rdev2,tmp2) {
++                      if ((rdev2 != rdev) &&
++                                      (rdev2->dev == rdev->dev)) {
++                              MD_BUG();
++                              goto abort;
++                      }
++              }
++      }
++
++      /*
++       * Check if we can support this RAID array
++       */
++      if (sb->major_version != MD_MAJOR_VERSION ||
++                      sb->minor_version > MD_MINOR_VERSION) {
++
++              printk (OLD_VERSION, mdidx(mddev), sb->major_version,
++                              sb->minor_version, sb->patch_version);
++              goto abort;
++      }
++
++      if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
++                      (sb->level == 4) || (sb->level == 5)))
++              printk (NOT_CLEAN_IGNORE, mdidx(mddev));
++
++      return 0;
++abort:
++      return 1;
++}
++
++#undef INCONSISTENT
++#undef OUT_OF_DATE
++#undef OLD_VERSION
++#undef OLD_LEVEL
++
++static int device_size_calculation (mddev_t * mddev)
++{
++      int data_disks = 0, persistent;
++      unsigned int readahead;
++      mdp_super_t *sb = mddev->sb;
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++
++      /*
++       * Do device size calculation. Bail out if too small.
++       * (we have to do this after having validated chunk_size,
++       * because device size has to be modulo chunk_size)
++       */ 
++      persistent = !mddev->sb->not_persistent;
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty)
++                      continue;
++              if (rdev->size) {
++                      MD_BUG();
++                      continue;
++              }
++              rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
++              if (rdev->size < sb->chunk_size / 1024) {
++                      printk (KERN_WARNING
++                              "Dev %s smaller than chunk_size: %dk < %dk\n",
++                              partition_name(rdev->dev),
++                              rdev->size, sb->chunk_size / 1024);
++                      return -EINVAL;
++              }
++      }
++
++      switch (sb->level) {
++              case -3:
++                      data_disks = 1;
++                      break;
++              case -2:
++                      data_disks = 1;
++                      break;
++              case -1:
++                      zoned_raid_size(mddev);
++                      data_disks = 1;
++                      break;
++              case 0:
++                      zoned_raid_size(mddev);
++                      data_disks = sb->raid_disks;
++                      break;
++              case 1:
++                      data_disks = 1;
++                      break;
++              case 4:
++              case 5:
++                      data_disks = sb->raid_disks-1;
++                      break;
++              default:
++                      printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
++                      goto abort;
++      }
++      if (!md_size[mdidx(mddev)])
++              md_size[mdidx(mddev)] = sb->size * data_disks;
++
++      readahead = MD_READAHEAD;
++      if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
++              readahead = mddev->sb->chunk_size * 4 * data_disks;
++              if (readahead < data_disks * MAX_SECTORS*512*2) 
++                      readahead = data_disks * MAX_SECTORS*512*2;
++      else {
++              if (sb->level == -3)
++                      readahead = 0;
++      }
++      md_maxreadahead[mdidx(mddev)] = readahead;
++
++      printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
++              mdidx(mddev), readahead/1024);
++
++      printk(KERN_INFO
++              "md%d: %d data-disks, max readahead per data-disk: %dk\n",
++                      mdidx(mddev), data_disks, readahead/data_disks/1024);
++      return 0;
++abort:
++      return 1;
++}
++
++
++#define TOO_BIG_CHUNKSIZE KERN_ERR \
++"too big chunk_size: %d > %d\n"
++
++#define TOO_SMALL_CHUNKSIZE KERN_ERR \
++"too small chunk_size: %d < %ld\n"
++
++#define BAD_CHUNKSIZE KERN_ERR \
++"no chunksize specified, see 'man raidtab'\n"
++
++static int do_md_run (mddev_t * mddev)
++{
++      int pnum, err;
++      int chunk_size;
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev;
++
++
++      if (!mddev->nb_dev) {
++              MD_BUG();
++              return -EINVAL;
++      }
++  
++      if (mddev->pers)
++              return -EBUSY;
++
++      /*
++       * Resize disks to align partitions size on a given
++       * chunk size.
++       */
++      md_size[mdidx(mddev)] = 0;
++
++      /*
++       * Analyze all RAID superblock(s)
++       */ 
++      if (analyze_sbs(mddev)) {
++              MD_BUG();
++              return -EINVAL;
++      }
++
++      chunk_size = mddev->sb->chunk_size;
++      pnum = level_to_pers(mddev->sb->level);
++
++      mddev->param.chunk_size = chunk_size;
++      mddev->param.personality = pnum;
++
++      if (chunk_size > MAX_CHUNK_SIZE) {
++              printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
++              return -EINVAL;
++      }
++      /*
++       * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
++       */
++      if ( (1 << ffz(~chunk_size)) != chunk_size) {
++              MD_BUG();
++              return -EINVAL;
++      }
++      if (chunk_size < PAGE_SIZE) {
++              printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
++              return -EINVAL;
++      }
++
++      if (pnum >= MAX_PERSONALITY) {
++              MD_BUG();
++              return -EINVAL;
++      }
++
++      if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
++              /*
++               * 'default chunksize' in the old md code used to
++               * be PAGE_SIZE, baaad.
++               * we abort here to be on the safe side. We dont
++               * want to continue the bad practice.
++               */
++              printk(BAD_CHUNKSIZE);
++              return -EINVAL;
++      }
++
++      if (!pers[pnum])
++      {
++#ifdef CONFIG_KMOD
++              char module_name[80];
++              sprintf (module_name, "md-personality-%d", pnum);
++              request_module (module_name);
++              if (!pers[pnum])
++#endif
++                      return -EINVAL;
++      }
++  
++      if (device_size_calculation(mddev))
++              return -EINVAL;
++
++      /*
++       * Drop all container device buffers, from now on
++       * the only valid external interface is through the md
++       * device.
++       */
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty)
++                      continue;
++              fsync_dev(rdev->dev);
++              invalidate_buffers(rdev->dev);
++      }
++  
++      mddev->pers = pers[pnum];
++  
++      err = mddev->pers->run(mddev);
++      if (err) {
++              printk("pers->run() failed ...\n");
++              mddev->pers = NULL;
++              return -EINVAL;
++      }
++
++      mddev->sb->state &= ~(1 << MD_SB_CLEAN);
++      md_update_sb(mddev);
++
++      /*
++       * md_size has units of 1K blocks, which are
++       * twice as large as sectors.
++       */
++      md_hd_struct[mdidx(mddev)].start_sect = 0;
++      md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
++  
++      read_ahead[MD_MAJOR] = 1024;
++      return (0);
++}
++
++#undef TOO_BIG_CHUNKSIZE
++#undef BAD_CHUNKSIZE
++
++#define OUT(x) do { err = (x); goto out; } while (0)
++
++static int restart_array (mddev_t *mddev)
++{
++      int err = 0;
++ 
++      /*
++       * Complain if it has no devices
++       */
++      if (!mddev->nb_dev)
++              OUT(-ENXIO);
++
++      if (mddev->pers) {
++              if (!mddev->ro)
++                      OUT(-EBUSY);
++
++              mddev->ro = 0;
++              set_device_ro(mddev_to_kdev(mddev), 0);
++
++              printk (KERN_INFO
++                      "md%d switched to read-write mode.\n", mdidx(mddev));
++              /*
++               * Kick recovery or resync if necessary
++               */
++              md_recover_arrays();
++              if (mddev->pers->restart_resync)
++                      mddev->pers->restart_resync(mddev);
++      } else
++              err = -EINVAL;
++ 
++out:
++      return err;
++}
++
++#define STILL_MOUNTED KERN_WARNING \
++"md: md%d still mounted.\n"
++
++static int do_md_stop (mddev_t * mddev, int ro)
++{
++      int err = 0, resync_interrupted = 0;
++      kdev_t dev = mddev_to_kdev(mddev);
++ 
++      if (!ro && !fs_may_mount (dev)) {
++              printk (STILL_MOUNTED, mdidx(mddev));
++              OUT(-EBUSY);
++      }
++  
++      /*
++       * complain if it's already stopped
++       */
++      if (!mddev->nb_dev)
++              OUT(-ENXIO);
++
++      if (mddev->pers) {
++              /*
++               * It is safe to call stop here, it only frees private
++               * data. Also, it tells us if a device is unstoppable
++               * (eg. resyncing is in progress)
++               */
++              if (mddev->pers->stop_resync)
++                      if (mddev->pers->stop_resync(mddev))
++                              resync_interrupted = 1;
++
++              if (mddev->recovery_running)
++                      md_interrupt_thread(md_recovery_thread);
++
++              /*
++               * This synchronizes with signal delivery to the
++               * resync or reconstruction thread. It also nicely
++               * hangs the process if some reconstruction has not
++               * finished.
++               */
++              down(&mddev->recovery_sem);
++              up(&mddev->recovery_sem);
++
++              /*
++               *  sync and invalidate buffers because we cannot kill the
++               *  main thread with valid IO transfers still around.
++               *  the kernel lock protects us from new requests being
++               *  added after invalidate_buffers().
++               */
++              fsync_dev (mddev_to_kdev(mddev));
++              fsync_dev (dev);
++              invalidate_buffers (dev);
++
++              if (ro) {
++                      if (mddev->ro)
++                              OUT(-ENXIO);
++                      mddev->ro = 1;
++              } else {
++                      if (mddev->ro)
++                              set_device_ro(dev, 0);
++                      if (mddev->pers->stop(mddev)) {
++                              if (mddev->ro)
++                                      set_device_ro(dev, 1);
++                              OUT(-EBUSY);
++                      }
++                      if (mddev->ro)
++                              mddev->ro = 0;
++              }
++              if (mddev->sb) {
++                      /*
++                       * mark it clean only if there was no resync
++                       * interrupted.
++                       */
++                      if (!mddev->recovery_running && !resync_interrupted) {
++                              printk("marking sb clean...\n");
++                              mddev->sb->state |= 1 << MD_SB_CLEAN;
++                      }
++                      md_update_sb(mddev);
++              }
++              if (ro)
++                      set_device_ro(dev, 1);
++      }
++ 
++      /*
++       * Free resources if final stop
++       */
++      if (!ro) {
++              export_array(mddev);
++              md_size[mdidx(mddev)] = 0;
++              md_hd_struct[mdidx(mddev)].nr_sects = 0;
++              free_mddev(mddev);
++
++              printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
++      } else
++              printk (KERN_INFO
++                      "md%d switched to read-only mode.\n", mdidx(mddev));
++out:
++      return err;
++}
++
++#undef OUT
++
++/*
++ * We have to safely support old arrays too.
++ */
++int detect_old_array (mdp_super_t *sb)
++{
++      if (sb->major_version > 0)
++              return 0;
++      if (sb->minor_version >= 90)
++              return 0;
++
++      return -EINVAL;
++}
++
++
++static void autorun_array (mddev_t *mddev)
++{
++      mdk_rdev_t *rdev;
++        struct md_list_head *tmp;
++      int err;
++
++      if (mddev->disks.prev == &mddev->disks) {
++              MD_BUG();
++              return;
++      }
++
++      printk("running: ");
++
++        ITERATE_RDEV(mddev,rdev,tmp) {
++              printk("<%s>", partition_name(rdev->dev));
++      }
++      printk("\nnow!\n");
++
++      err = do_md_run (mddev);
++      if (err) {
++              printk("do_md_run() returned %d\n", err);
++              /*
++               * prevent the writeback of an unrunnable array
++               */
++              mddev->sb_dirty = 0;
++              do_md_stop (mddev, 0);
++      }
++}
++
++/*
++ * lets try to run arrays based on all disks that have arrived
++ * until now. (those are in the ->pending list)
++ *
++ * the method: pick the first pending disk, collect all disks with
++ * the same UUID, remove all from the pending list and put them into
++ * the 'same_array' list. Then order this list based on superblock
++ * update time (freshest comes first), kick out 'old' disks and
++ * compare superblocks. If everything's fine then run it.
++ */
++static void autorun_devices (void)
++{
++      struct md_list_head candidates;
++      struct md_list_head *tmp;
++      mdk_rdev_t *rdev0, *rdev;
++      mddev_t *mddev;
++      kdev_t md_kdev;
++
++
++      printk("autorun ...\n");
++      while (pending_raid_disks.next != &pending_raid_disks) {
++              rdev0 = md_list_entry(pending_raid_disks.next,
++                                       mdk_rdev_t, pending);
++
++              printk("considering %s ...\n", partition_name(rdev0->dev));
++              MD_INIT_LIST_HEAD(&candidates);
++              ITERATE_RDEV_PENDING(rdev,tmp) {
++                      if (uuid_equal(rdev0, rdev)) {
++                              if (!sb_equal(rdev0->sb, rdev->sb)) {
++                                      printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
++                                      continue;
++                              }
++                              printk("  adding %s ...\n", partition_name(rdev->dev));
++                              md_list_del(&rdev->pending);
++                              md_list_add(&rdev->pending, &candidates);
++                      }
++              }
+               /*
+-               * Check the superblock for consistency.
+-               * The personality itself has to check whether it's getting
+-               * added with the proper flags.  The personality has to be
+-                 * checked too. ;)
++               * now we have a set of devices, with all of them having
++               * mostly sane superblocks. It's time to allocate the
++               * mddev.
+                */
+-              if (analyze_one_sb (realdev))
++              md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
++              mddev = kdev_to_mddev(md_kdev);
++              if (mddev) {
++                      printk("md%d already running, cannot run %s\n",
++                               mdidx(mddev), partition_name(rdev0->dev));
++                      ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
++                              export_rdev(rdev);
++                      continue;
++              }
++              mddev = alloc_mddev(md_kdev);
++              printk("created md%d\n", mdidx(mddev));
++              ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
++                      bind_rdev_to_array(rdev, mddev);
++                      md_list_del(&rdev->pending);
++                      MD_INIT_LIST_HEAD(&rdev->pending);
++              }
++              autorun_array(mddev);
++      }
++      printk("... autorun DONE.\n");
++}
++
++/*
++ * import RAID devices based on one partition
++ * if possible, the array gets run as well.
++ */
++
++#define BAD_VERSION KERN_ERR \
++"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
++
++#define OUT_OF_MEM KERN_ALERT \
++"md: out of memory.\n"
++
++#define NO_DEVICE KERN_ERR \
++"md: disabled device %s\n"
++
++#define AUTOADD_FAILED KERN_ERR \
++"md: auto-adding devices to md%d FAILED (error %d).\n"
++
++#define AUTOADD_FAILED_USED KERN_ERR \
++"md: cannot auto-add device %s to md%d, already used.\n"
++
++#define AUTORUN_FAILED KERN_ERR \
++"md: auto-running md%d FAILED (error %d).\n"
++
++#define MDDEV_BUSY KERN_ERR \
++"md: cannot auto-add to md%d, already running.\n"
++
++#define AUTOADDING KERN_INFO \
++"md: auto-adding devices to md%d, based on %s's superblock.\n"
++
++#define AUTORUNNING KERN_INFO \
++"md: auto-running md%d.\n"
++
++static int autostart_array (kdev_t startdev)
++{
++      int err = -EINVAL, i;
++      mdp_super_t *sb = NULL;
++      mdk_rdev_t *start_rdev = NULL, *rdev;
++
++      if (md_import_device(startdev, 1)) {
++              printk("could not import %s!\n", partition_name(startdev));
++              goto abort;
++      }
++
++      start_rdev = find_rdev_all(startdev);
++      if (!start_rdev) {
++              MD_BUG();
++              goto abort;
++      }
++      if (start_rdev->faulty) {
++              printk("can not autostart based on faulty %s!\n",
++                                              partition_name(startdev));
++              goto abort;
++      }
++      md_list_add(&start_rdev->pending, &pending_raid_disks);
++
++      sb = start_rdev->sb;
++
++      err = detect_old_array(sb);
++      if (err) {
++              printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
++              goto abort;
++      }
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              mdp_disk_t *desc;
++              kdev_t dev;
++
++              desc = sb->disks + i;
++              dev = MKDEV(desc->major, desc->minor);
++
++              if (dev == MKDEV(0,0))
++                      continue;
++              if (dev == startdev)
++                      continue;
++              if (md_import_device(dev, 1)) {
++                      printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
++                      continue;
++              }
++              rdev = find_rdev_all(dev);
++              if (!rdev) {
++                      MD_BUG();
++                      goto abort;
++              }
++              md_list_add(&rdev->pending, &pending_raid_disks);
++      }
++
++      /*
++       * possibly return codes
++       */
++      autorun_devices();
++      return 0;
++
++abort:
++      if (start_rdev)
++              export_rdev(start_rdev);
++      return err;
++}
++
++#undef BAD_VERSION
++#undef OUT_OF_MEM
++#undef NO_DEVICE
++#undef AUTOADD_FAILED_USED
++#undef AUTOADD_FAILED
++#undef AUTORUN_FAILED
++#undef AUTOADDING
++#undef AUTORUNNING
++
++struct {
++      int set;
++      int noautodetect;
++
++} raid_setup_args md__initdata = { 0, 0 };
++
++/*
++ * Searches all registered partitions for autorun RAID arrays
++ * at boot time.
++ */
++md__initfunc(void autodetect_raid(void))
++{
++#ifdef CONFIG_AUTODETECT_RAID
++      struct gendisk *disk;
++      mdk_rdev_t *rdev;
++      int i;
++
++      if (raid_setup_args.noautodetect) {
++              printk(KERN_INFO "skipping autodetection of RAID arrays\n");
++              return;
++      }
++      printk(KERN_INFO "autodetecting RAID arrays\n");
++
++      for (disk = gendisk_head ; disk ; disk = disk->next) {
++              for (i = 0; i < disk->max_p*disk->max_nr; i++) {
++                      kdev_t dev = MKDEV(disk->major,i);
++
++                      if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
++                              printk(KERN_ALERT
++"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
++"    to maintain interoperability with other OSs! Autodetection support for\n"
++"    type 0x86 will be deleted after some migration timeout. Sorry.\n",
++                                      partition_name(dev));
++                              disk->part[i].type = LINUX_RAID_PARTITION;
++                      }
++                      if (disk->part[i].type != LINUX_RAID_PARTITION)
++                              continue;
++
++                      if (md_import_device(dev,1)) {
++                              printk(KERN_ALERT "could not import %s!\n",
++                                                      partition_name(dev));
++                              continue;
++                      }
++                      /*
++                       * Sanity checks:
++                       */
++                      rdev = find_rdev_all(dev);
++                      if (!rdev) {
++                              MD_BUG();
++                              continue;
++                      }
++                      if (rdev->faulty) {
++                              MD_BUG();
++                              continue;
++                      }
++                      md_list_add(&rdev->pending, &pending_raid_disks);
++              }
++      }
++
++      autorun_devices();
++#endif
++}
++
++static int get_version (void * arg)
++{
++      mdu_version_t ver;
++
++      ver.major = MD_MAJOR_VERSION;
++      ver.minor = MD_MINOR_VERSION;
++      ver.patchlevel = MD_PATCHLEVEL_VERSION;
++
++      if (md_copy_to_user(arg, &ver, sizeof(ver)))
++              return -EFAULT;
++
++      return 0;
++}
++
++#define SET_FROM_SB(x) info.x = mddev->sb->x
++static int get_array_info (mddev_t * mddev, void * arg)
++{
++      mdu_array_info_t info;
++
++      if (!mddev->sb)
++              return -EINVAL;
++
++      SET_FROM_SB(major_version);
++      SET_FROM_SB(minor_version);
++      SET_FROM_SB(patch_version);
++      SET_FROM_SB(ctime);
++      SET_FROM_SB(level);
++      SET_FROM_SB(size);
++      SET_FROM_SB(nr_disks);
++      SET_FROM_SB(raid_disks);
++      SET_FROM_SB(md_minor);
++      SET_FROM_SB(not_persistent);
++
++      SET_FROM_SB(utime);
++      SET_FROM_SB(state);
++      SET_FROM_SB(active_disks);
++      SET_FROM_SB(working_disks);
++      SET_FROM_SB(failed_disks);
++      SET_FROM_SB(spare_disks);
++
++      SET_FROM_SB(layout);
++      SET_FROM_SB(chunk_size);
++
++      if (md_copy_to_user(arg, &info, sizeof(info)))
++              return -EFAULT;
++
++      return 0;
++}
++#undef SET_FROM_SB
++
++#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
++static int get_disk_info (mddev_t * mddev, void * arg)
++{
++      mdu_disk_info_t info;
++      unsigned int nr;
++
++      if (!mddev->sb)
++              return -EINVAL;
++
++      if (md_copy_from_user(&info, arg, sizeof(info)))
++              return -EFAULT;
++
++      nr = info.number;
++      if (nr >= mddev->sb->nr_disks)
++              return -EINVAL;
++
++      SET_FROM_SB(major);
++      SET_FROM_SB(minor);
++      SET_FROM_SB(raid_disk);
++      SET_FROM_SB(state);
++
++      if (md_copy_to_user(arg, &info, sizeof(info)))
++              return -EFAULT;
++
++      return 0;
++}
++#undef SET_FROM_SB
++
++#define SET_SB(x) mddev->sb->disks[nr].x = info.x
++
++static int add_new_disk (mddev_t * mddev, void * arg)
++{
++      int err, size, persistent;
++      mdu_disk_info_t info;
++      mdk_rdev_t *rdev;
++      unsigned int nr;
++      kdev_t dev;
++
++      if (!mddev->sb)
++              return -EINVAL;
++
++      if (md_copy_from_user(&info, arg, sizeof(info)))
++              return -EFAULT;
++
++      nr = info.number;
++      if (nr >= mddev->sb->nr_disks)
++              return -EINVAL;
++
++      dev = MKDEV(info.major,info.minor);
++
++      if (find_rdev_all(dev)) {
++              printk("device %s already used in a RAID array!\n", 
++                              partition_name(dev));
++              return -EBUSY;
++      }
++
++      SET_SB(number);
++      SET_SB(major);
++      SET_SB(minor);
++      SET_SB(raid_disk);
++      SET_SB(state);
++ 
++      if ((info.state & (1<<MD_DISK_FAULTY))==0) {
++              err = md_import_device (dev, 0);
++              if (err) {
++                      printk("md: error, md_import_device() returned %d\n", err);
++                      return -EINVAL;
++              }
++              rdev = find_rdev_all(dev);
++              if (!rdev) {
++                      MD_BUG();
+                       return -EINVAL;
++              }
++ 
++              rdev->old_dev = dev;
++              rdev->desc_nr = info.number;
++ 
++              bind_rdev_to_array(rdev, mddev);
++ 
++              persistent = !mddev->sb->not_persistent;
++              if (!persistent)
++                      printk("nonpersistent superblock ...\n");
++              if (!mddev->sb->chunk_size)
++                      printk("no chunksize?\n");
++ 
++              size = calc_dev_size(dev, mddev, persistent);
++              rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
++ 
++              if (!mddev->sb->size || (mddev->sb->size > size))
++                      mddev->sb->size = size;
++      }
++ 
++      /*
++       * sync all other superblocks with the main superblock
++       */
++      sync_sbs(mddev);
++
++      return 0;
++}
++#undef SET_SB
++
++static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
++{
++      int err;
++      mdk_rdev_t *rdev;
++      mdp_disk_t *disk;
++
++      if (!mddev->pers)
++              return -ENODEV;
++
++      printk("trying to remove %s from md%d ... \n",
++              partition_name(dev), mdidx(mddev));
++
++      if (!mddev->pers->diskop) {
++              printk("md%d: personality does not support diskops!\n",
++                                                               mdidx(mddev));
++              return -EINVAL;
++      }
++
++      rdev = find_rdev(mddev, dev);
++      if (!rdev)
++              return -ENXIO;
++
++      if (rdev->desc_nr == -1) {
++              MD_BUG();
++              return -EINVAL;
++      }
++      disk = &mddev->sb->disks[rdev->desc_nr];
++      if (disk_active(disk))
++              goto busy;
++      if (disk_removed(disk)) {
++              MD_BUG();
++              return -EINVAL;
++      }
++      
++      err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
++      if (err == -EBUSY)
++              goto busy;
++      if (err) {
++              MD_BUG();
++              return -EINVAL;
++      }
++
++      remove_descriptor(disk, mddev->sb);
++      kick_rdev_from_array(rdev);
++      mddev->sb_dirty = 1;
++      md_update_sb(mddev);
++
++      return 0;
++busy:
++      printk("cannot remove active disk %s from md%d ... \n",
++              partition_name(dev), mdidx(mddev));
++      return -EBUSY;
++}
++
++static int hot_add_disk (mddev_t * mddev, kdev_t dev)
++{
++      int i, err, persistent;
++      unsigned int size;
++      mdk_rdev_t *rdev;
++      mdp_disk_t *disk;
++
++      if (!mddev->pers)
++              return -ENODEV;
++
++      printk("trying to hot-add %s to md%d ... \n",
++              partition_name(dev), mdidx(mddev));
++
++      if (!mddev->pers->diskop) {
++              printk("md%d: personality does not support diskops!\n",
++                                                               mdidx(mddev));
++              return -EINVAL;
++      }
++
++      persistent = !mddev->sb->not_persistent;
++      size = calc_dev_size(dev, mddev, persistent);
++
++      if (size < mddev->sb->size) {
++              printk("md%d: disk size %d blocks < array size %d\n",
++                              mdidx(mddev), size, mddev->sb->size);
++              return -ENOSPC;
++      }
++
++      rdev = find_rdev(mddev, dev);
++      if (rdev)
++              return -EBUSY;
++
++      err = md_import_device (dev, 0);
++      if (err) {
++              printk("md: error, md_import_device() returned %d\n", err);
++              return -EINVAL;
++      }
++      rdev = find_rdev_all(dev);
++      if (!rdev) {
++              MD_BUG();
++              return -EINVAL;
++      }
++      if (rdev->faulty) {
++              printk("md: can not hot-add faulty %s disk to md%d!\n",
++                              partition_name(dev), mdidx(mddev));
++              err = -EINVAL;
++              goto abort_export;
++      }
++      bind_rdev_to_array(rdev, mddev);
++
++      /*
++       * The rest should better be atomic, we can have disk failures
++       * noticed in interrupt contexts ...
++       */
++      cli();
++      rdev->old_dev = dev;
++      rdev->size = size;
++      rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
++
++      disk = mddev->sb->disks + mddev->sb->raid_disks;
++      for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
++              disk = mddev->sb->disks + i;
++
++              if (!disk->major && !disk->minor)
++                      break;
++              if (disk_removed(disk))
++                      break;
++      }
++      if (i == MD_SB_DISKS) {
++              sti();
++              printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
++              err = -EBUSY;
++              goto abort_unbind_export;
++      }
++
++      if (disk_removed(disk)) {
+               /*
+-               * hot_add has to bump up nb_dev itself
++               * reuse slot
+                */
+-              if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
+-                      /*
+-                       * FIXME: here we should free up the inode and stuff
+-                       */
+-                      printk ("FIXME\n");
+-                      return -EINVAL;
++              if (disk->number != i) {
++                      sti();
++                      MD_BUG();
++                      err = -EINVAL;
++                      goto abort_unbind_export;
+               }
+-      } else
+-              md_dev[minor].nb_dev++;
++      } else {
++              disk->number = i;
++      }
+ 
+-      printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
+-      return (0);
++      disk->raid_disk = disk->number;
++      disk->major = MAJOR(dev);
++      disk->minor = MINOR(dev);
++
++      if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
++              sti();
++              MD_BUG();
++              err = -EINVAL;
++              goto abort_unbind_export;
++      }
++
++      mark_disk_spare(disk);
++      mddev->sb->nr_disks++;
++      mddev->sb->spare_disks++;
++      mddev->sb->working_disks++;
++
++      mddev->sb_dirty = 1;
++
++      sti();
++      md_update_sb(mddev);
++
++      /*
++       * Kick recovery, maybe this spare has to be added to the
++       * array immediately.
++       */
++      md_recover_arrays();
++
++      return 0;
++
++abort_unbind_export:
++      unbind_rdev_from_array(rdev);
++
++abort_export:
++      export_rdev(rdev);
++      return err;
++}
++
++#define SET_SB(x) mddev->sb->x = info.x
++static int set_array_info (mddev_t * mddev, void * arg)
++{
++      mdu_array_info_t info;
++
++      if (mddev->sb) {
++              printk("array md%d already has a superblock!\n", 
++                              mdidx(mddev));
++              return -EBUSY;
++      }
++
++      if (md_copy_from_user(&info, arg, sizeof(info)))
++              return -EFAULT;
++
++      if (alloc_array_sb(mddev))
++              return -ENOMEM;
++
++      mddev->sb->major_version = MD_MAJOR_VERSION;
++      mddev->sb->minor_version = MD_MINOR_VERSION;
++      mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
++      mddev->sb->ctime = CURRENT_TIME;
++
++      SET_SB(level);
++      SET_SB(size);
++      SET_SB(nr_disks);
++      SET_SB(raid_disks);
++      SET_SB(md_minor);
++      SET_SB(not_persistent);
++
++      SET_SB(state);
++      SET_SB(active_disks);
++      SET_SB(working_disks);
++      SET_SB(failed_disks);
++      SET_SB(spare_disks);
++
++      SET_SB(layout);
++      SET_SB(chunk_size);
++
++      mddev->sb->md_magic = MD_SB_MAGIC;
++
++      /*
++       * Generate a 128 bit UUID
++       */
++      get_random_bytes(&mddev->sb->set_uuid0, 4);
++      get_random_bytes(&mddev->sb->set_uuid1, 4);
++      get_random_bytes(&mddev->sb->set_uuid2, 4);
++      get_random_bytes(&mddev->sb->set_uuid3, 4);
++
++      return 0;
++}
++#undef SET_SB
++
++static int set_disk_info (mddev_t * mddev, void * arg)
++{
++      printk("not yet");
++      return -EINVAL;
++}
++
++static int clear_array (mddev_t * mddev)
++{
++      printk("not yet");
++      return -EINVAL;
++}
++
++static int write_raid_info (mddev_t * mddev)
++{
++      printk("not yet");
++      return -EINVAL;
++}
++
++static int protect_array (mddev_t * mddev)
++{
++      printk("not yet");
++      return -EINVAL;
++}
++
++static int unprotect_array (mddev_t * mddev)
++{
++      printk("not yet");
++      return -EINVAL;
++}
++
++static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
++{
++      int ret;
++
++      fsync_dev(mddev_to_kdev(mddev));
++      ret = md_error(mddev_to_kdev(mddev), dev);
++      return ret;
+ }
+ 
+ static int md_ioctl (struct inode *inode, struct file *file,
+                      unsigned int cmd, unsigned long arg)
+ {
+-  int minor, err;
+-  struct hd_geometry *loc = (struct hd_geometry *) arg;
++      unsigned int minor;
++      int err = 0;
++      struct hd_geometry *loc = (struct hd_geometry *) arg;
++      mddev_t *mddev = NULL;
++      kdev_t dev;
+ 
+-  if (!capable(CAP_SYS_ADMIN))
+-    return -EACCES;
++      if (!md_capable_admin())
++              return -EACCES;
+ 
+-  if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
+-      (minor & 0x7f) < MAX_PERSONALITY &&
+-      pers[minor & 0x7f] &&
+-      pers[minor & 0x7f]->ioctl)
+-    return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
+-  
+-  if (minor >= MAX_MD_DEV)
+-    return -EINVAL;
++      dev = inode->i_rdev;
++      minor = MINOR(dev);
++      if (minor >= MAX_MD_DEVS)
++              return -EINVAL;
+ 
+-  switch (cmd)
+-  {
+-    case REGISTER_DEV:
+-      return do_md_add (minor, to_kdev_t ((dev_t) arg));
++      /*
++       * Commands dealing with the RAID driver but not any
++       * particular array:
++       */
++      switch (cmd)
++      {
++              case RAID_VERSION:
++                      err = get_version((void *)arg);
++                      goto done;
++
++              case PRINT_RAID_DEBUG:
++                      err = 0;
++                      md_print_devices();
++                      goto done_unlock;
++      
++              case BLKGETSIZE:   /* Return device size */
++                      if (!arg) {
++                              err = -EINVAL;
++                              goto abort;
++                      }
++                      err = md_put_user(md_hd_struct[minor].nr_sects,
++                                              (long *) arg);
++                      goto done;
+ 
+-    case START_MD:
+-      return do_md_run (minor, (int) arg);
++              case BLKFLSBUF:
++                      fsync_dev(dev);
++                      invalidate_buffers(dev);
++                      goto done;
+ 
+-    case STOP_MD:
+-      return do_md_stop (minor, inode);
+-      
+-    case BLKGETSIZE:   /* Return device size */
+-    if  (!arg)  return -EINVAL;
+-    err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
+-    if (err)
+-      return err;
+-    break;
+-
+-    case BLKFLSBUF:
+-    fsync_dev (inode->i_rdev);
+-    invalidate_buffers (inode->i_rdev);
+-    break;
+-
+-    case BLKRASET:
+-    if (arg > 0xff)
+-      return -EINVAL;
+-    read_ahead[MAJOR(inode->i_rdev)] = arg;
+-    return 0;
+-    
+-    case BLKRAGET:
+-    if  (!arg)  return -EINVAL;
+-    err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
+-    if (err)
+-      return err;
+-    break;
+-
+-    /* We have a problem here : there is no easy way to give a CHS
+-       virtual geometry. We currently pretend that we have a 2 heads
+-       4 sectors (with a BIG number of cylinders...). This drives dosfs
+-       just mad... ;-) */
+-    
+-    case HDIO_GETGEO:
+-    if (!loc)  return -EINVAL;
+-    err = put_user (2, (char *) &loc->heads);
+-    if (err)
+-      return err;
+-    err = put_user (4, (char *) &loc->sectors);
+-    if (err)
+-      return err;
+-    err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
+-    if (err)
+-      return err;
+-    err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
+-              (long *) &loc->start);
+-    if (err)
+-      return err;
+-    break;
+-    
+-    RO_IOCTLS(inode->i_rdev,arg);
++              case BLKRASET:
++                      if (arg > 0xff) {
++                              err = -EINVAL;
++                              goto abort;
++                      }
++                      read_ahead[MAJOR(dev)] = arg;
++                      goto done;
+     
+-    default:
+-    return -EINVAL;
+-  }
++              case BLKRAGET:
++                      if (!arg) {
++                              err = -EINVAL;
++                              goto abort;
++                      }
++                      err = md_put_user (read_ahead[
++                              MAJOR(dev)], (long *) arg);
++                      goto done;
++              default:
++      }
++
++      /*
++       * Commands creating/starting a new array:
++       */
++
++      mddev = kdev_to_mddev(dev);
++
++      switch (cmd)
++      {
++              case SET_ARRAY_INFO:
++              case START_ARRAY:
++                      if (mddev) {
++                              printk("array md%d already exists!\n",
++                                                              mdidx(mddev));
++                              err = -EEXIST;
++                              goto abort;
++                      }
++              default:
++      }
++
++      switch (cmd)
++      {
++              case SET_ARRAY_INFO:
++                      mddev = alloc_mddev(dev);
++                      if (!mddev) {
++                              err = -ENOMEM;
++                              goto abort;
++                      }
++                      /*
++                       * alloc_mddev() should possibly self-lock.
++                       */
++                      err = lock_mddev(mddev);
++                      if (err) {
++                              printk("ioctl, reason %d, cmd %d\n", err, cmd);
++                              goto abort;
++                      }
++                      err = set_array_info(mddev, (void *)arg);
++                      if (err) {
++                              printk("couldnt set array info. %d\n", err);
++                              goto abort;
++                      }
++                      goto done_unlock;
++
++              case START_ARRAY:
++                      /*
++                       * possibly make it lock the array ...
++                       */
++                      err = autostart_array((kdev_t)arg);
++                      if (err) {
++                              printk("autostart %s failed!\n",
++                                      partition_name((kdev_t)arg));
++                              goto abort;
++                      }
++                      goto done;
++      
++              default:
++      }
++      
++      /*
++       * Commands querying/configuring an existing array:
++       */
++
++      if (!mddev) {
++              err = -ENODEV;
++              goto abort;
++      }
++      err = lock_mddev(mddev);
++      if (err) {
++              printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
++              goto abort;
++      }
++
++      /*
++       * Commands even a read-only array can execute:
++       */
++      switch (cmd)
++      {
++              case GET_ARRAY_INFO:
++                      err = get_array_info(mddev, (void *)arg);
++                      goto done_unlock;
++
++              case GET_DISK_INFO:
++                      err = get_disk_info(mddev, (void *)arg);
++                      goto done_unlock;
++      
++              case RESTART_ARRAY_RW:
++                      err = restart_array(mddev);
++                      goto done_unlock;
++
++              case STOP_ARRAY:
++                      err = do_md_stop (mddev, 0);
++                      goto done_unlock;
++      
++              case STOP_ARRAY_RO:
++                      err = do_md_stop (mddev, 1);
++                      goto done_unlock;
++      
++      /*
++       * We have a problem here : there is no easy way to give a CHS
++       * virtual geometry. We currently pretend that we have a 2 heads
++       * 4 sectors (with a BIG number of cylinders...). This drives
++       * dosfs just mad... ;-)
++       */
++              case HDIO_GETGEO:
++                      if (!loc) {
++                              err = -EINVAL;
++                              goto abort_unlock;
++                      }
++                      err = md_put_user (2, (char *) &loc->heads);
++                      if (err)
++                              goto abort_unlock;
++                      err = md_put_user (4, (char *) &loc->sectors);
++                      if (err)
++                              goto abort_unlock;
++                      err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
++                                              (short *) &loc->cylinders);
++                      if (err)
++                              goto abort_unlock;
++                      err = md_put_user (md_hd_struct[minor].start_sect,
++                                              (long *) &loc->start);
++                      goto done_unlock;
++      }
+ 
+-  return (0);
++      /*
++       * The remaining ioctls are changing the state of the
++       * superblock, so we do not allow read-only arrays
++       * here:
++       */
++      if (mddev->ro) {
++              err = -EROFS;
++              goto abort_unlock;
++      }
++
++      switch (cmd)
++      {
++              case CLEAR_ARRAY:
++                      err = clear_array(mddev);
++                      goto done_unlock;
++      
++              case ADD_NEW_DISK:
++                      err = add_new_disk(mddev, (void *)arg);
++                      goto done_unlock;
++      
++              case HOT_REMOVE_DISK:
++                      err = hot_remove_disk(mddev, (kdev_t)arg);
++                      goto done_unlock;
++      
++              case HOT_ADD_DISK:
++                      err = hot_add_disk(mddev, (kdev_t)arg);
++                      goto done_unlock;
++      
++              case SET_DISK_INFO:
++                      err = set_disk_info(mddev, (void *)arg);
++                      goto done_unlock;
++      
++              case WRITE_RAID_INFO:
++                      err = write_raid_info(mddev);
++                      goto done_unlock;
++      
++              case UNPROTECT_ARRAY:
++                      err = unprotect_array(mddev);
++                      goto done_unlock;
++      
++              case PROTECT_ARRAY:
++                      err = protect_array(mddev);
++                      goto done_unlock;
++
++              case SET_DISK_FAULTY:
++                      err = set_disk_faulty(mddev, (kdev_t)arg);
++                      goto done_unlock;
++      
++              case RUN_ARRAY:
++              {
++                      mdu_param_t param;
++
++                      err = md_copy_from_user(&param, (mdu_param_t *)arg,
++                                                       sizeof(param));
++                      if (err)
++                              goto abort_unlock;
++
++                      err = do_md_run (mddev);
++                      /*
++                       * we have to clean up the mess if
++                       * the array cannot be run for some
++                       * reason ...
++                       */
++                      if (err) {
++                              mddev->sb_dirty = 0;
++                              do_md_stop (mddev, 0);
++                      }
++                      goto done_unlock;
++              }
++      
++              default:
++                      printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
++                      err = -EINVAL;
++                      goto abort_unlock;
++      }
++
++done_unlock:
++abort_unlock:
++      if (mddev)
++              unlock_mddev(mddev);
++      else
++              printk("huh11?\n");
++
++      return err;
++done:
++      if (err)
++              printk("huh12?\n");
++abort:
++      return err;
+ }
+ 
++
++#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
++
+ static int md_open (struct inode *inode, struct file *file)
+ {
+-  int minor=MINOR(inode->i_rdev);
++      /*
++       * Always succeed
++       */
++      return (0);
++}
++
++static void md_release (struct inode *inode, struct file *file)
++{
++      sync_dev(inode->i_rdev);
++}
++
++
++static int md_read (struct inode *inode, struct file *file,
++                                              char *buf, int count)
++{
++      mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+ 
+-  md_dev[minor].busy++;
+-  return (0);                 /* Always succeed */
++      if (!mddev || !mddev->pers)
++              return -ENXIO;
++
++      return block_read (inode, file, buf, count);
+ }
+ 
++static int md_write (struct inode *inode, struct file *file,
++                                              const char *buf, int count)
++{
++      mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
++
++      if (!mddev || !mddev->pers)
++              return -ENXIO;
+ 
+-static int md_release (struct inode *inode, struct file *file)
++      return block_write (inode, file, buf, count);
++}
++
++static struct file_operations md_fops=
+ {
+-  int minor=MINOR(inode->i_rdev);
++      NULL,
++      md_read,
++      md_write,
++      NULL,
++      NULL,
++      md_ioctl,
++      NULL,
++      md_open,
++      md_release,
++      block_fsync
++};
++
++#else
+ 
+-  sync_dev (inode->i_rdev);
+-  md_dev[minor].busy--;
+-  return 0;
++static int md_open (struct inode *inode, struct file *file)
++{
++      /*
++       * Always succeed
++       */
++      return (0);
+ }
+ 
++static int md_release (struct inode *inode, struct file *file)
++{
++      sync_dev(inode->i_rdev);
++      return 0;
++}
+ 
+ static ssize_t md_read (struct file *file, char *buf, size_t count,
+                       loff_t *ppos)
+ {
+-  int minor=MINOR(file->f_dentry->d_inode->i_rdev);
++      mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+ 
+-  if (!md_dev[minor].pers)    /* Check if device is being run */
+-    return -ENXIO;
++      if (!mddev || !mddev->pers)
++              return -ENXIO;
+ 
+-  return block_read(file, buf, count, ppos);
++      return block_read(file, buf, count, ppos);
+ }
+ 
+ static ssize_t md_write (struct file *file, const char *buf,
+                        size_t count, loff_t *ppos)
+ {
+-  int minor=MINOR(file->f_dentry->d_inode->i_rdev);
++      mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
+ 
+-  if (!md_dev[minor].pers)    /* Check if device is being run */
+-    return -ENXIO;
++      if (!mddev || !mddev->pers)
++              return -ENXIO;
+ 
+-  return block_write(file, buf, count, ppos);
++      return block_write(file, buf, count, ppos);
+ }
+ 
+ static struct file_operations md_fops=
+ {
+-  NULL,
+-  md_read,
+-  md_write,
+-  NULL,
+-  NULL,
+-  md_ioctl,
+-  NULL,
+-  md_open,
+-  NULL,
+-  md_release,
+-  block_fsync
++      NULL,
++      md_read,
++      md_write,
++      NULL,
++      NULL,
++      md_ioctl,
++      NULL,
++      md_open,
++      NULL,
++      md_release,
++      block_fsync
+ };
+ 
+-int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
++#endif
++
++int md_map (kdev_t dev, kdev_t *rdev,
++                       unsigned long *rsector, unsigned long size)
+ {
+-  if ((unsigned int) minor >= MAX_MD_DEV)
+-  {
+-    printk ("Bad md device %d\n", minor);
+-    return (-1);
+-  }
+-  
+-  if (!md_dev[minor].pers)
+-  {
+-    printk ("Oops ! md%d not running, giving up !\n", minor);
+-    return (-1);
+-  }
++      int err;
++      mddev_t *mddev = kdev_to_mddev(dev);
+ 
+-  return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
++      if (!mddev || !mddev->pers) {
++              err = -ENXIO;
++              goto out;
++      }
++
++      err = mddev->pers->map(mddev, dev, rdev, rsector, size);
++out:
++      return err;
+ }
+   
+-int md_make_request (int minor, int rw, struct buffer_head * bh)
++int md_make_request (struct buffer_head * bh, int rw)
+ {
+-      if (md_dev [minor].pers->make_request) {
+-              if (buffer_locked(bh))
+-                      return 0;
++      int err;
++      mddev_t *mddev = kdev_to_mddev(bh->b_dev);
++
++      if (!mddev || !mddev->pers) {
++              err = -ENXIO;
++              goto out;
++      }
++
++      if (mddev->pers->make_request) {
++              if (buffer_locked(bh)) {
++                      err = 0;
++                      goto out;
++              }
+               set_bit(BH_Lock, &bh->b_state);
+               if (rw == WRITE || rw == WRITEA) {
+                       if (!buffer_dirty(bh)) {
+-                              bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+-                              return 0;
++                              bh->b_end_io(bh, buffer_uptodate(bh));
++                              err = 0;
++                              goto out;
+                       }
+               }
+               if (rw == READ || rw == READA) {
+                       if (buffer_uptodate(bh)) {
+-                              bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+-                              return 0;
++                              bh->b_end_io(bh, buffer_uptodate(bh));
++                              err = 0;
++                              goto out;
+                       }
+               }
+-              return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
++              err = mddev->pers->make_request(mddev, rw, bh);
+       } else {
+               make_request (MAJOR(bh->b_rdev), rw, bh);
+-              return 0;
++              err = 0;
+       }
++out:
++      return err;
+ }
+ 
+ static void do_md_request (void)
+ {
+-  printk ("Got md request, not good...");
+-  return;
++      printk(KERN_ALERT "Got md request, not good...");
++      return;
++}
++
++int md_thread(void * arg)
++{
++      mdk_thread_t *thread = arg;
++
++      md_lock_kernel();
++      exit_mm(current);
++      exit_files(current);
++      exit_fs(current);
++
++      /*
++       * Detach thread
++       */
++      sys_setsid();
++      sprintf(current->comm, thread->name);
++      md_init_signals();
++      md_flush_signals();
++      thread->tsk = current;
++
++      /*
++       * md_thread is a 'system-thread', it's priority should be very
++       * high. We avoid resource deadlocks individually in each
++       * raid personality. (RAID5 does preallocation) We also use RR and
++       * the very same RT priority as kswapd, thus we will never get
++       * into a priority inversion deadlock.
++       *
++       * we definitely have to have equal or higher priority than
++       * bdflush, otherwise bdflush will deadlock if there are too
++       * many dirty RAID5 blocks.
++       */
++      current->policy = SCHED_OTHER;
++      current->priority = 40;
++
++      up(thread->sem);
++
++      for (;;) {
++              cli();
++              if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
++                      if (!thread->run)
++                              break;
++                      interruptible_sleep_on(&thread->wqueue);
++              }
++              sti();
++              clear_bit(THREAD_WAKEUP, &thread->flags);
++              if (thread->run) {
++                      thread->run(thread->data);
++                      run_task_queue(&tq_disk);
++              }
++              if (md_signal_pending(current)) {
++                      printk("%8s(%d) flushing signals.\n", current->comm,
++                              current->pid);
++                      md_flush_signals();
++              }
++      }
++      sti();
++      up(thread->sem);
++      return 0;
+ }
+ 
+-void md_wakeup_thread(struct md_thread *thread)
++void md_wakeup_thread(mdk_thread_t *thread)
+ {
+       set_bit(THREAD_WAKEUP, &thread->flags);
+       wake_up(&thread->wqueue);
+ }
+ 
+-struct md_thread *md_register_thread (void (*run) (void *), void *data)
++mdk_thread_t *md_register_thread (void (*run) (void *),
++                                              void *data, const char *name)
+ {
+-      struct md_thread *thread = (struct md_thread *)
+-              kmalloc(sizeof(struct md_thread), GFP_KERNEL);
++      mdk_thread_t *thread;
+       int ret;
+       struct semaphore sem = MUTEX_LOCKED;
+       
+-      if (!thread) return NULL;
++      thread = (mdk_thread_t *) kmalloc
++                              (sizeof(mdk_thread_t), GFP_KERNEL);
++      if (!thread)
++              return NULL;
+       
+-      memset(thread, 0, sizeof(struct md_thread));
++      memset(thread, 0, sizeof(mdk_thread_t));
+       init_waitqueue(&thread->wqueue);
+       
+       thread->sem = &sem;
+       thread->run = run;
+       thread->data = data;
++      thread->name = name;
+       ret = kernel_thread(md_thread, thread, 0);
+       if (ret < 0) {
+               kfree(thread);
+@@ -836,270 +3032,407 @@
+       return thread;
+ }
+ 
+-void md_unregister_thread (struct md_thread *thread)
++void md_interrupt_thread (mdk_thread_t *thread)
++{
++      if (!thread->tsk) {
++              MD_BUG();
++              return;
++      }
++      printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
++      send_sig(SIGKILL, thread->tsk, 1);
++}
++
++void md_unregister_thread (mdk_thread_t *thread)
+ {
+       struct semaphore sem = MUTEX_LOCKED;
+       
+       thread->sem = &sem;
+       thread->run = NULL;
+-      if (thread->tsk)
+-              printk("Killing md_thread %d %p %s\n",
+-                     thread->tsk->pid, thread->tsk, thread->tsk->comm);
+-      else
+-              printk("Aiee. md_thread has 0 tsk\n");
+-      send_sig(SIGKILL, thread->tsk, 1);
+-      printk("downing on %p\n", &sem);
++      thread->name = NULL;
++      if (!thread->tsk) {
++              MD_BUG();
++              return;
++      }
++      md_interrupt_thread(thread);
+       down(&sem);
+ }
+ 
+-#define SHUTDOWN_SIGS   (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
+-
+-int md_thread(void * arg)
++void md_recover_arrays (void)
+ {
+-      struct md_thread *thread = arg;
+-
+-      lock_kernel();
+-      exit_mm(current);
+-      exit_files(current);
+-      exit_fs(current);
+-      
+-      current->session = 1;
+-      current->pgrp = 1;
+-      sprintf(current->comm, "md_thread");
+-      siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
+-      thread->tsk = current;
+-      up(thread->sem);
+-
+-      for (;;) {
+-              cli();
+-              if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
+-                      do {
+-                              spin_lock(&current->sigmask_lock);
+-                              flush_signals(current);
+-                              spin_unlock(&current->sigmask_lock);
+-                              interruptible_sleep_on(&thread->wqueue);
+-                              cli();
+-                              if (test_bit(THREAD_WAKEUP, &thread->flags))
+-                                      break;
+-                              if (!thread->run) {
+-                                      sti();
+-                                      up(thread->sem);
+-                                      return 0;
+-                              }
+-                      } while (signal_pending(current));
+-              }
+-              sti();
+-              clear_bit(THREAD_WAKEUP, &thread->flags);
+-              if (thread->run) {
+-                      thread->run(thread->data);
+-                      run_task_queue(&tq_disk);
+-              }
++      if (!md_recovery_thread) {
++              MD_BUG();
++              return;
+       }
++      md_wakeup_thread(md_recovery_thread);
+ }
+ 
+-EXPORT_SYMBOL(md_size);
+-EXPORT_SYMBOL(md_maxreadahead);
+-EXPORT_SYMBOL(register_md_personality);
+-EXPORT_SYMBOL(unregister_md_personality);
+-EXPORT_SYMBOL(partition_name);
+-EXPORT_SYMBOL(md_dev);
+-EXPORT_SYMBOL(md_error);
+-EXPORT_SYMBOL(md_register_thread);
+-EXPORT_SYMBOL(md_unregister_thread);
+-EXPORT_SYMBOL(md_update_sb);
+-EXPORT_SYMBOL(md_map);
+-EXPORT_SYMBOL(md_wakeup_thread);
+-EXPORT_SYMBOL(md_do_sync);
+ 
+-#ifdef CONFIG_PROC_FS
+-static struct proc_dir_entry proc_md = {
+-      PROC_MD, 6, "mdstat",
+-      S_IFREG | S_IRUGO, 1, 0, 0,
+-      0, &proc_array_inode_operations,
+-};
++int md_error (kdev_t dev, kdev_t rdev)
++{
++      mddev_t *mddev = kdev_to_mddev(dev);
++      mdk_rdev_t * rrdev;
++      int rc;
++
++      if (!mddev) {
++              MD_BUG();
++              return 0;
++      }
++      rrdev = find_rdev(mddev, rdev);
++      mark_rdev_faulty(rrdev);
++      /*
++       * if recovery was running, stop it now.
++       */
++      if (mddev->pers->stop_resync)
++              mddev->pers->stop_resync(mddev);
++      if (mddev->recovery_running)
++              md_interrupt_thread(md_recovery_thread);
++      if (mddev->pers->error_handler) {
++              rc = mddev->pers->error_handler(mddev, rdev);
++              md_recover_arrays();
++              return rc;
++      }
++#if 0
++      /*
++       * Drop all buffers in the failed array.
++       * _not_. This is called from IRQ handlers ...
++       */
++      invalidate_buffers(rdev);
+ #endif
++      return 0;
++}
+ 
+-static void md_geninit (struct gendisk *gdisk)
++static int status_unused (char * page)
+ {
+-  int i;
+-  
+-  for(i=0;i<MAX_MD_DEV;i++)
+-  {
+-    md_blocksizes[i] = 1024;
+-    md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
+-    md_gendisk.part[i].start_sect=-1; /* avoid partition check */
+-    md_gendisk.part[i].nr_sects=0;
+-    md_dev[i].pers=NULL;
+-  }
++      int sz = 0, i = 0;
++      mdk_rdev_t *rdev;
++      struct md_list_head *tmp;
+ 
+-  blksize_size[MD_MAJOR] = md_blocksizes;
+-  max_readahead[MD_MAJOR] = md_maxreadahead;
++      sz += sprintf(page + sz, "unused devices: ");
+ 
+-#ifdef CONFIG_PROC_FS
+-  proc_register(&proc_root, &proc_md);
+-#endif
++      ITERATE_RDEV_ALL(rdev,tmp) {
++              if (!rdev->same_set.next && !rdev->same_set.prev) {
++                      /*
++                       * The device is not yet used by any array.
++                       */
++                      i++;
++                      sz += sprintf(page + sz, "%s ",
++                              partition_name(rdev->dev));
++              }
++      }
++      if (!i)
++              sz += sprintf(page + sz, "<none>");
++
++      sz += sprintf(page + sz, "\n");
++      return sz;
+ }
+ 
+-int md_error (kdev_t mddev, kdev_t rdev)
++
++static int status_resync (char * page, mddev_t * mddev)
+ {
+-    unsigned int minor = MINOR (mddev);
+-    int rc;
++      int sz = 0;
++      unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
+ 
+-    if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
+-      panic ("md_error gets unknown device\n");
+-    if (!md_dev [minor].pers)
+-      panic ("md_error gets an error for an unknown device\n");
+-    if (md_dev [minor].pers->error_handler) {
+-      rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
+-#if SUPPORT_RECONSTRUCTION
+-      md_wakeup_thread(md_sync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
+-      return rc;
+-    }
+-    return 0;
++      resync = mddev->curr_resync;
++      blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
++      max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
++
++      /*
++       * Should not happen.
++       */             
++      if (!max_blocks) {
++              MD_BUG();
++              return 0;
++      }
++      res = resync*100/max_blocks;
++      if (!mddev->recovery_running)
++              /*
++               * true resync
++               */
++              sz += sprintf(page + sz, " resync=%u%%", res);
++      else
++              /*
++               * recovery ...
++               */
++              sz += sprintf(page + sz, " recovery=%u%%", res);
++
++      /*
++       * We do not want to overflow, so the order of operands and
++       * the * 100 / 100 trick are important. We do a +1 to be
++       * safe against division by zero. We only estimate anyway.
++       *
++       * dt: time until now
++       * tt: total time
++       * et: estimated finish time
++       */
++      dt = ((jiffies - mddev->resync_start) / HZ);
++      tt = (dt * (max_blocks / (resync/100+1)))/100;
++      if (tt > dt)
++              et = tt - dt;
++      else
++              /*
++               * ignore rounding effects near finish time
++               */
++              et = 0;
++      
++      sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
++
++      return sz;
+ }
+ 
+ int get_md_status (char *page)
+ {
+-  int sz=0, i, j, size;
+-
+-  sz+=sprintf( page+sz, "Personalities : ");
+-  for (i=0; i<MAX_PERSONALITY; i++)
+-    if (pers[i])
+-      sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
+-
+-  page[sz-1]='\n';
+-
+-  sz+=sprintf (page+sz, "read_ahead ");
+-  if (read_ahead[MD_MAJOR]==INT_MAX)
+-    sz+=sprintf (page+sz, "not set\n");
+-  else
+-    sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
++      int sz = 0, j, size;
++      struct md_list_head *tmp, *tmp2;
++      mdk_rdev_t *rdev;
++      mddev_t *mddev;
++
++      sz += sprintf(page + sz, "Personalities : ");
++      for (j = 0; j < MAX_PERSONALITY; j++)
++      if (pers[j])
++              sz += sprintf(page+sz, "[%s] ", pers[j]->name);
++
++      sz += sprintf(page+sz, "\n");
++
++
++      sz += sprintf(page+sz, "read_ahead ");
++      if (read_ahead[MD_MAJOR] == INT_MAX)
++              sz += sprintf(page+sz, "not set\n");
++      else
++              sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
+   
+-  for (i=0; i<MAX_MD_DEV; i++)
+-  {
+-    sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
+-
+-    if (md_dev[i].pers)
+-      sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
++      ITERATE_MDDEV(mddev,tmp) {
++              sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
++                                              mddev->pers ? "" : "in");
++              if (mddev->pers) {
++                      if (mddev->ro)  
++                              sz += sprintf(page + sz, " (read-only)");
++                      sz += sprintf(page + sz, " %s", mddev->pers->name);
++              }
+ 
+-    size=0;
+-    for (j=0; j<md_dev[i].nb_dev; j++)
+-    {
+-      sz+=sprintf (page+sz, " %s",
+-                 partition_name(md_dev[i].devices[j].dev));
+-      size+=md_dev[i].devices[j].size;
+-    }
++              size = 0;
++              ITERATE_RDEV(mddev,rdev,tmp2) {
++                      sz += sprintf(page + sz, " %s[%d]",
++                              partition_name(rdev->dev), rdev->desc_nr);
++                      if (rdev->faulty) {
++                              sz += sprintf(page + sz, "(F)");
++                              continue;
++                      }
++                      size += rdev->size;
++              }
+ 
+-    if (md_dev[i].nb_dev) {
+-      if (md_dev[i].pers)
+-        sz+=sprintf (page+sz, " %d blocks", md_size[i]);
+-      else
+-        sz+=sprintf (page+sz, " %d blocks", size);
+-    }
++              if (mddev->nb_dev) {
++                      if (mddev->pers)
++                              sz += sprintf(page + sz, " %d blocks",
++                                               md_size[mdidx(mddev)]);
++                      else
++                              sz += sprintf(page + sz, " %d blocks", size);
++              }
+ 
+-    if (!md_dev[i].pers)
+-    {
+-      sz+=sprintf (page+sz, "\n");
+-      continue;
+-    }
++              if (!mddev->pers) {
++                      sz += sprintf(page+sz, "\n");
++                      continue;
++              }
+ 
+-    if (md_dev[i].pers->max_invalid_dev)
+-      sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
++              sz += mddev->pers->status (page+sz, mddev);
+ 
+-    sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
+-    sz+=sprintf (page+sz, "\n");
+-  }
++              if (mddev->curr_resync)
++                      sz += status_resync (page+sz, mddev);
++              else {
++                      if (md_atomic_read(&mddev->resync_sem.count) != 1)
++                              sz += sprintf(page + sz, " resync=DELAYED");
++              }
++              sz += sprintf(page + sz, "\n");
++      }
++      sz += status_unused (page + sz);
+ 
+-  return (sz);
++      return (sz);
+ }
+ 
+-int register_md_personality (int p_num, struct md_personality *p)
++int register_md_personality (int pnum, mdk_personality_t *p)
+ {
+-  int i=(p_num >> PERSONALITY_SHIFT);
+-
+-  if (i >= MAX_PERSONALITY)
+-    return -EINVAL;
++      if (pnum >= MAX_PERSONALITY)
++              return -EINVAL;
+ 
+-  if (pers[i])
+-    return -EBUSY;
++      if (pers[pnum])
++              return -EBUSY;
+   
+-  pers[i]=p;
+-  printk ("%s personality registered\n", p->name);
+-  return 0;
++      pers[pnum] = p;
++      printk(KERN_INFO "%s personality registered\n", p->name);
++      return 0;
+ }
+ 
+-int unregister_md_personality (int p_num)
++int unregister_md_personality (int pnum)
+ {
+-  int i=(p_num >> PERSONALITY_SHIFT);
+-
+-  if (i >= MAX_PERSONALITY)
+-    return -EINVAL;
++      if (pnum >= MAX_PERSONALITY)
++              return -EINVAL;
+ 
+-  printk ("%s personality unregistered\n", pers[i]->name);
+-  pers[i]=NULL;
+-  return 0;
++      printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
++      pers[pnum] = NULL;
++      return 0;
+ } 
+ 
+-static md_descriptor_t *get_spare(struct md_dev *mddev)
++static mdp_disk_t *get_spare(mddev_t *mddev)
+ {
+-      int i;
+-      md_superblock_t *sb = mddev->sb;
+-      md_descriptor_t *descriptor;
+-      struct real_dev *realdev;
+-      
+-      for (i = 0; i < mddev->nb_dev; i++) {
+-              realdev = &mddev->devices[i];
+-              if (!realdev->sb)
++      mdp_super_t *sb = mddev->sb;
++      mdp_disk_t *disk;
++      mdk_rdev_t *rdev;
++      struct md_list_head *tmp;
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty)
++                      continue;
++              if (!rdev->sb) {
++                      MD_BUG();
+                       continue;
+-              descriptor = &sb->disks[realdev->sb->descriptor.number];
+-              if (descriptor->state & (1 << MD_FAULTY_DEVICE))
++              }
++              disk = &sb->disks[rdev->desc_nr];
++              if (disk_faulty(disk)) {
++                      MD_BUG();
+                       continue;
+-              if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
++              }
++              if (disk_active(disk))
+                       continue;
+-              return descriptor;
++              return disk;
+       }
+       return NULL;
+ }
+ 
++static int is_mddev_idle (mddev_t *mddev)
++{
++      mdk_rdev_t * rdev;
++      struct md_list_head *tmp;
++      int idle;
++      unsigned long curr_events;
++
++      idle = 1;
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              curr_events = io_events[MAJOR(rdev->dev)];
++
++              if (curr_events != rdev->last_events) {
++//                    printk("!I(%d)", curr_events-rdev->last_events);
++                      rdev->last_events = curr_events;
++                      idle = 0;
++              }
++      }
++      return idle;
++}
++
+ /*
+  * parallel resyncing thread. 
+- *
+- * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
+- *        - fix read error handing
+  */
+ 
+-int md_do_sync(struct md_dev *mddev)
++/*
++ * Determine correct block size for this device.
++ */
++unsigned int device_bsize (kdev_t dev)
++{
++      unsigned int i, correct_size;
++
++      correct_size = BLOCK_SIZE;
++      if (blksize_size[MAJOR(dev)]) {
++              i = blksize_size[MAJOR(dev)][MINOR(dev)];
++              if (i)
++                      correct_size = i;
++      }
++
++      return correct_size;
++}
++
++static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
++
++#define RA_ORDER (1)
++#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
++#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
++
++int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+ {
+-        struct buffer_head *bh;
+-      int max_blocks, blocksize, curr_bsize, percent=1, j;
+-      kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
++      mddev_t *mddev2;
++        struct buffer_head **bh;
++      unsigned int max_blocks, blocksize, curr_bsize,
++              i, ii, j, k, chunk, window, nr_blocks, err, serialize;
++      kdev_t read_disk = mddev_to_kdev(mddev);
+       int major = MAJOR(read_disk), minor = MINOR(read_disk);
+       unsigned long starttime;
++      int max_read_errors = 2*MAX_NR_BLOCKS,
++               max_write_errors = 2*MAX_NR_BLOCKS;
++      struct md_list_head *tmp;
++
++retry_alloc:
++      bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
++      if (!bh) {
++              printk(KERN_ERR
++              "could not alloc bh array for reconstruction ... retrying!\n");
++              goto retry_alloc;
++      }
++
++      err = down_interruptible(&mddev->resync_sem);
++      if (err)
++              goto out_nolock;
++
++recheck:
++      serialize = 0;
++      ITERATE_MDDEV(mddev2,tmp) {
++              if (mddev2 == mddev)
++                      continue;
++              if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
++                      printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
++                      serialize = 1;
++                      break;
++              }
++      }
++      if (serialize) {
++              interruptible_sleep_on(&resync_wait);
++              if (md_signal_pending(current)) {
++                      md_flush_signals();
++                      err = -EINTR;
++                      goto out;
++              }
++              goto recheck;
++      }
++
++      mddev->curr_resync = 1;
+ 
+-      blocksize = blksize_size[major][minor];
++      blocksize = device_bsize(read_disk);
+       max_blocks = blk_size[major][minor] / (blocksize >> 10);
+ 
+-      printk("... resync log\n");
+-      printk(" ....   mddev->nb_dev: %d\n", mddev->nb_dev);
+-      printk(" ....   raid array: %s\n", kdevname(read_disk));
+-      printk(" ....   max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
+-      printk("md: syncing RAID array %s\n", kdevname(read_disk));
++      printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
++      printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
++                                              sysctl_speed_limit);
++      printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
++
++      /*
++       * Resync has low priority.
++       */
++      current->priority = 1;
++
++      is_mddev_idle(mddev); /* this also initializes IO event counters */
++      starttime = jiffies;
++      mddev->resync_start = starttime;
+ 
+-      mddev->busy++;
++      /*
++       * Tune reconstruction:
++       */
++      window = md_maxreadahead[mdidx(mddev)]/1024;
++      nr_blocks = window / (blocksize >> 10);
++      if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
++              nr_blocks = MAX_NR_BLOCKS;
++      printk(KERN_INFO "md: using %dk window.\n",window);
+ 
+-      starttime=jiffies;
+-      for (j = 0; j < max_blocks; j++) {
++      for (j = 0; j < max_blocks; j += nr_blocks) {
+ 
++              if (j)
++                      mddev->curr_resync = j;
+               /*
+                * B careful. When some1 mounts a non-'blocksize' filesystem
+                * then we get the blocksize changed right under us. Go deal
+                * with it transparently, recalculate 'blocksize', 'j' and
+                * 'max_blocks':
+                */
+-              curr_bsize = blksize_size[major][minor];
++              curr_bsize = device_bsize(read_disk);
+               if (curr_bsize != blocksize) {
+-              diff_blocksize:
++                      printk(KERN_INFO "md%d: blocksize changed\n",
++                                                              mdidx(mddev));
++retry_read:
+                       if (curr_bsize > blocksize)
+                               /*
+                                * this is safe, rounds downwards.
+@@ -1109,114 +3442,384 @@
+                               j *= blocksize/curr_bsize;
+ 
+                       blocksize = curr_bsize;
++                      nr_blocks = window / (blocksize >> 10);
++                      if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
++                              nr_blocks = MAX_NR_BLOCKS;
+                       max_blocks = blk_size[major][minor] / (blocksize >> 10);
+-              }
+-              if ((bh = breada (read_disk, j, blocksize, j * blocksize,
+-                                      max_blocks * blocksize)) != NULL) {
+-                      mark_buffer_dirty(bh, 1);
+-                      brelse(bh);
+-              } else {
++                      printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
++                                      nr_blocks, blocksize, j, max_blocks);
+                       /*
+-                       * FIXME: Ugly, but set_blocksize() isnt safe ...
++                       * We will retry the current block-group
+                        */
+-                      curr_bsize = blksize_size[major][minor];
+-                      if (curr_bsize != blocksize)
+-                              goto diff_blocksize;
++              }
+ 
+-                      /*
+-                       * It's a real read problem. FIXME, handle this
+-                       * a better way.
+-                       */
+-                      printk ( KERN_ALERT
+-                               "read error, stopping reconstruction.\n");
+-                      mddev->busy--;
+-                      return 1;
++              /*
++               * Cleanup routines expect this
++               */
++              for (k = 0; k < nr_blocks; k++)
++                      bh[k] = NULL;
++
++              chunk = nr_blocks;
++              if (chunk > max_blocks-j)
++                      chunk = max_blocks-j;
++
++              /*
++               * request buffer heads ...
++               */
++              for (i = 0; i < chunk; i++) {
++                      bh[i] = getblk (read_disk, j+i, blocksize);
++                      if (!bh[i])
++                              goto read_error;
++                      if (!buffer_dirty(bh[i]))
++                              mark_buffer_lowprio(bh[i]);
+               }
+ 
+               /*
+-               * Let's sleep some if we are faster than our speed limit:
++               * read buffer heads ...
+                */
+-              while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
+-              {
+-                      current->state = TASK_INTERRUPTIBLE;
+-                      schedule_timeout(1);
++              ll_rw_block (READ, chunk, bh);
++              run_task_queue(&tq_disk);
++              
++              /*
++               * verify that all of them are OK ...
++               */
++              for (i = 0; i < chunk; i++) {
++                      ii = chunk-i-1;
++                      wait_on_buffer(bh[ii]);
++                      if (!buffer_uptodate(bh[ii]))
++                              goto read_error;
++              }
++
++retry_write:
++              for (i = 0; i < chunk; i++)
++                      mark_buffer_dirty_lowprio(bh[i]);
++
++              ll_rw_block(WRITE, chunk, bh);
++              run_task_queue(&tq_disk);
++
++              for (i = 0; i < chunk; i++) {
++                      ii = chunk-i-1;
++                      wait_on_buffer(bh[ii]);
++
++                      if (spare && disk_faulty(spare)) {
++                              for (k = 0; k < chunk; k++)
++                                      brelse(bh[k]);
++                              printk(" <SPARE FAILED!>\n ");
++                              err = -EIO;
++                              goto out;
++                      }
++
++                      if (!buffer_uptodate(bh[ii])) {
++                              curr_bsize = device_bsize(read_disk);
++                              if (curr_bsize != blocksize) {
++                                      printk(KERN_INFO
++                                              "md%d: blocksize changed during write\n",
++                                              mdidx(mddev));
++                                      for (k = 0; k < chunk; k++)
++                                              if (bh[k]) {
++                                                      if (buffer_lowprio(bh[k]))
++                                                              mark_buffer_clean(bh[k]);
++                                                      brelse(bh[k]);
++                                              }
++                                      goto retry_read;
++                              }
++                              printk(" BAD WRITE %8d>\n", j);
++                              /*
++                               * Ouch, write error, retry or bail out.
++                               */
++                              if (max_write_errors) {
++                                      max_write_errors--;
++                                      printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
++                                      goto retry_write;
++                              }
++                              printk ( KERN_ALERT
++                                "too many write errors, stopping reconstruction.\n");
++                              for (k = 0; k < chunk; k++)
++                                      if (bh[k]) {
++                                              if (buffer_lowprio(bh[k]))
++                                                      mark_buffer_clean(bh[k]);
++                                              brelse(bh[k]);
++                                      }
++                              err = -EIO;
++                              goto out;
++                      }
+               }
+ 
+               /*
+-               * FIXME: put this status bar thing into /proc
++               * This is the normal 'everything went OK' case
++               * do a 'free-behind' logic, we sure dont need
++               * this buffer if it was the only user.
+                */
+-              if (!(j%(max_blocks/100))) {
+-                      if (!(percent%10))
+-                              printk (" %03d%% done.\n",percent);
++              for (i = 0; i < chunk; i++)
++                      if (buffer_dirty(bh[i]))
++                              brelse(bh[i]);
+                       else
+-                              printk (".");
+-                      percent++;
++                              bforget(bh[i]);
++
++
++              if (md_signal_pending(current)) {
++                      /*
++                       * got a signal, exit.
++                       */
++                      mddev->curr_resync = 0;
++                      printk("md_do_sync() got signal ... exiting\n");
++                      md_flush_signals();
++                      err = -EINTR;
++                      goto out;
+               }
++
++              /*
++               * this loop exits only if either when we are slower than
++               * the 'hard' speed limit, or the system was IO-idle for
++               * a jiffy.
++               * the system might be non-idle CPU-wise, but we only care
++               * about not overloading the IO subsystem. (things like an
++               * e2fsck being done on the RAID array should execute fast)
++               */
++repeat:
++              if (md_need_resched(current))
++                      schedule();
++
++              if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
++                                              > sysctl_speed_limit) {
++                      current->priority = 1;
++
++                      if (!is_mddev_idle(mddev)) {
++                              current->state = TASK_INTERRUPTIBLE;
++                              md_schedule_timeout(HZ/2);
++                              if (!md_signal_pending(current))
++                                      goto repeat;
++                      }
++              } else
++                      current->priority = 40;
+       }
+       fsync_dev(read_disk);
+-      printk("md: %s: sync done.\n", kdevname(read_disk));
+-      mddev->busy--;
+-      return 0;
++      printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
++      err = 0;
++      /*
++       * this also signals 'finished resyncing' to md_stop
++       */
++out:
++      up(&mddev->resync_sem);
++out_nolock:
++      free_pages((unsigned long)bh, RA_ORDER);
++      mddev->curr_resync = 0;
++      wake_up(&resync_wait);
++      return err;
++
++read_error:
++      /*
++       * set_blocksize() might change the blocksize. This
++       * should not happen often, but it happens when eg.
++       * someone mounts a filesystem that has non-1k
++       * blocksize. set_blocksize() doesnt touch our
++       * buffer, but to avoid aliasing problems we change
++       * our internal blocksize too and retry the read.
++       */
++      curr_bsize = device_bsize(read_disk);
++      if (curr_bsize != blocksize) {
++              printk(KERN_INFO "md%d: blocksize changed during read\n",
++                      mdidx(mddev));
++              for (k = 0; k < chunk; k++)
++                      if (bh[k]) {
++                              if (buffer_lowprio(bh[k]))
++                                      mark_buffer_clean(bh[k]);
++                              brelse(bh[k]);
++                      }
++              goto retry_read;
++      }
++
++      /*
++       * It's a real read problem. We retry and bail out
++       * only if it's excessive.
++       */
++      if (max_read_errors) {
++              max_read_errors--;
++              printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
++              for (k = 0; k < chunk; k++)
++                      if (bh[k]) {
++                              if (buffer_lowprio(bh[k]))
++                                      mark_buffer_clean(bh[k]);
++                              brelse(bh[k]);
++                      }
++              goto retry_read;
++      }
++      printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
++      for (k = 0; k < chunk; k++)
++              if (bh[k]) {
++                      if (buffer_lowprio(bh[k]))
++                              mark_buffer_clean(bh[k]);
++                      brelse(bh[k]);
++              }
++      err = -EIO;
++      goto out;
+ }
+ 
++#undef MAX_NR_BLOCKS
++
+ /*
+- * This is a kernel thread which: syncs a spare disk with the active array
++ * This is a kernel thread which syncs a spare disk with the active array
+  *
+  * the amount of foolproofing might seem to be a tad excessive, but an
+  * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+  * of my root partition with the first 0.5 gigs of my /home partition ... so
+  * i'm a bit nervous ;)
+  */
+-void mdsyncd (void *data)
++void md_do_recovery (void *data)
+ {
+-      int i;
+-      struct md_dev *mddev;
+-      md_superblock_t *sb;
+-      md_descriptor_t *spare;
++      int err;
++      mddev_t *mddev;
++      mdp_super_t *sb;
++      mdp_disk_t *spare;
+       unsigned long flags;
++      struct md_list_head *tmp;
+ 
+-      for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
+-              if ((sb = mddev->sb) == NULL)
++      printk(KERN_INFO "md: recovery thread got woken up ...\n");
++restart:
++      ITERATE_MDDEV(mddev,tmp) {
++              sb = mddev->sb;
++              if (!sb)
++                      continue;
++              if (mddev->recovery_running)
+                       continue;
+               if (sb->active_disks == sb->raid_disks)
+                       continue;
+-              if (!sb->spare_disks)
++              if (!sb->spare_disks) {
++                      printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
+                       continue;
++              }
++              /*
++               * now here we get the spare and resync it.
++               */
+               if ((spare = get_spare(mddev)) == NULL)
+                       continue;
+-              if (!mddev->pers->mark_spare)
++              printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
++              if (!mddev->pers->diskop)
+                       continue;
+-              if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
++              if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+                       continue;
+-              if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
+-                      mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
++              down(&mddev->recovery_sem);
++              mddev->recovery_running = 1;
++              err = md_do_sync(mddev, spare);
++              if (err == -EIO) {
++                      printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
++                      if (!disk_faulty(spare)) {
++                              mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
++                              mark_disk_faulty(spare);
++                              mark_disk_nonsync(spare);
++                              mark_disk_inactive(spare);
++                              sb->spare_disks--;
++                              sb->working_disks--;
++                              sb->failed_disks++;
++                      }
++              } else
++                      if (disk_faulty(spare))
++                              mddev->pers->diskop(mddev, &spare,
++                                              DISKOP_SPARE_INACTIVE);
++              if (err == -EINTR) {
++                      /*
++                       * Recovery got interrupted ...
++                       * signal back that we have finished using the array.
++                       */
++                      mddev->pers->diskop(mddev, &spare,
++                                                       DISKOP_SPARE_INACTIVE);
++                      up(&mddev->recovery_sem);
++                      mddev->recovery_running = 0;
+                       continue;
++              } else {
++                      mddev->recovery_running = 0;
++                      up(&mddev->recovery_sem);
+               }
+               save_flags(flags);
+               cli();
+-              mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
+-              spare->state |= (1 << MD_SYNC_DEVICE);
+-              spare->state |= (1 << MD_ACTIVE_DEVICE);
+-              sb->spare_disks--;
+-              sb->active_disks++;
+-              mddev->sb_dirty = 1;
+-              md_update_sb(mddev - md_dev);
++              if (!disk_faulty(spare)) {
++                      /*
++                       * the SPARE_ACTIVE diskop possibly changes the
++                       * pointer too
++                       */
++                      mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
++                      mark_disk_sync(spare);
++                      mark_disk_active(spare);
++                      sb->active_disks++;
++                      sb->spare_disks--;
++              }
+               restore_flags(flags);
++              mddev->sb_dirty = 1;
++              md_update_sb(mddev);
++              goto restart;
+       }
++      printk(KERN_INFO "md: recovery thread finished ...\n");
+       
+ }
+ 
++int md_notify_reboot(struct notifier_block *this,
++                                      unsigned long code, void *x)
++{
++      struct md_list_head *tmp;
++      mddev_t *mddev;
++
++      if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
++                                || (code == MD_SYS_POWER_OFF)) {
++
++              printk(KERN_INFO "stopping all md devices.\n");
++
++              ITERATE_MDDEV(mddev,tmp)
++                      do_md_stop (mddev, 1);
++              /*
++               * certain more exotic SCSI devices are known to be
++               * volatile wrt too early system reboots. While the
++               * right place to handle this issue is the given
++               * driver, we do want to have a safe RAID driver ...
++               */
++              md_mdelay(1000*1);
++      }
++      return NOTIFY_DONE;
++}
++
++struct notifier_block md_notifier = {
++      md_notify_reboot,
++      NULL,
++      0
++};
++
++md__initfunc(void raid_setup(char *str, int *ints))
++{
++      char tmpline[100];
++      int len, pos, nr, i;
++
++      len = strlen(str) + 1;
++      nr = 0;
++      pos = 0;
++
++      for (i = 0; i < len; i++) {
++              char c = str[i];
++
++              if (c == ',' || !c) {
++                      tmpline[pos] = 0;
++                      if (!strcmp(tmpline,"noautodetect"))
++                              raid_setup_args.noautodetect = 1;
++                      nr++;
++                      pos = 0;
++                      continue;
++              }
++              tmpline[pos] = c;
++              pos++;
++      }
++      raid_setup_args.set = 1;
++      return;
++}
++
+ #ifdef CONFIG_MD_BOOT
+ struct {
+       int set;
+       int ints[100];
+       char str[100];
+-} md_setup_args __initdata = {
++} md_setup_args md__initdata = {
+       0,{0},{0}
+ };
+ 
+ /* called from init/main.c */
+-__initfunc(void md_setup(char *str,int *ints))
++md__initfunc(void md_setup(char *str,int *ints))
+ {
+       int i;
+       for(i=0;i<=ints[0];i++) {
+@@ -1228,21 +3831,24 @@
+       return;
+ }
+ 
+-__initfunc(void do_md_setup(char *str,int *ints))
++md__initfunc(void do_md_setup(char *str,int *ints))
+ {
+-      int minor, pers, factor, fault;
++#if 0
++      int minor, pers, chunk_size, fault;
+       kdev_t dev;
+       int i=1;
+ 
++      printk("i plan to phase this out --mingo\n");
++
+       if(ints[0] < 4) {
+-              printk ("md: Too few Arguments (%d).\n", ints[0]);
++              printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
+               return;
+       }
+    
+       minor=ints[i++];
+    
+-      if (minor >= MAX_MD_DEV) {
+-              printk ("md: Minor device number too high.\n");
++      if ((unsigned int)minor >= MAX_MD_DEVS) {
++              printk (KERN_WARNING "md: Minor device number too high.\n");
+               return;
+       }
+ 
+@@ -1252,18 +3858,20 @@
+       case -1:
+ #ifdef CONFIG_MD_LINEAR
+               pers = LINEAR;
+-              printk ("md: Setting up md%d as linear device.\n",minor);
++              printk (KERN_INFO "md: Setting up md%d as linear device.\n",
++                                                                      minor);
+ #else 
+-              printk ("md: Linear mode not configured." 
++              printk (KERN_WARNING "md: Linear mode not configured." 
+                       "Recompile the kernel with linear mode enabled!\n");
+ #endif
+               break;
+       case 0:
+               pers = STRIPED;
+ #ifdef CONFIG_MD_STRIPED
+-              printk ("md: Setting up md%d as a striped device.\n",minor);
++              printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
++                                                              minor);
+ #else 
+-              printk ("md: Striped mode not configured." 
++              printk (KERN_WARNING "md: Striped mode not configured." 
+                       "Recompile the kernel with striped mode enabled!\n");
+ #endif
+               break;
+@@ -1278,79 +3886,145 @@
+               break;
+ */
+       default:           
+-              printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
++              printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
+               return;
+       }
+ 
+-      if(pers) {
++      if (pers) {
+ 
+-        factor=ints[i++]; /* Chunksize  */
+-        fault =ints[i++]; /* Faultlevel */
++              chunk_size = ints[i++]; /* Chunksize  */
++              fault = ints[i++]; /* Faultlevel */
+    
+-        pers=pers | factor | (fault << FAULT_SHIFT);   
++              pers = pers | chunk_size | (fault << FAULT_SHIFT);   
+    
+-        while( str && (dev = name_to_kdev_t(str))) {
+-          do_md_add (minor, dev);
+-          if((str = strchr (str, ',')) != NULL)
+-            str++;
+-        }
++              while( str && (dev = name_to_kdev_t(str))) {
++                      do_md_add (minor, dev);
++                      if((str = strchr (str, ',')) != NULL)
++                              str++;
++              }
+ 
+-        do_md_run (minor, pers);
+-        printk ("md: Loading md%d.\n",minor);
++              do_md_run (minor, pers);
++              printk (KERN_INFO "md: Loading md%d.\n",minor);
+       }
+-   
++#endif
+ }
+ #endif
+ 
++void hsm_init (void);
++void translucent_init (void);
+ void linear_init (void);
+ void raid0_init (void);
+ void raid1_init (void);
+ void raid5_init (void);
+ 
+-__initfunc(int md_init (void))
++md__initfunc(int md_init (void))
+ {
+-  printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
+-    MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
+-    MAX_MD_DEV, MAX_REAL);
+-
+-  if (register_blkdev (MD_MAJOR, "md", &md_fops))
+-  {
+-    printk ("Unable to get major %d for md\n", MD_MAJOR);
+-    return (-1);
+-  }
+-
+-  blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
+-  blk_dev[MD_MAJOR].current_request=NULL;
+-  read_ahead[MD_MAJOR]=INT_MAX;
+-  memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
+-  md_gendisk.next=gendisk_head;
+-
+-  gendisk_head=&md_gendisk;
+-
+-#if SUPPORT_RECONSTRUCTION
+-  if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
+-    printk("md: bug: md_sync_thread == NULL\n");
+-#endif /* SUPPORT_RECONSTRUCTION */
++      static char * name = "mdrecoveryd";
++
++      printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
++                      MD_MAJOR_VERSION, MD_MINOR_VERSION,
++                      MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
++
++      if (register_blkdev (MD_MAJOR, "md", &md_fops))
++      {
++              printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
++              return (-1);
++      }
++
++      blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
++      blk_dev[MD_MAJOR].current_request = NULL;
++      read_ahead[MD_MAJOR] = INT_MAX;
++      md_gendisk.next = gendisk_head;
++
++      gendisk_head = &md_gendisk;
++
++      md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
++      if (!md_recovery_thread)
++              printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
+ 
++      md_register_reboot_notifier(&md_notifier);
++      md_register_sysctl();
++
++#ifdef CONFIG_MD_HSM
++      hsm_init ();
++#endif
++#ifdef CONFIG_MD_TRANSLUCENT
++      translucent_init ();
++#endif
+ #ifdef CONFIG_MD_LINEAR
+-  linear_init ();
++      linear_init ();
+ #endif
+ #ifdef CONFIG_MD_STRIPED
+-  raid0_init ();
++      raid0_init ();
+ #endif
+ #ifdef CONFIG_MD_MIRRORING
+-  raid1_init ();
++      raid1_init ();
+ #endif
+ #ifdef CONFIG_MD_RAID5
+-  raid5_init ();
++      raid5_init ();
++#endif
++#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
++        /*
++         * pick a XOR routine, runtime.
++         */
++      calibrate_xor_block();
+ #endif
+-  return (0);
++
++      return (0);
+ }
+ 
+ #ifdef CONFIG_MD_BOOT
+-__initfunc(void md_setup_drive(void))
++md__initfunc(void md_setup_drive(void))
+ {
+       if(md_setup_args.set)
+               do_md_setup(md_setup_args.str, md_setup_args.ints);
+ }
+ #endif
++
++MD_EXPORT_SYMBOL(md_size);
++MD_EXPORT_SYMBOL(register_md_personality);
++MD_EXPORT_SYMBOL(unregister_md_personality);
++MD_EXPORT_SYMBOL(partition_name);
++MD_EXPORT_SYMBOL(md_error);
++MD_EXPORT_SYMBOL(md_recover_arrays);
++MD_EXPORT_SYMBOL(md_register_thread);
++MD_EXPORT_SYMBOL(md_unregister_thread);
++MD_EXPORT_SYMBOL(md_update_sb);
++MD_EXPORT_SYMBOL(md_map);
++MD_EXPORT_SYMBOL(md_wakeup_thread);
++MD_EXPORT_SYMBOL(md_do_sync);
++MD_EXPORT_SYMBOL(md_print_devices);
++MD_EXPORT_SYMBOL(find_rdev_nr);
++MD_EXPORT_SYMBOL(md_check_ordering);
++MD_EXPORT_SYMBOL(md_interrupt_thread);
++MD_EXPORT_SYMBOL(mddev_map);
++
++#ifdef CONFIG_PROC_FS
++static struct proc_dir_entry proc_md = {
++      PROC_MD, 6, "mdstat",
++      S_IFREG | S_IRUGO, 1, 0, 0,
++      0, &proc_array_inode_operations,
++};
++#endif
++
++static void md_geninit (struct gendisk *gdisk)
++{
++      int i;
++  
++      for(i = 0; i < MAX_MD_DEVS; i++) {
++              md_blocksizes[i] = 1024;
++              md_maxreadahead[i] = MD_READAHEAD;
++              md_gendisk.part[i].start_sect = -1; /* avoid partition check */
++              md_gendisk.part[i].nr_sects = 0;
++      }
++
++      printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
++
++      blksize_size[MD_MAJOR] = md_blocksizes;
++      md_set_global_readahead(md_maxreadahead);
++
++#ifdef CONFIG_PROC_FS
++      proc_register(&proc_root, &proc_md);
++#endif
++}
++
+diff -uNr linux-orig/drivers/block/raid0.c linux/drivers/block/raid0.c
+--- linux-orig/drivers/block/raid0.c   Mon Sep  4 10:39:16 2000
++++ linux/drivers/block/raid0.c        Fri Dec  8 22:54:52 2000
+@@ -1,4 +1,3 @@
+-
+ /*
+    raid0.c : Multiple Devices driver for Linux
+              Copyright (C) 1994-96 Marc ZYNGIER
+@@ -18,146 +17,201 @@
+ */
+ 
+ #include <linux/module.h>
+-#include <linux/md.h>
+-#include <linux/raid0.h>
+-#include <linux/vmalloc.h>
++#include <linux/raid/raid0.h>
+ 
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+ 
+-static int create_strip_zones (int minor, struct md_dev *mddev)
++static int create_strip_zones (mddev_t *mddev)
+ {
+-  int i, j, c=0;
+-  int current_offset=0;
+-  struct real_dev *smallest_by_zone;
+-  struct raid0_data *data=(struct raid0_data *) mddev->private;
+-  
+-  data->nr_strip_zones=1;
+-  
+-  for (i=1; i<mddev->nb_dev; i++)
+-  {
+-    for (j=0; j<i; j++)
+-      if (mddev->devices[i].size==mddev->devices[j].size)
+-      {
+-      c=1;
+-      break;
+-      }
+-
+-    if (!c)
+-      data->nr_strip_zones++;
+-
+-    c=0;
+-  }
+-
+-  if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
+-    return 1;
+-
+-  data->smallest=NULL;
+-  
+-  for (i=0; i<data->nr_strip_zones; i++)
+-  {
+-    data->strip_zone[i].dev_offset=current_offset;
+-    smallest_by_zone=NULL;
+-    c=0;
+-
+-    for (j=0; j<mddev->nb_dev; j++)
+-      if (mddev->devices[j].size>current_offset)
+-      {
+-      data->strip_zone[i].dev[c++]=mddev->devices+j;
+-      if (!smallest_by_zone ||
+-          smallest_by_zone->size > mddev->devices[j].size)
+-        smallest_by_zone=mddev->devices+j;
+-      }
+-
+-    data->strip_zone[i].nb_dev=c;
+-    data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
+-
+-    if (!data->smallest ||
+-      data->smallest->size > data->strip_zone[i].size)
+-      data->smallest=data->strip_zone+i;
+-
+-    data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
+-                                         data->strip_zone[i-1].size) : 0;
+-    current_offset=smallest_by_zone->size;
+-  }
+-  return 0;
++      int i, c, j, j1, j2;
++      int current_offset, curr_zone_offset;
++      raid0_conf_t *conf = mddev_to_conf(mddev);
++      mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
++ 
++      /*
++       * The number of 'same size groups'
++       */
++      conf->nr_strip_zones = 0;
++ 
++      ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
++              printk("raid0: looking at %s\n", partition_name(rdev1->dev));
++              c = 0;
++              ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
++                      printk("raid0:   comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
++                      if (rdev2 == rdev1) {
++                              printk("raid0:   END\n");
++                              break;
++                      }
++                      if (rdev2->size == rdev1->size)
++                      {
++                              /*
++                               * Not unique, dont count it as a new
++                               * group
++                               */
++                              printk("raid0:   EQUAL\n");
++                              c = 1;
++                              break;
++                      }
++                      printk("raid0:   NOT EQUAL\n");
++              }
++              if (!c) {
++                      printk("raid0:   ==> UNIQUE\n");
++                      conf->nr_strip_zones++;
++                      printk("raid0: %d zones\n", conf->nr_strip_zones);
++              }
++      }
++              printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
++
++      conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
++                              conf->nr_strip_zones);
++      if (!conf->strip_zone)
++              return 1;
++
++
++      conf->smallest = NULL;
++      current_offset = 0;
++      curr_zone_offset = 0;
++
++      for (i = 0; i < conf->nr_strip_zones; i++)
++      {
++              struct strip_zone *zone = conf->strip_zone + i;
++
++              printk("zone %d\n", i);
++              zone->dev_offset = current_offset;
++              smallest = NULL;
++              c = 0;
++
++              ITERATE_RDEV_ORDERED(mddev,rdev,j) {
++
++                      printk(" checking %s ...", partition_name(rdev->dev));
++                      if (rdev->size > current_offset)
++                      {
++                              printk(" contained as device %d\n", c);
++                              zone->dev[c] = rdev;
++                              c++;
++                              if (!smallest || (rdev->size <smallest->size)) {
++                                      smallest = rdev;
++                                      printk("  (%d) is smallest!.\n", rdev->size);
++                              }
++                      } else
++                              printk(" nope.\n");
++              }
++
++              zone->nb_dev = c;
++              zone->size = (smallest->size - current_offset) * c;
++              printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
++
++              if (!conf->smallest || (zone->size < conf->smallest->size))
++                      conf->smallest = zone;
++
++              zone->zone_offset = curr_zone_offset;
++              curr_zone_offset += zone->size;
++
++              current_offset = smallest->size;
++              printk("current zone offset: %d\n", current_offset);
++      }
++      printk("done.\n");
++      return 0;
+ }
+ 
+-static int raid0_run (int minor, struct md_dev *mddev)
++static int raid0_run (mddev_t *mddev)
+ {
+-  int cur=0, i=0, size, zone0_size, nb_zone;
+-  struct raid0_data *data;
+-
+-  MOD_INC_USE_COUNT;
++      int cur=0, i=0, size, zone0_size, nb_zone;
++      raid0_conf_t *conf;
+ 
+-  if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
+-  data=(struct raid0_data *) mddev->private;
+-  
+-  if (create_strip_zones (minor, mddev)) 
+-  {
+-      vfree(data);
+-      return 1;
+-  }
+-
+-  nb_zone=data->nr_zones=
+-    md_size[minor]/data->smallest->size +
+-    (md_size[minor]%data->smallest->size ? 1 : 0);
+-
+-  printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
+-  if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
+-  {
+-    vfree(data->strip_zone);
+-    vfree(data);
+-    return 1;
+-  }
+-  size=data->strip_zone[cur].size;
+-
+-  i=0;
+-  while (cur<data->nr_strip_zones)
+-  {
+-    data->hash_table[i].zone0=data->strip_zone+cur;
+-
+-    if (size>=data->smallest->size)/* If we completely fill the slot */
+-    {
+-      data->hash_table[i++].zone1=NULL;
+-      size-=data->smallest->size;
+-
+-      if (!size)
+-      {
+-      if (++cur==data->nr_strip_zones) continue;
+-      size=data->strip_zone[cur].size;
+-      }
+-
+-      continue;
+-    }
+-
+-    if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
+-    {
+-      data->hash_table[i].zone1=NULL;
+-      continue;
+-    }
+-
+-    zone0_size=size;          /* Here, we use a 2nd dev to fill the slot */
+-    size=data->strip_zone[cur].size;
+-    data->hash_table[i++].zone1=data->strip_zone+cur;
+-    size-=(data->smallest->size - zone0_size);
+-  }
++      MOD_INC_USE_COUNT;
+ 
+-  return (0);
++      conf = vmalloc(sizeof (raid0_conf_t));
++      if (!conf)
++              goto out;
++      mddev->private = (void *)conf;
++ 
++      if (md_check_ordering(mddev)) {
++              printk("raid0: disks are not ordered, aborting!\n");
++              goto out_free_conf;
++      }
++
++      if (create_strip_zones (mddev)) 
++              goto out_free_conf;
++
++      printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
++      printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
++      nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
++                      (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
++      printk("raid0 : nb_zone is %d.\n", nb_zone);
++      conf->nr_zones = nb_zone;
++
++      printk("raid0 : Allocating %d bytes for hash.\n",
++                              sizeof(struct raid0_hash)*nb_zone);
++
++      conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
++      if (!conf->hash_table)
++              goto out_free_zone_conf;
++      size = conf->strip_zone[cur].size;
++
++      i = 0;
++      while (cur < conf->nr_strip_zones) {
++              conf->hash_table[i].zone0 = conf->strip_zone + cur;
++
++              /*
++               * If we completely fill the slot
++               */
++              if (size >= conf->smallest->size) {
++                      conf->hash_table[i++].zone1 = NULL;
++                      size -= conf->smallest->size;
++
++                      if (!size) {
++                              if (++cur == conf->nr_strip_zones)
++                                      continue;
++                              size = conf->strip_zone[cur].size;
++                      }
++                      continue;
++              }
++              if (++cur == conf->nr_strip_zones) {
++                      /*
++                       * Last dev, set unit1 as NULL
++                       */
++                      conf->hash_table[i].zone1=NULL;
++                      continue;
++              }
++
++              /*
++               * Here we use a 2nd dev to fill the slot
++               */
++              zone0_size = size;
++              size = conf->strip_zone[cur].size;
++              conf->hash_table[i++].zone1 = conf->strip_zone + cur;
++              size -= (conf->smallest->size - zone0_size);
++      }
++      return 0;
++
++out_free_zone_conf:
++      vfree(conf->strip_zone);
++      conf->strip_zone = NULL;
++
++out_free_conf:
++      vfree(conf);
++      mddev->private = NULL;
++out:
++      MOD_DEC_USE_COUNT;
++      return 1;
+ }
+ 
+-
+-static int raid0_stop (int minor, struct md_dev *mddev)
++static int raid0_stop (mddev_t *mddev)
+ {
+-  struct raid0_data *data=(struct raid0_data *) mddev->private;
++      raid0_conf_t *conf = mddev_to_conf(mddev);
+ 
+-  vfree (data->hash_table);
+-  vfree (data->strip_zone);
+-  vfree (data);
++      vfree (conf->hash_table);
++      conf->hash_table = NULL;
++      vfree (conf->strip_zone);
++      conf->strip_zone = NULL;
++      vfree (conf);
++      mddev->private = NULL;
+ 
+-  MOD_DEC_USE_COUNT;
+-  return 0;
++      MOD_DEC_USE_COUNT;
++      return 0;
+ }
+ 
+ /*
+@@ -167,135 +221,140 @@
+  * Of course, those facts may not be valid anymore (and surely won't...)
+  * Hey guys, there's some work out there ;-)
+  */
+-static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+                     unsigned long *rsector, unsigned long size)
+ {
+-  struct raid0_data *data=(struct raid0_data *) mddev->private;
+-  static struct raid0_hash *hash;
+-  struct strip_zone *zone;
+-  struct real_dev *tmp_dev;
+-  int blk_in_chunk, factor, chunk, chunk_size;
+-  long block, rblock;
+-
+-  factor=FACTOR(mddev);
+-  chunk_size=(1UL << FACTOR_SHIFT(factor));
+-  block=*rsector >> 1;
+-  hash=data->hash_table+(block/data->smallest->size);
+-
+-  if (hash - data->hash_table > data->nr_zones) 
+-  { 
+-        printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
+-        return -1;
+-  }
+-
+-  /* Sanity check */
+-  if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
+-  {
+-    printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
+-    return (-1);
+-  }
+-  
+-  if (block >= (hash->zone0->size +
+-              hash->zone0->zone_offset))
+-  {
+-    if (!hash->zone1)
+-    {
+-      printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
+-      return (-1);
+-    }
+-    
+-    zone=hash->zone1;
+-  }
+-  else
+-    zone=hash->zone0;
++      raid0_conf_t *conf = mddev_to_conf(mddev);
++      struct raid0_hash *hash;
++      struct strip_zone *zone;
++      mdk_rdev_t *tmp_dev;
++      int blk_in_chunk, chunksize_bits, chunk, chunk_size;
++      long block, rblock;
++
++      chunk_size = mddev->param.chunk_size >> 10;
++      chunksize_bits = ffz(~chunk_size);
++      block = *rsector >> 1;
++      hash = conf->hash_table + block / conf->smallest->size;
++
++      if (hash - conf->hash_table > conf->nr_zones) {
++              printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
++              return -1;
++      }
++
++      /* Sanity check */
++      if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
++              goto bad_map;
++ 
++      if (!hash)
++              goto bad_hash;
++
++      if (!hash->zone0)
++              goto bad_zone0;
++ 
++      if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
++              if (!hash->zone1)
++                      goto bad_zone1;
++              zone = hash->zone1;
++      } else
++              zone = hash->zone0;
+     
+-  blk_in_chunk=block & (chunk_size -1);
+-  chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
+-  tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
+-  rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
++      blk_in_chunk = block & (chunk_size -1);
++      chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
++      tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
++      rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
+   
+-  *rdev=tmp_dev->dev;
+-  *rsector=rblock<<1;
++      *rdev = tmp_dev->dev;
++      *rsector = rblock << 1;
+ 
+-  return (0);
++      return 0;
++
++bad_map:
++      printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
++      return -1;
++bad_hash:
++      printk("raid0_map bug: hash==NULL for block %ld\n", block);
++      return -1;
++bad_zone0:
++      printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
++      return -1;
++bad_zone1:
++      printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
++      return -1;
+ }
+ 
+                          
+-static int raid0_status (char *page, int minor, struct md_dev *mddev)
++static int raid0_status (char *page, mddev_t *mddev)
+ {
+-  int sz=0;
++      int sz = 0;
+ #undef MD_DEBUG
+ #ifdef MD_DEBUG
+-  int j, k;
+-  struct raid0_data *data=(struct raid0_data *) mddev->private;
++      int j, k;
++      raid0_conf_t *conf = mddev_to_conf(mddev);
+   
+-  sz+=sprintf (page+sz, "      ");
+-  for (j=0; j<data->nr_zones; j++)
+-  {
+-    sz+=sprintf (page+sz, "[z%d",
+-               data->hash_table[j].zone0-data->strip_zone);
+-    if (data->hash_table[j].zone1)
+-      sz+=sprintf (page+sz, "/z%d] ",
+-                 data->hash_table[j].zone1-data->strip_zone);
+-    else
+-      sz+=sprintf (page+sz, "] ");
+-  }
++      sz += sprintf(page + sz, "      ");
++      for (j = 0; j < conf->nr_zones; j++) {
++              sz += sprintf(page + sz, "[z%d",
++                              conf->hash_table[j].zone0 - conf->strip_zone);
++              if (conf->hash_table[j].zone1)
++                      sz += sprintf(page+sz, "/z%d] ",
++                              conf->hash_table[j].zone1 - conf->strip_zone);
++              else
++                      sz += sprintf(page+sz, "] ");
++      }
+   
+-  sz+=sprintf (page+sz, "\n");
++      sz += sprintf(page + sz, "\n");
+   
+-  for (j=0; j<data->nr_strip_zones; j++)
+-  {
+-    sz+=sprintf (page+sz, "      z%d=[", j);
+-    for (k=0; k<data->strip_zone[j].nb_dev; k++)
+-      sz+=sprintf (page+sz, "%s/",
+-                 partition_name(data->strip_zone[j].dev[k]->dev));
+-    sz--;
+-    sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
+-               data->strip_zone[j].zone_offset,
+-               data->strip_zone[j].dev_offset,
+-               data->strip_zone[j].size);
+-  }
++      for (j = 0; j < conf->nr_strip_zones; j++) {
++              sz += sprintf(page + sz, "      z%d=[", j);
++              for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
++                      sz += sprintf (page+sz, "%s/", partition_name(
++                              conf->strip_zone[j].dev[k]->dev));
++              sz--;
++              sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
++                              conf->strip_zone[j].zone_offset,
++                              conf->strip_zone[j].dev_offset,
++                              conf->strip_zone[j].size);
++      }
+ #endif
+-  sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
+-  return sz;
++      sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
++      return sz;
+ }
+ 
+-
+-static struct md_personality raid0_personality=
++static mdk_personality_t raid0_personality=
+ {
+-  "raid0",
+-  raid0_map,
+-  NULL,                               /* no special make_request */
+-  NULL,                               /* no special end_request */
+-  raid0_run,
+-  raid0_stop,
+-  raid0_status,
+-  NULL,                               /* no ioctls */
+-  0,
+-  NULL,                               /* no error_handler */
+-  NULL,                               /* hot_add_disk */
+-  NULL,                               /* hot_remove_disk */
+-  NULL                                /* mark_spare */
++      "raid0",
++      raid0_map,
++      NULL,                           /* no special make_request */
++      NULL,                           /* no special end_request */
++      raid0_run,
++      raid0_stop,
++      raid0_status,
++      NULL,                           /* no ioctls */
++      0,
++      NULL,                           /* no error_handler */
++      NULL,                           /* no diskop */
++      NULL,                           /* no stop resync */
++      NULL                            /* no restart resync */
+ };
+ 
+-
+ #ifndef MODULE
+ 
+ void raid0_init (void)
+ {
+-  register_md_personality (RAID0, &raid0_personality);
++      register_md_personality (RAID0, &raid0_personality);
+ }
+ 
+ #else
+ 
+ int init_module (void)
+ {
+-  return (register_md_personality (RAID0, &raid0_personality));
++      return (register_md_personality (RAID0, &raid0_personality));
+ }
+ 
+ void cleanup_module (void)
+ {
+-  unregister_md_personality (RAID0);
++      unregister_md_personality (RAID0);
+ }
+ 
+ #endif
++
+diff -uNr linux-orig/drivers/block/raid1.c linux/drivers/block/raid1.c
+--- linux-orig/drivers/block/raid1.c   Fri Dec  8 22:53:51 2000
++++ linux/drivers/block/raid1.c        Fri Dec  8 22:54:52 2000
+@@ -1,6 +1,6 @@
+-/************************************************************************
++/*
+  * raid1.c : Multiple Devices driver for Linux
+- *           Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
++ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+  *
+  * RAID-1 management functions.
+  *
+@@ -15,50 +15,52 @@
+  */
+ 
+ #include <linux/module.h>
+-#include <linux/locks.h>
+ #include <linux/malloc.h>
+-#include <linux/md.h>
+-#include <linux/raid1.h>
+-#include <asm/bitops.h>
++#include <linux/raid/raid1.h>
+ #include <asm/atomic.h>
+ 
+ #define MAJOR_NR MD_MAJOR
+ #define MD_DRIVER
+ #define MD_PERSONALITY
+ 
+-/*
+- * The following can be used to debug the driver
+- */
+-/*#define RAID1_DEBUG*/
+-#ifdef RAID1_DEBUG
+-#define PRINTK(x)   do { printk x; } while (0);
+-#else
+-#define PRINTK(x)   do { ; } while (0);
+-#endif
++#define MAX_LINEAR_SECTORS 128
+ 
+ #define MAX(a,b)      ((a) > (b) ? (a) : (b))
+ #define MIN(a,b)      ((a) < (b) ? (a) : (b))
+ 
+-static struct md_personality raid1_personality;
+-static struct md_thread *raid1_thread = NULL;
++static mdk_personality_t raid1_personality;
+ struct buffer_head *raid1_retry_list = NULL;
+ 
+-static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
++static void * raid1_kmalloc (int size)
++{
++      void * ptr;
++      /*
++       * now we are rather fault tolerant than nice, but
++       * there are a couple of places in the RAID code where we
++       * simply can not afford to fail an allocation because
++       * there is no failure return path (eg. make_request())
++       */
++      while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL)))
++              printk ("raid1: out of memory, retrying...\n");
++
++      memset(ptr, 0, size);
++      return ptr;
++}
++
++static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
+                       unsigned long *rsector, unsigned long size)
+ {
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+-      int i, n = raid_conf->raid_disks;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++      int i, disks = MD_SB_DISKS;
+ 
+       /*
+        * Later we do read balancing on the read side 
+        * now we use the first available disk.
+        */
+ 
+-      PRINTK(("raid1_map().\n"));
+-
+-      for (i=0; i<n; i++) {
+-              if (raid_conf->mirrors[i].operational) {
+-                      *rdev = raid_conf->mirrors[i].dev;
++      for (i = 0; i < disks; i++) {
++              if (conf->mirrors[i].operational) {
++                      *rdev = conf->mirrors[i].dev;
+                       return (0);
+               }
+       }
+@@ -67,29 +69,29 @@
+       return (-1);
+ }
+ 
+-static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+                     unsigned long *rsector, unsigned long size)
+ {
+       return 0;
+ }
+ 
+-void raid1_reschedule_retry (struct buffer_head *bh)
++static void raid1_reschedule_retry (struct buffer_head *bh)
+ {
+       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+-
+-      PRINTK(("raid1_reschedule_retry().\n"));
++      mddev_t *mddev = r1_bh->mddev;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
+ 
+       r1_bh->next_retry = raid1_retry_list;
+       raid1_retry_list = bh;
+-      md_wakeup_thread(raid1_thread);
++      md_wakeup_thread(conf->thread);
+ }
+ 
+ /*
+- * raid1_end_buffer_io() is called when we have finished servicing a mirrored
++ * raid1_end_bh_io() is called when we have finished servicing a mirrored
+  * operation and are ready to return a success/failure code to the buffer
+  * cache layer.
+  */
+-static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
++static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
+ {
+       struct buffer_head *bh = r1_bh->master_bh;
+ 
+@@ -97,8 +99,6 @@
+       kfree(r1_bh);
+ }
+ 
+-int raid1_one_error=0;
+-
+ void raid1_end_request (struct buffer_head *bh, int uptodate)
+ {
+       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+@@ -106,12 +106,7 @@
+ 
+       save_flags(flags);
+       cli();
+-      PRINTK(("raid1_end_request().\n"));
+ 
+-      if (raid1_one_error) {
+-              raid1_one_error=0;
+-              uptodate=0;
+-      }
+       /*
+        * this branch is our 'one mirror IO has finished' event handler:
+        */
+@@ -136,15 +131,11 @@
+        */
+ 
+       if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
+-
+-              PRINTK(("raid1_end_request(), read branch.\n"));
+-
+               /*
+                * we have only one buffer_head on the read side
+                */
+               if (uptodate) {
+-                      PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
+-                      raid1_end_buffer_io(r1_bh, uptodate);
++                      raid1_end_bh_io(r1_bh, uptodate);
+                       restore_flags(flags);
+                       return;
+               }
+@@ -152,71 +143,56 @@
+                * oops, read error:
+                */
+               printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
+-                               kdevname(bh->b_dev), bh->b_blocknr);
+-              raid1_reschedule_retry (bh);
++                       partition_name(bh->b_dev), bh->b_blocknr);
++              raid1_reschedule_retry(bh);
+               restore_flags(flags);
+               return;
+       }
+ 
+       /*
+-       * WRITE or WRITEA.
+-       */
+-      PRINTK(("raid1_end_request(), write branch.\n"));
+-
+-      /*
++       * WRITE:
++       *
+        * Let's see if all mirrored write operations have finished 
+-       * already [we have irqs off, so we can decrease]:
++       * already.
+        */
+ 
+-      if (!--r1_bh->remaining) {
+-              struct md_dev *mddev = r1_bh->mddev;
+-              struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+-              int i, n = raid_conf->raid_disks;
++      if (atomic_dec_and_test(&r1_bh->remaining)) {
++              int i, disks = MD_SB_DISKS;
+ 
+-              PRINTK(("raid1_end_request(), remaining == 0.\n"));
++              for ( i = 0; i < disks; i++)
++                      if (r1_bh->mirror_bh[i])
++                              kfree(r1_bh->mirror_bh[i]);
+ 
+-              for ( i=0; i<n; i++)
+-                      if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
+-
+-              raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
++              raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
+       }
+-      else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
+       restore_flags(flags);
+ }
+ 
+-/* This routine checks if the undelying device is an md device and in that
+- * case it maps the blocks before putting the request on the queue
++/*
++ * This routine checks if the undelying device is an md device
++ * and in that case it maps the blocks before putting the
++ * request on the queue
+  */
+-static inline void
+-map_and_make_request (int rw, struct buffer_head *bh)
++static void map_and_make_request (int rw, struct buffer_head *bh)
+ {
+       if (MAJOR (bh->b_rdev) == MD_MAJOR)
+-              md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
++              md_map (bh->b_rdev, &bh->b_rdev,
++                              &bh->b_rsector, bh->b_size >> 9);
+       clear_bit(BH_Lock, &bh->b_state);
+       make_request (MAJOR (bh->b_rdev), rw, bh);
+ }
+       
+-static int
+-raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
++static int raid1_make_request (mddev_t *mddev, int rw,
++                                               struct buffer_head * bh)
+ {
+-
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
+       struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
+       struct raid1_bh * r1_bh;
+-      int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
++      int disks = MD_SB_DISKS;
++      int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
+       struct mirror_info *mirror;
+ 
+-      PRINTK(("raid1_make_request().\n"));
+-
+-      while (!( /* FIXME: now we are rather fault tolerant than nice */
+-      r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
+-      ) )
+-      {
+-              printk ("raid1_make_request(#1): out of memory\n");
+-              current->policy |= SCHED_YIELD;
+-              schedule();
+-      }
+-      memset (r1_bh, 0, sizeof (struct raid1_bh));
++      r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
+ 
+ /*
+  * make_request() can abort the operation when READA or WRITEA are being
+@@ -227,43 +203,65 @@
+       if (rw == READA) rw = READ;
+       if (rw == WRITEA) rw = WRITE;
+ 
+-      if (rw == WRITE || rw == WRITEA)
+-              mark_buffer_clean(bh);          /* Too early ? */
++      if (rw == WRITE) {
++              /*
++               * Too early ?
++               */
++              mark_buffer_clean(bh);
++              /*
++               * not too early. we _first_ clean the bh, then we start
++               * the IO, then when the IO has finished, we unlock the
++               * bh and mark it uptodate. This way we do not miss the
++               * case when the bh got dirty again during the IO.
++               */
++      }
++
++      /*
++       * special flag for 'lowprio' reconstruction requests ...
++       */
++      if (buffer_lowprio(bh))
++              lowprio = 1;
+ 
+ /*
+- * i think the read and write branch should be separated completely, since we want
+- * to do read balancing on the read side for example. Comments? :) --mingo
++ * i think the read and write branch should be separated completely,
++ * since we want to do read balancing on the read side for example.
++ * Comments? :) --mingo
+  */
+ 
+       r1_bh->master_bh=bh;
+       r1_bh->mddev=mddev;
+       r1_bh->cmd = rw;
+ 
+-      if (rw==READ || rw==READA) {
+-              int last_used = raid_conf->last_used;
+-              PRINTK(("raid1_make_request(), read branch.\n"));
+-              mirror = raid_conf->mirrors + last_used;
++      if (rw==READ) {
++              int last_used = conf->last_used;
++
++              /*
++               * read balancing logic:
++               */
++              mirror = conf->mirrors + last_used;
+               bh->b_rdev = mirror->dev;
+               sectors = bh->b_size >> 9;
+-              if (bh->b_blocknr * sectors == raid_conf->next_sect) {
+-                      raid_conf->sect_count += sectors;
+-                      if (raid_conf->sect_count >= mirror->sect_limit)
++
++              if (bh->b_blocknr * sectors == conf->next_sect) {
++                      conf->sect_count += sectors;
++                      if (conf->sect_count >= mirror->sect_limit)
+                               switch_disks = 1;
+               } else
+                       switch_disks = 1;
+-              raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
+-              if (switch_disks) {
+-                      PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
+-                      raid_conf->sect_count = 0;
+-                      last_used = raid_conf->last_used = mirror->next;
++              conf->next_sect = (bh->b_blocknr + 1) * sectors;
++              /*
++               * Do not switch disks if full resync is in progress ...
++               */
++              if (switch_disks && !conf->resync_mirrors) {
++                      conf->sect_count = 0;
++                      last_used = conf->last_used = mirror->next;
+                       /*
+-                       * Do not switch to write-only disks ... resyncing
+-                       * is in progress
++                       * Do not switch to write-only disks ...
++                       * reconstruction is in progress
+                        */
+-                      while (raid_conf->mirrors[last_used].write_only)
+-                              raid_conf->last_used = raid_conf->mirrors[last_used].next;
++                      while (conf->mirrors[last_used].write_only)
++                              conf->last_used = conf->mirrors[last_used].next;
+               }
+-              PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
+               bh_req = &r1_bh->bh_req;
+               memcpy(bh_req, bh, sizeof(*bh));
+               bh_req->b_end_io = raid1_end_request;
+@@ -273,13 +271,12 @@
+       }
+ 
+       /*
+-       * WRITE or WRITEA.
++       * WRITE:
+        */
+-      PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
+ 
+-      for (i = 0; i < n; i++) {
++      for (i = 0; i < disks; i++) {
+ 
+-              if (!raid_conf->mirrors [i].operational) {
++              if (!conf->mirrors[i].operational) {
+                       /*
+                        * the r1_bh->mirror_bh[i] pointer remains NULL
+                        */
+@@ -287,89 +284,91 @@
+                       continue;
+               }
+ 
++              /*
++               * special case for reconstruction ...
++               */
++              if (lowprio && (i == conf->last_used)) {
++                      mirror_bh[i] = NULL;
++                      continue;
++              }
++ 
++      /*
++       * We should use a private pool (size depending on NR_REQUEST),
++       * to avoid writes filling up the memory with bhs
++       *
++       * Such pools are much faster than kmalloc anyways (so we waste
++       * almost nothing by not using the master bh when writing and
++       * win alot of cleanness) but for now we are cool enough. --mingo
++       *
++       * It's safe to sleep here, buffer heads cannot be used in a shared
++       * manner in the write branch. Look how we lock the buffer at the
++       * beginning of this function to grok the difference ;)
++       */
++              mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
++      /*
++       * prepare mirrored bh (fields ordered for max mem throughput):
++       */
++              mirror_bh[i]->b_blocknr    = bh->b_blocknr;
++              mirror_bh[i]->b_dev        = bh->b_dev;
++              mirror_bh[i]->b_rdev       = conf->mirrors[i].dev;
++              mirror_bh[i]->b_rsector    = bh->b_rsector;
++              mirror_bh[i]->b_state      = (1<<BH_Req) | (1<<BH_Dirty);
++              if (lowprio)
++                      mirror_bh[i]->b_state |= (1<<BH_LowPrio);
++ 
++              mirror_bh[i]->b_count      = 1;
++              mirror_bh[i]->b_size       = bh->b_size;
++              mirror_bh[i]->b_data       = bh->b_data;
++              mirror_bh[i]->b_list       = BUF_LOCKED;
++              mirror_bh[i]->b_end_io     = raid1_end_request;
++              mirror_bh[i]->b_dev_id     = r1_bh;
++  
++              r1_bh->mirror_bh[i] = mirror_bh[i];
++              sum_bhs++;
++      }
++
++      md_atomic_set(&r1_bh->remaining, sum_bhs);
++
+       /*
+-       * We should use a private pool (size depending on NR_REQUEST),
+-       * to avoid writes filling up the memory with bhs
+-       *
+-       * Such pools are much faster than kmalloc anyways (so we waste almost 
+-       * nothing by not using the master bh when writing and win alot of cleanness)
+-       *
+-       * but for now we are cool enough. --mingo
+-       *
+-       * It's safe to sleep here, buffer heads cannot be used in a shared
+-       * manner in the write branch. Look how we lock the buffer at the beginning
+-       * of this function to grok the difference ;)
+-       */
+-              while (!( /* FIXME: now we are rather fault tolerant than nice */
+-              mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
+-              ) )
+-              {
+-                      printk ("raid1_make_request(#2): out of memory\n");
+-                      current->policy |= SCHED_YIELD;
+-                      schedule();
+-              }
+-              memset (mirror_bh[i], 0, sizeof (struct buffer_head));
+-
+-      /*
+-       * prepare mirrored bh (fields ordered for max mem throughput):
+-       */
+-              mirror_bh [i]->b_blocknr    = bh->b_blocknr;
+-              mirror_bh [i]->b_dev        = bh->b_dev;
+-              mirror_bh [i]->b_rdev       = raid_conf->mirrors [i].dev;
+-              mirror_bh [i]->b_rsector    = bh->b_rsector;
+-              mirror_bh [i]->b_state      = (1<<BH_Req) | (1<<BH_Dirty);
+-              mirror_bh [i]->b_count      = 1;
+-              mirror_bh [i]->b_size       = bh->b_size;
+-              mirror_bh [i]->b_data       = bh->b_data;
+-              mirror_bh [i]->b_list       = BUF_LOCKED;
+-              mirror_bh [i]->b_end_io     = raid1_end_request;
+-              mirror_bh [i]->b_dev_id     = r1_bh;
+-
+-              r1_bh->mirror_bh[i] = mirror_bh[i];
+-              sum_bhs++;
+-      }
+-
+-      r1_bh->remaining = sum_bhs;
+-
+-      PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
+-
+-      /*
+-       * We have to be a bit careful about the semaphore above, thats why we
+-       * start the requests separately. Since kmalloc() could fail, sleep and
+-       * make_request() can sleep too, this is the safer solution. Imagine,
+-       * end_request decreasing the semaphore before we could have set it up ...
+-       * We could play tricks with the semaphore (presetting it and correcting
+-       * at the end if sum_bhs is not 'n' but we have to do end_request by hand
+-       * if all requests finish until we had a chance to set up the semaphore
+-       * correctly ... lots of races).
+-       */
+-      for (i = 0; i < n; i++)
+-              if (mirror_bh [i] != NULL)
+-                      map_and_make_request (rw, mirror_bh [i]);
++       * We have to be a bit careful about the semaphore above, thats
++       * why we start the requests separately. Since kmalloc() could
++       * fail, sleep and make_request() can sleep too, this is the
++       * safer solution. Imagine, end_request decreasing the semaphore
++       * before we could have set it up ... We could play tricks with
++       * the semaphore (presetting it and correcting at the end if
++       * sum_bhs is not 'n' but we have to do end_request by hand if
++       * all requests finish until we had a chance to set up the
++       * semaphore correctly ... lots of races).
++       */
++      for (i = 0; i < disks; i++)
++              if (mirror_bh[i])
++                      map_and_make_request(rw, mirror_bh[i]);
+ 
+       return (0);
+ }
+                          
+-static int raid1_status (char *page, int minor, struct md_dev *mddev)
++static int raid1_status (char *page, mddev_t *mddev)
+ {
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
+       int sz = 0, i;
+       
+-      sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
+-      for (i = 0; i < raid_conf->raid_disks; i++)
+-              sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
++      sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
++                                               conf->working_disks);
++      for (i = 0; i < conf->raid_disks; i++)
++              sz += sprintf (page+sz, "%s",
++                      conf->mirrors[i].operational ? "U" : "_");
+       sz += sprintf (page+sz, "]");
+       return sz;
+ }
+ 
+-static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
++static void unlink_disk (raid1_conf_t *conf, int target)
+ {
+-      int disks = raid_conf->raid_disks;
+-      int j;
++      int disks = MD_SB_DISKS;
++      int i;
+ 
+-      for (j = 0; j < disks; j++)
+-              if (raid_conf->mirrors [j].next == failed_index)
+-                      raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
++      for (i = 0; i < disks; i++)
++              if (conf->mirrors[i].next == target)
++                      conf->mirrors[i].next = conf->mirrors[target].next;
+ }
+ 
+ #define LAST_DISK KERN_ALERT \
+@@ -388,48 +387,53 @@
+ #define ALREADY_SYNCING KERN_INFO \
+ "raid1: syncing already in progress.\n"
+ 
+-static int raid1_error (struct md_dev *mddev, kdev_t dev)
++static void mark_disk_bad (mddev_t *mddev, int failed)
+ {
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+-      struct mirror_info *mirror;
+-      md_superblock_t *sb = mddev->sb;
+-      int disks = raid_conf->raid_disks;
+-      int i;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++      struct mirror_info *mirror = conf->mirrors+failed;
++      mdp_super_t *sb = mddev->sb;
++
++      mirror->operational = 0;
++      unlink_disk(conf, failed);
++      mark_disk_faulty(sb->disks+mirror->number);
++      mark_disk_nonsync(sb->disks+mirror->number);
++      mark_disk_inactive(sb->disks+mirror->number);
++      sb->active_disks--;
++      sb->working_disks--;
++      sb->failed_disks++;
++      mddev->sb_dirty = 1;
++      md_wakeup_thread(conf->thread);
++      conf->working_disks--;
++      printk (DISK_FAILED, partition_name (mirror->dev),
++                               conf->working_disks);
++}
+ 
+-      PRINTK(("raid1_error called\n"));
++static int raid1_error (mddev_t *mddev, kdev_t dev)
++{
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++      struct mirror_info * mirrors = conf->mirrors;
++      int disks = MD_SB_DISKS;
++      int i;
+ 
+-      if (raid_conf->working_disks == 1) {
++      if (conf->working_disks == 1) {
+               /*
+                * Uh oh, we can do nothing if this is our last disk, but
+                * first check if this is a queued request for a device
+                * which has just failed.
+                */
+-              for (i = 0, mirror = raid_conf->mirrors; i < disks;
+-                               i++, mirror++)
+-                      if (mirror->dev == dev && !mirror->operational)
++              for (i = 0; i < disks; i++) {
++                      if (mirrors[i].dev==dev && !mirrors[i].operational)
+                               return 0;
++              }
+               printk (LAST_DISK);
+       } else {
+-              /* Mark disk as unusable */
+-              for (i = 0, mirror = raid_conf->mirrors; i < disks;
+-                               i++, mirror++) {
+-                      if (mirror->dev == dev && mirror->operational){
+-                              mirror->operational = 0;
+-                              raid1_fix_links (raid_conf, i);
+-                              sb->disks[mirror->number].state |=
+-                                              (1 << MD_FAULTY_DEVICE);
+-                              sb->disks[mirror->number].state &=
+-                                              ~(1 << MD_SYNC_DEVICE);
+-                              sb->disks[mirror->number].state &=
+-                                              ~(1 << MD_ACTIVE_DEVICE);
+-                              sb->active_disks--;
+-                              sb->working_disks--;
+-                              sb->failed_disks++;
+-                              mddev->sb_dirty = 1;
+-                              md_wakeup_thread(raid1_thread);
+-                              raid_conf->working_disks--;
+-                              printk (DISK_FAILED, kdevname (dev),
+-                                              raid_conf->working_disks);
++              /*
++               * Mark disk as unusable
++               */
++              for (i = 0; i < disks; i++) {
++                      if (mirrors[i].dev==dev && mirrors[i].operational) {
++                              mark_disk_bad (mddev, i);
++                              break;
+                       }
+               }
+       }
+@@ -442,219 +446,396 @@
+ #undef START_SYNCING
+ 
+ /*
+- * This is the personality-specific hot-addition routine
++ * Insert the spare disk into the drive-ring
+  */
++static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
++{
++      int j, next;
++      int disks = MD_SB_DISKS;
++      struct mirror_info *p = conf->mirrors;
++
++      for (j = 0; j < disks; j++, p++)
++              if (p->operational && !p->write_only) {
++                      next = p->next;
++                      p->next = mirror->raid_disk;
++                      mirror->next = next;
++                      return;
++              }
+ 
+-#define NO_SUPERBLOCK KERN_ERR \
+-"raid1: cannot hot-add disk to the array with no RAID superblock\n"
++      printk("raid1: bug: no read-operational devices\n");
++}
+ 
+-#define WRONG_LEVEL KERN_ERR \
+-"raid1: hot-add: level of disk is not RAID-1\n"
++static void print_raid1_conf (raid1_conf_t *conf)
++{
++      int i;
++      struct mirror_info *tmp;
+ 
+-#define HOT_ADD_SUCCEEDED KERN_INFO \
+-"raid1: device %s hot-added\n"
++      printk("RAID1 conf printout:\n");
++      if (!conf) {
++              printk("(conf==NULL)\n");
++              return;
++      }
++      printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
++                       conf->raid_disks, conf->nr_disks);
+ 
+-static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              tmp = conf->mirrors + i;
++              printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
++                      i, tmp->spare,tmp->operational,
++                      tmp->number,tmp->raid_disk,tmp->used_slot,
++                      partition_name(tmp->dev));
++      }
++}
++
++static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+ {
++      int err = 0;
++      int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
++      raid1_conf_t *conf = mddev->private;
++      struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+       unsigned long flags;
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
+-      struct mirror_info *mirror;
+-      md_superblock_t *sb = mddev->sb;
+-      struct real_dev * realdev;
+-      int n;
++      mdp_super_t *sb = mddev->sb;
++      mdp_disk_t *failed_desc, *spare_desc, *added_desc;
++
++      save_flags(flags);
++      cli();
+ 
++      print_raid1_conf(conf);
+       /*
+-       * The device has its superblock already read and it was found
+-       * to be consistent for generic RAID usage.  Now we check whether
+-       * it's usable for RAID-1 hot addition.
++       * find the disk ...
+        */
++      switch (state) {
+ 
+-      n = mddev->nb_dev++;
+-      realdev = &mddev->devices[n];
+-      if (!realdev->sb) {
+-              printk (NO_SUPERBLOCK);
+-              return -EINVAL;
+-      }
+-      if (realdev->sb->level != 1) {
+-              printk (WRONG_LEVEL);
+-              return -EINVAL;
++      case DISKOP_SPARE_ACTIVE:
++
++              /*
++               * Find the failed disk within the RAID1 configuration ...
++               * (this can only be in the first conf->working_disks part)
++               */
++              for (i = 0; i < conf->raid_disks; i++) {
++                      tmp = conf->mirrors + i;
++                      if ((!tmp->operational && !tmp->spare) ||
++                                      !tmp->used_slot) {
++                              failed_disk = i;
++                              break;
++                      }
++              }
++              /*
++               * When we activate a spare disk we _must_ have a disk in
++               * the lower (active) part of the array to replace. 
++               */
++              if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              /* fall through */
++
++      case DISKOP_SPARE_WRITE:
++      case DISKOP_SPARE_INACTIVE:
++
++              /*
++               * Find the spare disk ... (can only be in the 'high'
++               * area of the array)
++               */
++              for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++                      tmp = conf->mirrors + i;
++                      if (tmp->spare && tmp->number == (*d)->number) {
++                              spare_disk = i;
++                              break;
++                      }
++              }
++              if (spare_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
++
++      case DISKOP_HOT_REMOVE_DISK:
++
++              for (i = 0; i < MD_SB_DISKS; i++) {
++                      tmp = conf->mirrors + i;
++                      if (tmp->used_slot && (tmp->number == (*d)->number)) {
++                              if (tmp->operational) {
++                                      err = -EBUSY;
++                                      goto abort;
++                              }
++                              removed_disk = i;
++                              break;
++                      }
++              }
++              if (removed_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
++
++      case DISKOP_HOT_ADD_DISK:
++
++              for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++                      tmp = conf->mirrors + i;
++                      if (!tmp->used_slot) {
++                              added_disk = i;
++                              break;
++                      }
++              }
++              if (added_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
+       }
+-      /* FIXME: are there other things left we could sanity-check? */
+ 
++      switch (state) {
+       /*
+-       * We have to disable interrupts, as our RAID-1 state is used
+-       * from irq handlers as well.
++       * Switch the spare disk to write-only mode:
+        */
+-      save_flags(flags);
+-      cli();
++      case DISKOP_SPARE_WRITE:
++              sdisk = conf->mirrors + spare_disk;
++              sdisk->operational = 1;
++              sdisk->write_only = 1;
++              break;
++      /*
++       * Deactivate a spare disk:
++       */
++      case DISKOP_SPARE_INACTIVE:
++              sdisk = conf->mirrors + spare_disk;
++              sdisk->operational = 0;
++              sdisk->write_only = 0;
++              break;
++      /*
++       * Activate (mark read-write) the (now sync) spare disk,
++       * which means we switch it's 'raid position' (->raid_disk)
++       * with the failed disk. (only the first 'conf->nr_disks'
++       * slots are used for 'real' disks and we must preserve this
++       * property)
++       */
++      case DISKOP_SPARE_ACTIVE:
+ 
+-      raid_conf->raid_disks++;
+-      mirror = raid_conf->mirrors+n;
++              sdisk = conf->mirrors + spare_disk;
++              fdisk = conf->mirrors + failed_disk;
+ 
+-      mirror->number=n;
+-      mirror->raid_disk=n;
+-      mirror->dev=dev;
+-      mirror->next=0; /* FIXME */
+-      mirror->sect_limit=128;
+-
+-      mirror->operational=0;
+-      mirror->spare=1;
+-      mirror->write_only=0;
+-
+-      sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
+-      sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
+-      sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
+-      sb->nr_disks++;
+-      sb->spare_disks++;
++              spare_desc = &sb->disks[sdisk->number];
++              failed_desc = &sb->disks[fdisk->number];
+ 
+-      restore_flags(flags);
++              if (spare_desc != *d) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
+ 
+-      md_update_sb(MINOR(dev));
++              if (spare_desc->raid_disk != sdisk->raid_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++                      
++              if (sdisk->raid_disk != spare_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
+ 
+-      printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
++              if (failed_desc->raid_disk != fdisk->raid_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
+ 
+-      return 0;
+-}
++              if (fdisk->raid_disk != failed_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
+ 
+-#undef NO_SUPERBLOCK
+-#undef WRONG_LEVEL
+-#undef HOT_ADD_SUCCEEDED
++              /*
++               * do the switch finally
++               */
++              xchg_values(*spare_desc, *failed_desc);
++              xchg_values(*fdisk, *sdisk);
+ 
+-/*
+- * Insert the spare disk into the drive-ring
+- */
+-static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
+-{
+-      int j, next;
+-      struct mirror_info *p = raid_conf->mirrors;
++              /*
++               * (careful, 'failed' and 'spare' are switched from now on)
++               *
++               * we want to preserve linear numbering and we want to
++               * give the proper raid_disk number to the now activated
++               * disk. (this means we switch back these values)
++               */
++      
++              xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
++              xchg_values(sdisk->raid_disk, fdisk->raid_disk);
++              xchg_values(spare_desc->number, failed_desc->number);
++              xchg_values(sdisk->number, fdisk->number);
+ 
+-      for (j = 0; j < raid_conf->raid_disks; j++, p++)
+-              if (p->operational && !p->write_only) {
+-                      next = p->next;
+-                      p->next = mirror->raid_disk;
+-                      mirror->next = next;
+-                      return;
+-              }
+-      printk("raid1: bug: no read-operational devices\n");
+-}
++              *d = failed_desc;
+ 
+-static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
+-                              int state)
+-{
+-      int i = 0, failed_disk = -1;
+-      struct raid1_data *raid_conf = mddev->private;
+-      struct mirror_info *mirror = raid_conf->mirrors;
+-      md_descriptor_t *descriptor;
+-      unsigned long flags;
++              if (sdisk->dev == MKDEV(0,0))
++                      sdisk->used_slot = 0;
++              /*
++               * this really activates the spare.
++               */
++              fdisk->spare = 0;
++              fdisk->write_only = 0;
++              link_disk(conf, fdisk);
+ 
+-      for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
+-              if (mirror->spare && mirror->number == spare->number)
+-                      goto found;
+-      }
+-      return 1;
+-found:
+-      for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
+-                                                              i++, mirror++)
+-              if (!mirror->operational)
+-                      failed_disk = i;
++              /*
++               * if we activate a spare, we definitely replace a
++               * non-operational disk slot in the 'low' area of
++               * the disk array.
++               */
+ 
+-      save_flags(flags);
+-      cli();
+-      switch (state) {
+-              case SPARE_WRITE:
+-                      mirror->operational = 1;
+-                      mirror->write_only = 1;
+-                      raid_conf->raid_disks = MAX(raid_conf->raid_disks,
+-                                                      mirror->raid_disk + 1);
+-                      break;
+-              case SPARE_INACTIVE:
+-                      mirror->operational = 0;
+-                      mirror->write_only = 0;
+-                      break;
+-              case SPARE_ACTIVE:
+-                      mirror->spare = 0;
+-                      mirror->write_only = 0;
+-                      raid_conf->working_disks++;
+-                      add_ring(raid_conf, mirror);
+-
+-                      if (failed_disk != -1) {
+-                              descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
+-                              i = spare->raid_disk;
+-                              spare->raid_disk = descriptor->raid_disk;
+-                              descriptor->raid_disk = i;
+-                      }
+-                      break;
+-              default:
+-                      printk("raid1_mark_spare: bug: state == %d\n", state);
+-                      restore_flags(flags);
+-                      return 1;
++              conf->working_disks++;
++
++              break;
++
++      case DISKOP_HOT_REMOVE_DISK:
++              rdisk = conf->mirrors + removed_disk;
++
++              if (rdisk->spare && (removed_disk < conf->raid_disks)) {
++                      MD_BUG();       
++                      err = 1;
++                      goto abort;
++              }
++              rdisk->dev = MKDEV(0,0);
++              rdisk->used_slot = 0;
++              conf->nr_disks--;
++              break;
++
++      case DISKOP_HOT_ADD_DISK:
++              adisk = conf->mirrors + added_disk;
++              added_desc = *d;
++
++              if (added_disk != added_desc->number) {
++                      MD_BUG();       
++                      err = 1;
++                      goto abort;
++              }
++
++              adisk->number = added_desc->number;
++              adisk->raid_disk = added_desc->raid_disk;
++              adisk->dev = MKDEV(added_desc->major,added_desc->minor);
++
++              adisk->operational = 0;
++              adisk->write_only = 0;
++              adisk->spare = 1;
++              adisk->used_slot = 1;
++              conf->nr_disks++;
++
++              break;
++
++      default:
++              MD_BUG();       
++              err = 1;
++              goto abort;
+       }
++abort:
+       restore_flags(flags);
+-      return 0;
++      print_raid1_conf(conf);
++      return err;
+ }
+ 
++
++#define IO_ERROR KERN_ALERT \
++"raid1: %s: unrecoverable I/O read error for block %lu\n"
++
++#define REDIRECT_SECTOR KERN_ERR \
++"raid1: %s: redirecting sector %lu to another mirror\n"
++
+ /*
+  * This is a kernel thread which:
+  *
+  *    1.      Retries failed read operations on working mirrors.
+  *    2.      Updates the raid superblock when problems encounter.
+  */
+-void raid1d (void *data)
++static void raid1d (void *data)
+ {
+       struct buffer_head *bh;
+       kdev_t dev;
+       unsigned long flags;
+-      struct raid1_bh * r1_bh;
+-      struct md_dev *mddev;
++      struct raid1_bh *r1_bh;
++      mddev_t *mddev;
+ 
+-      PRINTK(("raid1d() active\n"));
+-      save_flags(flags);
+-      cli();
+       while (raid1_retry_list) {
++              save_flags(flags);
++              cli();
+               bh = raid1_retry_list;
+               r1_bh = (struct raid1_bh *)(bh->b_dev_id);
+               raid1_retry_list = r1_bh->next_retry;
+               restore_flags(flags);
+ 
+-              mddev = md_dev + MINOR(bh->b_dev);
++              mddev = kdev_to_mddev(bh->b_dev);
+               if (mddev->sb_dirty) {
+-                      printk("dirty sb detected, updating.\n");
++                      printk(KERN_INFO "dirty sb detected, updating.\n");
+                       mddev->sb_dirty = 0;
+-                      md_update_sb(MINOR(bh->b_dev));
++                      md_update_sb(mddev);
+               }
+               dev = bh->b_rdev;
+-              __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
++              __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
++                                                       bh->b_size >> 9);
+               if (bh->b_rdev == dev) {
+-                      printk (KERN_ALERT 
+-                                      "raid1: %s: unrecoverable I/O read error for block %lu\n",
+-                                              kdevname(bh->b_dev), bh->b_blocknr);
+-                      raid1_end_buffer_io(r1_bh, 0);
++                      printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
++                      raid1_end_bh_io(r1_bh, 0);
+               } else {
+-                      printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", 
+-                                        kdevname(bh->b_dev), bh->b_blocknr);
++                      printk (REDIRECT_SECTOR,
++                              partition_name(bh->b_dev), bh->b_blocknr);
+                       map_and_make_request (r1_bh->cmd, bh);
+               }
+-              cli();
+       }
+-      restore_flags(flags);
++}
++#undef IO_ERROR
++#undef REDIRECT_SECTOR
++
++/*
++ * Private kernel thread to reconstruct mirrors after an unclean
++ * shutdown.
++ */
++static void raid1syncd (void *data)
++{
++        raid1_conf_t *conf = data;
++        mddev_t *mddev = conf->mddev;
++
++        if (!conf->resync_mirrors)
++                return;
++        if (conf->resync_mirrors == 2)
++                return;
++      down(&mddev->recovery_sem);
++        if (md_do_sync(mddev, NULL)) {
++              up(&mddev->recovery_sem);
++              return;
++      }
++      /*
++       * Only if everything went Ok.
++       */
++        conf->resync_mirrors = 0;
++      up(&mddev->recovery_sem);
+ }
+ 
++
+ /*
+  * This will catch the scenario in which one of the mirrors was
+  * mounted as a normal device rather than as a part of a raid set.
++ *
++ * check_consistency is very personality-dependent, eg. RAID5 cannot
++ * do this check, it uses another method.
+  */
+-static int __check_consistency (struct md_dev *mddev, int row)
++static int __check_consistency (mddev_t *mddev, int row)
+ {
+-      struct raid1_data *raid_conf = mddev->private;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++      int disks = MD_SB_DISKS;
+       kdev_t dev;
+       struct buffer_head *bh = NULL;
+       int i, rc = 0;
+       char *buffer = NULL;
+ 
+-      for (i = 0; i < raid_conf->raid_disks; i++) {
+-              if (!raid_conf->mirrors[i].operational)
++      for (i = 0; i < disks; i++) {
++              printk("(checking disk %d)\n",i);
++              if (!conf->mirrors[i].operational)
+                       continue;
+-              dev = raid_conf->mirrors[i].dev;
++              printk("(really checking disk %d)\n",i);
++              dev = conf->mirrors[i].dev;
+               set_blocksize(dev, 4096);
+               if ((bh = bread(dev, row / 4, 4096)) == NULL)
+                       break;
+@@ -683,167 +864,342 @@
+       return rc;
+ }
+ 
+-static int check_consistency (struct md_dev *mddev)
++static int check_consistency (mddev_t *mddev)
+ {
+-      int size = mddev->sb->size;
+-      int row;
++      if (__check_consistency(mddev, 0))
++/*
++ * we do not do this currently, as it's perfectly possible to
++ * have an inconsistent array when it's freshly created. Only
++ * newly written data has to be consistent.
++ */
++              return 0;
+ 
+-      for (row = 0; row < size; row += size / 8)
+-              if (__check_consistency(mddev, row))
+-                      return 1;
+       return 0;
+ }
+ 
+-static int raid1_run (int minor, struct md_dev *mddev)
++#define INVALID_LEVEL KERN_WARNING \
++"raid1: md%d: raid level not set to mirroring (%d)\n"
++
++#define NO_SB KERN_ERR \
++"raid1: disabled mirror %s (couldn't access raid superblock)\n"
++
++#define ERRORS KERN_ERR \
++"raid1: disabled mirror %s (errors detected)\n"
++
++#define NOT_IN_SYNC KERN_ERR \
++"raid1: disabled mirror %s (not in sync)\n"
++
++#define INCONSISTENT KERN_ERR \
++"raid1: disabled mirror %s (inconsistent descriptor)\n"
++
++#define ALREADY_RUNNING KERN_ERR \
++"raid1: disabled mirror %s (mirror %d already operational)\n"
++
++#define OPERATIONAL KERN_INFO \
++"raid1: device %s operational as mirror %d\n"
++
++#define MEM_ERROR KERN_ERR \
++"raid1: couldn't allocate memory for md%d\n"
++
++#define SPARE KERN_INFO \
++"raid1: spare disk %s\n"
++
++#define NONE_OPERATIONAL KERN_ERR \
++"raid1: no operational mirrors for md%d\n"
++
++#define RUNNING_CKRAID KERN_ERR \
++"raid1: detected mirror differences -- running resync\n"
++
++#define ARRAY_IS_ACTIVE KERN_INFO \
++"raid1: raid set md%d active with %d out of %d mirrors\n"
++
++#define THREAD_ERROR KERN_ERR \
++"raid1: couldn't allocate thread for md%d\n"
++
++#define START_RESYNC KERN_WARNING \
++"raid1: raid set md%d not clean; reconstructing mirrors\n"
++
++static int raid1_run (mddev_t *mddev)
+ {
+-      struct raid1_data *raid_conf;
+-      int i, j, raid_disk;
+-      md_superblock_t *sb = mddev->sb;
+-      md_descriptor_t *descriptor;
+-      struct real_dev *realdev;
++      raid1_conf_t *conf;
++      int i, j, disk_idx;
++      struct mirror_info *disk;
++      mdp_super_t *sb = mddev->sb;
++      mdp_disk_t *descriptor;
++      mdk_rdev_t *rdev;
++      struct md_list_head *tmp;
++      int start_recovery = 0;
+ 
+       MOD_INC_USE_COUNT;
+ 
+       if (sb->level != 1) {
+-              printk("raid1: %s: raid level not set to mirroring (%d)\n",
+-                              kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
+-              MOD_DEC_USE_COUNT;
+-              return -EIO;
+-      }
+-      /****
+-       * copy the now verified devices into our private RAID1 bookkeeping
+-       * area. [whatever we allocate in raid1_run(), should be freed in
+-       * raid1_stop()]
++              printk(INVALID_LEVEL, mdidx(mddev), sb->level);
++              goto out;
++      }
++      /*
++       * copy the already verified devices into our private RAID1
++       * bookkeeping area. [whatever we allocate in raid1_run(),
++       * should be freed in raid1_stop()]
+        */
+ 
+-      while (!( /* FIXME: now we are rather fault tolerant than nice */
+-      mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
+-      ) )
+-      {
+-              printk ("raid1_run(): out of memory\n");
+-              current->policy |= SCHED_YIELD;
+-              schedule();
+-      }
+-      raid_conf = mddev->private;
+-      memset(raid_conf, 0, sizeof(*raid_conf));
+-
+-      PRINTK(("raid1_run(%d) called.\n", minor));
+-
+-      for (i = 0; i < mddev->nb_dev; i++) {
+-              realdev = &mddev->devices[i];
+-              if (!realdev->sb) {
+-                      printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
++      conf = raid1_kmalloc(sizeof(raid1_conf_t));
++      mddev->private = conf;
++      if (!conf) {
++              printk(MEM_ERROR, mdidx(mddev));
++              goto out;
++      }
++
++      ITERATE_RDEV(mddev,rdev,tmp) {
++              if (rdev->faulty) {
++                      printk(ERRORS, partition_name(rdev->dev));
++              } else {
++                      if (!rdev->sb) {
++                              MD_BUG();
++                              continue;
++                      }
++              }
++              if (rdev->desc_nr == -1) {
++                      MD_BUG();
+                       continue;
+               }
+-
+-              /*
+-               * This is important -- we are using the descriptor on
+-               * the disk only to get a pointer to the descriptor on
+-               * the main superblock, which might be more recent.
+-               */
+-              descriptor = &sb->disks[realdev->sb->descriptor.number];
+-              if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
+-                      printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
++              descriptor = &sb->disks[rdev->desc_nr];
++              disk_idx = descriptor->raid_disk;
++              disk = conf->mirrors + disk_idx;
++
++              if (disk_faulty(descriptor)) {
++                      disk->number = descriptor->number;
++                      disk->raid_disk = disk_idx;
++                      disk->dev = rdev->dev;
++                      disk->sect_limit = MAX_LINEAR_SECTORS;
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 0;
++                      disk->used_slot = 1;
+                       continue;
+               }
+-              if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
+-                      if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
+-                              printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
++              if (disk_active(descriptor)) {
++                      if (!disk_sync(descriptor)) {
++                              printk(NOT_IN_SYNC,
++                                      partition_name(rdev->dev));
+                               continue;
+                       }
+-                      raid_disk = descriptor->raid_disk;
+-                      if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
+-                              printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
++                      if ((descriptor->number > MD_SB_DISKS) ||
++                                       (disk_idx > sb->raid_disks)) {
++
++                              printk(INCONSISTENT,
++                                      partition_name(rdev->dev));
+                               continue;
+                       }
+-                      if (raid_conf->mirrors[raid_disk].operational) {
+-                              printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
++                      if (disk->operational) {
++                              printk(ALREADY_RUNNING,
++                                      partition_name(rdev->dev),
++                                      disk_idx);
+                               continue;
+                       }
+-                      printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
+-                      raid_conf->mirrors[raid_disk].number = descriptor->number;
+-                      raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
+-                      raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
+-                      raid_conf->mirrors[raid_disk].operational = 1;
+-                      raid_conf->mirrors[raid_disk].sect_limit = 128;
+-                      raid_conf->working_disks++;
++                      printk(OPERATIONAL, partition_name(rdev->dev),
++                                      disk_idx);
++                      disk->number = descriptor->number;
++                      disk->raid_disk = disk_idx;
++                      disk->dev = rdev->dev;
++                      disk->sect_limit = MAX_LINEAR_SECTORS;
++                      disk->operational = 1;
++                      disk->write_only = 0;
++                      disk->spare = 0;
++                      disk->used_slot = 1;
++                      conf->working_disks++;
+               } else {
+               /*
+                * Must be a spare disk ..
+                */
+-                      printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
+-                      raid_disk = descriptor->raid_disk;
+-                      raid_conf->mirrors[raid_disk].number = descriptor->number;
+-                      raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
+-                      raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
+-                      raid_conf->mirrors[raid_disk].sect_limit = 128;
+-
+-                      raid_conf->mirrors[raid_disk].operational = 0;
+-                      raid_conf->mirrors[raid_disk].write_only = 0;
+-                      raid_conf->mirrors[raid_disk].spare = 1;
+-              }
+-      }
+-      if (!raid_conf->working_disks) {
+-              printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+-              kfree(raid_conf);
+-              mddev->private = NULL;
+-              MOD_DEC_USE_COUNT;
+-              return -EIO;
+-      }
+-
+-      raid_conf->raid_disks = sb->raid_disks;
+-      raid_conf->mddev = mddev;
+-
+-      for (j = 0; !raid_conf->mirrors[j].operational; j++);
+-      raid_conf->last_used = j;
+-      for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
+-              if (raid_conf->mirrors[i].operational) {
+-                      PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
+-                      raid_conf->mirrors[i].next = j;
++                      printk(SPARE, partition_name(rdev->dev));
++                      disk->number = descriptor->number;
++                      disk->raid_disk = disk_idx;
++                      disk->dev = rdev->dev;
++                      disk->sect_limit = MAX_LINEAR_SECTORS;
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 1;
++                      disk->used_slot = 1;
++              }
++      }
++      if (!conf->working_disks) {
++              printk(NONE_OPERATIONAL, mdidx(mddev));
++              goto out_free_conf;
++      }
++
++      conf->raid_disks = sb->raid_disks;
++      conf->nr_disks = sb->nr_disks;
++      conf->mddev = mddev;
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              
++              descriptor = sb->disks+i;
++              disk_idx = descriptor->raid_disk;
++              disk = conf->mirrors + disk_idx;
++
++              if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
++                              !disk->used_slot) {
++
++                      disk->number = descriptor->number;
++                      disk->raid_disk = disk_idx;
++                      disk->dev = MKDEV(0,0);
++
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 0;
++                      disk->used_slot = 1;
++              }
++      }
++
++      /*
++       * find the first working one and use it as a starting point
++       * to read balancing.
++       */
++      for (j = 0; !conf->mirrors[j].operational; j++)
++              /* nothing */;
++      conf->last_used = j;
++
++      /*
++       * initialize the 'working disks' list.
++       */
++      for (i = conf->raid_disks - 1; i >= 0; i--) {
++              if (conf->mirrors[i].operational) {
++                      conf->mirrors[i].next = j;
+                       j = i;
+               }
+       }
+ 
+-      if (check_consistency(mddev)) {
+-              printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
+-              sb->state |= 1 << MD_SB_ERRORS;
+-              kfree(raid_conf);
+-              mddev->private = NULL;
+-              MOD_DEC_USE_COUNT;
+-              return -EIO;
++      if (conf->working_disks != sb->raid_disks) {
++              printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
++              start_recovery = 1;
+       }
+ 
++      if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
++              /*
++               * we do sanity checks even if the device says
++               * it's clean ...
++               */
++              if (check_consistency(mddev)) {
++                      printk(RUNNING_CKRAID);
++                      sb->state &= ~(1 << MD_SB_CLEAN);
++              }
++      }
++
++      {
++              const char * name = "raid1d";
++
++              conf->thread = md_register_thread(raid1d, conf, name);
++              if (!conf->thread) {
++                      printk(THREAD_ERROR, mdidx(mddev));
++                      goto out_free_conf;
++              }
++      }
++
++      if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
++              const char * name = "raid1syncd";
++
++              conf->resync_thread = md_register_thread(raid1syncd, conf,name);
++              if (!conf->resync_thread) {
++                      printk(THREAD_ERROR, mdidx(mddev));
++                      goto out_free_conf;
++              }
++
++              printk(START_RESYNC, mdidx(mddev));
++                conf->resync_mirrors = 1;
++                md_wakeup_thread(conf->resync_thread);
++        }
++
+       /*
+        * Regenerate the "device is in sync with the raid set" bit for
+        * each device.
+        */
+-      for (i = 0; i < sb->nr_disks ; i++) {
+-              sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              mark_disk_nonsync(sb->disks+i);
+               for (j = 0; j < sb->raid_disks; j++) {
+-                      if (!raid_conf->mirrors[j].operational)
++                      if (!conf->mirrors[j].operational)
+                               continue;
+-                      if (sb->disks[i].number == raid_conf->mirrors[j].number)
+-                              sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
++                      if (sb->disks[i].number == conf->mirrors[j].number)
++                              mark_disk_sync(sb->disks+i);
+               }
+       }
+-      sb->active_disks = raid_conf->working_disks;
++      sb->active_disks = conf->working_disks;
+ 
+-      printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
+-      /* Ok, everything is just fine now */
+-      return (0);
++      if (start_recovery)
++              md_recover_arrays();
++
++
++      printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
++      /*
++       * Ok, everything is just fine now
++       */
++      return 0;
++
++out_free_conf:
++      kfree(conf);
++      mddev->private = NULL;
++out:
++      MOD_DEC_USE_COUNT;
++      return -EIO;
++}
++
++#undef INVALID_LEVEL
++#undef NO_SB
++#undef ERRORS
++#undef NOT_IN_SYNC
++#undef INCONSISTENT
++#undef ALREADY_RUNNING
++#undef OPERATIONAL
++#undef SPARE
++#undef NONE_OPERATIONAL
++#undef RUNNING_CKRAID
++#undef ARRAY_IS_ACTIVE
++
++static int raid1_stop_resync (mddev_t *mddev)
++{
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++
++      if (conf->resync_thread) {
++              if (conf->resync_mirrors) {
++                      conf->resync_mirrors = 2;
++                      md_interrupt_thread(conf->resync_thread);
++                      printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
++                      return 1;
++              }
++              return 0;
++      }
++      return 0;
++}
++
++static int raid1_restart_resync (mddev_t *mddev)
++{
++      raid1_conf_t *conf = mddev_to_conf(mddev);
++
++      if (conf->resync_mirrors) {
++              if (!conf->resync_thread) {
++                      MD_BUG();
++                      return 0;
++              }
++              conf->resync_mirrors = 1;
++              md_wakeup_thread(conf->resync_thread);
++              return 1;
++      }
++      return 0;
+ }
+ 
+-static int raid1_stop (int minor, struct md_dev *mddev)
++static int raid1_stop (mddev_t *mddev)
+ {
+-      struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
++      raid1_conf_t *conf = mddev_to_conf(mddev);
+ 
+-      kfree (raid_conf);
++      md_unregister_thread(conf->thread);
++      if (conf->resync_thread)
++              md_unregister_thread(conf->resync_thread);
++      kfree(conf);
+       mddev->private = NULL;
+       MOD_DEC_USE_COUNT;
+       return 0;
+ }
+ 
+-static struct md_personality raid1_personality=
++static mdk_personality_t raid1_personality=
+ {
+       "raid1",
+       raid1_map,
+@@ -855,15 +1211,13 @@
+       NULL,                   /* no ioctls */
+       0,
+       raid1_error,
+-      raid1_hot_add_disk,
+-      /* raid1_hot_remove_drive */ NULL,
+-      raid1_mark_spare
++      raid1_diskop,
++      raid1_stop_resync,
++      raid1_restart_resync
+ };
+ 
+ int raid1_init (void)
+ {
+-      if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
+-              return -EBUSY;
+       return register_md_personality (RAID1, &raid1_personality);
+ }
+ 
+@@ -875,7 +1229,6 @@
+ 
+ void cleanup_module (void)
+ {
+-      md_unregister_thread (raid1_thread);
+       unregister_md_personality (RAID1);
+ }
+ #endif
+diff -uNr linux-orig/drivers/block/raid5.c linux/drivers/block/raid5.c
+--- linux-orig/drivers/block/raid5.c   Fri May  8 00:17:13 1998
++++ linux/drivers/block/raid5.c        Fri Dec  8 22:54:52 2000
+@@ -1,4 +1,4 @@
+-/*****************************************************************************
++/*
+  * raid5.c : Multiple Devices driver for Linux
+  *           Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+  *
+@@ -14,16 +14,15 @@
+  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+  */
+ 
++
+ #include <linux/module.h>
+ #include <linux/locks.h>
+ #include <linux/malloc.h>
+-#include <linux/md.h>
+-#include <linux/raid5.h>
++#include <linux/raid/raid5.h>
+ #include <asm/bitops.h>
+ #include <asm/atomic.h>
+-#include <asm/md.h>
+ 
+-static struct md_personality raid5_personality;
++static mdk_personality_t raid5_personality;
+ 
+ /*
+  * Stripe cache
+@@ -33,7 +32,7 @@
+ #define HASH_PAGES_ORDER      0
+ #define NR_HASH                       (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+ #define HASH_MASK             (NR_HASH - 1)
+-#define stripe_hash(raid_conf, sect, size)    ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
++#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
+ 
+ /*
+  * The following can be used to debug the driver
+@@ -46,6 +45,8 @@
+ #define PRINTK(x)   do { ; } while (0)
+ #endif
+ 
++static void print_raid5_conf (raid5_conf_t *conf);
++
+ static inline int stripe_locked(struct stripe_head *sh)
+ {
+       return test_bit(STRIPE_LOCKED, &sh->state);
+@@ -61,32 +62,32 @@
+  */
+ static inline void lock_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
++      raid5_conf_t *conf = sh->raid_conf;
++      if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
+               PRINTK(("locking stripe %lu\n", sh->sector));
+-              raid_conf->nr_locked_stripes++;
++              conf->nr_locked_stripes++;
+       }
+ }
+ 
+ static inline void unlock_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
++      raid5_conf_t *conf = sh->raid_conf;
++      if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
+               PRINTK(("unlocking stripe %lu\n", sh->sector));
+-              raid_conf->nr_locked_stripes--;
++              conf->nr_locked_stripes--;
+               wake_up(&sh->wait);
+       }
+ }
+ 
+ static inline void finish_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
++      raid5_conf_t *conf = sh->raid_conf;
+       unlock_stripe(sh);
+       sh->cmd = STRIPE_NONE;
+       sh->phase = PHASE_COMPLETE;
+-      raid_conf->nr_pending_stripes--;
+-      raid_conf->nr_cached_stripes++;
+-      wake_up(&raid_conf->wait_for_stripe);
++      conf->nr_pending_stripes--;
++      conf->nr_cached_stripes++;
++      wake_up(&conf->wait_for_stripe);
+ }
+ 
+ void __wait_on_stripe(struct stripe_head *sh)
+@@ -114,7 +115,7 @@
+               __wait_on_stripe(sh);
+ }
+ 
+-static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
++static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+       PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
+ 
+@@ -123,21 +124,22 @@
+                       sh->hash_next->hash_pprev = sh->hash_pprev;
+               *sh->hash_pprev = sh->hash_next;
+               sh->hash_pprev = NULL;
+-              raid_conf->nr_hashed_stripes--;
++              conf->nr_hashed_stripes--;
+       }
+ }
+ 
+-static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
++static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+-      struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
++      struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
+ 
+-      PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
++      PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
++                      sh->sector, conf->nr_hashed_stripes));
+ 
+       if ((sh->hash_next = *shp) != NULL)
+               (*shp)->hash_pprev = &sh->hash_next;
+       *shp = sh;
+       sh->hash_pprev = shp;
+-      raid_conf->nr_hashed_stripes++;
++      conf->nr_hashed_stripes++;
+ }
+ 
+ static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
+@@ -145,13 +147,15 @@
+       struct buffer_head *bh;
+       unsigned long flags;
+ 
+-      save_flags(flags);
+-      cli();
+-      if ((bh = sh->buffer_pool) == NULL)
+-              return NULL;
++      md_spin_lock_irqsave(&sh->stripe_lock, flags);
++      bh = sh->buffer_pool;
++      if (!bh)
++              goto out_unlock;
+       sh->buffer_pool = bh->b_next;
+       bh->b_size = b_size;
+-      restore_flags(flags);
++out_unlock:
++      md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
++
+       return bh;
+ }
+ 
+@@ -160,12 +164,14 @@
+       struct buffer_head *bh;
+       unsigned long flags;
+ 
+-      save_flags(flags);
+-      cli();
+-      if ((bh = sh->bh_pool) == NULL)
+-              return NULL;
++      md_spin_lock_irqsave(&sh->stripe_lock, flags);
++      bh = sh->bh_pool;
++      if (!bh)
++              goto out_unlock;
+       sh->bh_pool = bh->b_next;
+-      restore_flags(flags);
++out_unlock:
++      md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
++
+       return bh;
+ }
+ 
+@@ -173,54 +179,52 @@
+ {
+       unsigned long flags;
+ 
+-      save_flags(flags);
+-      cli();
++      md_spin_lock_irqsave(&sh->stripe_lock, flags);
+       bh->b_next = sh->buffer_pool;
+       sh->buffer_pool = bh;
+-      restore_flags(flags);
++      md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+ 
+ static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
+ {
+       unsigned long flags;
+ 
+-      save_flags(flags);
+-      cli();
++      md_spin_lock_irqsave(&sh->stripe_lock, flags);
+       bh->b_next = sh->bh_pool;
+       sh->bh_pool = bh;
+-      restore_flags(flags);
++      md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+ 
+-static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
++static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+ {
+       struct stripe_head *sh;
+       unsigned long flags;
+ 
+       save_flags(flags);
+       cli();
+-      if ((sh = raid_conf->free_sh_list) == NULL) {
++      if ((sh = conf->free_sh_list) == NULL) {
+               restore_flags(flags);
+               return NULL;
+       }
+-      raid_conf->free_sh_list = sh->free_next;
+-      raid_conf->nr_free_sh--;
+-      if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
++      conf->free_sh_list = sh->free_next;
++      conf->nr_free_sh--;
++      if (!conf->nr_free_sh && conf->free_sh_list)
+               printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
+       restore_flags(flags);
+-      if (sh->hash_pprev || sh->nr_pending || sh->count)
++      if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
+               printk("get_free_stripe(): bug\n");
+       return sh;
+ }
+ 
+-static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
++static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+ {
+       unsigned long flags;
+ 
+       save_flags(flags);
+       cli();
+-      sh->free_next = raid_conf->free_sh_list;
+-      raid_conf->free_sh_list = sh;
+-      raid_conf->nr_free_sh++;
++      sh->free_next = conf->free_sh_list;
++      conf->free_sh_list = sh;
++      conf->nr_free_sh++;
+       restore_flags(flags);
+ }
+ 
+@@ -324,8 +328,8 @@
+ 
+ static void kfree_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int disks = raid_conf->raid_disks, j;
++      raid5_conf_t *conf = sh->raid_conf;
++      int disks = conf->raid_disks, j;
+ 
+       PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
+       if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
+@@ -338,19 +342,19 @@
+               if (sh->bh_new[j] || sh->bh_copy[j])
+                       printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
+       }
+-      remove_hash(raid_conf, sh);
+-      put_free_stripe(raid_conf, sh);
++      remove_hash(conf, sh);
++      put_free_stripe(conf, sh);
+ }
+ 
+-static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
++static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
+ {
+       struct stripe_head *sh;
+       int i, count = 0;
+ 
+-      PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
++      PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
+       for (i = 0; i < NR_HASH; i++) {
+ repeat:
+-              sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
++              sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
+               for (; sh; sh = sh->hash_next) {
+                       if (sh->phase != PHASE_COMPLETE)
+                               continue;
+@@ -360,30 +364,30 @@
+                               continue;
+                       kfree_stripe(sh);
+                       if (++count == nr) {
+-                              PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
+-                              raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
++                              PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
++                              conf->clock = (i + conf->clock) & HASH_MASK;
+                               return nr;
+                       }
+                       goto repeat;
+               }
+       }
+-      PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
++      PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
+       return count;
+ }
+ 
+-static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+       struct stripe_head *sh;
+ 
+-      if (raid_conf->buffer_size != size) {
+-              PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
+-              shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
+-              raid_conf->buffer_size = size;
++      if (conf->buffer_size != size) {
++              PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
++              shrink_stripe_cache(conf, conf->max_nr_stripes);
++              conf->buffer_size = size;
+       }
+ 
+       PRINTK(("find_stripe, sector %lu\n", sector));
+-      for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
+-              if (sh->sector == sector && sh->raid_conf == raid_conf) {
++      for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
++              if (sh->sector == sector && sh->raid_conf == conf) {
+                       if (sh->size == size) {
+                               PRINTK(("found stripe %lu\n", sector));
+                               return sh;
+@@ -397,7 +401,7 @@
+       return NULL;
+ }
+ 
+-static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
++static int grow_stripes(raid5_conf_t *conf, int num, int priority)
+ {
+       struct stripe_head *sh;
+ 
+@@ -405,62 +409,64 @@
+               if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
+                       return 1;
+               memset(sh, 0, sizeof(*sh));
+-              if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
+-                      shrink_buffers(sh, 2 * raid_conf->raid_disks);
++              sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
++
++              if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
++                      shrink_buffers(sh, 2 * conf->raid_disks);
+                       kfree(sh);
+                       return 1;
+               }
+-              if (grow_bh(sh, raid_conf->raid_disks, priority)) {
+-                      shrink_buffers(sh, 2 * raid_conf->raid_disks);
+-                      shrink_bh(sh, raid_conf->raid_disks);
++              if (grow_bh(sh, conf->raid_disks, priority)) {
++                      shrink_buffers(sh, 2 * conf->raid_disks);
++                      shrink_bh(sh, conf->raid_disks);
+                       kfree(sh);
+                       return 1;
+               }
+-              put_free_stripe(raid_conf, sh);
+-              raid_conf->nr_stripes++;
++              put_free_stripe(conf, sh);
++              conf->nr_stripes++;
+       }
+       return 0;
+ }
+ 
+-static void shrink_stripes(struct raid5_data *raid_conf, int num)
++static void shrink_stripes(raid5_conf_t *conf, int num)
+ {
+       struct stripe_head *sh;
+ 
+       while (num--) {
+-              sh = get_free_stripe(raid_conf);
++              sh = get_free_stripe(conf);
+               if (!sh)
+                       break;
+-              shrink_buffers(sh, raid_conf->raid_disks * 2);
+-              shrink_bh(sh, raid_conf->raid_disks);
++              shrink_buffers(sh, conf->raid_disks * 2);
++              shrink_bh(sh, conf->raid_disks);
+               kfree(sh);
+-              raid_conf->nr_stripes--;
++              conf->nr_stripes--;
+       }
+ }
+ 
+-static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+       struct stripe_head *sh = NULL, *tmp;
+       struct buffer_head *buffer_pool, *bh_pool;
+ 
+       PRINTK(("kmalloc_stripe called\n"));
+ 
+-      while ((sh = get_free_stripe(raid_conf)) == NULL) {
+-              shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
+-              if ((sh = get_free_stripe(raid_conf)) != NULL)
++      while ((sh = get_free_stripe(conf)) == NULL) {
++              shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
++              if ((sh = get_free_stripe(conf)) != NULL)
+                       break;
+-              if (!raid_conf->nr_pending_stripes)
++              if (!conf->nr_pending_stripes)
+                       printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
+-              md_wakeup_thread(raid_conf->thread);
++              md_wakeup_thread(conf->thread);
+               PRINTK(("waiting for some stripes to complete\n"));
+-              sleep_on(&raid_conf->wait_for_stripe);
++              sleep_on(&conf->wait_for_stripe);
+       }
+ 
+       /*
+        * The above might have slept, so perhaps another process
+        * already created the stripe for us..
+        */
+-      if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) { 
+-              put_free_stripe(raid_conf, sh);
++      if ((tmp = find_stripe(conf, sector, size)) != NULL) { 
++              put_free_stripe(conf, sh);
+               wait_on_stripe(tmp);
+               return tmp;
+       }
+@@ -472,25 +478,25 @@
+               sh->bh_pool = bh_pool;
+               sh->phase = PHASE_COMPLETE;
+               sh->cmd = STRIPE_NONE;
+-              sh->raid_conf = raid_conf;
++              sh->raid_conf = conf;
+               sh->sector = sector;
+               sh->size = size;
+-              raid_conf->nr_cached_stripes++;
+-              insert_hash(raid_conf, sh);
++              conf->nr_cached_stripes++;
++              insert_hash(conf, sh);
+       } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
+       return sh;
+ }
+ 
+-static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
++static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
+ {
+       struct stripe_head *sh;
+ 
+       PRINTK(("get_stripe, sector %lu\n", sector));
+-      sh = find_stripe(raid_conf, sector, size);
++      sh = find_stripe(conf, sector, size);
+       if (sh)
+               wait_on_stripe(sh);
+       else
+-              sh = kmalloc_stripe(raid_conf, sector, size);
++              sh = kmalloc_stripe(conf, sector, size);
+       return sh;
+ }
+ 
+@@ -523,7 +529,7 @@
+       bh->b_end_io(bh, uptodate);
+       if (!uptodate)
+               printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
+-                     "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
++                     "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
+ }
+ 
+ static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
+@@ -537,36 +543,35 @@
+ static void raid5_end_request (struct buffer_head * bh, int uptodate)
+ {
+       struct stripe_head *sh = bh->b_dev_id;
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int disks = raid_conf->raid_disks, i;
++      raid5_conf_t *conf = sh->raid_conf;
++      int disks = conf->raid_disks, i;
+       unsigned long flags;
+ 
+       PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
+-      save_flags(flags);
+-      cli();
++      md_spin_lock_irqsave(&sh->stripe_lock, flags);
+       raid5_mark_buffer_uptodate(bh, uptodate);
+-      --sh->nr_pending;
+-      if (!sh->nr_pending) {
+-              md_wakeup_thread(raid_conf->thread);
+-              atomic_inc(&raid_conf->nr_handle);
++      if (atomic_dec_and_test(&sh->nr_pending)) {
++              md_wakeup_thread(conf->thread);
++              atomic_inc(&conf->nr_handle);
+       }
+-      if (!uptodate)
++      if (!uptodate) {
+               md_error(bh->b_dev, bh->b_rdev);
+-      if (raid_conf->failed_disks) {
++      }
++      if (conf->failed_disks) {
+               for (i = 0; i < disks; i++) {
+-                      if (raid_conf->disks[i].operational)
++                      if (conf->disks[i].operational)
+                               continue;
+                       if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
+                               continue;
+-                      if (bh->b_rdev != raid_conf->disks[i].dev)
++                      if (bh->b_rdev != conf->disks[i].dev)
+                               continue;
+                       set_bit(STRIPE_ERROR, &sh->state);
+               }
+       }
+-      restore_flags(flags);
++      md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
+ }
+ 
+-static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
++static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
+                     unsigned long *rsector, unsigned long size)
+ {
+       /* No complex mapping used: the core of the work is done in the
+@@ -577,11 +582,10 @@
+ 
+ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      struct md_dev *mddev = raid_conf->mddev;
+-      int minor = (int) (mddev - md_dev);
++      raid5_conf_t *conf = sh->raid_conf;
++      mddev_t *mddev = conf->mddev;
+       char *b_data;
+-      kdev_t dev = MKDEV(MD_MAJOR, minor);
++      kdev_t dev = mddev_to_kdev(mddev);
+       int block = sh->sector / (sh->size >> 9);
+ 
+       b_data = ((volatile struct buffer_head *) bh)->b_data;
+@@ -589,7 +593,7 @@
+       init_buffer(bh, dev, block, raid5_end_request, sh);
+       ((volatile struct buffer_head *) bh)->b_data = b_data;
+ 
+-      bh->b_rdev      = raid_conf->disks[i].dev;
++      bh->b_rdev      = conf->disks[i].dev;
+       bh->b_rsector   = sh->sector;
+ 
+       bh->b_state     = (1 << BH_Req);
+@@ -597,33 +601,62 @@
+       bh->b_list      = BUF_LOCKED;
+ }
+ 
+-static int raid5_error (struct md_dev *mddev, kdev_t dev)
++static int raid5_error (mddev_t *mddev, kdev_t dev)
+ {
+-      struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+-      md_superblock_t *sb = mddev->sb;
++      raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++      mdp_super_t *sb = mddev->sb;
+       struct disk_info *disk;
+       int i;
+ 
+       PRINTK(("raid5_error called\n"));
+-      raid_conf->resync_parity = 0;
+-      for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
++      conf->resync_parity = 0;
++      for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
+               if (disk->dev == dev && disk->operational) {
+                       disk->operational = 0;
+-                      sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
+-                      sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
+-                      sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
++                      mark_disk_faulty(sb->disks+disk->number);
++                      mark_disk_nonsync(sb->disks+disk->number);
++                      mark_disk_inactive(sb->disks+disk->number);
+                       sb->active_disks--;
+                       sb->working_disks--;
+                       sb->failed_disks++;
+                       mddev->sb_dirty = 1;
+-                      raid_conf->working_disks--;
+-                      raid_conf->failed_disks++;
+-                      md_wakeup_thread(raid_conf->thread);
++                      conf->working_disks--;
++                      conf->failed_disks++;
++                      md_wakeup_thread(conf->thread);
+                       printk (KERN_ALERT
+-                              "RAID5: Disk failure on %s, disabling device."
+-                              "Operation continuing on %d devices\n",
+-                              kdevname (dev), raid_conf->working_disks);
++                              "raid5: Disk failure on %s, disabling device."
++                              " Operation continuing on %d devices\n",
++                              partition_name (dev), conf->working_disks);
++                      return -EIO;
+               }
++      }
++      /*
++       * handle errors in spares (during reconstruction)
++       */
++      if (conf->spare) {
++              disk = conf->spare;
++              if (disk->dev == dev) {
++                      printk (KERN_ALERT
++                              "raid5: Disk failure on spare %s\n",
++                              partition_name (dev));
++                      if (!conf->spare->operational) {
++                              MD_BUG();
++                              return -EIO;
++                      }
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      conf->spare = NULL;
++                      mark_disk_faulty(sb->disks+disk->number);
++                      mark_disk_nonsync(sb->disks+disk->number);
++                      mark_disk_inactive(sb->disks+disk->number);
++                      sb->spare_disks--;
++                      sb->working_disks--;
++                      sb->failed_disks++;
++
++                      return -EIO;
++              }
++      }
++      MD_BUG();
+       return 0;
+ }     
+ 
+@@ -634,12 +667,12 @@
+ static inline unsigned long 
+ raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
+                       unsigned int * dd_idx, unsigned int * pd_idx, 
+-                      struct raid5_data *raid_conf)
++                      raid5_conf_t *conf)
+ {
+       unsigned int  stripe;
+       int chunk_number, chunk_offset;
+       unsigned long new_sector;
+-      int sectors_per_chunk = raid_conf->chunk_size >> 9;
++      int sectors_per_chunk = conf->chunk_size >> 9;
+ 
+       /* First compute the information on this sector */
+ 
+@@ -662,9 +695,9 @@
+       /*
+        * Select the parity disk based on the user selected algorithm.
+        */
+-      if (raid_conf->level == 4)
++      if (conf->level == 4)
+               *pd_idx = data_disks;
+-      else switch (raid_conf->algorithm) {
++      else switch (conf->algorithm) {
+               case ALGORITHM_LEFT_ASYMMETRIC:
+                       *pd_idx = data_disks - stripe % raid_disks;
+                       if (*dd_idx >= *pd_idx)
+@@ -684,7 +717,7 @@
+                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                       break;
+               default:
+-                      printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
++                      printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+       }
+ 
+       /*
+@@ -705,16 +738,16 @@
+ 
+ static unsigned long compute_blocknr(struct stripe_head *sh, int i)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
++      raid5_conf_t *conf = sh->raid_conf;
++      int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+       unsigned long new_sector = sh->sector, check;
+-      int sectors_per_chunk = raid_conf->chunk_size >> 9;
++      int sectors_per_chunk = conf->chunk_size >> 9;
+       unsigned long stripe = new_sector / sectors_per_chunk;
+       int chunk_offset = new_sector % sectors_per_chunk;
+       int chunk_number, dummy1, dummy2, dd_idx = i;
+       unsigned long r_sector, blocknr;
+ 
+-      switch (raid_conf->algorithm) {
++      switch (conf->algorithm) {
+               case ALGORITHM_LEFT_ASYMMETRIC:
+               case ALGORITHM_RIGHT_ASYMMETRIC:
+                       if (i > sh->pd_idx)
+@@ -727,14 +760,14 @@
+                       i -= (sh->pd_idx + 1);
+                       break;
+               default:
+-                      printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
++                      printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+       }
+ 
+       chunk_number = stripe * data_disks + i;
+       r_sector = chunk_number * sectors_per_chunk + chunk_offset;
+       blocknr = r_sector / (sh->size >> 9);
+ 
+-      check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
++      check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+       if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+               printk("compute_blocknr: map not correct\n");
+               return 0;
+@@ -742,36 +775,11 @@
+       return blocknr;
+ }
+ 
+-#ifdef HAVE_ARCH_XORBLOCK
+-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
+-{
+-      __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
+-}
+-#else
+-static void xor_block(struct buffer_head *dest, struct buffer_head *source)
+-{
+-      long lines = dest->b_size / (sizeof (long)) / 8, i;
+-      long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
+-
+-      for (i = lines; i > 0; i--) {
+-              *(destp + 0) ^= *(sourcep + 0);
+-              *(destp + 1) ^= *(sourcep + 1);
+-              *(destp + 2) ^= *(sourcep + 2);
+-              *(destp + 3) ^= *(sourcep + 3);
+-              *(destp + 4) ^= *(sourcep + 4);
+-              *(destp + 5) ^= *(sourcep + 5);
+-              *(destp + 6) ^= *(sourcep + 6);
+-              *(destp + 7) ^= *(sourcep + 7);
+-              destp += 8;
+-              sourcep += 8;
+-      }
+-}
+-#endif
+-
+ static void compute_block(struct stripe_head *sh, int dd_idx)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int i, disks = raid_conf->raid_disks;
++      raid5_conf_t *conf = sh->raid_conf;
++      int i, count, disks = conf->raid_disks;
++      struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+ 
+       PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
+ 
+@@ -780,69 +788,100 @@
+       raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
+ 
+       memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
++      bh_ptr[0] = sh->bh_old[dd_idx];
++      count = 1;
+       for (i = 0; i < disks; i++) {
+               if (i == dd_idx)
+                       continue;
+               if (sh->bh_old[i]) {
+-                      xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
+-                      continue;
+-              } else
++                      bh_ptr[count++] = sh->bh_old[i];
++              } else {
+                       printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
++              }
++              if (count == MAX_XOR_BLOCKS) {
++                      xor_block(count, &bh_ptr[0]);
++                      count = 1;
++              }
++      }
++      if(count != 1) {
++              xor_block(count, &bh_ptr[0]);
+       }
+       raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
+ }
+ 
+ static void compute_parity(struct stripe_head *sh, int method)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
++      raid5_conf_t *conf = sh->raid_conf;
++      int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
++      struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+ 
+       PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
++      lowprio = 1;
+       for (i = 0; i < disks; i++) {
+               if (i == pd_idx || !sh->bh_new[i])
+                       continue;
+               if (!sh->bh_copy[i])
+                       sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
+               raid5_build_block(sh, sh->bh_copy[i], i);
++              if (!buffer_lowprio(sh->bh_new[i]))
++                      lowprio = 0;
++              else
++                      mark_buffer_lowprio(sh->bh_copy[i]);
+               mark_buffer_clean(sh->bh_new[i]);
+               memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
+       }
+       if (sh->bh_copy[pd_idx] == NULL)
+               sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
+       raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
++      if (lowprio)
++              mark_buffer_lowprio(sh->bh_copy[pd_idx]);
+ 
+       if (method == RECONSTRUCT_WRITE) {
+               memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
++              bh_ptr[0] = sh->bh_copy[pd_idx];
++              count = 1;
+               for (i = 0; i < disks; i++) {
+                       if (i == sh->pd_idx)
+                               continue;
+                       if (sh->bh_new[i]) {
+-                              xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
+-                              continue;
++                              bh_ptr[count++] = sh->bh_copy[i];
++                      } else if (sh->bh_old[i]) {
++                              bh_ptr[count++] = sh->bh_old[i];
+                       }
+-                      if (sh->bh_old[i]) {
+-                              xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
+-                              continue;
++                      if (count == MAX_XOR_BLOCKS) {
++                              xor_block(count, &bh_ptr[0]);
++                              count = 1;
+                       }
+               }
++              if (count != 1) {
++                      xor_block(count, &bh_ptr[0]);
++              }
+       } else if (method == READ_MODIFY_WRITE) {
+               memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
++              bh_ptr[0] = sh->bh_copy[pd_idx];
++              count = 1;
+               for (i = 0; i < disks; i++) {
+                       if (i == sh->pd_idx)
+                               continue;
+                       if (sh->bh_new[i] && sh->bh_old[i]) {
+-                              xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
+-                              xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
+-                              continue;
++                              bh_ptr[count++] = sh->bh_copy[i];
++                              bh_ptr[count++] = sh->bh_old[i];
++                      }
++                      if (count >= (MAX_XOR_BLOCKS - 1)) {
++                              xor_block(count, &bh_ptr[0]);
++                              count = 1;
+                       }
+               }
++              if (count != 1) {
++                      xor_block(count, &bh_ptr[0]);
++              }
+       }
+       raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
+ }
+ 
+ static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
++      raid5_conf_t *conf = sh->raid_conf;
+       struct buffer_head *bh_req;
+ 
+       if (sh->bh_new[dd_idx]) {
+@@ -860,19 +899,22 @@
+       if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
+               sh->phase = PHASE_BEGIN;
+               sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
+-              raid_conf->nr_pending_stripes++;
+-              atomic_inc(&raid_conf->nr_handle);
++              conf->nr_pending_stripes++;
++              atomic_inc(&conf->nr_handle);
+       }
+       sh->bh_new[dd_idx] = bh;
+       sh->bh_req[dd_idx] = bh_req;
+       sh->cmd_new[dd_idx] = rw;
+       sh->new[dd_idx] = 1;
++
++      if (buffer_lowprio(bh))
++              mark_buffer_lowprio(bh_req);
+ }
+ 
+ static void complete_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      int disks = raid_conf->raid_disks;
++      raid5_conf_t *conf = sh->raid_conf;
++      int disks = conf->raid_disks;
+       int i, new = 0;
+       
+       PRINTK(("complete_stripe %lu\n", sh->sector));
+@@ -909,6 +951,22 @@
+       }
+ }
+ 
++
++static int is_stripe_lowprio(struct stripe_head *sh, int disks)
++{
++      int i, lowprio = 1;
++
++      for (i = 0; i < disks; i++) {
++              if (sh->bh_new[i])
++                      if (!buffer_lowprio(sh->bh_new[i]))
++                              lowprio = 0;
++              if (sh->bh_old[i])
++                      if (!buffer_lowprio(sh->bh_old[i]))
++                              lowprio = 0;
++      }
++      return lowprio;
++}
++
+ /*
+  * handle_stripe() is our main logic routine. Note that:
+  *
+@@ -919,28 +977,27 @@
+  * 2. We should be careful to set sh->nr_pending whenever we sleep,
+  *    to prevent re-entry of handle_stripe() for the same sh.
+  *
+- * 3. raid_conf->failed_disks and disk->operational can be changed
++ * 3. conf->failed_disks and disk->operational can be changed
+  *    from an interrupt. This complicates things a bit, but it allows
+  *    us to stop issuing requests for a failed drive as soon as possible.
+  */
+ static void handle_stripe(struct stripe_head *sh)
+ {
+-      struct raid5_data *raid_conf = sh->raid_conf;
+-      struct md_dev *mddev = raid_conf->mddev;
+-      int minor = (int) (mddev - md_dev);
++      raid5_conf_t *conf = sh->raid_conf;
++      mddev_t *mddev = conf->mddev;
+       struct buffer_head *bh;
+-      int disks = raid_conf->raid_disks;
+-      int i, nr = 0, nr_read = 0, nr_write = 0;
++      int disks = conf->raid_disks;
++      int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
+       int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
+       int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
+       int reading = 0, nr_writing = 0;
+       int method1 = INT_MAX, method2 = INT_MAX;
+       int block;
+       unsigned long flags;
+-      int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
++      int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
+ 
+       PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
+-      if (sh->nr_pending) {
++      if (md_atomic_read(&sh->nr_pending)) {
+               printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
+               return;
+       }
+@@ -949,9 +1006,9 @@
+               return;
+       }
+ 
+-      atomic_dec(&raid_conf->nr_handle);
++      atomic_dec(&conf->nr_handle);
+ 
+-      if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
++      if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
+               printk("raid5: restarting stripe %lu\n", sh->sector);
+               sh->phase = PHASE_BEGIN;
+       }
+@@ -969,11 +1026,11 @@
+       save_flags(flags);
+       cli();
+       for (i = 0; i < disks; i++) {
+-              operational[i] = raid_conf->disks[i].operational;
+-              if (i == sh->pd_idx && raid_conf->resync_parity)
++              operational[i] = conf->disks[i].operational;
++              if (i == sh->pd_idx && conf->resync_parity)
+                       operational[i] = 0;
+       }
+-      failed_disks = raid_conf->failed_disks;
++      failed_disks = conf->failed_disks;
+       restore_flags(flags);
+ 
+       if (failed_disks > 1) {
+@@ -1017,7 +1074,7 @@
+       }
+ 
+       if (nr_write && nr_read)
+-              printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
++              printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
+ 
+       if (nr_write) {
+               /*
+@@ -1030,7 +1087,7 @@
+                               if (sh->bh_new[i])
+                                       continue;
+                               block = (int) compute_blocknr(sh, i);
+-                              bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
++                              bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
+                               if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
+                                       PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
+                                       add_stripe_bh(sh, bh, i, WRITE);
+@@ -1064,21 +1121,22 @@
+ 
+               if (!method1 || !method2) {
+                       lock_stripe(sh);
+-                      sh->nr_pending++;
++                      lowprio = is_stripe_lowprio(sh, disks);
++                      atomic_inc(&sh->nr_pending);
+                       sh->phase = PHASE_WRITE;
+                       compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+                       for (i = 0; i < disks; i++) {
+-                              if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
++                              if (!operational[i] && !conf->spare && !conf->resync_parity)
+                                       continue;
+                               if (i == sh->pd_idx || sh->bh_new[i])
+                                       nr_writing++;
+                       }
+ 
+-                      sh->nr_pending = nr_writing;
+-                      PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
++                      md_atomic_set(&sh->nr_pending, nr_writing);
++                      PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+ 
+                       for (i = 0; i < disks; i++) {
+-                              if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
++                              if (!operational[i] && !conf->spare && !conf->resync_parity)
+                                       continue;
+                               bh = sh->bh_copy[i];
+                               if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
+@@ -1089,18 +1147,30 @@
+                                       bh->b_state |= (1<<BH_Dirty);
+                                       PRINTK(("making request for buffer %d\n", i));
+                                       clear_bit(BH_Lock, &bh->b_state);
+-                                      if (!operational[i] && !raid_conf->resync_parity) {
+-                                              bh->b_rdev = raid_conf->spare->dev;
+-                                              make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
+-                                      } else
+-                                              make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
++                                      if (!operational[i] && !conf->resync_parity) {
++                                              bh->b_rdev = conf->spare->dev;
++                                              make_request(MAJOR(conf->spare->dev), WRITE, bh);
++                                      } else {
++#if 0
++                                              make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
++#else
++                                              if (!lowprio || (i==sh->pd_idx))
++                                                      make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
++                                              else {
++                                                      mark_buffer_clean(bh);
++                                                      raid5_end_request(bh,1);
++                                                      sh->new[i] = 0;
++                                              }
++#endif
++                                      }
+                               }
+                       }
+                       return;
+               }
+ 
+               lock_stripe(sh);
+-              sh->nr_pending++;
++              lowprio = is_stripe_lowprio(sh, disks);
++              atomic_inc(&sh->nr_pending);
+               if (method1 < method2) {
+                       sh->write_method = RECONSTRUCT_WRITE;
+                       for (i = 0; i < disks; i++) {
+@@ -1110,6 +1180,8 @@
+                                       continue;
+                               sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+                               raid5_build_block(sh, sh->bh_old[i], i);
++                              if (lowprio)
++                                      mark_buffer_lowprio(sh->bh_old[i]);
+                               reading++;
+                       }
+               } else {
+@@ -1121,19 +1193,21 @@
+                                       continue;
+                               sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+                               raid5_build_block(sh, sh->bh_old[i], i);
++                              if (lowprio)
++                                      mark_buffer_lowprio(sh->bh_old[i]);
+                               reading++;
+                       }
+               }
+               sh->phase = PHASE_READ_OLD;
+-              sh->nr_pending = reading;
+-              PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
++              md_atomic_set(&sh->nr_pending, reading);
++              PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+               for (i = 0; i < disks; i++) {
+                       if (!sh->bh_old[i])
+                               continue;
+                       if (buffer_uptodate(sh->bh_old[i]))
+                               continue;
+                       clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
+-                      make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
++                      make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
+               }
+       } else {
+               /*
+@@ -1141,7 +1215,8 @@
+                */
+               method1 = nr_read - nr_cache_overwrite;
+               lock_stripe(sh);
+-              sh->nr_pending++;
++              lowprio = is_stripe_lowprio(sh,disks);
++              atomic_inc(&sh->nr_pending);
+ 
+               PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
+               if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
+@@ -1149,18 +1224,22 @@
+                       for (i = 0; i < disks; i++) {
+                               if (!sh->bh_new[i])
+                                       continue;
+-                              if (!sh->bh_old[i])
++                              if (!sh->bh_old[i]) {
+                                       compute_block(sh, i);
++                                      if (lowprio)
++                                              mark_buffer_lowprio
++                                                      (sh->bh_old[i]);
++                              }
+                               memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
+                       }
+-                      sh->nr_pending--;
++                      atomic_dec(&sh->nr_pending);
+                       complete_stripe(sh);
+                       return;
+               }
+               if (nr_failed_overwrite) {
+                       sh->phase = PHASE_READ_OLD;
+-                      sh->nr_pending = (disks - 1) - nr_cache;
+-                      PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
++                      md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
++                      PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+                       for (i = 0; i < disks; i++) {
+                               if (sh->bh_old[i])
+                                       continue;
+@@ -1168,13 +1247,16 @@
+                                       continue;
+                               sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
+                               raid5_build_block(sh, sh->bh_old[i], i);
++                              if (lowprio)
++                                      mark_buffer_lowprio(sh->bh_old[i]);
+                               clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
+-                              make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
++                              make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
+                       }
+               } else {
+                       sh->phase = PHASE_READ;
+-                      sh->nr_pending = nr_read - nr_cache_overwrite;
+-                      PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
++                      md_atomic_set(&sh->nr_pending,
++                              nr_read - nr_cache_overwrite);
++                      PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
+                       for (i = 0; i < disks; i++) {
+                               if (!sh->bh_new[i])
+                                       continue;
+@@ -1182,16 +1264,16 @@
+                                       memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
+                                       continue;
+                               }
+-                              make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
++                              make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
+                       }
+               }
+       }
+ }
+ 
+-static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
++static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
+ {
+-      struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+-      const unsigned int raid_disks = raid_conf->raid_disks;
++      raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++      const unsigned int raid_disks = conf->raid_disks;
+       const unsigned int data_disks = raid_disks - 1;
+       unsigned int  dd_idx, pd_idx;
+       unsigned long new_sector;
+@@ -1202,15 +1284,15 @@
+       if (rw == WRITEA) rw = WRITE;
+ 
+       new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
+-                                              &dd_idx, &pd_idx, raid_conf);
++                                              &dd_idx, &pd_idx, conf);
+ 
+       PRINTK(("raid5_make_request, sector %lu\n", new_sector));
+ repeat:
+-      sh = get_stripe(raid_conf, new_sector, bh->b_size);
++      sh = get_stripe(conf, new_sector, bh->b_size);
+       if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
+               PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
+               lock_stripe(sh);
+-              if (!sh->nr_pending)
++              if (!md_atomic_read(&sh->nr_pending))
+                       handle_stripe(sh);
+               goto repeat;
+       }
+@@ -1221,24 +1303,24 @@
+               printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
+               printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
+               lock_stripe(sh);
+-              md_wakeup_thread(raid_conf->thread);
++              md_wakeup_thread(conf->thread);
+               wait_on_stripe(sh);
+               goto repeat;
+       }
+       add_stripe_bh(sh, bh, dd_idx, rw);
+ 
+-      md_wakeup_thread(raid_conf->thread);
++      md_wakeup_thread(conf->thread);
+       return 0;
+ }
+ 
+ static void unplug_devices(struct stripe_head *sh)
+ {
+ #if 0
+-      struct raid5_data *raid_conf = sh->raid_conf;
++      raid5_conf_t *conf = sh->raid_conf;
+       int i;
+ 
+-      for (i = 0; i < raid_conf->raid_disks; i++)
+-              unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
++      for (i = 0; i < conf->raid_disks; i++)
++              unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
+ #endif
+ }
+ 
+@@ -1252,8 +1334,8 @@
+ static void raid5d (void *data)
+ {
+       struct stripe_head *sh;
+-      struct raid5_data *raid_conf = data;
+-      struct md_dev *mddev = raid_conf->mddev;
++      raid5_conf_t *conf = data;
++      mddev_t *mddev = conf->mddev;
+       int i, handled = 0, unplug = 0;
+       unsigned long flags;
+ 
+@@ -1261,47 +1343,47 @@
+ 
+       if (mddev->sb_dirty) {
+               mddev->sb_dirty = 0;
+-              md_update_sb((int) (mddev - md_dev));
++              md_update_sb(mddev);
+       }
+       for (i = 0; i < NR_HASH; i++) {
+ repeat:
+-              sh = raid_conf->stripe_hashtbl[i];
++              sh = conf->stripe_hashtbl[i];
+               for (; sh; sh = sh->hash_next) {
+-                      if (sh->raid_conf != raid_conf)
++                      if (sh->raid_conf != conf)
+                               continue;
+                       if (sh->phase == PHASE_COMPLETE)
+                               continue;
+-                      if (sh->nr_pending)
++                      if (md_atomic_read(&sh->nr_pending))
+                               continue;
+-                      if (sh->sector == raid_conf->next_sector) {
+-                              raid_conf->sector_count += (sh->size >> 9);
+-                              if (raid_conf->sector_count >= 128)
++                      if (sh->sector == conf->next_sector) {
++                              conf->sector_count += (sh->size >> 9);
++                              if (conf->sector_count >= 128)
+                                       unplug = 1;
+                       } else
+                               unplug = 1;
+                       if (unplug) {
+-                              PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
++                              PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
+                               unplug_devices(sh);
+                               unplug = 0;
+-                              raid_conf->sector_count = 0;
++                              conf->sector_count = 0;
+                       }
+-                      raid_conf->next_sector = sh->sector + (sh->size >> 9);
++                      conf->next_sector = sh->sector + (sh->size >> 9);
+                       handled++;
+                       handle_stripe(sh);
+                       goto repeat;
+               }
+       }
+-      if (raid_conf) {
+-              PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
++      if (conf) {
++              PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
+               save_flags(flags);
+               cli();
+-              if (!atomic_read(&raid_conf->nr_handle))
+-                      clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
++              if (!md_atomic_read(&conf->nr_handle))
++                      clear_bit(THREAD_WAKEUP, &conf->thread->flags);
++              restore_flags(flags);
+       }
+       PRINTK(("--- raid5d inactive\n"));
+ }
+ 
+-#if SUPPORT_RECONSTRUCTION
+ /*
+  * Private kernel thread for parity reconstruction after an unclean
+  * shutdown. Reconstruction on spare drives in case of a failed drive
+@@ -1309,44 +1391,64 @@
+  */
+ static void raid5syncd (void *data)
+ {
+-      struct raid5_data *raid_conf = data;
+-      struct md_dev *mddev = raid_conf->mddev;
++      raid5_conf_t *conf = data;
++      mddev_t *mddev = conf->mddev;
+ 
+-      if (!raid_conf->resync_parity)
++      if (!conf->resync_parity)
++              return;
++      if (conf->resync_parity == 2)
++              return;
++      down(&mddev->recovery_sem);
++      if (md_do_sync(mddev,NULL)) {
++              up(&mddev->recovery_sem);
++              printk("raid5: resync aborted!\n");
+               return;
+-      md_do_sync(mddev);
+-      raid_conf->resync_parity = 0;
++      }
++      conf->resync_parity = 0;
++      up(&mddev->recovery_sem);
++      printk("raid5: resync finished.\n");
+ }
+-#endif /* SUPPORT_RECONSTRUCTION */
+ 
+-static int __check_consistency (struct md_dev *mddev, int row)
++static int __check_consistency (mddev_t *mddev, int row)
+ {
+-      struct raid5_data *raid_conf = mddev->private;
++      raid5_conf_t *conf = mddev->private;
+       kdev_t dev;
+       struct buffer_head *bh[MD_SB_DISKS], tmp;
+-      int i, rc = 0, nr = 0;
++      int i, rc = 0, nr = 0, count;
++      struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+ 
+-      if (raid_conf->working_disks != raid_conf->raid_disks)
++      if (conf->working_disks != conf->raid_disks)
+               return 0;
+       tmp.b_size = 4096;
+       if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
+               return 0;
++      md_clear_page((unsigned long)tmp.b_data);
+       memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
+-      for (i = 0; i < raid_conf->raid_disks; i++) {
+-              dev = raid_conf->disks[i].dev;
++      for (i = 0; i < conf->raid_disks; i++) {
++              dev = conf->disks[i].dev;
+               set_blocksize(dev, 4096);
+               if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
+                       break;
+               nr++;
+       }
+-      if (nr == raid_conf->raid_disks) {
+-              for (i = 1; i < nr; i++)
+-                      xor_block(&tmp, bh[i]);
++      if (nr == conf->raid_disks) {
++              bh_ptr[0] = &tmp;
++              count = 1;
++              for (i = 1; i < nr; i++) {
++                      bh_ptr[count++] = bh[i];
++                      if (count == MAX_XOR_BLOCKS) {
++                              xor_block(count, &bh_ptr[0]);
++                              count = 1;
++                      }
++              }
++              if (count != 1) {
++                      xor_block(count, &bh_ptr[0]);
++              }
+               if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
+                       rc = 1;
+       }
+-      for (i = 0; i < raid_conf->raid_disks; i++) {
+-              dev = raid_conf->disks[i].dev;
++      for (i = 0; i < conf->raid_disks; i++) {
++              dev = conf->disks[i].dev;
+               if (bh[i]) {
+                       bforget(bh[i]);
+                       bh[i] = NULL;
+@@ -1358,285 +1460,607 @@
+       return rc;
+ }
+ 
+-static int check_consistency (struct md_dev *mddev)
++static int check_consistency (mddev_t *mddev)
+ {
+-      int size = mddev->sb->size;
+-      int row;
++      if (__check_consistency(mddev, 0))
++/*
++ * We are not checking this currently, as it's legitimate to have
++ * an inconsistent array, at creation time.
++ */
++              return 0;
+ 
+-      for (row = 0; row < size; row += size / 8)
+-              if (__check_consistency(mddev, row))
+-                      return 1;
+       return 0;
+ }
+ 
+-static int raid5_run (int minor, struct md_dev *mddev)
++static int raid5_run (mddev_t *mddev)
+ {
+-      struct raid5_data *raid_conf;
++      raid5_conf_t *conf;
+       int i, j, raid_disk, memory;
+-      md_superblock_t *sb = mddev->sb;
+-      md_descriptor_t *descriptor;
+-      struct real_dev *realdev;
++      mdp_super_t *sb = mddev->sb;
++      mdp_disk_t *desc;
++      mdk_rdev_t *rdev;
++      struct disk_info *disk;
++      struct md_list_head *tmp;
++      int start_recovery = 0;
+ 
+       MOD_INC_USE_COUNT;
+ 
+       if (sb->level != 5 && sb->level != 4) {
+-              printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
++              printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
+               MOD_DEC_USE_COUNT;
+               return -EIO;
+       }
+ 
+-      mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
+-      if ((raid_conf = mddev->private) == NULL)
++      mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
++      if ((conf = mddev->private) == NULL)
+               goto abort;
+-      memset (raid_conf, 0, sizeof (*raid_conf));
+-      raid_conf->mddev = mddev;
++      memset (conf, 0, sizeof (*conf));
++      conf->mddev = mddev;
+ 
+-      if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
++      if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+               goto abort;
+-      memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
++      memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+ 
+-      init_waitqueue(&raid_conf->wait_for_stripe);
+-      PRINTK(("raid5_run(%d) called.\n", minor));
+-
+-      for (i = 0; i < mddev->nb_dev; i++) {
+-              realdev = &mddev->devices[i];
+-              if (!realdev->sb) {
+-                      printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
+-                      continue;
+-              }
++      init_waitqueue(&conf->wait_for_stripe);
++      PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
+ 
++      ITERATE_RDEV(mddev,rdev,tmp) {
+               /*
+                * This is important -- we are using the descriptor on
+                * the disk only to get a pointer to the descriptor on
+                * the main superblock, which might be more recent.
+                */
+-              descriptor = &sb->disks[realdev->sb->descriptor.number];
+-              if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
+-                      printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
++              desc = sb->disks + rdev->desc_nr;
++              raid_disk = desc->raid_disk;
++              disk = conf->disks + raid_disk;
++
++              if (disk_faulty(desc)) {
++                      printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
++                      if (!rdev->faulty) {
++                              MD_BUG();
++                              goto abort;
++                      }
++                      disk->number = desc->number;
++                      disk->raid_disk = raid_disk;
++                      disk->dev = rdev->dev;
++
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 0;
++                      disk->used_slot = 1;
+                       continue;
+               }
+-              if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
+-                      if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
+-                              printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
+-                              continue;
++              if (disk_active(desc)) {
++                      if (!disk_sync(desc)) {
++                              printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
++                              MD_BUG();
++                              goto abort;
+                       }
+-                      raid_disk = descriptor->raid_disk;
+-                      if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
+-                              printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
++                      if (raid_disk > sb->raid_disks) {
++                              printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
+                               continue;
+                       }
+-                      if (raid_conf->disks[raid_disk].operational) {
+-                              printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
++                      if (disk->operational) {
++                              printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
+                               continue;
+                       }
+-                      printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
++                      printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
+       
+-                      raid_conf->disks[raid_disk].number = descriptor->number;
+-                      raid_conf->disks[raid_disk].raid_disk = raid_disk;
+-                      raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
+-                      raid_conf->disks[raid_disk].operational = 1;
++                      disk->number = desc->number;
++                      disk->raid_disk = raid_disk;
++                      disk->dev = rdev->dev;
++                      disk->operational = 1;
++                      disk->used_slot = 1;
+ 
+-                      raid_conf->working_disks++;
++                      conf->working_disks++;
+               } else {
+                       /*
+                        * Must be a spare disk ..
+                        */
+-                      printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
+-                      raid_disk = descriptor->raid_disk;
+-                      raid_conf->disks[raid_disk].number = descriptor->number;
+-                      raid_conf->disks[raid_disk].raid_disk = raid_disk;
+-                      raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
+-
+-                      raid_conf->disks[raid_disk].operational = 0;
+-                      raid_conf->disks[raid_disk].write_only = 0;
+-                      raid_conf->disks[raid_disk].spare = 1;
+-              }
+-      }
+-      raid_conf->raid_disks = sb->raid_disks;
+-      raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
+-      raid_conf->mddev = mddev;
+-      raid_conf->chunk_size = sb->chunk_size;
+-      raid_conf->level = sb->level;
+-      raid_conf->algorithm = sb->parity_algorithm;
+-      raid_conf->max_nr_stripes = NR_STRIPES;
++                      printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
++                      disk->number = desc->number;
++                      disk->raid_disk = raid_disk;
++                      disk->dev = rdev->dev;
+ 
+-      if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
+-              printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
+-              goto abort;
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 1;
++                      disk->used_slot = 1;
++              }
+       }
+-      if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
+-              printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              desc = sb->disks + i;
++              raid_disk = desc->raid_disk;
++              disk = conf->disks + raid_disk;
++
++              if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
++                      !conf->disks[raid_disk].used_slot) {
++
++                      disk->number = desc->number;
++                      disk->raid_disk = raid_disk;
++                      disk->dev = MKDEV(0,0);
++
++                      disk->operational = 0;
++                      disk->write_only = 0;
++                      disk->spare = 0;
++                      disk->used_slot = 1;
++              }
++      }
++
++      conf->raid_disks = sb->raid_disks;
++      /*
++       * 0 for a fully functional array, 1 for a degraded array.
++       */
++      conf->failed_disks = conf->raid_disks - conf->working_disks;
++      conf->mddev = mddev;
++      conf->chunk_size = sb->chunk_size;
++      conf->level = sb->level;
++      conf->algorithm = sb->layout;
++      conf->max_nr_stripes = NR_STRIPES;
++
++#if 0
++      for (i = 0; i < conf->raid_disks; i++) {
++              if (!conf->disks[i].used_slot) {
++                      MD_BUG();
++                      goto abort;
++              }
++      }
++#endif
++      if (!conf->chunk_size || conf->chunk_size % 4) {
++              printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
+               goto abort;
+       }
+-      if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+-              printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
++      if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
++              printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
+               goto abort;
+       }
+-      if (raid_conf->failed_disks > 1) {
+-              printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
++      if (conf->failed_disks > 1) {
++              printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
+               goto abort;
+       }
+ 
+-      if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
+-              printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
+-              sb->state |= 1 << MD_SB_ERRORS;
+-              goto abort;
++      if (conf->working_disks != sb->raid_disks) {
++              printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
++              start_recovery = 1;
+       }
+ 
+-      if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
+-              printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+-              goto abort;
++      if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
++                      check_consistency(mddev)) {
++              printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
++              sb->state &= ~(1 << MD_SB_CLEAN);
+       }
+ 
+-#if SUPPORT_RECONSTRUCTION
+-      if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
+-              printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+-              goto abort;
++      {
++              const char * name = "raid5d";
++
++              conf->thread = md_register_thread(raid5d, conf, name);
++              if (!conf->thread) {
++                      printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
++                      goto abort;
++              }
+       }
+-#endif /* SUPPORT_RECONSTRUCTION */
+ 
+-      memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
+-               raid_conf->raid_disks * (sizeof(struct buffer_head) +
++      memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
++               conf->raid_disks * (sizeof(struct buffer_head) +
+                2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+-      if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
++      if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
+               printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
+-              shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
++              shrink_stripes(conf, conf->max_nr_stripes);
+               goto abort;
+       } else
+-              printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
++              printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
+ 
+       /*
+        * Regenerate the "device is in sync with the raid set" bit for
+        * each device.
+        */
+-      for (i = 0; i < sb->nr_disks ; i++) {
+-              sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
++      for (i = 0; i < MD_SB_DISKS ; i++) {
++              mark_disk_nonsync(sb->disks + i);
+               for (j = 0; j < sb->raid_disks; j++) {
+-                      if (!raid_conf->disks[j].operational)
++                      if (!conf->disks[j].operational)
+                               continue;
+-                      if (sb->disks[i].number == raid_conf->disks[j].number)
+-                              sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
++                      if (sb->disks[i].number == conf->disks[j].number)
++                              mark_disk_sync(sb->disks + i);
+               }
+       }
+-      sb->active_disks = raid_conf->working_disks;
++      sb->active_disks = conf->working_disks;
+ 
+       if (sb->active_disks == sb->raid_disks)
+-              printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
++              printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+       else
+-              printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
++              printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
++
++      if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
++              const char * name = "raid5syncd";
++
++              conf->resync_thread = md_register_thread(raid5syncd, conf,name);
++              if (!conf->resync_thread) {
++                      printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
++                      goto abort;
++              }
+ 
+-      if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
+-              printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
+-              raid_conf->resync_parity = 1;
+-#if SUPPORT_RECONSTRUCTION
+-              md_wakeup_thread(raid_conf->resync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
++              printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
++              conf->resync_parity = 1;
++              md_wakeup_thread(conf->resync_thread);
+       }
+ 
++      print_raid5_conf(conf);
++      if (start_recovery)
++              md_recover_arrays();
++      print_raid5_conf(conf);
++
+       /* Ok, everything is just fine now */
+       return (0);
+ abort:
+-      if (raid_conf) {
+-              if (raid_conf->stripe_hashtbl)
+-                      free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
+-              kfree(raid_conf);
++      if (conf) {
++              print_raid5_conf(conf);
++              if (conf->stripe_hashtbl)
++                      free_pages((unsigned long) conf->stripe_hashtbl,
++                                                      HASH_PAGES_ORDER);
++              kfree(conf);
+       }
+       mddev->private = NULL;
+-      printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
++      printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
+       MOD_DEC_USE_COUNT;
+       return -EIO;
+ }
+ 
+-static int raid5_stop (int minor, struct md_dev *mddev)
++static int raid5_stop_resync (mddev_t *mddev)
++{
++      raid5_conf_t *conf = mddev_to_conf(mddev);
++      mdk_thread_t *thread = conf->resync_thread;
++
++      if (thread) {
++              if (conf->resync_parity) {
++                      conf->resync_parity = 2;
++                      md_interrupt_thread(thread);
++                      printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
++                      return 1;
++              }
++              return 0;
++      }
++      return 0;
++}
++
++static int raid5_restart_resync (mddev_t *mddev)
+ {
+-      struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
++      raid5_conf_t *conf = mddev_to_conf(mddev);
+ 
+-      shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
+-      shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
+-      md_unregister_thread(raid_conf->thread);
+-#if SUPPORT_RECONSTRUCTION
+-      md_unregister_thread(raid_conf->resync_thread);
+-#endif /* SUPPORT_RECONSTRUCTION */
+-      free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
+-      kfree(raid_conf);
++      if (conf->resync_parity) {
++              if (!conf->resync_thread) {
++                      MD_BUG();
++                      return 0;
++              }
++              printk("raid5: waking up raid5resync.\n");
++              conf->resync_parity = 1;
++              md_wakeup_thread(conf->resync_thread);
++              return 1;
++      } else
++              printk("raid5: no restart-resync needed.\n");
++      return 0;
++}
++
++
++static int raid5_stop (mddev_t *mddev)
++{
++      raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++
++      shrink_stripe_cache(conf, conf->max_nr_stripes);
++      shrink_stripes(conf, conf->max_nr_stripes);
++      md_unregister_thread(conf->thread);
++      if (conf->resync_thread)
++              md_unregister_thread(conf->resync_thread);
++      free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
++      kfree(conf);
+       mddev->private = NULL;
+       MOD_DEC_USE_COUNT;
+       return 0;
+ }
+ 
+-static int raid5_status (char *page, int minor, struct md_dev *mddev)
++static int raid5_status (char *page, mddev_t *mddev)
+ {
+-      struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
+-      md_superblock_t *sb = mddev->sb;
++      raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
++      mdp_super_t *sb = mddev->sb;
+       int sz = 0, i;
+ 
+-      sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
+-      sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
+-      for (i = 0; i < raid_conf->raid_disks; i++)
+-              sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
++      sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
++      sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
++      for (i = 0; i < conf->raid_disks; i++)
++              sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
+       sz += sprintf (page+sz, "]");
+       return sz;
+ }
+ 
+-static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
++static void print_raid5_conf (raid5_conf_t *conf)
++{
++      int i;
++      struct disk_info *tmp;
++
++      printk("RAID5 conf printout:\n");
++      if (!conf) {
++              printk("(conf==NULL)\n");
++              return;
++      }
++      printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
++               conf->working_disks, conf->failed_disks);
++
++      for (i = 0; i < MD_SB_DISKS; i++) {
++              tmp = conf->disks + i;
++              printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
++                      i, tmp->spare,tmp->operational,
++                      tmp->number,tmp->raid_disk,tmp->used_slot,
++                      partition_name(tmp->dev));
++      }
++}
++
++static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+ {
+-      int i = 0, failed_disk = -1;
+-      struct raid5_data *raid_conf = mddev->private;
+-      struct disk_info *disk = raid_conf->disks;
++      int err = 0;
++      int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
++      raid5_conf_t *conf = mddev->private;
++      struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+       unsigned long flags;
+-      md_superblock_t *sb = mddev->sb;
+-      md_descriptor_t *descriptor;
++      mdp_super_t *sb = mddev->sb;
++      mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+ 
+-      for (i = 0; i < MD_SB_DISKS; i++, disk++) {
+-              if (disk->spare && disk->number == spare->number)
+-                      goto found;
+-      }
+-      return 1;
+-found:
+-      for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
+-              if (!disk->operational)
+-                      failed_disk = i;
+-      if (failed_disk == -1)
+-              return 1;
+       save_flags(flags);
+       cli();
++
++      print_raid5_conf(conf);
++      /*
++       * find the disk ...
++       */
+       switch (state) {
+-              case SPARE_WRITE:
+-                      disk->operational = 1;
+-                      disk->write_only = 1;
+-                      raid_conf->spare = disk;
+-                      break;
+-              case SPARE_INACTIVE:
+-                      disk->operational = 0;
+-                      disk->write_only = 0;
+-                      raid_conf->spare = NULL;
+-                      break;
+-              case SPARE_ACTIVE:
+-                      disk->spare = 0;
+-                      disk->write_only = 0;
+ 
+-                      descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
+-                      i = spare->raid_disk;
+-                      disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
+-                      if (disk->raid_disk != failed_disk)
+-                              printk("raid5: disk->raid_disk != failed_disk");
+-                      descriptor->raid_disk = i;
+-
+-                      raid_conf->spare = NULL;
+-                      raid_conf->working_disks++;
+-                      raid_conf->failed_disks--;
+-                      raid_conf->disks[failed_disk] = *disk;
+-                      break;
+-              default:
+-                      printk("raid5_mark_spare: bug: state == %d\n", state);
+-                      restore_flags(flags);
+-                      return 1;
++      case DISKOP_SPARE_ACTIVE:
++
++              /*
++               * Find the failed disk within the RAID5 configuration ...
++               * (this can only be in the first conf->raid_disks part)
++               */
++              for (i = 0; i < conf->raid_disks; i++) {
++                      tmp = conf->disks + i;
++                      if ((!tmp->operational && !tmp->spare) ||
++                                      !tmp->used_slot) {
++                              failed_disk = i;
++                              break;
++                      }
++              }
++              /*
++               * When we activate a spare disk we _must_ have a disk in
++               * the lower (active) part of the array to replace. 
++               */
++              if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              /* fall through */
++
++      case DISKOP_SPARE_WRITE:
++      case DISKOP_SPARE_INACTIVE:
++
++              /*
++               * Find the spare disk ... (can only be in the 'high'
++               * area of the array)
++               */
++              for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++                      tmp = conf->disks + i;
++                      if (tmp->spare && tmp->number == (*d)->number) {
++                              spare_disk = i;
++                              break;
++                      }
++              }
++              if (spare_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
++
++      case DISKOP_HOT_REMOVE_DISK:
++
++              for (i = 0; i < MD_SB_DISKS; i++) {
++                      tmp = conf->disks + i;
++                      if (tmp->used_slot && (tmp->number == (*d)->number)) {
++                              if (tmp->operational) {
++                                      err = -EBUSY;
++                                      goto abort;
++                              }
++                              removed_disk = i;
++                              break;
++                      }
++              }
++              if (removed_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
++
++      case DISKOP_HOT_ADD_DISK:
++
++              for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
++                      tmp = conf->disks + i;
++                      if (!tmp->used_slot) {
++                              added_disk = i;
++                              break;
++                      }
++              }
++              if (added_disk == -1) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              break;
++      }
++
++      switch (state) {
++      /*
++       * Switch the spare disk to write-only mode:
++       */
++      case DISKOP_SPARE_WRITE:
++              if (conf->spare) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              sdisk = conf->disks + spare_disk;
++              sdisk->operational = 1;
++              sdisk->write_only = 1;
++              conf->spare = sdisk;
++              break;
++      /*
++       * Deactivate a spare disk:
++       */
++      case DISKOP_SPARE_INACTIVE:
++              sdisk = conf->disks + spare_disk;
++              sdisk->operational = 0;
++              sdisk->write_only = 0;
++              /*
++               * Was the spare being resynced?
++               */
++              if (conf->spare == sdisk)
++                      conf->spare = NULL;
++              break;
++      /*
++       * Activate (mark read-write) the (now sync) spare disk,
++       * which means we switch it's 'raid position' (->raid_disk)
++       * with the failed disk. (only the first 'conf->raid_disks'
++       * slots are used for 'real' disks and we must preserve this
++       * property)
++       */
++      case DISKOP_SPARE_ACTIVE:
++              if (!conf->spare) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++              sdisk = conf->disks + spare_disk;
++              fdisk = conf->disks + failed_disk;
++
++              spare_desc = &sb->disks[sdisk->number];
++              failed_desc = &sb->disks[fdisk->number];
++
++              if (spare_desc != *d) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++
++              if (spare_desc->raid_disk != sdisk->raid_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++                      
++              if (sdisk->raid_disk != spare_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++
++              if (failed_desc->raid_disk != fdisk->raid_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++
++              if (fdisk->raid_disk != failed_disk) {
++                      MD_BUG();
++                      err = 1;
++                      goto abort;
++              }
++
++              /*
++               * do the switch finally
++               */
++              xchg_values(*spare_desc, *failed_desc);
++              xchg_values(*fdisk, *sdisk);
++
++              /*
++               * (careful, 'failed' and 'spare' are switched from now on)
++               *
++               * we want to preserve linear numbering and we want to
++               * give the proper raid_disk number to the now activated
++               * disk. (this means we switch back these values)
++               */
++      
++              xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
++              xchg_values(sdisk->raid_disk, fdisk->raid_disk);
++              xchg_values(spare_desc->number, failed_desc->number);
++              xchg_values(sdisk->number, fdisk->number);
++
++              *d = failed_desc;
++
++              if (sdisk->dev == MKDEV(0,0))
++                      sdisk->used_slot = 0;
++
++              /*
++               * this really activates the spare.
++               */
++              fdisk->spare = 0;
++              fdisk->write_only = 0;
++
++              /*
++               * if we activate a spare, we definitely replace a
++               * non-operational disk slot in the 'low' area of
++               * the disk array.
++               */
++              conf->failed_disks--;
++              conf->working_disks++;
++              conf->spare = NULL;
++
++              break;
++
++      case DISKOP_HOT_REMOVE_DISK:
++              rdisk = conf->disks + removed_disk;
++
++              if (rdisk->spare && (removed_disk < conf->raid_disks)) {
++                      MD_BUG();       
++                      err = 1;
++                      goto abort;
++              }
++              rdisk->dev = MKDEV(0,0);
++              rdisk->used_slot = 0;
++
++              break;
++
++      case DISKOP_HOT_ADD_DISK:
++              adisk = conf->disks + added_disk;
++              added_desc = *d;
++
++              if (added_disk != added_desc->number) {
++                      MD_BUG();       
++                      err = 1;
++                      goto abort;
++              }
++
++              adisk->number = added_desc->number;
++              adisk->raid_disk = added_desc->raid_disk;
++              adisk->dev = MKDEV(added_desc->major,added_desc->minor);
++
++              adisk->operational = 0;
++              adisk->write_only = 0;
++              adisk->spare = 1;
++              adisk->used_slot = 1;
++
++
++              break;
++
++      default:
++              MD_BUG();       
++              err = 1;
++              goto abort;
+       }
++abort:
+       restore_flags(flags);
+-      return 0;
++      print_raid5_conf(conf);
++      return err;
+ }
+ 
+-static struct md_personality raid5_personality=
++static mdk_personality_t raid5_personality=
+ {
+       "raid5",
+       raid5_map,
+@@ -1648,14 +2072,19 @@
+       NULL,                   /* no ioctls */
+       0,
+       raid5_error,
+-      /* raid5_hot_add_disk, */ NULL,
+-      /* raid1_hot_remove_drive */ NULL,
+-      raid5_mark_spare
++      raid5_diskop,
++      raid5_stop_resync,
++      raid5_restart_resync
+ };
+ 
+ int raid5_init (void)
+ {
+-      return register_md_personality (RAID5, &raid5_personality);
++      int err;
++
++      err = register_md_personality (RAID5, &raid5_personality);
++      if (err)
++              return err;
++      return 0;
+ }
+ 
+ #ifdef MODULE
+diff -uNr linux-orig/drivers/block/translucent.c linux/drivers/block/translucent.c
+--- linux-orig/drivers/block/translucent.c     Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/translucent.c  Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,136 @@
++/*
++   translucent.c : Translucent RAID driver for Linux
++              Copyright (C) 1998 Ingo Molnar
++
++   Translucent mode management functions.
++
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#include <linux/module.h>
++
++#include <linux/raid/md.h>
++#include <linux/malloc.h>
++
++#include <linux/raid/translucent.h>
++
++#define MAJOR_NR MD_MAJOR
++#define MD_DRIVER
++#define MD_PERSONALITY
++
++static int translucent_run (mddev_t *mddev)
++{
++      translucent_conf_t *conf;
++      mdk_rdev_t *rdev;
++      int i;
++
++      MOD_INC_USE_COUNT;
++
++      conf = kmalloc (sizeof (*conf), GFP_KERNEL);
++      if (!conf)
++              goto out;
++      mddev->private = conf;
++
++      if (mddev->nb_dev != 2) {
++              printk("translucent: this mode needs 2 disks, aborting!\n");
++              goto out;
++      }
++
++      if (md_check_ordering(mddev)) {
++              printk("translucent: disks are not ordered, aborting!\n");
++              goto out;
++      }
++
++      ITERATE_RDEV_ORDERED(mddev,rdev,i) {
++              dev_info_t *disk = conf->disks + i;
++
++              disk->dev = rdev->dev;
++              disk->size = rdev->size;
++      }
++
++      return 0;
++
++out:
++      if (conf)
++              kfree(conf);
++
++      MOD_DEC_USE_COUNT;
++      return 1;
++}
++
++static int translucent_stop (mddev_t *mddev)
++{
++      translucent_conf_t *conf = mddev_to_conf(mddev);
++  
++      kfree(conf);
++
++      MOD_DEC_USE_COUNT;
++
++      return 0;
++}
++
++
++static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++                     unsigned long *rsector, unsigned long size)
++{
++      translucent_conf_t *conf = mddev_to_conf(mddev);
++  
++      *rdev = conf->disks[0].dev;
++
++      return 0;
++}
++
++static int translucent_status (char *page, mddev_t *mddev)
++{
++      int sz = 0;
++  
++      sz += sprintf(page+sz, " %d%% full", 10);
++      return sz;
++}
++
++
++static mdk_personality_t translucent_personality=
++{
++      "translucent",
++      translucent_map,
++      NULL,
++      NULL,
++      translucent_run,
++      translucent_stop,
++      translucent_status,
++      NULL,
++      0,
++      NULL,
++      NULL,
++      NULL,
++      NULL
++};
++
++#ifndef MODULE
++
++md__initfunc(void translucent_init (void))
++{
++      register_md_personality (TRANSLUCENT, &translucent_personality);
++}
++
++#else
++
++int init_module (void)
++{
++      return (register_md_personality (TRANSLUCENT, &translucent_personality));
++}
++
++void cleanup_module (void)
++{
++      unregister_md_personality (TRANSLUCENT);
++}
++
++#endif
++
+diff -uNr linux-orig/drivers/block/xor.c linux/drivers/block/xor.c
+--- linux-orig/drivers/block/xor.c     Wed Dec 31 17:00:00 1969
++++ linux/drivers/block/xor.c  Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,1894 @@
++/*
++ * xor.c : Multiple Devices driver for Linux
++ *
++ * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
++ *
++ *
++ * optimized RAID-5 checksumming functions.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2, or (at your option)
++ * any later version.
++ *
++ * You should have received a copy of the GNU General Public License
++ * (for example /usr/src/linux/COPYING); if not, write to the Free
++ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
++ */
++#include <linux/module.h>
++#include <linux/raid/md.h>
++#ifdef __sparc_v9__
++#include <asm/head.h>
++#include <asm/asi.h>
++#include <asm/visasm.h>
++#endif
++
++/*
++ * we use the 'XOR function template' to register multiple xor
++ * functions runtime. The kernel measures their speed upon bootup
++ * and decides which one to use. (compile-time registration is
++ * not enough as certain CPU features like MMX can only be detected
++ * runtime)
++ *
++ * this architecture makes it pretty easy to add new routines
++ * that are faster on certain CPUs, without killing other CPU's
++ * 'native' routine. Although the current routines are belived
++ * to be the physically fastest ones on all CPUs tested, but
++ * feel free to prove me wrong and add yet another routine =B-)
++ * --mingo
++ */
++
++#define MAX_XOR_BLOCKS 5
++
++#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
++
++typedef void (*xor_block_t) XOR_ARGS;
++xor_block_t xor_block = NULL;
++
++#ifndef __sparc_v9__
++
++struct xor_block_template;
++
++struct xor_block_template {
++      char * name;
++      xor_block_t xor_block;
++      int speed;
++      struct xor_block_template * next;
++};
++
++struct xor_block_template * xor_functions = NULL;
++
++#define XORBLOCK_TEMPLATE(x) \
++static void xor_block_##x XOR_ARGS; \
++static struct xor_block_template t_xor_block_##x = \
++                               { #x, xor_block_##x, 0, NULL }; \
++static void xor_block_##x XOR_ARGS
++
++#ifdef __i386__
++
++#ifdef CONFIG_X86_XMM
++/*
++ * Cache avoiding checksumming functions utilizing KNI instructions
++ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
++ */
++
++XORBLOCK_TEMPLATE(pIII_kni)
++{
++      char xmm_save[16*4];
++      int cr0;
++        int lines = (bh_ptr[0]->b_size>>8);
++
++      __asm__ __volatile__ ( 
++              "movl %%cr0,%0          ;\n\t"
++              "clts                   ;\n\t"
++              "movups %%xmm0,(%1)     ;\n\t"
++              "movups %%xmm1,0x10(%1) ;\n\t"
++              "movups %%xmm2,0x20(%1) ;\n\t"
++              "movups %%xmm3,0x30(%1) ;\n\t"
++              : "=r" (cr0)
++              : "r" (xmm_save) 
++              : "memory" );
++
++#define OFFS(x) "8*("#x"*2)"
++#define       PF0(x) \
++      "       prefetcht0  "OFFS(x)"(%1)   ;\n"
++#define LD(x,y) \
++        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
++#define ST(x,y) \
++        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
++#define PF1(x) \
++      "       prefetchnta "OFFS(x)"(%2)   ;\n"
++#define PF2(x) \
++      "       prefetchnta "OFFS(x)"(%3)   ;\n"
++#define PF3(x) \
++      "       prefetchnta "OFFS(x)"(%4)   ;\n"
++#define PF4(x) \
++      "       prefetchnta "OFFS(x)"(%5)   ;\n"
++#define PF5(x) \
++      "       prefetchnta "OFFS(x)"(%6)   ;\n"
++#define XO1(x,y) \
++        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
++#define XO2(x,y) \
++        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
++#define XO3(x,y) \
++        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
++#define XO4(x,y) \
++        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
++#define XO5(x,y) \
++        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
++
++      switch(count) {
++              case 2:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++              LD(i,0)                                 \
++                      LD(i+1,1)                       \
++              PF1(i)                                  \
++                              PF1(i+2)                \
++                              LD(i+2,2)               \
++                                      LD(i+3,3)       \
++              PF0(i+4)                                \
++                              PF0(i+6)                \
++              XO1(i,0)                                \
++                      XO1(i+1,1)                      \
++                              XO1(i+2,2)              \
++                                      XO1(i+3,3)      \
++              ST(i,0)                                 \
++                      ST(i+1,1)                       \
++                              ST(i+2,2)               \
++                                      ST(i+3,3)       \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32,0x90                ;\n"
++        " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++        "       addl $256, %1           ;\n"
++        "       addl $256, %2           ;\n"
++        "       decl %0                 ;\n"
++        "       jnz 1b                  ;\n"
++
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data)
++                      : "memory" );
++                      break;
++              case 3:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i+2)                \
++              LD(i,0)                                 \
++                      LD(i+1,1)                       \
++                              LD(i+2,2)               \
++                                      LD(i+3,3)       \
++              PF2(i)                                  \
++                              PF2(i+2)                \
++              PF0(i+4)                                \
++                              PF0(i+6)                \
++              XO1(i,0)                                \
++                      XO1(i+1,1)                      \
++                              XO1(i+2,2)              \
++                                      XO1(i+3,3)      \
++              XO2(i,0)                                \
++                      XO2(i+1,1)                      \
++                              XO2(i+2,2)              \
++                                      XO2(i+3,3)      \
++              ST(i,0)                                 \
++                      ST(i+1,1)                       \
++                              ST(i+2,2)               \
++                                      ST(i+3,3)       \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32,0x90                ;\n"
++        " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++        "       addl $256, %1           ;\n"
++        "       addl $256, %2           ;\n"
++        "       addl $256, %3           ;\n"
++        "       decl %0                 ;\n"
++        "       jnz 1b                  ;\n"
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data)
++                      : "memory" );
++                      break;
++              case 4:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i+2)                \
++              LD(i,0)                                 \
++                      LD(i+1,1)                       \
++                              LD(i+2,2)               \
++                                      LD(i+3,3)       \
++              PF2(i)                                  \
++                              PF2(i+2)                \
++              XO1(i,0)                                \
++                      XO1(i+1,1)                      \
++                              XO1(i+2,2)              \
++                                      XO1(i+3,3)      \
++              PF3(i)                                  \
++                              PF3(i+2)                \
++              PF0(i+4)                                \
++                              PF0(i+6)                \
++              XO2(i,0)                                \
++                      XO2(i+1,1)                      \
++                              XO2(i+2,2)              \
++                                      XO2(i+3,3)      \
++              XO3(i,0)                                \
++                      XO3(i+1,1)                      \
++                              XO3(i+2,2)              \
++                                      XO3(i+3,3)      \
++              ST(i,0)                                 \
++                      ST(i+1,1)                       \
++                              ST(i+2,2)               \
++                                      ST(i+3,3)       \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32,0x90                ;\n"
++        " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++        "       addl $256, %1           ;\n"
++        "       addl $256, %2           ;\n"
++        "       addl $256, %3           ;\n"
++        "       addl $256, %4           ;\n"
++        "       decl %0                 ;\n"
++        "       jnz 1b                  ;\n"
++
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data),
++                        "r" (bh_ptr[3]->b_data)
++                      : "memory" );
++                      break;
++              case 5:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i+2)                \
++              LD(i,0)                                 \
++                      LD(i+1,1)                       \
++                              LD(i+2,2)               \
++                                      LD(i+3,3)       \
++              PF2(i)                                  \
++                              PF2(i+2)                \
++              XO1(i,0)                                \
++                      XO1(i+1,1)                      \
++                              XO1(i+2,2)              \
++                                      XO1(i+3,3)      \
++              PF3(i)                                  \
++                              PF3(i+2)                \
++              XO2(i,0)                                \
++                      XO2(i+1,1)                      \
++                              XO2(i+2,2)              \
++                                      XO2(i+3,3)      \
++              PF4(i)                                  \
++                              PF4(i+2)                \
++              PF0(i+4)                                \
++                              PF0(i+6)                \
++              XO3(i,0)                                \
++                      XO3(i+1,1)                      \
++                              XO3(i+2,2)              \
++                                      XO3(i+3,3)      \
++              XO4(i,0)                                \
++                      XO4(i+1,1)                      \
++                              XO4(i+2,2)              \
++                                      XO4(i+3,3)      \
++              ST(i,0)                                 \
++                      ST(i+1,1)                       \
++                              ST(i+2,2)               \
++                                      ST(i+3,3)       \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32,0x90                ;\n"
++        " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++        "       addl $256, %1           ;\n"
++        "       addl $256, %2           ;\n"
++        "       addl $256, %3           ;\n"
++        "       addl $256, %4           ;\n"
++        "       addl $256, %5           ;\n"
++        "       decl %0                 ;\n"
++        "       jnz 1b                  ;\n"
++
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data),
++                        "r" (bh_ptr[3]->b_data),
++                        "r" (bh_ptr[4]->b_data)
++                      : "memory");
++                      break;
++      }
++
++      __asm__ __volatile__ ( 
++              "sfence                 ;\n\t"
++              "movups (%1),%%xmm0     ;\n\t"
++              "movups 0x10(%1),%%xmm1 ;\n\t"
++              "movups 0x20(%1),%%xmm2 ;\n\t"
++              "movups 0x30(%1),%%xmm3 ;\n\t"
++              "movl   %0,%%cr0        ;\n\t"
++              :
++              : "r" (cr0), "r" (xmm_save)
++              : "memory" );
++}
++
++#undef OFFS
++#undef LD
++#undef ST
++#undef PF0
++#undef PF1
++#undef PF2
++#undef PF3
++#undef PF4
++#undef PF5
++#undef XO1
++#undef XO2
++#undef XO3
++#undef XO4
++#undef XO5
++#undef BLOCK
++
++#endif /* CONFIG_X86_XMM */
++
++/*
++ * high-speed RAID5 checksumming functions utilizing MMX instructions
++ * Copyright (C) 1998 Ingo Molnar
++ */
++XORBLOCK_TEMPLATE(pII_mmx)
++{
++      char fpu_save[108];
++        int lines = (bh_ptr[0]->b_size>>7);
++
++      if (!(current->flags & PF_USEDFPU))
++              __asm__ __volatile__ ( " clts;\n");
++
++      __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
++
++#define LD(x,y) \
++        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
++#define ST(x,y) \
++        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
++#define XO1(x,y) \
++        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
++#define XO2(x,y) \
++        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
++#define XO3(x,y) \
++        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
++#define XO4(x,y) \
++        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
++
++      switch(count) {
++              case 2:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++                      LD(i,0)                                 \
++                              LD(i+1,1)                       \
++                                      LD(i+2,2)               \
++                                              LD(i+3,3)       \
++                      XO1(i,0)                                \
++                      ST(i,0)                                 \
++                              XO1(i+1,1)                      \
++                              ST(i+1,1)                       \
++                                      XO1(i+2,2)              \
++                                      ST(i+2,2)               \
++                                              XO1(i+3,3)      \
++                                              ST(i+3,3)
++
++                      " .align 32,0x90                ;\n"
++                      " 1:                            ;\n"
++
++                      BLOCK(0)
++                      BLOCK(4)
++                      BLOCK(8)
++                      BLOCK(12)
++
++                      "       addl $128, %1         ;\n"
++                      "       addl $128, %2         ;\n"
++                      "       decl %0               ;\n"
++                      "       jnz 1b                ;\n"
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data)
++                      : "memory");
++                      break;
++              case 3:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++                      LD(i,0)                                 \
++                              LD(i+1,1)                       \
++                                      LD(i+2,2)               \
++                                              LD(i+3,3)       \
++                      XO1(i,0)                                \
++                              XO1(i+1,1)                      \
++                                      XO1(i+2,2)              \
++                                              XO1(i+3,3)      \
++                      XO2(i,0)                                \
++                      ST(i,0)                                 \
++                              XO2(i+1,1)                      \
++                              ST(i+1,1)                       \
++                                      XO2(i+2,2)              \
++                                      ST(i+2,2)               \
++                                              XO2(i+3,3)      \
++                                              ST(i+3,3)
++
++                      " .align 32,0x90                ;\n"
++                      " 1:                            ;\n"
++
++                      BLOCK(0)
++                      BLOCK(4)
++                      BLOCK(8)
++                      BLOCK(12)
++
++                      "       addl $128, %1         ;\n"
++                      "       addl $128, %2         ;\n"
++                      "       addl $128, %3         ;\n"
++                      "       decl %0               ;\n"
++                      "       jnz 1b                ;\n"
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data)
++                      : "memory");
++                      break;
++              case 4:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++                      LD(i,0)                                 \
++                              LD(i+1,1)                       \
++                                      LD(i+2,2)               \
++                                              LD(i+3,3)       \
++                      XO1(i,0)                                \
++                              XO1(i+1,1)                      \
++                                      XO1(i+2,2)              \
++                                              XO1(i+3,3)      \
++                      XO2(i,0)                                \
++                              XO2(i+1,1)                      \
++                                      XO2(i+2,2)              \
++                                              XO2(i+3,3)      \
++                      XO3(i,0)                                \
++                      ST(i,0)                                 \
++                              XO3(i+1,1)                      \
++                              ST(i+1,1)                       \
++                                      XO3(i+2,2)              \
++                                      ST(i+2,2)               \
++                                              XO3(i+3,3)      \
++                                              ST(i+3,3)
++
++                      " .align 32,0x90                ;\n"
++                      " 1:                            ;\n"
++
++                      BLOCK(0)
++                      BLOCK(4)
++                      BLOCK(8)
++                      BLOCK(12)
++
++                      "       addl $128, %1         ;\n"
++                      "       addl $128, %2         ;\n"
++                      "       addl $128, %3         ;\n"
++                      "       addl $128, %4         ;\n"
++                      "       decl %0               ;\n"
++                      "       jnz 1b                ;\n"
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data),
++                        "r" (bh_ptr[3]->b_data)
++                      : "memory");
++                      break;
++              case 5:
++                      __asm__ __volatile__ (
++#undef BLOCK
++#define BLOCK(i) \
++                      LD(i,0)                                 \
++                              LD(i+1,1)                       \
++                                      LD(i+2,2)               \
++                                              LD(i+3,3)       \
++                      XO1(i,0)                                \
++                              XO1(i+1,1)                      \
++                                      XO1(i+2,2)              \
++                                              XO1(i+3,3)      \
++                      XO2(i,0)                                \
++                              XO2(i+1,1)                      \
++                                      XO2(i+2,2)              \
++                                              XO2(i+3,3)      \
++                      XO3(i,0)                                \
++                              XO3(i+1,1)                      \
++                                      XO3(i+2,2)              \
++                                              XO3(i+3,3)      \
++                      XO4(i,0)                                \
++                      ST(i,0)                                 \
++                              XO4(i+1,1)                      \
++                              ST(i+1,1)                       \
++                                      XO4(i+2,2)              \
++                                      ST(i+2,2)               \
++                                              XO4(i+3,3)      \
++                                              ST(i+3,3)
++
++                      " .align 32,0x90                ;\n"
++                      " 1:                            ;\n"
++
++                      BLOCK(0)
++                      BLOCK(4)
++                      BLOCK(8)
++                      BLOCK(12)
++
++                      "       addl $128, %1         ;\n"
++                      "       addl $128, %2         ;\n"
++                      "       addl $128, %3         ;\n"
++                      "       addl $128, %4         ;\n"
++                      "       addl $128, %5         ;\n"
++                      "       decl %0               ;\n"
++                      "       jnz 1b                ;\n"
++                      :
++                      : "r" (lines),
++                        "r" (bh_ptr[0]->b_data),
++                        "r" (bh_ptr[1]->b_data),
++                        "r" (bh_ptr[2]->b_data),
++                        "r" (bh_ptr[3]->b_data),
++                        "r" (bh_ptr[4]->b_data)
++                      : "memory");
++                      break;
++      }
++
++      __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
++
++      if (!(current->flags & PF_USEDFPU))
++              stts();
++}
++
++#undef LD
++#undef XO1
++#undef XO2
++#undef XO3
++#undef XO4
++#undef ST
++#undef BLOCK
++
++XORBLOCK_TEMPLATE(p5_mmx)
++{
++      char fpu_save[108];
++        int lines = (bh_ptr[0]->b_size>>6);
++
++      if (!(current->flags & PF_USEDFPU))
++              __asm__ __volatile__ ( " clts;\n");
++
++      __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
++
++      switch(count) {
++              case 2:
++                      __asm__ __volatile__ (
++
++                              " .align 32,0x90             ;\n"
++                              " 1:                         ;\n"
++                              "       movq   (%1), %%mm0   ;\n"
++                              "       movq  8(%1), %%mm1   ;\n"
++                              "       pxor   (%2), %%mm0   ;\n"
++                              "       movq 16(%1), %%mm2   ;\n"
++                              "       movq %%mm0,   (%1)   ;\n"
++                              "       pxor  8(%2), %%mm1   ;\n"
++                              "       movq 24(%1), %%mm3   ;\n"
++                              "       movq %%mm1,  8(%1)   ;\n"
++                              "       pxor 16(%2), %%mm2   ;\n"
++                              "       movq 32(%1), %%mm4   ;\n"
++                              "       movq %%mm2, 16(%1)   ;\n"
++                              "       pxor 24(%2), %%mm3   ;\n"
++                              "       movq 40(%1), %%mm5   ;\n"
++                              "       movq %%mm3, 24(%1)   ;\n"
++                              "       pxor 32(%2), %%mm4   ;\n"
++                              "       movq 48(%1), %%mm6   ;\n"
++                              "       movq %%mm4, 32(%1)   ;\n"
++                              "       pxor 40(%2), %%mm5   ;\n"
++                              "       movq 56(%1), %%mm7   ;\n"
++                              "       movq %%mm5, 40(%1)   ;\n"
++                              "       pxor 48(%2), %%mm6   ;\n"
++                              "       pxor 56(%2), %%mm7   ;\n"
++                              "       movq %%mm6, 48(%1)   ;\n"
++                              "       movq %%mm7, 56(%1)   ;\n"
++        
++                              "       addl $64, %1         ;\n"
++                              "       addl $64, %2         ;\n"
++                              "       decl %0              ;\n"
++                              "       jnz 1b               ;\n"
++
++                              : 
++                              : "r" (lines),
++                                "r" (bh_ptr[0]->b_data),
++                                "r" (bh_ptr[1]->b_data)
++                              : "memory" );
++                      break;
++              case 3:
++                      __asm__ __volatile__ (
++
++                              " .align 32,0x90             ;\n"
++                              " 1:                         ;\n"
++                              "       movq   (%1), %%mm0   ;\n"
++                              "       movq  8(%1), %%mm1   ;\n"
++                              "       pxor   (%2), %%mm0   ;\n"
++                              "       movq 16(%1), %%mm2   ;\n"
++                              "       pxor  8(%2), %%mm1   ;\n"
++                              "       pxor   (%3), %%mm0   ;\n"
++                              "       pxor 16(%2), %%mm2   ;\n"
++                              "       movq %%mm0,   (%1)   ;\n"
++                              "       pxor  8(%3), %%mm1   ;\n"
++                              "       pxor 16(%3), %%mm2   ;\n"
++                              "       movq 24(%1), %%mm3   ;\n"
++                              "       movq %%mm1,  8(%1)   ;\n"
++                              "       movq 32(%1), %%mm4   ;\n"
++                              "       movq 40(%1), %%mm5   ;\n"
++                              "       pxor 24(%2), %%mm3   ;\n"
++                              "       movq %%mm2, 16(%1)   ;\n"
++                              "       pxor 32(%2), %%mm4   ;\n"
++                              "       pxor 24(%3), %%mm3   ;\n"
++                              "       pxor 40(%2), %%mm5   ;\n"
++                              "       movq %%mm3, 24(%1)   ;\n"
++                              "       pxor 32(%3), %%mm4   ;\n"
++                              "       pxor 40(%3), %%mm5   ;\n"
++                              "       movq 48(%1), %%mm6   ;\n"
++                              "       movq %%mm4, 32(%1)   ;\n"
++                              "       movq 56(%1), %%mm7   ;\n"
++                              "       pxor 48(%2), %%mm6   ;\n"
++                              "       movq %%mm5, 40(%1)   ;\n"
++                              "       pxor 56(%2), %%mm7   ;\n"
++                              "       pxor 48(%3), %%mm6   ;\n"
++                              "       pxor 56(%3), %%mm7   ;\n"
++                              "       movq %%mm6, 48(%1)   ;\n"
++                              "       movq %%mm7, 56(%1)   ;\n"
++        
++                              "       addl $64, %1         ;\n"
++                              "       addl $64, %2         ;\n"
++                              "       addl $64, %3         ;\n"
++                              "       decl %0              ;\n"
++                              "       jnz 1b               ;\n"
++
++                              : 
++                              : "r" (lines),
++                                "r" (bh_ptr[0]->b_data),
++                                "r" (bh_ptr[1]->b_data),
++                                "r" (bh_ptr[2]->b_data)
++                              : "memory" );
++                      break;
++              case 4:
++                      __asm__ __volatile__ (
++
++                              " .align 32,0x90             ;\n"
++                              " 1:                         ;\n"
++                              "       movq   (%1), %%mm0   ;\n"
++                              "       movq  8(%1), %%mm1   ;\n"
++                              "       pxor   (%2), %%mm0   ;\n"
++                              "       movq 16(%1), %%mm2   ;\n"
++                              "       pxor  8(%2), %%mm1   ;\n"
++                              "       pxor   (%3), %%mm0   ;\n"
++                              "       pxor 16(%2), %%mm2   ;\n"
++                              "       pxor  8(%3), %%mm1   ;\n"
++                              "       pxor   (%4), %%mm0   ;\n"
++                              "       movq 24(%1), %%mm3   ;\n"
++                              "       pxor 16(%3), %%mm2   ;\n"
++                              "       pxor  8(%4), %%mm1   ;\n"
++                              "       movq %%mm0,   (%1)   ;\n"
++                              "       movq 32(%1), %%mm4   ;\n"
++                              "       pxor 24(%2), %%mm3   ;\n"
++                              "       pxor 16(%4), %%mm2   ;\n"
++                              "       movq %%mm1,  8(%1)   ;\n"
++                              "       movq 40(%1), %%mm5   ;\n"
++                              "       pxor 32(%2), %%mm4   ;\n"
++                              "       pxor 24(%3), %%mm3   ;\n"
++                              "       movq %%mm2, 16(%1)   ;\n"
++                              "       pxor 40(%2), %%mm5   ;\n"
++                              "       pxor 32(%3), %%mm4   ;\n"
++                              "       pxor 24(%4), %%mm3   ;\n"
++                              "       movq %%mm3, 24(%1)   ;\n"
++                              "       movq 56(%1), %%mm7   ;\n"
++                              "       movq 48(%1), %%mm6   ;\n"
++                              "       pxor 40(%3), %%mm5   ;\n"
++                              "       pxor 32(%4), %%mm4   ;\n"
++                              "       pxor 48(%2), %%mm6   ;\n"
++                              "       movq %%mm4, 32(%1)   ;\n"
++                              "       pxor 56(%2), %%mm7   ;\n"
++                              "       pxor 40(%4), %%mm5   ;\n"
++                              "       pxor 48(%3), %%mm6   ;\n"
++                              "       pxor 56(%3), %%mm7   ;\n"
++                              "       movq %%mm5, 40(%1)   ;\n"
++                              "       pxor 48(%4), %%mm6   ;\n"
++                              "       pxor 56(%4), %%mm7   ;\n"
++                              "       movq %%mm6, 48(%1)   ;\n"
++                              "       movq %%mm7, 56(%1)   ;\n"
++        
++                              "       addl $64, %1         ;\n"
++                              "       addl $64, %2         ;\n"
++                              "       addl $64, %3         ;\n"
++                              "       addl $64, %4         ;\n"
++                              "       decl %0              ;\n"
++                              "       jnz 1b               ;\n"
++
++                              : 
++                              : "r" (lines),
++                                "r" (bh_ptr[0]->b_data),
++                                "r" (bh_ptr[1]->b_data),
++                                "r" (bh_ptr[2]->b_data),
++                                "r" (bh_ptr[3]->b_data)
++                              : "memory" );
++                      break;
++              case 5:
++                      __asm__ __volatile__ (
++
++                              " .align 32,0x90             ;\n"
++                              " 1:                         ;\n"
++                              "       movq   (%1), %%mm0   ;\n"
++                              "       movq  8(%1), %%mm1   ;\n"
++                              "       pxor   (%2), %%mm0   ;\n"
++                              "       pxor  8(%2), %%mm1   ;\n"
++                              "       movq 16(%1), %%mm2   ;\n"
++                              "       pxor   (%3), %%mm0   ;\n"
++                              "       pxor  8(%3), %%mm1   ;\n"
++                              "       pxor 16(%2), %%mm2   ;\n"
++                              "       pxor   (%4), %%mm0   ;\n"
++                              "       pxor  8(%4), %%mm1   ;\n"
++                              "       pxor 16(%3), %%mm2   ;\n"
++                              "       movq 24(%1), %%mm3   ;\n"
++                              "       pxor   (%5), %%mm0   ;\n"
++                              "       pxor  8(%5), %%mm1   ;\n"
++                              "       movq %%mm0,   (%1)   ;\n"
++                              "       pxor 16(%4), %%mm2   ;\n"
++                              "       pxor 24(%2), %%mm3   ;\n"
++                              "       movq %%mm1,  8(%1)   ;\n"
++                              "       pxor 16(%5), %%mm2   ;\n"
++                              "       pxor 24(%3), %%mm3   ;\n"
++                              "       movq 32(%1), %%mm4   ;\n"
++                              "       movq %%mm2, 16(%1)   ;\n"
++                              "       pxor 24(%4), %%mm3   ;\n"
++                              "       pxor 32(%2), %%mm4   ;\n"
++                              "       movq 40(%1), %%mm5   ;\n"
++                              "       pxor 24(%5), %%mm3   ;\n"
++                              "       pxor 32(%3), %%mm4   ;\n"
++                              "       pxor 40(%2), %%mm5   ;\n"
++                              "       movq %%mm3, 24(%1)   ;\n"
++                              "       pxor 32(%4), %%mm4   ;\n"
++                              "       pxor 40(%3), %%mm5   ;\n"
++                              "       movq 48(%1), %%mm6   ;\n"
++                              "       movq 56(%1), %%mm7   ;\n"
++                              "       pxor 32(%5), %%mm4   ;\n"
++                              "       pxor 40(%4), %%mm5   ;\n"
++                              "       pxor 48(%2), %%mm6   ;\n"
++                              "       pxor 56(%2), %%mm7   ;\n"
++                              "       movq %%mm4, 32(%1)   ;\n"
++                              "       pxor 48(%3), %%mm6   ;\n"
++                              "       pxor 56(%3), %%mm7   ;\n"
++                              "       pxor 40(%5), %%mm5   ;\n"
++                              "       pxor 48(%4), %%mm6   ;\n"
++                              "       pxor 56(%4), %%mm7   ;\n"
++                              "       movq %%mm5, 40(%1)   ;\n"
++                              "       pxor 48(%5), %%mm6   ;\n"
++                              "       pxor 56(%5), %%mm7   ;\n"
++                              "       movq %%mm6, 48(%1)   ;\n"
++                              "       movq %%mm7, 56(%1)   ;\n"
++        
++                              "       addl $64, %1         ;\n"
++                              "       addl $64, %2         ;\n"
++                              "       addl $64, %3         ;\n"
++                              "       addl $64, %4         ;\n"
++                              "       addl $64, %5         ;\n"
++                              "       decl %0              ;\n"
++                              "       jnz 1b               ;\n"
++
++                              : 
++                              : "r" (lines),
++                                "r" (bh_ptr[0]->b_data),
++                                "r" (bh_ptr[1]->b_data),
++                                "r" (bh_ptr[2]->b_data),
++                                "r" (bh_ptr[3]->b_data),
++                                "r" (bh_ptr[4]->b_data)
++                              : "memory" );
++                      break;
++      }
++
++      __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
++
++      if (!(current->flags & PF_USEDFPU))
++              stts();
++}
++#endif /* __i386__ */
++#endif /* !__sparc_v9__ */
++
++#ifdef __sparc_v9__
++/*
++ * High speed xor_block operation for RAID4/5 utilizing the
++ * UltraSparc Visual Instruction Set.
++ *
++ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
++ *
++ *    Requirements:
++ *    !(((long)dest | (long)sourceN) & (64 - 1)) &&
++ *    !(len & 127) && len >= 256
++ *
++ * It is done in pure assembly, as otherwise gcc makes it
++ * a non-leaf function, which is not what we want.
++ * Also, we don't measure the speeds as on other architectures,
++ * as the measuring routine does not take into account cold caches
++ * and the fact that xor_block_VIS bypasses the caches.
++ * xor_block_32regs might be 5% faster for count 2 if caches are hot
++ * and things just right (for count 3 VIS is about as fast as 32regs for
++ * hot caches and for count 4 and 5 VIS is faster by good margin always),
++ * but I think it is better not to pollute the caches.
++ * Actually, if I'd just fight for speed for hot caches, I could
++ * write a hybrid VIS/integer routine, which would do always two
++ * 64B blocks in VIS and two in IEUs, but I really care more about
++ * caches.
++ */
++extern void *VISenter(void);
++extern void xor_block_VIS XOR_ARGS;
++
++void __xor_block_VIS(void)
++{
++__asm__ ("
++      .globl xor_block_VIS
++xor_block_VIS:
++      ldx     [%%o1 + 0], %%o4
++      ldx     [%%o1 + 8], %%o3
++      ldx     [%%o4 + %1], %%g5
++      ldx     [%%o4 + %0], %%o4
++      ldx     [%%o3 + %0], %%o3
++      rd      %%fprs, %%o5
++      andcc   %%o5, %2, %%g0
++      be,pt   %%icc, 297f
++       sethi  %%hi(%5), %%g1
++      jmpl    %%g1 + %%lo(%5), %%g7
++       add    %%g7, 8, %%g7
++297:  wr      %%g0, %4, %%fprs
++      membar  #LoadStore|#StoreLoad|#StoreStore
++      sub     %%g5, 64, %%g5
++      ldda    [%%o4] %3, %%f0
++      ldda    [%%o3] %3, %%f16
++      cmp     %%o0, 4
++      bgeu,pt %%xcc, 10f
++       cmp    %%o0, 3
++      be,pn   %%xcc, 13f
++       mov    -64, %%g1
++      sub     %%g5, 64, %%g5
++      rd      %%asi, %%g1
++      wr      %%g0, %3, %%asi
++
++2:    ldda    [%%o4 + 64] %%asi, %%f32
++      fxor    %%f0, %%f16, %%f16
++      fxor    %%f2, %%f18, %%f18
++      fxor    %%f4, %%f20, %%f20
++      fxor    %%f6, %%f22, %%f22
++      fxor    %%f8, %%f24, %%f24
++      fxor    %%f10, %%f26, %%f26
++      fxor    %%f12, %%f28, %%f28
++      fxor    %%f14, %%f30, %%f30
++      stda    %%f16, [%%o4] %3
++      ldda    [%%o3 + 64] %%asi, %%f48
++      ldda    [%%o4 + 128] %%asi, %%f0
++      fxor    %%f32, %%f48, %%f48
++      fxor    %%f34, %%f50, %%f50
++      add     %%o4, 128, %%o4
++      fxor    %%f36, %%f52, %%f52
++      add     %%o3, 128, %%o3
++      fxor    %%f38, %%f54, %%f54
++      subcc   %%g5, 128, %%g5
++      fxor    %%f40, %%f56, %%f56
++      fxor    %%f42, %%f58, %%f58
++      fxor    %%f44, %%f60, %%f60
++      fxor    %%f46, %%f62, %%f62
++      stda    %%f48, [%%o4 - 64] %%asi
++      bne,pt  %%xcc, 2b
++       ldda   [%%o3] %3, %%f16
++
++      ldda    [%%o4 + 64] %%asi, %%f32
++      fxor    %%f0, %%f16, %%f16
++      fxor    %%f2, %%f18, %%f18
++      fxor    %%f4, %%f20, %%f20
++      fxor    %%f6, %%f22, %%f22
++      fxor    %%f8, %%f24, %%f24
++      fxor    %%f10, %%f26, %%f26
++      fxor    %%f12, %%f28, %%f28
++      fxor    %%f14, %%f30, %%f30
++      stda    %%f16, [%%o4] %3
++      ldda    [%%o3 + 64] %%asi, %%f48
++      membar  #Sync
++      fxor    %%f32, %%f48, %%f48
++      fxor    %%f34, %%f50, %%f50
++      fxor    %%f36, %%f52, %%f52
++      fxor    %%f38, %%f54, %%f54
++      fxor    %%f40, %%f56, %%f56
++      fxor    %%f42, %%f58, %%f58
++      fxor    %%f44, %%f60, %%f60
++      fxor    %%f46, %%f62, %%f62
++      stda    %%f48, [%%o4 + 64] %%asi
++      membar  #Sync|#StoreStore|#StoreLoad
++      wr      %%g0, 0, %%fprs
++      retl
++       wr     %%g1, %%g0, %%asi
++
++13:   ldx     [%%o1 + 16], %%o2
++      ldx     [%%o2 + %0], %%o2
++
++3:    ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f48
++      fxor    %%f2, %%f18, %%f50
++      add     %%o4, 64, %%o4
++      fxor    %%f4, %%f20, %%f52
++      fxor    %%f6, %%f22, %%f54
++      add     %%o3, 64, %%o3
++      fxor    %%f8, %%f24, %%f56
++      fxor    %%f10, %%f26, %%f58
++      fxor    %%f12, %%f28, %%f60
++      fxor    %%f14, %%f30, %%f62
++      ldda    [%%o4] %3, %%f0
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      add     %%o2, 64, %%o2
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      subcc   %%g5, 64, %%g5
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      stda    %%f48, [%%o4 + %%g1] %3
++      bne,pt  %%xcc, 3b
++       ldda   [%%o3] %3, %%f16
++
++      ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f48
++      fxor    %%f2, %%f18, %%f50
++      fxor    %%f4, %%f20, %%f52
++      fxor    %%f6, %%f22, %%f54
++      fxor    %%f8, %%f24, %%f56
++      fxor    %%f10, %%f26, %%f58
++      fxor    %%f12, %%f28, %%f60
++      fxor    %%f14, %%f30, %%f62
++      membar  #Sync
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      stda    %%f48, [%%o4] %3
++      membar  #Sync|#StoreStore|#StoreLoad
++      retl
++       wr     %%g0, 0, %%fprs
++
++10:   cmp     %%o0, 5
++      be,pt   %%xcc, 15f
++       mov    -64, %%g1
++
++14:   ldx     [%%o1 + 16], %%o2
++      ldx     [%%o1 + 24], %%o0
++      ldx     [%%o2 + %0], %%o2
++      ldx     [%%o0 + %0], %%o0
++
++4:    ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f16
++      fxor    %%f2, %%f18, %%f18
++      add     %%o4, 64, %%o4
++      fxor    %%f4, %%f20, %%f20
++      fxor    %%f6, %%f22, %%f22
++      add     %%o3, 64, %%o3
++      fxor    %%f8, %%f24, %%f24
++      fxor    %%f10, %%f26, %%f26
++      fxor    %%f12, %%f28, %%f28
++      fxor    %%f14, %%f30, %%f30
++      ldda    [%%o0] %3, %%f48
++      fxor    %%f16, %%f32, %%f32
++      fxor    %%f18, %%f34, %%f34
++      fxor    %%f20, %%f36, %%f36
++      fxor    %%f22, %%f38, %%f38
++      add     %%o2, 64, %%o2
++      fxor    %%f24, %%f40, %%f40
++      fxor    %%f26, %%f42, %%f42
++      fxor    %%f28, %%f44, %%f44
++      fxor    %%f30, %%f46, %%f46
++      ldda    [%%o4] %3, %%f0
++      fxor    %%f32, %%f48, %%f48
++      fxor    %%f34, %%f50, %%f50
++      fxor    %%f36, %%f52, %%f52
++      add     %%o0, 64, %%o0
++      fxor    %%f38, %%f54, %%f54
++      fxor    %%f40, %%f56, %%f56
++      fxor    %%f42, %%f58, %%f58
++      subcc   %%g5, 64, %%g5
++      fxor    %%f44, %%f60, %%f60
++      fxor    %%f46, %%f62, %%f62
++      stda    %%f48, [%%o4 + %%g1] %3
++      bne,pt  %%xcc, 4b
++       ldda   [%%o3] %3, %%f16
++
++      ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f16
++      fxor    %%f2, %%f18, %%f18
++      fxor    %%f4, %%f20, %%f20
++      fxor    %%f6, %%f22, %%f22
++      fxor    %%f8, %%f24, %%f24
++      fxor    %%f10, %%f26, %%f26
++      fxor    %%f12, %%f28, %%f28
++      fxor    %%f14, %%f30, %%f30
++      ldda    [%%o0] %3, %%f48
++      fxor    %%f16, %%f32, %%f32
++      fxor    %%f18, %%f34, %%f34
++      fxor    %%f20, %%f36, %%f36
++      fxor    %%f22, %%f38, %%f38
++      fxor    %%f24, %%f40, %%f40
++      fxor    %%f26, %%f42, %%f42
++      fxor    %%f28, %%f44, %%f44
++      fxor    %%f30, %%f46, %%f46
++      membar  #Sync
++      fxor    %%f32, %%f48, %%f48
++      fxor    %%f34, %%f50, %%f50
++      fxor    %%f36, %%f52, %%f52
++      fxor    %%f38, %%f54, %%f54
++      fxor    %%f40, %%f56, %%f56
++      fxor    %%f42, %%f58, %%f58
++      fxor    %%f44, %%f60, %%f60
++      fxor    %%f46, %%f62, %%f62
++      stda    %%f48, [%%o4] %3
++      membar  #Sync|#StoreStore|#StoreLoad
++      retl
++       wr     %%g0, 0, %%fprs
++
++15:   ldx     [%%o1 + 16], %%o2
++      ldx     [%%o1 + 24], %%o0
++      ldx     [%%o1 + 32], %%o1
++      ldx     [%%o2 + %0], %%o2
++      ldx     [%%o0 + %0], %%o0
++      ldx     [%%o1 + %0], %%o1
++
++5:    ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f48
++      fxor    %%f2, %%f18, %%f50
++      add     %%o4, 64, %%o4
++      fxor    %%f4, %%f20, %%f52
++      fxor    %%f6, %%f22, %%f54
++      add     %%o3, 64, %%o3
++      fxor    %%f8, %%f24, %%f56
++      fxor    %%f10, %%f26, %%f58
++      fxor    %%f12, %%f28, %%f60
++      fxor    %%f14, %%f30, %%f62
++      ldda    [%%o0] %3, %%f16
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      add     %%o2, 64, %%o2
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      ldda    [%%o1] %3, %%f32
++      fxor    %%f48, %%f16, %%f48
++      fxor    %%f50, %%f18, %%f50
++      add     %%o0, 64, %%o0
++      fxor    %%f52, %%f20, %%f52
++      fxor    %%f54, %%f22, %%f54
++      add     %%o1, 64, %%o1
++      fxor    %%f56, %%f24, %%f56
++      fxor    %%f58, %%f26, %%f58
++      fxor    %%f60, %%f28, %%f60
++      fxor    %%f62, %%f30, %%f62
++      ldda    [%%o4] %3, %%f0
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      subcc   %%g5, 64, %%g5
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      stda    %%f48, [%%o4 + %%g1] %3
++      bne,pt  %%xcc, 5b
++       ldda   [%%o3] %3, %%f16
++
++      ldda    [%%o2] %3, %%f32
++      fxor    %%f0, %%f16, %%f48
++      fxor    %%f2, %%f18, %%f50
++      fxor    %%f4, %%f20, %%f52
++      fxor    %%f6, %%f22, %%f54
++      fxor    %%f8, %%f24, %%f56
++      fxor    %%f10, %%f26, %%f58
++      fxor    %%f12, %%f28, %%f60
++      fxor    %%f14, %%f30, %%f62
++      ldda    [%%o0] %3, %%f16
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      ldda    [%%o1] %3, %%f32
++      fxor    %%f48, %%f16, %%f48
++      fxor    %%f50, %%f18, %%f50
++      fxor    %%f52, %%f20, %%f52
++      fxor    %%f54, %%f22, %%f54
++      fxor    %%f56, %%f24, %%f56
++      fxor    %%f58, %%f26, %%f58
++      fxor    %%f60, %%f28, %%f60
++      fxor    %%f62, %%f30, %%f62
++      membar  #Sync
++      fxor    %%f48, %%f32, %%f48
++      fxor    %%f50, %%f34, %%f50
++      fxor    %%f52, %%f36, %%f52
++      fxor    %%f54, %%f38, %%f54
++      fxor    %%f56, %%f40, %%f56
++      fxor    %%f58, %%f42, %%f58
++      fxor    %%f60, %%f44, %%f60
++      fxor    %%f62, %%f46, %%f62
++      stda    %%f48, [%%o4] %3
++      membar  #Sync|#StoreStore|#StoreLoad
++      retl
++       wr     %%g0, 0, %%fprs
++      " : :
++      "i" (&((struct buffer_head *)0)->b_data),
++      "i" (&((struct buffer_head *)0)->b_size),
++      "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
++      "i" (FPRS_FEF), "i" (VISenter));
++}
++#endif /* __sparc_v9__ */
++
++#if defined(__sparc__) && !defined(__sparc_v9__)
++/*
++ * High speed xor_block operation for RAID4/5 utilizing the
++ * ldd/std SPARC instructions.
++ *
++ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
++ *
++ */
++
++XORBLOCK_TEMPLATE(SPARC)
++{
++      int size  = bh_ptr[0]->b_size;
++      int lines = size / (sizeof (long)) / 8, i;
++      long *destp   = (long *) bh_ptr[0]->b_data;
++      long *source1 = (long *) bh_ptr[1]->b_data;
++      long *source2, *source3, *source4;
++
++      switch (count) {
++      case 2:
++              for (i = lines; i > 0; i--) {
++                __asm__ __volatile__("
++                ldd [%0 + 0x00], %%g2
++                ldd [%0 + 0x08], %%g4
++                ldd [%0 + 0x10], %%o0
++                ldd [%0 + 0x18], %%o2
++                ldd [%1 + 0x00], %%o4
++                ldd [%1 + 0x08], %%l0
++                ldd [%1 + 0x10], %%l2
++                ldd [%1 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                std %%g2, [%0 + 0x00]
++                std %%g4, [%0 + 0x08]
++                std %%o0, [%0 + 0x10]
++                std %%o2, [%0 + 0x18]
++                " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
++                "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
++                destp += 8;
++                source1 += 8;
++              }
++              break;
++      case 3:
++              source2 = (long *) bh_ptr[2]->b_data;
++              for (i = lines; i > 0; i--) {
++                __asm__ __volatile__("
++                ldd [%0 + 0x00], %%g2
++                ldd [%0 + 0x08], %%g4
++                ldd [%0 + 0x10], %%o0
++                ldd [%0 + 0x18], %%o2
++                ldd [%1 + 0x00], %%o4
++                ldd [%1 + 0x08], %%l0
++                ldd [%1 + 0x10], %%l2
++                ldd [%1 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%2 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%2 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%2 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%2 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                std %%g2, [%0 + 0x00]
++                std %%g4, [%0 + 0x08]
++                std %%o0, [%0 + 0x10]
++                std %%o2, [%0 + 0x18]
++                " : : "r" (destp), "r" (source1), "r" (source2)
++                : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++                "l0", "l1", "l2", "l3", "l4", "l5");
++                destp += 8;
++                source1 += 8;
++                source2 += 8;
++              }
++              break;
++      case 4:
++              source2 = (long *) bh_ptr[2]->b_data;
++              source3 = (long *) bh_ptr[3]->b_data;
++              for (i = lines; i > 0; i--) {
++                __asm__ __volatile__("
++                ldd [%0 + 0x00], %%g2
++                ldd [%0 + 0x08], %%g4
++                ldd [%0 + 0x10], %%o0
++                ldd [%0 + 0x18], %%o2
++                ldd [%1 + 0x00], %%o4
++                ldd [%1 + 0x08], %%l0
++                ldd [%1 + 0x10], %%l2
++                ldd [%1 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%2 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%2 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%2 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%2 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%3 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%3 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%3 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%3 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                std %%g2, [%0 + 0x00]
++                std %%g4, [%0 + 0x08]
++                std %%o0, [%0 + 0x10]
++                std %%o2, [%0 + 0x18]
++                " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
++                : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++                "l0", "l1", "l2", "l3", "l4", "l5");
++                destp += 8;
++                source1 += 8;
++                source2 += 8;
++                source3 += 8;
++              }
++              break;
++      case 5:
++              source2 = (long *) bh_ptr[2]->b_data;
++              source3 = (long *) bh_ptr[3]->b_data;
++              source4 = (long *) bh_ptr[4]->b_data;
++              for (i = lines; i > 0; i--) {
++                __asm__ __volatile__("
++                ldd [%0 + 0x00], %%g2
++                ldd [%0 + 0x08], %%g4
++                ldd [%0 + 0x10], %%o0
++                ldd [%0 + 0x18], %%o2
++                ldd [%1 + 0x00], %%o4
++                ldd [%1 + 0x08], %%l0
++                ldd [%1 + 0x10], %%l2
++                ldd [%1 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%2 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%2 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%2 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%2 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%3 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%3 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%3 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%3 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                ldd [%4 + 0x00], %%o4
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                ldd [%4 + 0x08], %%l0
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                ldd [%4 + 0x10], %%l2
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                ldd [%4 + 0x18], %%l4
++                xor %%g2, %%o4, %%g2
++                xor %%g3, %%o5, %%g3
++                xor %%g4, %%l0, %%g4
++                xor %%g5, %%l1, %%g5
++                xor %%o0, %%l2, %%o0
++                xor %%o1, %%l3, %%o1
++                xor %%o2, %%l4, %%o2
++                xor %%o3, %%l5, %%o3
++                std %%g2, [%0 + 0x00]
++                std %%g4, [%0 + 0x08]
++                std %%o0, [%0 + 0x10]
++                std %%o2, [%0 + 0x18]
++                " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
++                : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
++                "l0", "l1", "l2", "l3", "l4", "l5");
++                destp += 8;
++                source1 += 8;
++                source2 += 8;
++                source3 += 8;
++                source4 += 8;
++              }
++              break;
++      }
++}
++#endif /* __sparc_v[78]__ */
++
++#ifndef __sparc_v9__
++
++/*
++ * this one works reasonably on any x86 CPU
++ * (send me an assembly version for inclusion if you can make it faster)
++ *
++ * this one is just as fast as written in pure assembly on x86.
++ * the reason for this separate version is that the
++ * fast open-coded xor routine "32reg" produces suboptimal code
++ * on x86, due to lack of registers.
++ */
++XORBLOCK_TEMPLATE(8regs)
++{
++      int len  = bh_ptr[0]->b_size;
++      long *destp   = (long *) bh_ptr[0]->b_data;
++      long *source1, *source2, *source3, *source4;
++      long lines = len / (sizeof (long)) / 8, i;
++
++      switch(count) {
++              case 2:
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              *(destp + 0) ^= *(source1 + 0);
++                              *(destp + 1) ^= *(source1 + 1);
++                              *(destp + 2) ^= *(source1 + 2);
++                              *(destp + 3) ^= *(source1 + 3);
++                              *(destp + 4) ^= *(source1 + 4);
++                              *(destp + 5) ^= *(source1 + 5);
++                              *(destp + 6) ^= *(source1 + 6);
++                              *(destp + 7) ^= *(source1 + 7);
++                              source1 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 3:
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              *(destp + 0) ^= *(source1 + 0);
++                              *(destp + 0) ^= *(source2 + 0);
++                              *(destp + 1) ^= *(source1 + 1);
++                              *(destp + 1) ^= *(source2 + 1);
++                              *(destp + 2) ^= *(source1 + 2);
++                              *(destp + 2) ^= *(source2 + 2);
++                              *(destp + 3) ^= *(source1 + 3);
++                              *(destp + 3) ^= *(source2 + 3);
++                              *(destp + 4) ^= *(source1 + 4);
++                              *(destp + 4) ^= *(source2 + 4);
++                              *(destp + 5) ^= *(source1 + 5);
++                              *(destp + 5) ^= *(source2 + 5);
++                              *(destp + 6) ^= *(source1 + 6);
++                              *(destp + 6) ^= *(source2 + 6);
++                              *(destp + 7) ^= *(source1 + 7);
++                              *(destp + 7) ^= *(source2 + 7);
++                              source1 += 8;
++                              source2 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 4:
++                      source3 = (long *) bh_ptr[3]->b_data;
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              *(destp + 0) ^= *(source1 + 0);
++                              *(destp + 0) ^= *(source2 + 0);
++                              *(destp + 0) ^= *(source3 + 0);
++                              *(destp + 1) ^= *(source1 + 1);
++                              *(destp + 1) ^= *(source2 + 1);
++                              *(destp + 1) ^= *(source3 + 1);
++                              *(destp + 2) ^= *(source1 + 2);
++                              *(destp + 2) ^= *(source2 + 2);
++                              *(destp + 2) ^= *(source3 + 2);
++                              *(destp + 3) ^= *(source1 + 3);
++                              *(destp + 3) ^= *(source2 + 3);
++                              *(destp + 3) ^= *(source3 + 3);
++                              *(destp + 4) ^= *(source1 + 4);
++                              *(destp + 4) ^= *(source2 + 4);
++                              *(destp + 4) ^= *(source3 + 4);
++                              *(destp + 5) ^= *(source1 + 5);
++                              *(destp + 5) ^= *(source2 + 5);
++                              *(destp + 5) ^= *(source3 + 5);
++                              *(destp + 6) ^= *(source1 + 6);
++                              *(destp + 6) ^= *(source2 + 6);
++                              *(destp + 6) ^= *(source3 + 6);
++                              *(destp + 7) ^= *(source1 + 7);
++                              *(destp + 7) ^= *(source2 + 7);
++                              *(destp + 7) ^= *(source3 + 7);
++                              source1 += 8;
++                              source2 += 8;
++                              source3 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 5:
++                      source4 = (long *) bh_ptr[4]->b_data;
++                      source3 = (long *) bh_ptr[3]->b_data;
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              *(destp + 0) ^= *(source1 + 0);
++                              *(destp + 0) ^= *(source2 + 0);
++                              *(destp + 0) ^= *(source3 + 0);
++                              *(destp + 0) ^= *(source4 + 0);
++                              *(destp + 1) ^= *(source1 + 1);
++                              *(destp + 1) ^= *(source2 + 1);
++                              *(destp + 1) ^= *(source3 + 1);
++                              *(destp + 1) ^= *(source4 + 1);
++                              *(destp + 2) ^= *(source1 + 2);
++                              *(destp + 2) ^= *(source2 + 2);
++                              *(destp + 2) ^= *(source3 + 2);
++                              *(destp + 2) ^= *(source4 + 2);
++                              *(destp + 3) ^= *(source1 + 3);
++                              *(destp + 3) ^= *(source2 + 3);
++                              *(destp + 3) ^= *(source3 + 3);
++                              *(destp + 3) ^= *(source4 + 3);
++                              *(destp + 4) ^= *(source1 + 4);
++                              *(destp + 4) ^= *(source2 + 4);
++                              *(destp + 4) ^= *(source3 + 4);
++                              *(destp + 4) ^= *(source4 + 4);
++                              *(destp + 5) ^= *(source1 + 5);
++                              *(destp + 5) ^= *(source2 + 5);
++                              *(destp + 5) ^= *(source3 + 5);
++                              *(destp + 5) ^= *(source4 + 5);
++                              *(destp + 6) ^= *(source1 + 6);
++                              *(destp + 6) ^= *(source2 + 6);
++                              *(destp + 6) ^= *(source3 + 6);
++                              *(destp + 6) ^= *(source4 + 6);
++                              *(destp + 7) ^= *(source1 + 7);
++                              *(destp + 7) ^= *(source2 + 7);
++                              *(destp + 7) ^= *(source3 + 7);
++                              *(destp + 7) ^= *(source4 + 7);
++                              source1 += 8;
++                              source2 += 8;
++                              source3 += 8;
++                              source4 += 8;
++                              destp += 8;
++                      }
++                      break;
++      }
++}
++
++/*
++ * platform independent RAID5 checksum calculation, this should
++ * be very fast on any platform that has a decent amount of
++ * registers. (32 or more)
++ */
++XORBLOCK_TEMPLATE(32regs)
++{
++      int size  = bh_ptr[0]->b_size;
++      int lines = size / (sizeof (long)) / 8, i;
++      long *destp   = (long *) bh_ptr[0]->b_data;
++      long *source1, *source2, *source3, *source4;
++      
++        /* LOTS of registers available...
++           We do explicite loop-unrolling here for code which
++           favours RISC machines.  In fact this is almoast direct
++           RISC assembly on Alpha and SPARC :-)  */
++
++
++      switch(count) {
++              case 2:
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              register long d0, d1, d2, d3, d4, d5, d6, d7;
++                              d0 = destp[0];  /* Pull the stuff into registers        */
++                              d1 = destp[1];  /*  ... in bursts, if possible.         */
++                              d2 = destp[2];
++                              d3 = destp[3];
++                              d4 = destp[4];
++                              d5 = destp[5];
++                              d6 = destp[6];
++                              d7 = destp[7];
++                              d0 ^= source1[0];
++                              d1 ^= source1[1];
++                              d2 ^= source1[2];
++                              d3 ^= source1[3];
++                              d4 ^= source1[4];
++                              d5 ^= source1[5];
++                              d6 ^= source1[6];
++                              d7 ^= source1[7];
++                              destp[0] = d0;  /* Store the result (in burts)          */
++                              destp[1] = d1;
++                              destp[2] = d2;
++                              destp[3] = d3;
++                              destp[4] = d4;  /* Store the result (in burts)          */
++                              destp[5] = d5;
++                              destp[6] = d6;
++                              destp[7] = d7;
++                              source1 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 3:
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              register long d0, d1, d2, d3, d4, d5, d6, d7;
++                              d0 = destp[0];  /* Pull the stuff into registers        */
++                              d1 = destp[1];  /*  ... in bursts, if possible.         */
++                              d2 = destp[2];
++                              d3 = destp[3];
++                              d4 = destp[4];
++                              d5 = destp[5];
++                              d6 = destp[6];
++                              d7 = destp[7];
++                              d0 ^= source1[0];
++                              d1 ^= source1[1];
++                              d2 ^= source1[2];
++                              d3 ^= source1[3];
++                              d4 ^= source1[4];
++                              d5 ^= source1[5];
++                              d6 ^= source1[6];
++                              d7 ^= source1[7];
++                              d0 ^= source2[0];
++                              d1 ^= source2[1];
++                              d2 ^= source2[2];
++                              d3 ^= source2[3];
++                              d4 ^= source2[4];
++                              d5 ^= source2[5];
++                              d6 ^= source2[6];
++                              d7 ^= source2[7];
++                              destp[0] = d0;  /* Store the result (in burts)          */
++                              destp[1] = d1;
++                              destp[2] = d2;
++                              destp[3] = d3;
++                              destp[4] = d4;  /* Store the result (in burts)          */
++                              destp[5] = d5;
++                              destp[6] = d6;
++                              destp[7] = d7;
++                              source1 += 8;
++                              source2 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 4:
++                      source3 = (long *) bh_ptr[3]->b_data;
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              register long d0, d1, d2, d3, d4, d5, d6, d7;
++                              d0 = destp[0];  /* Pull the stuff into registers        */
++                              d1 = destp[1];  /*  ... in bursts, if possible.         */
++                              d2 = destp[2];
++                              d3 = destp[3];
++                              d4 = destp[4];
++                              d5 = destp[5];
++                              d6 = destp[6];
++                              d7 = destp[7];
++                              d0 ^= source1[0];
++                              d1 ^= source1[1];
++                              d2 ^= source1[2];
++                              d3 ^= source1[3];
++                              d4 ^= source1[4];
++                              d5 ^= source1[5];
++                              d6 ^= source1[6];
++                              d7 ^= source1[7];
++                              d0 ^= source2[0];
++                              d1 ^= source2[1];
++                              d2 ^= source2[2];
++                              d3 ^= source2[3];
++                              d4 ^= source2[4];
++                              d5 ^= source2[5];
++                              d6 ^= source2[6];
++                              d7 ^= source2[7];
++                              d0 ^= source3[0];
++                              d1 ^= source3[1];
++                              d2 ^= source3[2];
++                              d3 ^= source3[3];
++                              d4 ^= source3[4];
++                              d5 ^= source3[5];
++                              d6 ^= source3[6];
++                              d7 ^= source3[7];
++                              destp[0] = d0;  /* Store the result (in burts)          */
++                              destp[1] = d1;
++                              destp[2] = d2;
++                              destp[3] = d3;
++                              destp[4] = d4;  /* Store the result (in burts)          */
++                              destp[5] = d5;
++                              destp[6] = d6;
++                              destp[7] = d7;
++                              source1 += 8;
++                              source2 += 8;
++                              source3 += 8;
++                              destp += 8;
++                      }
++                      break;
++              case 5:
++                      source4 = (long *) bh_ptr[4]->b_data;
++                      source3 = (long *) bh_ptr[3]->b_data;
++                      source2 = (long *) bh_ptr[2]->b_data;
++                      source1 = (long *) bh_ptr[1]->b_data;
++                      for (i = lines; i > 0; i--) {
++                              register long d0, d1, d2, d3, d4, d5, d6, d7;
++                              d0 = destp[0];  /* Pull the stuff into registers        */
++                              d1 = destp[1];  /*  ... in bursts, if possible.         */
++                              d2 = destp[2];
++                              d3 = destp[3];
++                              d4 = destp[4];
++                              d5 = destp[5];
++                              d6 = destp[6];
++                              d7 = destp[7];
++                              d0 ^= source1[0];
++                              d1 ^= source1[1];
++                              d2 ^= source1[2];
++                              d3 ^= source1[3];
++                              d4 ^= source1[4];
++                              d5 ^= source1[5];
++                              d6 ^= source1[6];
++                              d7 ^= source1[7];
++                              d0 ^= source2[0];
++                              d1 ^= source2[1];
++                              d2 ^= source2[2];
++                              d3 ^= source2[3];
++                              d4 ^= source2[4];
++                              d5 ^= source2[5];
++                              d6 ^= source2[6];
++                              d7 ^= source2[7];
++                              d0 ^= source3[0];
++                              d1 ^= source3[1];
++                              d2 ^= source3[2];
++                              d3 ^= source3[3];
++                              d4 ^= source3[4];
++                              d5 ^= source3[5];
++                              d6 ^= source3[6];
++                              d7 ^= source3[7];
++                              d0 ^= source4[0];
++                              d1 ^= source4[1];
++                              d2 ^= source4[2];
++                              d3 ^= source4[3];
++                              d4 ^= source4[4];
++                              d5 ^= source4[5];
++                              d6 ^= source4[6];
++                              d7 ^= source4[7];
++                              destp[0] = d0;  /* Store the result (in burts)          */
++                              destp[1] = d1;
++                              destp[2] = d2;
++                              destp[3] = d3;
++                              destp[4] = d4;  /* Store the result (in burts)          */
++                              destp[5] = d5;
++                              destp[6] = d6;
++                              destp[7] = d7;
++                              source1 += 8;
++                              source2 += 8;
++                              source3 += 8;
++                              source4 += 8;
++                              destp += 8;
++                      }
++                      break;
++      }
++}
++
++/*
++ * (the -6*32 shift factor colors the cache)
++ */
++#define SIZE (PAGE_SIZE-6*32)
++
++static void xor_speed ( struct xor_block_template * func, 
++      struct buffer_head *b1, struct buffer_head *b2)
++{
++      int speed;
++      unsigned long now;
++      int i, count, max;
++      struct buffer_head *bh_ptr[6];
++
++      func->next = xor_functions;
++      xor_functions = func;
++      bh_ptr[0] = b1;
++      bh_ptr[1] = b2;
++
++      /*
++       * count the number of XORs done during a whole jiffy.
++       * calculate the speed of checksumming from this.
++       * (we use a 2-page allocation to have guaranteed
++       * color L1-cache layout)
++       */
++      max = 0;
++      for (i = 0; i < 5; i++) {
++              now = jiffies;
++              count = 0;
++              while (jiffies == now) {
++                      mb();
++                      func->xor_block(2,bh_ptr);
++                      mb();
++                      count++;
++                      mb();
++              }
++              if (count > max)
++                      max = count;
++      }
++
++      speed = max * (HZ*SIZE/1024);
++      func->speed = speed;
++
++      printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
++              speed / 1000, speed % 1000);
++}
++
++static inline void pick_fastest_function(void)
++{
++      struct xor_block_template *f, *fastest;
++
++      fastest = xor_functions;
++      for (f = fastest; f; f = f->next) {
++              if (f->speed > fastest->speed)
++                      fastest = f;
++      }
++#ifdef CONFIG_X86_XMM 
++      if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
++              fastest = &t_xor_block_pIII_kni;
++      }
++#endif
++      xor_block = fastest->xor_block;
++      printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
++              fastest->speed / 1000, fastest->speed % 1000);
++}
++ 
++
++void calibrate_xor_block(void)
++{
++      struct buffer_head b1, b2;
++
++      memset(&b1,0,sizeof(b1));
++      b2 = b1;
++
++      b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
++      if (!b1.b_data) {
++              pick_fastest_function();
++              return;
++      }
++      b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
++
++      b1.b_size = SIZE;
++
++      printk(KERN_INFO "raid5: measuring checksumming speed\n");
++
++      sti(); /* should be safe */
++
++#if defined(__sparc__) && !defined(__sparc_v9__)
++      printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
++      xor_speed(&t_xor_block_SPARC,&b1,&b2);
++#endif
++
++#ifdef CONFIG_X86_XMM 
++      if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
++              printk(KERN_INFO
++                      "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
++              /* we force the use of the KNI xor block because it
++                      can write around l2.  we may also be able
++                      to load into the l1 only depending on how
++                      the cpu deals with a load to a line that is
++                      being prefetched.
++              */
++              xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
++      }
++#endif /* CONFIG_X86_XMM */
++
++#ifdef __i386__
++
++      if (md_cpu_has_mmx()) {
++              printk(KERN_INFO
++                      "raid5: MMX detected, trying high-speed MMX checksum routines\n");
++              xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
++              xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
++      }
++
++#endif /* __i386__ */
++      
++      
++      xor_speed(&t_xor_block_8regs,&b1,&b2);
++      xor_speed(&t_xor_block_32regs,&b1,&b2);
++
++      free_pages((unsigned long)b1.b_data,2);
++      pick_fastest_function();
++}
++
++#else /* __sparc_v9__ */
++
++void calibrate_xor_block(void)
++{
++      printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
++      xor_block = xor_block_VIS;
++}
++
++#endif /* __sparc_v9__ */
++
++MD_EXPORT_SYMBOL(xor_block);
++
+diff -uNr linux-orig/include/linux/blkdev.h linux/include/linux/blkdev.h
+--- linux-orig/include/linux/blkdev.h  Fri Dec  8 22:53:55 2000
++++ linux/include/linux/blkdev.h       Fri Dec  8 22:54:52 2000
+@@ -91,8 +91,9 @@
+ extern void make_request(int major,int rw, struct buffer_head * bh);
+ 
+ /* md needs this function to remap requests */
+-extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
+-extern int md_make_request (int minor, int rw, struct buffer_head * bh);
++extern int md_map (kdev_t dev, kdev_t *rdev,
++                               unsigned long *rsector, unsigned long size);
++extern int md_make_request (struct buffer_head * bh, int rw);
+ extern int md_error (kdev_t mddev, kdev_t rdev);
+ 
+ extern int * blk_size[MAX_BLKDEV];
+diff -uNr linux-orig/include/linux/fs.h linux/include/linux/fs.h
+--- linux-orig/include/linux/fs.h      Fri Dec  8 22:53:55 2000
++++ linux/include/linux/fs.h   Fri Dec  8 22:54:52 2000
+@@ -185,6 +185,7 @@
+ #define BH_Lock               2       /* 1 if the buffer is locked */
+ #define BH_Req                3       /* 0 if the buffer has been invalidated */
+ #define BH_Protected  6       /* 1 if the buffer is protected */
++#define BH_LowPrio    7       /* 1 if the buffer is lowprio */
+ 
+ /*
+  * Try to keep the most commonly used fields in single cache lines (16
+@@ -778,6 +779,7 @@
+ extern void refile_buffer(struct buffer_head * buf);
+ extern void set_writetime(struct buffer_head * buf, int flag);
+ extern int try_to_free_buffers(struct page *, int wait);
++extern void cache_drop_behind(struct buffer_head *bh);
+ 
+ extern int nr_buffers;
+ extern long buffermem;
+@@ -798,6 +800,25 @@
+       }
+ }
+ 
++extern inline void mark_buffer_highprio(struct buffer_head * bh)
++{
++      clear_bit(BH_LowPrio, &bh->b_state);
++}
++
++extern inline void mark_buffer_lowprio(struct buffer_head * bh)
++{
++      /*
++       * dirty buffers cannot be marked lowprio.
++       */
++      if (!buffer_dirty(bh))
++              set_bit(BH_LowPrio, &bh->b_state);
++}
++
++static inline int buffer_lowprio(struct buffer_head * bh)
++{
++      return test_bit(BH_LowPrio, &bh->b_state);
++}
++
+ extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
+ {
+       if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
+@@ -805,6 +826,23 @@
+               if (bh->b_list != BUF_DIRTY)
+                       refile_buffer(bh);
+       }
++      /*
++       * if a buffer gets marked dirty then it has to lose
++       * it's lowprio state.
++       */
++      mark_buffer_highprio(bh);
++}
++
++extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
++{
++      if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
++              if (bh->b_list != BUF_DIRTY)
++                      refile_buffer(bh);
++              /*
++               * Mark it lowprio only if it was not dirty before!
++               */
++              set_bit(BH_LowPrio, &bh->b_state);
++      }
+ }
+ 
+ extern int check_disk_change(kdev_t dev);
+@@ -882,6 +920,7 @@
+ extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
+ extern void ll_rw_block(int, int, struct buffer_head * bh[]);
+ extern int is_read_only(kdev_t);
++extern int is_device_idle(kdev_t);
+ extern void __brelse(struct buffer_head *);
+ extern inline void brelse(struct buffer_head *buf)
+ {
+@@ -897,8 +936,12 @@
+ extern void set_blocksize(kdev_t dev, int size);
+ extern unsigned int get_hardblocksize(kdev_t dev);
+ extern struct buffer_head * bread(kdev_t dev, int block, int size);
++extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
++extern void bread_ahead (kdev_t dev, int block, int size);
+ extern struct buffer_head * breada(kdev_t dev,int block, int size, 
+                                  unsigned int pos, unsigned int filesize);
++extern struct buffer_head * breada_blocks(kdev_t dev,int block,
++                                              int size, int blocks);
+ 
+ extern int brw_page(int, struct page *, kdev_t, int [], int, int);
+ 
+diff -uNr linux-orig/include/linux/md.h linux/include/linux/md.h
+--- linux-orig/include/linux/md.h      Fri May  8 00:17:13 1998
++++ linux/include/linux/md.h   Fri Dec  8 22:54:52 2000
+@@ -1,300 +0,0 @@
+-/*
+-   md.h : Multiple Devices driver for Linux
+-          Copyright (C) 1994-96 Marc ZYNGIER
+-        <zyngier@ufr-info-p7.ibp.fr> or
+-        <maz@gloups.fdn.fr>
+-        
+-   This program is free software; you can redistribute it and/or modify
+-   it under the terms of the GNU General Public License as published by
+-   the Free Software Foundation; either version 2, or (at your option)
+-   any later version.
+-   
+-   You should have received a copy of the GNU General Public License
+-   (for example /usr/src/linux/COPYING); if not, write to the Free
+-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+-*/
+-
+-#ifndef _MD_H
+-#define _MD_H
+-
+-#include <linux/major.h>
+-#include <linux/ioctl.h>
+-#include <linux/types.h>
+-
+-/*
+- * Different major versions are not compatible.
+- * Different minor versions are only downward compatible.
+- * Different patchlevel versions are downward and upward compatible.
+- */
+-#define MD_MAJOR_VERSION              0
+-#define MD_MINOR_VERSION              36
+-#define MD_PATCHLEVEL_VERSION         6
+-
+-#define MD_DEFAULT_DISK_READAHEAD     (256 * 1024)
+-
+-/* ioctls */
+-#define REGISTER_DEV          _IO (MD_MAJOR, 1)
+-#define START_MD              _IO (MD_MAJOR, 2)
+-#define STOP_MD               _IO (MD_MAJOR, 3)
+-#define REGISTER_DEV_NEW      _IO (MD_MAJOR, 4)
+-
+-/*
+-   personalities :
+-   Byte 0 : Chunk size factor
+-   Byte 1 : Fault tolerance count for each physical device
+-            (   0 means no fault tolerance,
+-             0xFF means always tolerate faults), not used by now.
+-   Byte 2 : Personality
+-   Byte 3 : Reserved.
+- */
+-
+-#define FAULT_SHIFT       8
+-#define PERSONALITY_SHIFT 16
+-
+-#define FACTOR_MASK       0x000000FFUL
+-#define FAULT_MASK        0x0000FF00UL
+-#define PERSONALITY_MASK  0x00FF0000UL
+-
+-#define MD_RESERVED       0   /* Not used by now */
+-#define LINEAR            (1UL << PERSONALITY_SHIFT)
+-#define STRIPED           (2UL << PERSONALITY_SHIFT)
+-#define RAID0             STRIPED
+-#define RAID1             (3UL << PERSONALITY_SHIFT)
+-#define RAID5             (4UL << PERSONALITY_SHIFT)
+-#define MAX_PERSONALITY   5
+-
+-/*
+- * MD superblock.
+- *
+- * The MD superblock maintains some statistics on each MD configuration.
+- * Each real device in the MD set contains it near the end of the device.
+- * Some of the ideas are copied from the ext2fs implementation.
+- *
+- * We currently use 4096 bytes as follows:
+- *
+- *    word offset     function
+- *
+- *       0  -    31   Constant generic MD device information.
+- *        32  -    63   Generic state information.
+- *      64  -   127   Personality specific information.
+- *     128  -   511   12 32-words descriptors of the disks in the raid set.
+- *     512  -   911   Reserved.
+- *     912  -  1023   Disk specific descriptor.
+- */
+-
+-/*
+- * If x is the real device size in bytes, we return an apparent size of:
+- *
+- *    y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+- *
+- * and place the 4kB superblock at offset y.
+- */
+-#define MD_RESERVED_BYTES             (64 * 1024)
+-#define MD_RESERVED_SECTORS           (MD_RESERVED_BYTES / 512)
+-#define MD_RESERVED_BLOCKS            (MD_RESERVED_BYTES / BLOCK_SIZE)
+-
+-#define MD_NEW_SIZE_SECTORS(x)                ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+-#define MD_NEW_SIZE_BLOCKS(x)         ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
+-
+-#define MD_SB_BYTES                   4096
+-#define MD_SB_WORDS                   (MD_SB_BYTES / 4)
+-#define MD_SB_BLOCKS                  (MD_SB_BYTES / BLOCK_SIZE)
+-#define MD_SB_SECTORS                 (MD_SB_BYTES / 512)
+-
+-/*
+- * The following are counted in 32-bit words
+- */
+-#define       MD_SB_GENERIC_OFFSET            0
+-#define MD_SB_PERSONALITY_OFFSET      64
+-#define MD_SB_DISKS_OFFSET            128
+-#define MD_SB_DESCRIPTOR_OFFSET               992
+-
+-#define MD_SB_GENERIC_CONSTANT_WORDS  32
+-#define MD_SB_GENERIC_STATE_WORDS     32
+-#define MD_SB_GENERIC_WORDS           (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+-#define MD_SB_PERSONALITY_WORDS               64
+-#define MD_SB_DISKS_WORDS             384
+-#define MD_SB_DESCRIPTOR_WORDS                32
+-#define MD_SB_RESERVED_WORDS          (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+-#define MD_SB_EQUAL_WORDS             (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+-#define MD_SB_DISKS                   (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
+-
+-/*
+- * Device "operational" state bits
+- */
+-#define MD_FAULTY_DEVICE              0       /* Device is faulty / operational */
+-#define MD_ACTIVE_DEVICE              1       /* Device is a part or the raid set / spare disk */
+-#define MD_SYNC_DEVICE                        2       /* Device is in sync with the raid set */
+-
+-typedef struct md_device_descriptor_s {
+-      __u32 number;           /* 0 Device number in the entire set */
+-      __u32 major;            /* 1 Device major number */
+-      __u32 minor;            /* 2 Device minor number */
+-      __u32 raid_disk;        /* 3 The role of the device in the raid set */
+-      __u32 state;            /* 4 Operational state */
+-      __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+-} md_descriptor_t;
+-
+-#define MD_SB_MAGIC           0xa92b4efc
+-
+-/*
+- * Superblock state bits
+- */
+-#define MD_SB_CLEAN           0
+-#define MD_SB_ERRORS          1
+-
+-typedef struct md_superblock_s {
+-
+-      /*
+-       * Constant generic information
+-       */
+-      __u32 md_magic;         /*  0 MD identifier */
+-      __u32 major_version;    /*  1 major version to which the set conforms */
+-      __u32 minor_version;    /*  2 minor version to which the set conforms */
+-      __u32 patch_version;    /*  3 patchlevel version to which the set conforms */
+-      __u32 gvalid_words;     /*  4 Number of non-reserved words in this section */
+-      __u32 set_magic;        /*  5 Raid set identifier */
+-      __u32 ctime;            /*  6 Creation time */
+-      __u32 level;            /*  7 Raid personality (mirroring, raid5, ...) */
+-      __u32 size;             /*  8 Apparent size of each individual disk, in kB */
+-      __u32 nr_disks;         /*  9 Number of total disks in the raid set */
+-      __u32 raid_disks;       /* 10 Number of disks in a fully functional raid set */
+-      __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
+-
+-      /*
+-       * Generic state information
+-       */
+-      __u32 utime;            /*  0 Superblock update time */
+-      __u32 state;            /*  1 State bits (clean, ...) */
+-      __u32 active_disks;     /*  2 Number of currently active disks (some non-faulty disks might not be in sync) */
+-      __u32 working_disks;    /*  3 Number of working disks */
+-      __u32 failed_disks;     /*  4 Number of failed disks */
+-      __u32 spare_disks;      /*  5 Number of spare disks */
+-      __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
+-
+-      /*
+-       * Personality information
+-       */
+-      __u32 parity_algorithm;
+-      __u32 chunk_size;
+-      __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
+-
+-      /*
+-       * Disks information
+-       */
+-      md_descriptor_t disks[MD_SB_DISKS];
+-
+-      /*
+-       * Reserved
+-       */
+-      __u32 reserved[MD_SB_RESERVED_WORDS];
+-
+-      /*
+-       * Active descriptor
+-       */
+-      md_descriptor_t descriptor;
+-} md_superblock_t;
+-
+-#ifdef __KERNEL__
+-
+-#include <linux/mm.h>
+-#include <linux/fs.h>
+-#include <linux/blkdev.h>
+-#include <asm/semaphore.h>
+-
+-/*
+- * Kernel-based reconstruction is mostly working, but still requires
+- * some additional work.
+- */
+-#define SUPPORT_RECONSTRUCTION        0
+-
+-#define MAX_REAL     8                /* Max number of physical dev per md dev */
+-#define MAX_MD_DEV   4                /* Max number of md dev */
+-
+-#define FACTOR(a)         ((a)->repartition & FACTOR_MASK)
+-#define MAX_FAULT(a)      (((a)->repartition & FAULT_MASK)>>8)
+-#define PERSONALITY(a)    ((a)->repartition & PERSONALITY_MASK)
+-
+-#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
+-
+-struct real_dev
+-{
+-  kdev_t dev;                 /* Device number */
+-  int size;                   /* Device size (in blocks) */
+-  int offset;                 /* Real device offset (in blocks) in md dev
+-                                 (only used in linear mode) */
+-  struct inode *inode;                /* Lock inode */
+-  md_superblock_t *sb;
+-  u32 sb_offset;
+-};
+-
+-struct md_dev;
+-
+-#define SPARE_INACTIVE        0
+-#define SPARE_WRITE   1
+-#define SPARE_ACTIVE  2
+-
+-struct md_personality
+-{
+-  char *name;
+-  int (*map)(struct md_dev *mddev, kdev_t *rdev,
+-                    unsigned long *rsector, unsigned long size);
+-  int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
+-  void (*end_request)(struct buffer_head * bh, int uptodate);
+-  int (*run)(int minor, struct md_dev *mddev);
+-  int (*stop)(int minor, struct md_dev *mddev);
+-  int (*status)(char *page, int minor, struct md_dev *mddev);
+-  int (*ioctl)(struct inode *inode, struct file *file,
+-             unsigned int cmd, unsigned long arg);
+-  int max_invalid_dev;
+-  int (*error_handler)(struct md_dev *mddev, kdev_t dev);
+-
+-/*
+- * Some personalities (RAID-1, RAID-5) can get disks hot-added and
+- * hot-removed. Hot removal is different from failure. (failure marks
+- * a disk inactive, but the disk is still part of the array)
+- */
+-  int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
+-  int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
+-  int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
+-};
+-
+-struct md_dev
+-{
+-  struct real_dev     devices[MAX_REAL];
+-  struct md_personality       *pers;
+-  md_superblock_t     *sb;
+-  int                 sb_dirty;
+-  int                 repartition;
+-  int                 busy;
+-  int                 nb_dev;
+-  void                        *private;
+-};
+-
+-struct md_thread {
+-      void                    (*run) (void *data);
+-      void                    *data;
+-      struct wait_queue       *wqueue;
+-      unsigned long           flags;
+-      struct semaphore        *sem;
+-      struct task_struct      *tsk;
+-};
+-
+-#define THREAD_WAKEUP  0
+-
+-extern struct md_dev md_dev[MAX_MD_DEV];
+-extern int md_size[MAX_MD_DEV];
+-extern int md_maxreadahead[MAX_MD_DEV];
+-
+-extern char *partition_name (kdev_t dev);
+-
+-extern int register_md_personality (int p_num, struct md_personality *p);
+-extern int unregister_md_personality (int p_num);
+-extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
+-extern void md_unregister_thread (struct md_thread *thread);
+-extern void md_wakeup_thread(struct md_thread *thread);
+-extern int md_update_sb (int minor);
+-extern int md_do_sync(struct md_dev *mddev);
+-
+-#endif __KERNEL__
+-#endif _MD_H
+diff -uNr linux-orig/include/linux/raid/hsm.h linux/include/linux/raid/hsm.h
+--- linux-orig/include/linux/raid/hsm.h        Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/hsm.h     Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,65 @@
++#ifndef _HSM_H
++#define _HSM_H
++
++#include <linux/raid/md.h>
++
++#if __alpha__
++#error fix cpu_addr on Alpha first
++#endif
++
++#include <linux/raid/hsm_p.h>
++
++#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
++#define index_dev(lv,index) index_pv((lv),(index))->dev
++#define index_block(lv,index) (index)->data.phys_block
++#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
++
++#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
++
++
++typedef struct pv_bg_desc_s {
++      unsigned int            free_blocks;
++      pv_block_group_t        *bg;
++} pv_bg_desc_t;
++
++typedef struct pv_s pv_t;
++typedef struct vg_s vg_t;
++typedef struct lv_s lv_t;
++
++struct pv_s
++{
++      int                     phys_nr;
++      kdev_t                  dev;
++      pv_sb_t                 *pv_sb;
++      pv_bg_desc_t            *bg_array;
++};
++
++struct lv_s
++{
++      int             log_id;
++      vg_t            *vg;
++
++      unsigned int    max_indices;
++      unsigned int    free_indices;
++      lv_lptr_t       root_index;
++
++      kdev_t          dev;
++};
++
++struct vg_s
++{
++      int             nr_pv;
++      pv_t            pv_array [MD_SB_DISKS];
++
++      int             nr_lv;
++      lv_t            lv_array [HSM_MAX_LVS_PER_VG];
++
++      vg_sb_t         *vg_sb;
++      mddev_t         *mddev;
++};
++
++#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
++#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
++
++#endif
++
+diff -uNr linux-orig/include/linux/raid/hsm_p.h linux/include/linux/raid/hsm_p.h
+--- linux-orig/include/linux/raid/hsm_p.h      Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/hsm_p.h   Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,237 @@
++#ifndef _HSM_P_H
++#define _HSM_P_H
++
++#define HSM_BLOCKSIZE 4096
++#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
++#define PACKED __attribute__ ((packed))
++
++/*
++ * Identifies a block in physical space
++ */
++typedef struct phys_idx_s {
++      __u16 phys_nr;
++      __u32 phys_block;
++
++} PACKED phys_idx_t;
++
++/*
++ * Identifies a block in logical space
++ */
++typedef struct log_idx_s {
++      __u16 log_id;
++      __u32 log_index;
++
++} PACKED log_idx_t;
++
++/*
++ * Describes one PV
++ */
++#define HSM_PV_SB_MAGIC          0xf091ae9fU
++
++#define HSM_PV_SB_GENERIC_WORDS 32
++#define HSM_PV_SB_RESERVED_WORDS \
++              (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
++
++/*
++ * On-disk PV identification data, on block 0 in any PV.
++ */
++typedef struct pv_sb_s
++{
++      __u32 pv_magic;         /*  0                                       */
++
++      __u32 pv_uuid0;         /*  1                                       */
++      __u32 pv_uuid1;         /*  2                                       */
++      __u32 pv_uuid2;         /*  3                                       */
++      __u32 pv_uuid3;         /*  4                                       */
++
++      __u32 pv_major;         /*  5                                       */
++      __u32 pv_minor;         /*  6                                       */
++      __u32 pv_patch;         /*  7                                       */
++
++      __u32 pv_ctime;         /*  8 Creation time                         */
++
++      __u32 pv_total_size;    /*  9 size of this PV, in blocks            */
++      __u32 pv_first_free;    /*  10 first free block                     */
++      __u32 pv_first_used;    /*  11 first used block                     */
++      __u32 pv_blocks_left;   /*  12 unallocated blocks                   */
++      __u32 pv_bg_size;       /*  13 size of a block group, in blocks     */
++      __u32 pv_block_size;    /*  14 size of blocks, in bytes             */
++      __u32 pv_pptr_size;     /*  15 size of block descriptor, in bytes   */
++      __u32 pv_block_groups;  /*  16 number of block groups               */
++
++      __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
++
++      /*
++       * Reserved
++       */
++      __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
++
++} PACKED pv_sb_t;
++
++/*
++ * this is pretty much arbitrary, but has to be less than ~64
++ */
++#define HSM_MAX_LVS_PER_VG 32
++
++#define HSM_VG_SB_GENERIC_WORDS 32
++
++#define LV_DESCRIPTOR_WORDS 8
++#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
++      LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
++
++#if (HSM_PV_SB_RESERVED_WORDS < 0)
++#error you messed this one up dude ...
++#endif
++
++typedef struct lv_descriptor_s
++{
++      __u32 lv_id;            /*  0                                       */
++      phys_idx_t lv_root_idx; /*  1                                       */
++      __u16 __reserved;       /*  2                                       */
++      __u32 lv_max_indices;   /*  3                                       */
++      __u32 lv_free_indices;  /*  4                                       */
++      __u32 md_id;            /*  5                                       */
++
++      __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
++
++} PACKED lv_descriptor_t;
++
++#define HSM_VG_SB_MAGIC          0x98320d7aU
++/*
++ * On-disk VG identification data, in block 1 on all PVs
++ */
++typedef struct vg_sb_s
++{
++      __u32 vg_magic;         /*  0                                       */
++      __u32 nr_lvs;           /*  1                                       */
++
++      __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
++
++      lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
++      /*
++       * Reserved
++       */
++      __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
++
++} PACKED vg_sb_t;
++
++/*
++ * Describes one LV
++ */
++
++#define HSM_LV_SB_MAGIC          0xe182bd8aU
++
++/* do we need lv_sb_t? */
++
++typedef struct lv_sb_s
++{
++      /*
++       * On-disk LV identifier
++       */
++      __u32 lv_magic;         /*  0 LV identifier                         */
++      __u32 lv_uuid0;         /*  1                                       */
++      __u32 lv_uuid1;         /*  2                                       */
++      __u32 lv_uuid2;         /*  3                                       */
++      __u32 lv_uuid3;         /*  4                                       */
++
++      __u32 lv_major;         /*  5 PV identifier                         */
++      __u32 lv_minor;         /*  6 PV identifier                         */
++      __u32 lv_patch;         /*  7 PV identifier                         */
++
++      __u32 ctime;            /*  8 Creation time                         */
++      __u32 size;             /*  9 size of this LV, in blocks            */
++      phys_idx_t start;       /*  10 position of root index block         */
++      log_idx_t first_free;   /*  11-12 first free index                  */
++
++      /*
++       * Reserved
++       */
++      __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
++
++} PACKED lv_sb_t;
++
++/*
++ * Pointer pointing from the physical space, points to
++ * the LV owning this block. It also contains various
++ * statistics about the physical block.
++ */
++typedef struct pv_pptr_s
++{
++      union {
++      /* case 1 */
++              struct {
++                      log_idx_t owner;
++                      log_idx_t predicted;
++                      __u32 last_referenced;
++              } used;
++      /* case 2 */
++              struct {
++                      __u16 log_id;
++                      __u16 __unused1;
++                      __u32 next_free;
++                      __u32 __unused2;
++                      __u32 __unused3;
++              } free;
++      } u;
++} PACKED pv_pptr_t;
++
++static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
++{
++      return !pptr->u.free.log_id;
++}
++
++
++#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
++
++#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
++/*
++ * A table of pointers filling up a single block, managing
++ * the next DATA_BLOCKS_PER_BG physical blocks. Such block
++ * groups form the physical space of blocks.
++ */
++typedef struct pv_block_group_s
++{
++      __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
++
++      pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
++
++} PACKED pv_block_group_t;
++
++/*
++ * Pointer from the logical space, points to
++ * the (PV,block) containing this logical block
++ */
++typedef struct lv_lptr_s
++{
++      phys_idx_t data;
++      __u16 __reserved;
++      __u32 cpu_addr;
++      __u32 __reserved2;
++
++} PACKED lv_lptr_t;
++
++static __inline__ int index_free (const lv_lptr_t * index)
++{
++      return !index->data.phys_block;
++}
++
++static __inline__ int index_present (const lv_lptr_t * index)
++{
++      return index->cpu_addr;
++}
++
++
++#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
++/*
++ * A table of pointers filling up a single block, managing
++ * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
++ * the logical space of blocks.
++ */
++typedef struct lv_index_block_s
++{
++      lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
++
++} PACKED lv_index_block_t;
++
++#endif
++
+diff -uNr linux-orig/include/linux/raid/linear.h linux/include/linux/raid/linear.h
+--- linux-orig/include/linux/raid/linear.h     Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/linear.h  Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,32 @@
++#ifndef _LINEAR_H
++#define _LINEAR_H
++
++#include <linux/raid/md.h>
++
++struct dev_info {
++      kdev_t          dev;
++      int             size;
++      unsigned int    offset;
++};
++
++typedef struct dev_info dev_info_t;
++
++struct linear_hash
++{
++      dev_info_t *dev0, *dev1;
++};
++
++struct linear_private_data
++{
++      struct linear_hash      *hash_table;
++      dev_info_t              disks[MD_SB_DISKS];
++      dev_info_t              *smallest;
++      int                     nr_zones;
++};
++
++
++typedef struct linear_private_data linear_conf_t;
++
++#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/md.h linux/include/linux/raid/md.h
+--- linux-orig/include/linux/raid/md.h Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md.h      Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,96 @@
++/*
++   md.h : Multiple Devices driver for Linux
++          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++          Copyright (C) 1994-96 Marc ZYNGIER
++        <zyngier@ufr-info-p7.ibp.fr> or
++        <maz@gloups.fdn.fr>
++        
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#ifndef _MD_H
++#define _MD_H
++
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <asm/semaphore.h>
++#include <linux/major.h>
++#include <linux/ioctl.h>
++#include <linux/types.h>
++#include <asm/bitops.h>
++#include <linux/module.h>
++#include <linux/mm.h>
++#include <linux/hdreg.h>
++#include <linux/sysctl.h>
++#include <linux/fs.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <linux/delay.h>
++#include <net/checksum.h>
++#include <linux/random.h>
++#include <linux/locks.h>
++#include <asm/io.h>
++
++#include <linux/raid/md_compatible.h>
++/*
++ * 'md_p.h' holds the 'physical' layout of RAID devices
++ * 'md_u.h' holds the user <=> kernel API
++ *
++ * 'md_k.h' holds kernel internal definitions
++ */
++
++#include <linux/raid/md_p.h>
++#include <linux/raid/md_u.h>
++#include <linux/raid/md_k.h>
++
++/*
++ * Different major versions are not compatible.
++ * Different minor versions are only downward compatible.
++ * Different patchlevel versions are downward and upward compatible.
++ */
++#define MD_MAJOR_VERSION                0
++#define MD_MINOR_VERSION                90
++#define MD_PATCHLEVEL_VERSION           0
++
++extern int md_size[MAX_MD_DEVS];
++extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
++
++extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
++extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
++extern char * partition_name (kdev_t dev);
++extern int register_md_personality (int p_num, mdk_personality_t *p);
++extern int unregister_md_personality (int p_num);
++extern mdk_thread_t * md_register_thread (void (*run) (void *data),
++                              void *data, const char *name);
++extern void md_unregister_thread (mdk_thread_t *thread);
++extern void md_wakeup_thread(mdk_thread_t *thread);
++extern void md_interrupt_thread (mdk_thread_t *thread);
++extern int md_update_sb (mddev_t *mddev);
++extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
++extern void md_recover_arrays (void);
++extern int md_check_ordering (mddev_t *mddev);
++extern void autodetect_raid(void);
++extern struct gendisk * find_gendisk (kdev_t dev);
++extern int md_notify_reboot(struct notifier_block *this,
++                                      unsigned long code, void *x);
++#if CONFIG_BLK_DEV_MD
++extern void raid_setup(char *str,int *ints) md__init;
++#endif
++#ifdef CONFIG_MD_BOOT
++extern void md_setup(char *str,int *ints) md__init;
++#endif
++
++extern void md_print_devices (void);
++
++#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
++
++#endif _MD_H
++
+diff -uNr linux-orig/include/linux/raid/md_compatible.h linux/include/linux/raid/md_compatible.h
+--- linux-orig/include/linux/raid/md_compatible.h      Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_compatible.h   Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,387 @@
++
++/*
++   md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
++          Copyright (C) 1998 Ingo Molnar
++        
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#include <linux/version.h>
++
++#ifndef _MD_COMPATIBLE_H
++#define _MD_COMPATIBLE_H
++
++#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
++
++#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
++
++/* 000 */
++#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
++
++#ifdef __i386__
++/* 001 */
++extern __inline__ int md_cpu_has_mmx(void)
++{
++      return x86_capability & 0x00800000;
++}
++#endif
++
++/* 002 */
++#define md_clear_page(page)        memset((void *)(page), 0, PAGE_SIZE)
++
++/* 003 */
++/*
++ * someone please suggest a sane compatibility layer for modules
++ */
++#define MD_EXPORT_SYMBOL(x)
++
++/* 004 */
++static inline unsigned long
++md_copy_from_user(void *to, const void *from, unsigned long n)
++{
++      int err;
++
++      err = verify_area(VERIFY_READ,from,n);
++      if (!err)
++              memcpy_fromfs(to, from, n);
++      return err; 
++}
++
++/* 005 */
++extern inline unsigned long
++md_copy_to_user(void *to, const void *from, unsigned long n)
++{
++      int err;
++
++      err = verify_area(VERIFY_WRITE,to,n);
++      if (!err)
++              memcpy_tofs(to, from, n);
++      return err; 
++}
++
++/* 006 */
++#define md_put_user(x,ptr)                                            \
++({                                                                    \
++      int __err;                                                      \
++                                                                      \
++      __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr));             \
++      if (!__err)                                                     \
++              put_user(x,ptr);                                        \
++      __err;                                                          \
++})
++
++/* 007 */
++extern inline int md_capable_admin(void)
++{
++      return suser();
++}
++ 
++/* 008 */
++#define MD_FILE_TO_INODE(file) ((file)->f_inode)
++
++/* 009 */
++extern inline void md_flush_signals (void)
++{
++      current->signal = 0;
++}
++ 
++/* 010 */
++#define __S(nr) (1<<((nr)-1))
++extern inline void md_init_signals (void)
++{
++        current->exit_signal = SIGCHLD;
++        current->blocked = ~(__S(SIGKILL));
++}
++#undef __S
++
++/* 011 */
++extern inline unsigned long md_signal_pending (struct task_struct * tsk)
++{
++      return (tsk->signal & ~tsk->blocked);
++}
++
++/* 012 */
++#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
++
++/* 013 */
++#define md_mdelay(n) (\
++      {unsigned long msec=(n); while (msec--) udelay(1000);})
++
++/* 014 */
++#define MD_SYS_DOWN 0
++#define MD_SYS_HALT 0
++#define MD_SYS_POWER_OFF 0
++
++/* 015 */
++#define md_register_reboot_notifier(x)
++
++/* 016 */
++extern __inline__ unsigned long
++md_test_and_set_bit(int nr, void * addr)
++{
++      unsigned long flags;
++      unsigned long oldbit;
++
++      save_flags(flags);
++      cli();
++      oldbit = test_bit(nr,addr);
++      set_bit(nr,addr);
++      restore_flags(flags);
++      return oldbit;
++}
++
++/* 017 */
++extern __inline__ unsigned long
++md_test_and_clear_bit(int nr, void * addr)
++{
++      unsigned long flags;
++      unsigned long oldbit;
++
++      save_flags(flags);
++      cli();
++      oldbit = test_bit(nr,addr);
++      clear_bit(nr,addr);
++      restore_flags(flags);
++      return oldbit;
++}
++
++/* 018 */
++#define md_atomic_read(x) (*(volatile int *)(x))
++#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
++
++/* 019 */
++extern __inline__ void md_lock_kernel (void)
++{
++#if __SMP__
++      lock_kernel();
++      syscall_count++;
++#endif
++}
++
++extern __inline__ void md_unlock_kernel (void)
++{
++#if __SMP__
++      syscall_count--;
++      unlock_kernel();
++#endif
++}
++/* 020 */
++
++#define md__init
++#define md__initdata
++#define md__initfunc(__arginit) __arginit
++
++/* 021 */
++
++/* 022 */
++
++struct md_list_head {
++      struct md_list_head *next, *prev;
++};
++
++#define MD_LIST_HEAD(name) \
++      struct md_list_head name = { &name, &name }
++
++#define MD_INIT_LIST_HEAD(ptr) do { \
++      (ptr)->next = (ptr); (ptr)->prev = (ptr); \
++} while (0)
++
++static __inline__ void md__list_add(struct md_list_head * new,
++      struct md_list_head * prev,
++      struct md_list_head * next)
++{
++      next->prev = new;
++      new->next = next;
++      new->prev = prev;
++      prev->next = new;
++}
++
++static __inline__ void md_list_add(struct md_list_head *new,
++                                              struct md_list_head *head)
++{
++      md__list_add(new, head, head->next);
++}
++
++static __inline__ void md__list_del(struct md_list_head * prev,
++                                      struct md_list_head * next)
++{
++      next->prev = prev;
++      prev->next = next;
++}
++
++static __inline__ void md_list_del(struct md_list_head *entry)
++{
++      md__list_del(entry->prev, entry->next);
++}
++
++static __inline__ int md_list_empty(struct md_list_head *head)
++{
++      return head->next == head;
++}
++
++#define md_list_entry(ptr, type, member) \
++      ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
++
++/* 023 */
++
++static __inline__ signed long md_schedule_timeout(signed long timeout)
++{
++      current->timeout = jiffies + timeout;
++      schedule();
++      return 0;
++}
++
++/* 024 */
++#define md_need_resched(tsk) (need_resched)
++
++/* 025 */
++typedef struct { int gcc_is_buggy; } md_spinlock_t;
++#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
++
++#define md_spin_lock_irq cli
++#define md_spin_unlock_irq sti
++#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
++#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
++
++/* END */
++
++#else
++
++#include <linux/reboot.h>
++#include <linux/vmalloc.h>
++
++/* 000 */
++#define md__get_free_pages(x,y) __get_free_pages(x,y)
++
++#ifdef __i386__
++/* 001 */
++extern __inline__ int md_cpu_has_mmx(void)
++{
++      return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
++}
++#endif
++
++/* 002 */
++#define md_clear_page(page)        clear_page(page)
++
++/* 003 */
++#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
++
++/* 004 */
++#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
++
++/* 005 */
++#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
++
++/* 006 */
++#define md_put_user put_user
++
++/* 007 */
++extern inline int md_capable_admin(void)
++{
++      return capable(CAP_SYS_ADMIN);
++}
++
++/* 008 */
++#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
++
++/* 009 */
++extern inline void md_flush_signals (void)
++{
++      spin_lock(&current->sigmask_lock);
++      flush_signals(current);
++      spin_unlock(&current->sigmask_lock);
++}
++ 
++/* 010 */
++extern inline void md_init_signals (void)
++{
++        current->exit_signal = SIGCHLD;
++        siginitsetinv(&current->blocked, sigmask(SIGKILL));
++}
++
++/* 011 */
++#define md_signal_pending signal_pending
++
++/* 012 */
++extern inline void md_set_global_readahead(int * table)
++{
++      max_readahead[MD_MAJOR] = table;
++}
++
++/* 013 */
++#define md_mdelay(x) mdelay(x)
++
++/* 014 */
++#define MD_SYS_DOWN SYS_DOWN
++#define MD_SYS_HALT SYS_HALT
++#define MD_SYS_POWER_OFF SYS_POWER_OFF
++
++/* 015 */
++#define md_register_reboot_notifier register_reboot_notifier
++
++/* 016 */
++#define md_test_and_set_bit test_and_set_bit
++
++/* 017 */
++#define md_test_and_clear_bit test_and_clear_bit
++
++/* 018 */
++#define md_atomic_read atomic_read
++#define md_atomic_set atomic_set
++
++/* 019 */
++#define md_lock_kernel lock_kernel
++#define md_unlock_kernel unlock_kernel
++
++/* 020 */
++
++#include <linux/init.h>
++
++#define md__init __init
++#define md__initdata __initdata
++#define md__initfunc(__arginit) __initfunc(__arginit)
++
++/* 021 */
++
++
++/* 022 */
++
++#define md_list_head list_head
++#define MD_LIST_HEAD(name) LIST_HEAD(name)
++#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
++#define md_list_add list_add
++#define md_list_del list_del
++#define md_list_empty list_empty
++
++#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
++
++/* 023 */
++
++#define md_schedule_timeout schedule_timeout
++
++/* 024 */
++#define md_need_resched(tsk) ((tsk)->need_resched)
++
++/* 025 */
++#define md_spinlock_t spinlock_t
++#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
++
++#define md_spin_lock_irq spin_lock_irq
++#define md_spin_unlock_irq spin_unlock_irq
++#define md_spin_unlock_irqrestore spin_unlock_irqrestore
++#define md_spin_lock_irqsave spin_lock_irqsave
++
++/* END */
++
++#endif
++
++#endif _MD_COMPATIBLE_H
++
+diff -uNr linux-orig/include/linux/raid/md_k.h linux/include/linux/raid/md_k.h
+--- linux-orig/include/linux/raid/md_k.h       Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_k.h    Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,338 @@
++/*
++   md_k.h : kernel internal structure of the Linux MD driver
++          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++        
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#ifndef _MD_K_H
++#define _MD_K_H
++
++#define MD_RESERVED       0UL
++#define LINEAR            1UL
++#define STRIPED           2UL
++#define RAID0             STRIPED
++#define RAID1             3UL
++#define RAID5             4UL
++#define TRANSLUCENT       5UL
++#define HSM               6UL
++#define MAX_PERSONALITY   7UL
++
++extern inline int pers_to_level (int pers)
++{
++      switch (pers) {
++              case HSM:               return -3;
++              case TRANSLUCENT:       return -2;
++              case LINEAR:            return -1;
++              case RAID0:             return 0;
++              case RAID1:             return 1;
++              case RAID5:             return 5;
++      }
++      panic("pers_to_level()");
++}
++
++extern inline int level_to_pers (int level)
++{
++      switch (level) {
++              case -3: return HSM;
++              case -2: return TRANSLUCENT;
++              case -1: return LINEAR;
++              case 0: return RAID0;
++              case 1: return RAID1;
++              case 4:
++              case 5: return RAID5;
++      }
++      return MD_RESERVED;
++}
++
++typedef struct mddev_s mddev_t;
++typedef struct mdk_rdev_s mdk_rdev_t;
++
++#if (MINORBITS != 8)
++#error MD doesnt handle bigger kdev yet
++#endif
++
++#define MAX_REAL     12                       /* Max number of disks per md dev */
++#define MAX_MD_DEVS  (1<<MINORBITS)   /* Max number of md dev */
++
++/*
++ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
++ * the personality. (eg. HSM uses this to identify individual LVs)
++ */
++typedef struct dev_mapping_s {
++      mddev_t *mddev;
++      void *data;
++} dev_mapping_t;
++
++extern dev_mapping_t mddev_map [MAX_MD_DEVS];
++
++extern inline mddev_t * kdev_to_mddev (kdev_t dev)
++{
++        return mddev_map[MINOR(dev)].mddev;
++}
++
++/*
++ * options passed in raidrun:
++ */
++
++#define MAX_CHUNK_SIZE (4096*1024)
++
++/*
++ * default readahead
++ */
++#define MD_READAHEAD  (256 * 512)
++
++extern inline int disk_faulty(mdp_disk_t * d)
++{
++      return d->state & (1 << MD_DISK_FAULTY);
++}
++
++extern inline int disk_active(mdp_disk_t * d)
++{
++      return d->state & (1 << MD_DISK_ACTIVE);
++}
++
++extern inline int disk_sync(mdp_disk_t * d)
++{
++      return d->state & (1 << MD_DISK_SYNC);
++}
++
++extern inline int disk_spare(mdp_disk_t * d)
++{
++      return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
++}
++
++extern inline int disk_removed(mdp_disk_t * d)
++{
++      return d->state & (1 << MD_DISK_REMOVED);
++}
++
++extern inline void mark_disk_faulty(mdp_disk_t * d)
++{
++      d->state |= (1 << MD_DISK_FAULTY);
++}
++
++extern inline void mark_disk_active(mdp_disk_t * d)
++{
++      d->state |= (1 << MD_DISK_ACTIVE);
++}
++
++extern inline void mark_disk_sync(mdp_disk_t * d)
++{
++      d->state |= (1 << MD_DISK_SYNC);
++}
++
++extern inline void mark_disk_spare(mdp_disk_t * d)
++{
++      d->state = 0;
++}
++
++extern inline void mark_disk_removed(mdp_disk_t * d)
++{
++      d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
++}
++
++extern inline void mark_disk_inactive(mdp_disk_t * d)
++{
++      d->state &= ~(1 << MD_DISK_ACTIVE);
++}
++
++extern inline void mark_disk_nonsync(mdp_disk_t * d)
++{
++      d->state &= ~(1 << MD_DISK_SYNC);
++}
++
++/*
++ * MD's 'extended' device
++ */
++struct mdk_rdev_s
++{
++      struct md_list_head same_set;   /* RAID devices within the same set */
++      struct md_list_head all;        /* all RAID devices */
++      struct md_list_head pending;    /* undetected RAID devices */
++
++      kdev_t dev;                     /* Device number */
++      kdev_t old_dev;                 /*  "" when it was last imported */
++      int size;                       /* Device size (in blocks) */
++      mddev_t *mddev;                 /* RAID array if running */
++      unsigned long last_events;      /* IO event timestamp */
++
++      struct inode *inode;            /* Lock inode */
++      struct file filp;               /* Lock file */
++
++      mdp_super_t *sb;
++      int sb_offset;
++
++      int faulty;                     /* if faulty do not issue IO requests */
++      int desc_nr;                    /* descriptor index in the superblock */
++};
++
++
++/*
++ * disk operations in a working array:
++ */
++#define DISKOP_SPARE_INACTIVE 0
++#define DISKOP_SPARE_WRITE    1
++#define DISKOP_SPARE_ACTIVE   2
++#define DISKOP_HOT_REMOVE_DISK        3
++#define DISKOP_HOT_ADD_DISK   4
++
++typedef struct mdk_personality_s mdk_personality_t;
++
++struct mddev_s
++{
++      void                            *private;
++      mdk_personality_t               *pers;
++      int                             __minor;
++      mdp_super_t                     *sb;
++      int                             nb_dev;
++      struct md_list_head             disks;
++      int                             sb_dirty;
++      mdu_param_t                     param;
++      int                             ro;
++      unsigned int                    curr_resync;
++      unsigned long                   resync_start;
++      char                            *name;
++      int                             recovery_running;
++      struct semaphore                reconfig_sem;
++      struct semaphore                recovery_sem;
++      struct semaphore                resync_sem;
++      struct md_list_head             all_mddevs;
++};
++
++struct mdk_personality_s
++{
++      char *name;
++      int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
++              unsigned long *rsector, unsigned long size);
++      int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
++      void (*end_request)(struct buffer_head * bh, int uptodate);
++      int (*run)(mddev_t *mddev);
++      int (*stop)(mddev_t *mddev);
++      int (*status)(char *page, mddev_t *mddev);
++      int (*ioctl)(struct inode *inode, struct file *file,
++              unsigned int cmd, unsigned long arg);
++      int max_invalid_dev;
++      int (*error_handler)(mddev_t *mddev, kdev_t dev);
++
++/*
++ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
++ * hot-removed. Hot removal is different from failure. (failure marks
++ * a disk inactive, but the disk is still part of the array) The interface
++ * to such operations is the 'pers->diskop()' function, can be NULL.
++ *
++ * the diskop function can change the pointer pointing to the incoming
++ * descriptor, but must do so very carefully. (currently only
++ * SPARE_ACTIVE expects such a change)
++ */
++      int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
++
++      int (*stop_resync)(mddev_t *mddev);
++      int (*restart_resync)(mddev_t *mddev);
++};
++
++
++/*
++ * Currently we index md_array directly, based on the minor
++ * number. This will have to change to dynamic allocation
++ * once we start supporting partitioning of md devices.
++ */
++extern inline int mdidx (mddev_t * mddev)
++{
++      return mddev->__minor;
++}
++
++extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
++{
++      return MKDEV(MD_MAJOR, mdidx(mddev));
++}
++
++extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
++extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
++
++/*
++ * iterates through some rdev ringlist. It's safe to remove the
++ * current 'rdev'. Dont touch 'tmp' though.
++ */
++#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp)                     \
++                                                                      \
++      for (tmp = head.next;                                           \
++              rdev = md_list_entry(tmp, mdk_rdev_t, field),           \
++                      tmp = tmp->next, tmp->prev != &head             \
++              ; )
++/*
++ * iterates through the 'same array disks' ringlist
++ */
++#define ITERATE_RDEV(mddev,rdev,tmp)                                  \
++      ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
++
++/*
++ * Same as above, but assumes that the device has rdev->desc_nr numbered
++ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
++ */
++#define ITERATE_RDEV_ORDERED(mddev,rdev,i)                            \
++      for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
++
++
++/*
++ * Iterates through all 'RAID managed disks'
++ */
++#define ITERATE_RDEV_ALL(rdev,tmp)                                    \
++      ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
++
++/*
++ * Iterates through 'pending RAID disks'
++ */
++#define ITERATE_RDEV_PENDING(rdev,tmp)                                        \
++      ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
++
++/*
++ * iterates through all used mddevs in the system.
++ */
++#define ITERATE_MDDEV(mddev,tmp)                                      \
++                                                                      \
++      for (tmp = all_mddevs.next;                                     \
++              mddev = md_list_entry(tmp, mddev_t, all_mddevs),        \
++                      tmp = tmp->next, tmp->prev != &all_mddevs       \
++              ; )
++
++extern inline int lock_mddev (mddev_t * mddev)
++{
++      return down_interruptible(&mddev->reconfig_sem);
++}
++
++extern inline void unlock_mddev (mddev_t * mddev)
++{
++      up(&mddev->reconfig_sem);
++}
++
++#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
++                              x = y; y = __tmp; } while (0)
++
++typedef struct mdk_thread_s {
++      void                    (*run) (void *data);
++      void                    *data;
++      struct wait_queue       *wqueue;
++      unsigned long           flags;
++      struct semaphore        *sem;
++      struct task_struct      *tsk;
++      const char              *name;
++} mdk_thread_t;
++
++#define THREAD_WAKEUP  0
++
++typedef struct dev_name_s {
++      struct md_list_head list;
++      kdev_t dev;
++      char name [MAX_DISKNAME_LEN];
++} dev_name_t;
++
++#endif _MD_K_H
++
+diff -uNr linux-orig/include/linux/raid/md_p.h linux/include/linux/raid/md_p.h
+--- linux-orig/include/linux/raid/md_p.h       Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_p.h    Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,161 @@
++/*
++   md_p.h : physical layout of Linux RAID devices
++          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
++        
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#ifndef _MD_P_H
++#define _MD_P_H
++
++/*
++ * RAID superblock.
++ *
++ * The RAID superblock maintains some statistics on each RAID configuration.
++ * Each real device in the RAID set contains it near the end of the device.
++ * Some of the ideas are copied from the ext2fs implementation.
++ *
++ * We currently use 4096 bytes as follows:
++ *
++ *    word offset     function
++ *
++ *       0  -    31   Constant generic RAID device information.
++ *        32  -    63   Generic state information.
++ *      64  -   127   Personality specific information.
++ *     128  -   511   12 32-words descriptors of the disks in the raid set.
++ *     512  -   911   Reserved.
++ *     912  -  1023   Disk specific descriptor.
++ */
++
++/*
++ * If x is the real device size in bytes, we return an apparent size of:
++ *
++ *    y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
++ *
++ * and place the 4kB superblock at offset y.
++ */
++#define MD_RESERVED_BYTES             (64 * 1024)
++#define MD_RESERVED_SECTORS           (MD_RESERVED_BYTES / 512)
++#define MD_RESERVED_BLOCKS            (MD_RESERVED_BYTES / BLOCK_SIZE)
++
++#define MD_NEW_SIZE_SECTORS(x)                ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
++#define MD_NEW_SIZE_BLOCKS(x)         ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
++
++#define MD_SB_BYTES                   4096
++#define MD_SB_WORDS                   (MD_SB_BYTES / 4)
++#define MD_SB_BLOCKS                  (MD_SB_BYTES / BLOCK_SIZE)
++#define MD_SB_SECTORS                 (MD_SB_BYTES / 512)
++
++/*
++ * The following are counted in 32-bit words
++ */
++#define       MD_SB_GENERIC_OFFSET            0
++#define MD_SB_PERSONALITY_OFFSET      64
++#define MD_SB_DISKS_OFFSET            128
++#define MD_SB_DESCRIPTOR_OFFSET               992
++
++#define MD_SB_GENERIC_CONSTANT_WORDS  32
++#define MD_SB_GENERIC_STATE_WORDS     32
++#define MD_SB_GENERIC_WORDS           (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
++#define MD_SB_PERSONALITY_WORDS               64
++#define MD_SB_DISKS_WORDS             384
++#define MD_SB_DESCRIPTOR_WORDS                32
++#define MD_SB_RESERVED_WORDS          (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
++#define MD_SB_EQUAL_WORDS             (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
++#define MD_SB_DISKS                   (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
++
++/*
++ * Device "operational" state bits
++ */
++#define MD_DISK_FAULTY                0 /* disk is faulty / operational */
++#define MD_DISK_ACTIVE                1 /* disk is running or spare disk */
++#define MD_DISK_SYNC          2 /* disk is in sync with the raid set */
++#define MD_DISK_REMOVED               3 /* disk is in sync with the raid set */
++
++typedef struct mdp_device_descriptor_s {
++      __u32 number;           /* 0 Device number in the entire set          */
++      __u32 major;            /* 1 Device major number                      */
++      __u32 minor;            /* 2 Device minor number                      */
++      __u32 raid_disk;        /* 3 The role of the device in the raid set   */
++      __u32 state;            /* 4 Operational state                        */
++      __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
++} mdp_disk_t;
++
++#define MD_SB_MAGIC           0xa92b4efc
++
++/*
++ * Superblock state bits
++ */
++#define MD_SB_CLEAN           0
++#define MD_SB_ERRORS          1
++
++typedef struct mdp_superblock_s {
++      /*
++       * Constant generic information
++       */
++      __u32 md_magic;         /*  0 MD identifier                           */
++      __u32 major_version;    /*  1 major version to which the set conforms */
++      __u32 minor_version;    /*  2 minor version ...                       */
++      __u32 patch_version;    /*  3 patchlevel version ...                  */
++      __u32 gvalid_words;     /*  4 Number of used words in this section    */
++      __u32 set_uuid0;        /*  5 Raid set identifier                     */
++      __u32 ctime;            /*  6 Creation time                           */
++      __u32 level;            /*  7 Raid personality                        */
++      __u32 size;             /*  8 Apparent size of each individual disk   */
++      __u32 nr_disks;         /*  9 total disks in the raid set             */
++      __u32 raid_disks;       /* 10 disks in a fully functional raid set    */
++      __u32 md_minor;         /* 11 preferred MD minor device number        */
++      __u32 not_persistent;   /* 12 does it have a persistent superblock    */
++      __u32 set_uuid1;        /* 13 Raid set identifier #2                  */
++      __u32 set_uuid2;        /* 14 Raid set identifier #3                  */
++      __u32 set_uuid3;        /* 14 Raid set identifier #4                  */
++      __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
++
++      /*
++       * Generic state information
++       */
++      __u32 utime;            /*  0 Superblock update time                  */
++      __u32 state;            /*  1 State bits (clean, ...)                 */
++      __u32 active_disks;     /*  2 Number of currently active disks        */
++      __u32 working_disks;    /*  3 Number of working disks                 */
++      __u32 failed_disks;     /*  4 Number of failed disks                  */
++      __u32 spare_disks;      /*  5 Number of spare disks                   */
++      __u32 sb_csum;          /*  6 checksum of the whole superblock        */
++      __u64 events;           /*  7 number of superblock updates (64-bit!)  */
++      __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
++
++      /*
++       * Personality information
++       */
++      __u32 layout;           /*  0 the array's physical layout             */
++      __u32 chunk_size;       /*  1 chunk size in bytes                     */
++      __u32 root_pv;          /*  2 LV root PV */
++      __u32 root_block;       /*  3 LV root block */
++      __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
++
++      /*
++       * Disks information
++       */
++      mdp_disk_t disks[MD_SB_DISKS];
++
++      /*
++       * Reserved
++       */
++      __u32 reserved[MD_SB_RESERVED_WORDS];
++
++      /*
++       * Active descriptor
++       */
++      mdp_disk_t this_disk;
++
++} mdp_super_t;
++
++#endif _MD_P_H
++
+diff -uNr linux-orig/include/linux/raid/md_u.h linux/include/linux/raid/md_u.h
+--- linux-orig/include/linux/raid/md_u.h       Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/md_u.h    Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,115 @@
++/*
++   md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
++          Copyright (C) 1998 Ingo Molnar
++        
++   This program is free software; you can redistribute it and/or modify
++   it under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 2, or (at your option)
++   any later version.
++   
++   You should have received a copy of the GNU General Public License
++   (for example /usr/src/linux/COPYING); if not, write to the Free
++   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
++*/
++
++#ifndef _MD_U_H
++#define _MD_U_H
++
++/* ioctls */
++
++/* status */
++#define RAID_VERSION          _IOR (MD_MAJOR, 0x10, mdu_version_t)
++#define GET_ARRAY_INFO                _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
++#define GET_DISK_INFO         _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
++#define PRINT_RAID_DEBUG      _IO (MD_MAJOR, 0x13)
++
++/* configuration */
++#define CLEAR_ARRAY           _IO (MD_MAJOR, 0x20)
++#define ADD_NEW_DISK          _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
++#define HOT_REMOVE_DISK               _IO (MD_MAJOR, 0x22)
++#define SET_ARRAY_INFO                _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
++#define SET_DISK_INFO         _IO (MD_MAJOR, 0x24)
++#define WRITE_RAID_INFO               _IO (MD_MAJOR, 0x25)
++#define UNPROTECT_ARRAY               _IO (MD_MAJOR, 0x26)
++#define PROTECT_ARRAY         _IO (MD_MAJOR, 0x27)
++#define HOT_ADD_DISK          _IO (MD_MAJOR, 0x28)
++#define SET_DISK_FAULTY               _IO (MD_MAJOR, 0x29)
++
++/* usage */
++#define RUN_ARRAY             _IOW (MD_MAJOR, 0x30, mdu_param_t)
++#define START_ARRAY           _IO (MD_MAJOR, 0x31)
++#define STOP_ARRAY            _IO (MD_MAJOR, 0x32)
++#define STOP_ARRAY_RO         _IO (MD_MAJOR, 0x33)
++#define RESTART_ARRAY_RW      _IO (MD_MAJOR, 0x34)
++
++typedef struct mdu_version_s {
++      int major;
++      int minor;
++      int patchlevel;
++} mdu_version_t;
++
++typedef struct mdu_array_info_s {
++      /*
++       * Generic constant information
++       */
++      int major_version;
++      int minor_version;
++      int patch_version;
++      int ctime;
++      int level;
++      int size;
++      int nr_disks;
++      int raid_disks;
++      int md_minor;
++      int not_persistent;
++
++      /*
++       * Generic state information
++       */
++      int utime;              /*  0 Superblock update time                  */
++      int state;              /*  1 State bits (clean, ...)                 */
++      int active_disks;       /*  2 Number of currently active disks        */
++      int working_disks;      /*  3 Number of working disks                 */
++      int failed_disks;       /*  4 Number of failed disks                  */
++      int spare_disks;        /*  5 Number of spare disks                   */
++
++      /*
++       * Personality information
++       */
++      int layout;             /*  0 the array's physical layout             */
++      int chunk_size; /*  1 chunk size in bytes                     */
++
++} mdu_array_info_t;
++
++typedef struct mdu_disk_info_s {
++      /*
++       * configuration/status of one particular disk
++       */
++      int number;
++      int major;
++      int minor;
++      int raid_disk;
++      int state;
++
++} mdu_disk_info_t;
++
++typedef struct mdu_start_info_s {
++      /*
++       * configuration/status of one particular disk
++       */
++      int major;
++      int minor;
++      int raid_disk;
++      int state;
++
++} mdu_start_info_t;
++
++typedef struct mdu_param_s
++{
++      int                     personality;    /* 1,2,3,4 */
++      int                     chunk_size;     /* in bytes */
++      int                     max_fault;      /* unused for now */
++} mdu_param_t;
++
++#endif _MD_U_H
++
+diff -uNr linux-orig/include/linux/raid/raid0.h linux/include/linux/raid/raid0.h
+--- linux-orig/include/linux/raid/raid0.h      Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid0.h   Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,33 @@
++#ifndef _RAID0_H
++#define _RAID0_H
++
++#include <linux/raid/md.h>
++
++struct strip_zone
++{
++      int zone_offset;                /* Zone offset in md_dev */
++      int dev_offset;                 /* Zone offset in real dev */
++      int size;                       /* Zone size */
++      int nb_dev;                     /* # of devices attached to the zone */
++      mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
++};
++
++struct raid0_hash
++{
++      struct strip_zone *zone0, *zone1;
++};
++
++struct raid0_private_data
++{
++      struct raid0_hash *hash_table; /* Dynamically allocated */
++      struct strip_zone *strip_zone; /* This one too */
++      int nr_strip_zones;
++      struct strip_zone *smallest;
++      int nr_zones;
++};
++
++typedef struct raid0_private_data raid0_conf_t;
++
++#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/raid1.h linux/include/linux/raid/raid1.h
+--- linux-orig/include/linux/raid/raid1.h      Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid1.h   Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,64 @@
++#ifndef _RAID1_H
++#define _RAID1_H
++
++#include <linux/raid/md.h>
++
++struct mirror_info {
++      int             number;
++      int             raid_disk;
++      kdev_t          dev;
++      int             next;
++      int             sect_limit;
++
++      /*
++       * State bits:
++       */
++      int             operational;
++      int             write_only;
++      int             spare;
++
++      int             used_slot;
++};
++
++struct raid1_private_data {
++      mddev_t                 *mddev;
++      struct mirror_info      mirrors[MD_SB_DISKS];
++      int                     nr_disks;
++      int                     raid_disks;
++      int                     working_disks;
++      int                     last_used;
++      unsigned long           next_sect;
++      int                     sect_count;
++      mdk_thread_t            *thread, *resync_thread;
++      int                     resync_mirrors;
++      struct mirror_info      *spare;
++};
++
++typedef struct raid1_private_data raid1_conf_t;
++
++/*
++ * this is the only point in the RAID code where we violate
++ * C type safety. mddev->private is an 'opaque' pointer.
++ */
++#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
++
++/*
++ * this is our 'private' 'collective' RAID1 buffer head.
++ * it contains information about what kind of IO operations were started
++ * for this RAID1 operation, and about their status:
++ */
++
++struct raid1_bh {
++      atomic_t                remaining; /* 'have we finished' count,
++                                          * used from IRQ handlers
++                                          */
++      int                     cmd;
++      unsigned long           state;
++      mddev_t                 *mddev;
++      struct buffer_head      *master_bh;
++      struct buffer_head      *mirror_bh [MD_SB_DISKS];
++      struct buffer_head      bh_req;
++      struct buffer_head      *next_retry;
++};
++
++#endif
+diff -uNr linux-orig/include/linux/raid/raid5.h linux/include/linux/raid/raid5.h
+--- linux-orig/include/linux/raid/raid5.h      Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/raid5.h   Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,113 @@
++#ifndef _RAID5_H
++#define _RAID5_H
++
++#include <linux/raid/md.h>
++#include <linux/raid/xor.h>
++
++struct disk_info {
++      kdev_t  dev;
++      int     operational;
++      int     number;
++      int     raid_disk;
++      int     write_only;
++      int     spare;
++      int     used_slot;
++};
++
++struct stripe_head {
++      md_spinlock_t           stripe_lock;
++      struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
++      struct stripe_head      *free_next;             /* pool of free sh's */
++      struct buffer_head      *buffer_pool;           /* pool of free buffers */
++      struct buffer_head      *bh_pool;               /* pool of free bh's */
++      struct raid5_private_data       *raid_conf;
++      struct buffer_head      *bh_old[MD_SB_DISKS];   /* disk image */
++      struct buffer_head      *bh_new[MD_SB_DISKS];   /* buffers of the MD device (present in buffer cache) */
++      struct buffer_head      *bh_copy[MD_SB_DISKS];  /* copy on write of bh_new (bh_new can change from under us) */
++      struct buffer_head      *bh_req[MD_SB_DISKS];   /* copy of bh_new (only the buffer heads), queued to the lower levels */
++      int                     cmd_new[MD_SB_DISKS];   /* READ/WRITE for new */
++      int                     new[MD_SB_DISKS];       /* buffer added since the last handle_stripe() */
++      unsigned long           sector;                 /* sector of this row */
++      int                     size;                   /* buffers size */
++      int                     pd_idx;                 /* parity disk index */
++      atomic_t                nr_pending;             /* nr of pending cmds */
++      unsigned long           state;                  /* state flags */
++      int                     cmd;                    /* stripe cmd */
++      int                     count;                  /* nr of waiters */
++      int                     write_method;           /* reconstruct-write / read-modify-write */
++      int                     phase;                  /* PHASE_BEGIN, ..., PHASE_COMPLETE */
++      struct wait_queue       *wait;                  /* processes waiting for this stripe */
++};
++
++/*
++ * Phase
++ */
++#define PHASE_BEGIN           0
++#define PHASE_READ_OLD                1
++#define PHASE_WRITE           2
++#define PHASE_READ            3
++#define PHASE_COMPLETE                4
++
++/*
++ * Write method
++ */
++#define METHOD_NONE           0
++#define RECONSTRUCT_WRITE     1
++#define READ_MODIFY_WRITE     2
++
++/*
++ * Stripe state
++ */
++#define STRIPE_LOCKED         0
++#define STRIPE_ERROR          1
++
++/*
++ * Stripe commands
++ */
++#define STRIPE_NONE           0
++#define       STRIPE_WRITE            1
++#define STRIPE_READ           2
++
++struct raid5_private_data {
++      struct stripe_head      **stripe_hashtbl;
++      mddev_t                 *mddev;
++      mdk_thread_t            *thread, *resync_thread;
++      struct disk_info        disks[MD_SB_DISKS];
++      struct disk_info        *spare;
++      int                     buffer_size;
++      int                     chunk_size, level, algorithm;
++      int                     raid_disks, working_disks, failed_disks;
++      int                     sector_count;
++      unsigned long           next_sector;
++      atomic_t                nr_handle;
++      struct stripe_head      *next_free_stripe;
++      int                     nr_stripes;
++      int                     resync_parity;
++      int                     max_nr_stripes;
++      int                     clock;
++      int                     nr_hashed_stripes;
++      int                     nr_locked_stripes;
++      int                     nr_pending_stripes;
++      int                     nr_cached_stripes;
++
++      /*
++       * Free stripes pool
++       */
++      int                     nr_free_sh;
++      struct stripe_head      *free_sh_list;
++      struct wait_queue       *wait_for_stripe;
++};
++
++typedef struct raid5_private_data raid5_conf_t;
++
++#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
++
++/*
++ * Our supported algorithms
++ */
++#define ALGORITHM_LEFT_ASYMMETRIC     0
++#define ALGORITHM_RIGHT_ASYMMETRIC    1
++#define ALGORITHM_LEFT_SYMMETRIC      2
++#define ALGORITHM_RIGHT_SYMMETRIC     3
++
++#endif
+diff -uNr linux-orig/include/linux/raid/translucent.h linux/include/linux/raid/translucent.h
+--- linux-orig/include/linux/raid/translucent.h        Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/translucent.h     Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,23 @@
++#ifndef _TRANSLUCENT_H
++#define _TRANSLUCENT_H
++
++#include <linux/raid/md.h>
++
++typedef struct dev_info dev_info_t;
++
++struct dev_info {
++      kdev_t          dev;
++      int             size;
++};
++
++struct translucent_private_data
++{
++      dev_info_t              disks[MD_SB_DISKS];
++};
++
++
++typedef struct translucent_private_data translucent_conf_t;
++
++#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
++
++#endif
+diff -uNr linux-orig/include/linux/raid/xor.h linux/include/linux/raid/xor.h
+--- linux-orig/include/linux/raid/xor.h        Wed Dec 31 17:00:00 1969
++++ linux/include/linux/raid/xor.h     Fri Dec  8 22:54:52 2000
+@@ -0,0 +1,12 @@
++#ifndef _XOR_H
++#define _XOR_H
++
++#include <linux/raid/md.h>
++
++#define MAX_XOR_BLOCKS 5
++
++extern void calibrate_xor_block(void);
++extern void (*xor_block)(unsigned int count,
++                         struct buffer_head **bh_ptr);
++
++#endif
+diff -uNr linux-orig/include/linux/raid0.h linux/include/linux/raid0.h
+--- linux-orig/include/linux/raid0.h   Tue Oct 29 06:20:24 1996
++++ linux/include/linux/raid0.h        Fri Dec  8 22:54:52 2000
+@@ -1,27 +0,0 @@
+-#ifndef _RAID0_H
+-#define _RAID0_H
+-
+-struct strip_zone
+-{
+-  int zone_offset;            /* Zone offset in md_dev */
+-  int dev_offset;             /* Zone offset in real dev */
+-  int size;                   /* Zone size */
+-  int nb_dev;                 /* Number of devices attached to the zone */
+-  struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
+-};
+-
+-struct raid0_hash
+-{
+-  struct strip_zone *zone0, *zone1;
+-};
+-
+-struct raid0_data
+-{
+-  struct raid0_hash *hash_table; /* Dynamically allocated */
+-  struct strip_zone *strip_zone; /* This one too */
+-  int nr_strip_zones;
+-  struct strip_zone *smallest;
+-  int nr_zones;
+-};
+-
+-#endif
+diff -uNr linux-orig/include/linux/raid1.h linux/include/linux/raid1.h
+--- linux-orig/include/linux/raid1.h   Fri May  8 00:17:13 1998
++++ linux/include/linux/raid1.h        Fri Dec  8 22:54:52 2000
+@@ -1,49 +0,0 @@
+-#ifndef _RAID1_H
+-#define _RAID1_H
+-
+-#include <linux/md.h>
+-
+-struct mirror_info {
+-      int             number;
+-      int             raid_disk;
+-      kdev_t          dev;
+-      int             next;
+-      int             sect_limit;
+-
+-      /*
+-       * State bits:
+-       */
+-      int             operational;
+-      int             write_only;
+-      int             spare;
+-};
+-
+-struct raid1_data {
+-      struct md_dev *mddev;
+-      struct mirror_info mirrors[MD_SB_DISKS];        /* RAID1 devices, 2 to MD_SB_DISKS */
+-      int raid_disks;
+-      int working_disks;                      /* Number of working disks */
+-      int last_used;
+-      unsigned long   next_sect;
+-      int             sect_count;
+-      int resync_running;
+-};
+-
+-/*
+- * this is our 'private' 'collective' RAID1 buffer head.
+- * it contains information about what kind of IO operations were started
+- * for this RAID5 operation, and about their status:
+- */
+-
+-struct raid1_bh {
+-      unsigned int            remaining;
+-      int                     cmd;
+-      unsigned long           state;
+-      struct md_dev           *mddev;
+-      struct buffer_head      *master_bh;
+-      struct buffer_head      *mirror_bh [MD_SB_DISKS];
+-      struct buffer_head      bh_req;
+-      struct buffer_head      *next_retry;
+-};
+-
+-#endif
+diff -uNr linux-orig/include/linux/raid5.h linux/include/linux/raid5.h
+--- linux-orig/include/linux/raid5.h   Fri May  8 00:17:13 1998
++++ linux/include/linux/raid5.h        Fri Dec  8 22:54:52 2000
+@@ -1,110 +0,0 @@
+-#ifndef _RAID5_H
+-#define _RAID5_H
+-
+-#ifdef __KERNEL__
+-#include <linux/md.h>
+-#include <asm/atomic.h>
+-
+-struct disk_info {
+-      kdev_t  dev;
+-      int     operational;
+-      int     number;
+-      int     raid_disk;
+-      int     write_only;
+-      int     spare;
+-};
+-
+-struct stripe_head {
+-      struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
+-      struct stripe_head      *free_next;             /* pool of free sh's */
+-      struct buffer_head      *buffer_pool;           /* pool of free buffers */
+-      struct buffer_head      *bh_pool;               /* pool of free bh's */
+-      struct raid5_data       *raid_conf;
+-      struct buffer_head      *bh_old[MD_SB_DISKS];   /* disk image */
+-      struct buffer_head      *bh_new[MD_SB_DISKS];   /* buffers of the MD device (present in buffer cache) */
+-      struct buffer_head      *bh_copy[MD_SB_DISKS];  /* copy on write of bh_new (bh_new can change from under us) */
+-      struct buffer_head      *bh_req[MD_SB_DISKS];   /* copy of bh_new (only the buffer heads), queued to the lower levels */
+-      int                     cmd_new[MD_SB_DISKS];   /* READ/WRITE for new */
+-      int                     new[MD_SB_DISKS];       /* buffer added since the last handle_stripe() */
+-      unsigned long           sector;                 /* sector of this row */
+-      int                     size;                   /* buffers size */
+-      int                     pd_idx;                 /* parity disk index */
+-      int                     nr_pending;             /* nr of pending cmds */
+-      unsigned long           state;                  /* state flags */
+-      int                     cmd;                    /* stripe cmd */
+-      int                     count;                  /* nr of waiters */
+-      int                     write_method;           /* reconstruct-write / read-modify-write */
+-      int                     phase;                  /* PHASE_BEGIN, ..., PHASE_COMPLETE */
+-      struct wait_queue       *wait;                  /* processes waiting for this stripe */
+-};
+-
+-/*
+- * Phase
+- */
+-#define PHASE_BEGIN           0
+-#define PHASE_READ_OLD                1
+-#define PHASE_WRITE           2
+-#define PHASE_READ            3
+-#define PHASE_COMPLETE                4
+-
+-/*
+- * Write method
+- */
+-#define METHOD_NONE           0
+-#define RECONSTRUCT_WRITE     1
+-#define READ_MODIFY_WRITE     2
+-
+-/*
+- * Stripe state
+- */
+-#define STRIPE_LOCKED         0
+-#define STRIPE_ERROR          1
+-
+-/*
+- * Stripe commands
+- */
+-#define STRIPE_NONE           0
+-#define       STRIPE_WRITE            1
+-#define STRIPE_READ           2
+-
+-struct raid5_data {
+-      struct stripe_head      **stripe_hashtbl;
+-      struct md_dev           *mddev;
+-      struct md_thread        *thread, *resync_thread;
+-      struct disk_info        disks[MD_SB_DISKS];
+-      struct disk_info        *spare;
+-      int                     buffer_size;
+-      int                     chunk_size, level, algorithm;
+-      int                     raid_disks, working_disks, failed_disks;
+-      int                     sector_count;
+-      unsigned long           next_sector;
+-      atomic_t                nr_handle;
+-      struct stripe_head      *next_free_stripe;
+-      int                     nr_stripes;
+-      int                     resync_parity;
+-      int                     max_nr_stripes;
+-      int                     clock;
+-      int                     nr_hashed_stripes;
+-      int                     nr_locked_stripes;
+-      int                     nr_pending_stripes;
+-      int                     nr_cached_stripes;
+-
+-      /*
+-       * Free stripes pool
+-       */
+-      int                     nr_free_sh;
+-      struct stripe_head      *free_sh_list;
+-      struct wait_queue       *wait_for_stripe;
+-};
+-
+-#endif
+-
+-/*
+- * Our supported algorithms
+- */
+-#define ALGORITHM_LEFT_ASYMMETRIC     0
+-#define ALGORITHM_RIGHT_ASYMMETRIC    1
+-#define ALGORITHM_LEFT_SYMMETRIC      2
+-#define ALGORITHM_RIGHT_SYMMETRIC     3
+-
+-#endif
+diff -uNr linux-orig/include/linux/sysctl.h linux/include/linux/sysctl.h
+--- linux-orig/include/linux/sysctl.h  Fri Dec  8 22:53:55 2000
++++ linux/include/linux/sysctl.h       Fri Dec  8 22:57:02 2000
+@@ -435,6 +435,7 @@
+ enum {
+       DEV_CDROM=1,
+       DEV_HWMON=2,
++      DEV_MD=4,
+       DEV_MAC_HID=5
+ };
+ 
+@@ -446,6 +447,11 @@
+       DEV_CDROM_DEBUG=4,
+       DEV_CDROM_LOCK=5,
+       DEV_CDROM_CHECK_MEDIA=6
++};
++
++/* /proc/sys/dev/md */
++enum {
++      DEV_MD_SPEED_LIMIT=1
+ };
+ 
+ /* /proc/sys/dev/mac_hid */
+diff -uNr linux-orig/init/main.c linux/init/main.c
+--- linux-orig/init/main.c     Fri Dec  8 22:53:56 2000
++++ linux/init/main.c  Fri Dec  8 22:54:52 2000
+@@ -19,6 +19,7 @@
+ #include <linux/utsname.h>
+ #include <linux/ioport.h>
+ #include <linux/init.h>
++#include <linux/raid/md.h>
+ #include <linux/smp_lock.h>
+ #include <linux/blk.h>
+ #include <linux/hdreg.h>
+@@ -550,7 +551,7 @@
+ #ifdef CONFIG_BLK_DEV_FD
+       { "fd",      0x0200 },
+ #endif
+-#ifdef CONFIG_MD_BOOT
++#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
+       { "md",      0x0900 },       
+ #endif     
+ #ifdef CONFIG_BLK_DEV_XD
+@@ -1058,6 +1059,9 @@
+ #ifdef CONFIG_MD_BOOT
+       { "md=", md_setup},
+ #endif
++#if CONFIG_BLK_DEV_MD
++      { "raid=", raid_setup},
++#endif
+ #ifdef CONFIG_ADBMOUSE
+       { "adb_buttons=", adb_mouse_setup },
+ #endif
+@@ -1637,6 +1641,9 @@
+                       while (pid != wait(&i));
+               if (MAJOR(real_root_dev) != RAMDISK_MAJOR
+                    || MINOR(real_root_dev) != 0) {
++#ifdef CONFIG_BLK_DEV_MD
++                      autodetect_raid();
++#endif
+                       error = change_root(real_root_dev,"/initrd");
+                       if (error)
+                               printk(KERN_ERR "Change root to /initrd: "
author	kloczek <kloczek@pld-linux.org>
	Fri, 15 Dec 2000 00:37:40 +0000 (00:37 +0000)
committer	cvs2git <feedback@pld-linux.org>
	Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)