diff -ruN linux.orig/Documentation/Configure.help linux-2.2.16/Documentation/Configure.help --- linux.orig/Documentation/Configure.help Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/Documentation/Configure.help Fri Jun 9 11:37:48 2000 @@ -961,6 +961,13 @@ If unsure, say N. +Autodetect RAID partitions +CONFIG_AUTODETECT_RAID + This feature lets the kernel detect RAID partitions on bootup. + An autodetect RAID partition is a normal partition with partition + type 0xfd. Use this if you want to boot RAID devices, or want to + run them automatically. + Linear (append) mode CONFIG_MD_LINEAR If you say Y here, then your multiple devices driver will be able to @@ -1039,6 +1046,21 @@ Documentation/modules.txt. If unsure, say Y. + +Translucent Block Device Support (EXPERIMENTAL) +CONFIG_MD_TRANSLUCENT + DO NOT USE THIS STUFF YET! + + currently there is only a placeholder there as the implementation + is not yet usable. + +Hierarchical Storage Management support (EXPERIMENTAL) +CONFIG_MD_HSM + DO NOT USE THIS STUFF YET! + + i have released this so people can comment on the architecture, + but user-space tools are still unusable so there is nothing much + you can do with this. Boot support (linear, striped) CONFIG_MD_BOOT diff -ruN linux.orig/arch/i386/defconfig linux-2.2.16/arch/i386/defconfig --- linux.orig/arch/i386/defconfig Thu May 4 02:16:30 2000 +++ linux-2.2.16/arch/i386/defconfig Fri Jun 9 11:37:45 2000 @@ -93,7 +93,15 @@ # # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_NBD is not set -# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_MD=y +CONFIG_AUTODETECT_RAID=y +CONFIG_MD_TRANSLUCENT=y +CONFIG_MD_LINEAR=y +CONFIG_MD_STRIPED=y +CONFIG_MD_MIRRORING=y +CONFIG_MD_RAID5=y +CONFIG_MD_BOOT=y +CONFIG_BLK_DEV_HSM=y # CONFIG_BLK_DEV_RAM is not set # CONFIG_BLK_DEV_XD is not set # CONFIG_BLK_DEV_DAC960 is not set diff -ruN linux.orig/arch/sparc/config.in linux-2.2.16/arch/sparc/config.in --- linux.orig/arch/sparc/config.in Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/arch/sparc/config.in Fri Jun 9 11:37:45 2000 @@ -1,4 +1,4 @@ -# $Id$ +# $Id$ # For a description of the syntax of this configuration file, # see Documentation/kbuild/config-language.txt. # @@ -85,10 +85,16 @@ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID tristate ' Linear (append) mode' CONFIG_MD_LINEAR tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM +fi +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT fi tristate 'RAM disk support' CONFIG_BLK_DEV_RAM diff -ruN linux.orig/arch/sparc/defconfig linux-2.2.16/arch/sparc/defconfig --- linux.orig/arch/sparc/defconfig Thu May 4 02:16:32 2000 +++ linux-2.2.16/arch/sparc/defconfig Fri Jun 9 11:37:45 2000 @@ -88,10 +88,13 @@ # CONFIG_BLK_DEV_FD=y CONFIG_BLK_DEV_MD=y +# CONFIG_AUTODETECT_RAID is not set CONFIG_MD_LINEAR=m CONFIG_MD_STRIPED=m CONFIG_MD_MIRRORING=m CONFIG_MD_RAID5=m +# CONFIG_MD_TRANSLUCENT is not set +# CONFIG_MD_HSM is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_LOOP=m diff -ruN linux.orig/arch/sparc64/config.in linux-2.2.16/arch/sparc64/config.in --- linux.orig/arch/sparc64/config.in Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/arch/sparc64/config.in Fri Jun 9 11:37:48 2000 @@ -97,10 +97,16 @@ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID tristate ' Linear (append) mode' CONFIG_MD_LINEAR tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM +fi +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT fi tristate 'RAM disk support' CONFIG_BLK_DEV_RAM diff -ruN linux.orig/arch/sparc64/defconfig linux-2.2.16/arch/sparc64/defconfig --- linux.orig/arch/sparc64/defconfig Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/arch/sparc64/defconfig Fri Jun 9 11:37:48 2000 @@ -107,10 +107,13 @@ # CONFIG_BLK_DEV_FD=y CONFIG_BLK_DEV_MD=y +# CONFIG_AUTODETECT_RAID is not set CONFIG_MD_LINEAR=m CONFIG_MD_STRIPED=m CONFIG_MD_MIRRORING=m CONFIG_MD_RAID5=m +# CONFIG_MD_TRANSLUCENT is not set +# CONFIG_MD_HSM is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_LOOP=m diff -ruN linux.orig/drivers/block/Config.in linux-2.2.16/drivers/block/Config.in --- linux.orig/drivers/block/Config.in Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/drivers/block/Config.in Fri Jun 9 11:37:45 2000 @@ -102,10 +102,13 @@ fi bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID tristate ' Linear (append) mode' CONFIG_MD_LINEAR tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM fi if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then bool ' Boot support (linear, striped)' CONFIG_MD_BOOT diff -ruN linux.orig/drivers/block/Makefile linux-2.2.16/drivers/block/Makefile --- linux.orig/drivers/block/Makefile Thu May 4 02:16:32 2000 +++ linux-2.2.16/drivers/block/Makefile Fri Jun 9 11:37:45 2000 @@ -282,10 +282,28 @@ endif ifeq ($(CONFIG_MD_RAID5),y) +LX_OBJS += xor.o L_OBJS += raid5.o else ifeq ($(CONFIG_MD_RAID5),m) + LX_OBJS += xor.o M_OBJS += raid5.o + endif +endif + +ifeq ($(CONFIG_MD_TRANSLUCENT),y) +L_OBJS += translucent.o +else + ifeq ($(CONFIG_MD_TRANSLUCENT),m) + M_OBJS += translucent.o + endif +endif + +ifeq ($(CONFIG_MD_HSM),y) +L_OBJS += hsm.o +else + ifeq ($(CONFIG_MD_HSM),m) + M_OBJS += hsm.o endif endif diff -ruN linux.orig/drivers/block/genhd.c linux-2.2.16/drivers/block/genhd.c --- linux.orig/drivers/block/genhd.c Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/drivers/block/genhd.c Fri Jun 9 11:37:45 2000 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1649,6 +1650,9 @@ else #endif rd_load(); +#endif +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); #endif #ifdef CONFIG_MD_BOOT md_setup_drive(); diff -ruN linux.orig/drivers/block/hsm.c linux-2.2.16/drivers/block/hsm.c --- linux.orig/drivers/block/hsm.c Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/drivers/block/hsm.c Fri Jun 9 11:37:45 2000 @@ -0,0 +1,840 @@ +/* + hsm.c : HSM RAID driver for Linux + Copyright (C) 1998 Ingo Molnar + + HSM mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#include +#include + +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + + +#define DEBUG_HSM 1 + +#if DEBUG_HSM +#define dprintk(x,y...) printk(x,##y) +#else +#define dprintk(x,y...) do { } while (0) +#endif + +void print_bh(struct buffer_head *bh) +{ + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh, + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev, + bh->b_rsector, bh->b_this_page, bh->b_state, + bh->b_next_free, bh->b_count, bh->b_data, + bh->b_list, bh->b_flushtime + ); +} + +static int check_bg (pv_t *pv, pv_block_group_t * bg) +{ + int i, free = 0; + + dprintk("checking bg ...\n"); + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + if (pv_pptr_free(bg->blocks + i)) { + free++; + if (test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d set?\n", i); + } + } else { + if (!test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d not set?\n", i); + } + } + } + dprintk("%d free blocks in bg ...\n", free); + return free; +} + +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr) +{ + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + struct buffer_head *bh; + + dprintk("... getting BG at %u ...\n", bg_pos); + + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return; + } + desc->bg = (pv_block_group_t *) bh->b_data; + desc->free_blocks = check_bg(pv, desc->bg); +} + +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + pv_pptr_t * bptr = desc->bg->blocks + i; + if (pv_pptr_free(bptr)) { + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + + if (test_bit(i, desc->bg->used_bitmap)) { + MD_BUG(); + continue; + } + bptr->u.used.owner.log_id = lv->log_id; + bptr->u.used.owner.log_index = lblock; + index->data.phys_nr = pv->phys_nr; + index->data.phys_block = bg_pos + i + 1; + set_bit(i, desc->bg->used_bitmap); + desc->free_blocks--; + dprintk(".....free blocks left in bg %p: %d\n", + desc->bg, desc->free_blocks); + return 0; + } + } + return -ENOSPC; +} + +static int __get_free_block (lv_t *lv, pv_t *pv, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + dprintk("trying to get free block for lblock %d ...\n", lblock); + + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + pv_bg_desc_t *desc = pv->bg_array + i; + + dprintk("looking at desc #%d (%p)...\n", i, desc->bg); + if (!desc->bg) + get_bg(pv, desc, i); + + if (desc->bg && desc->free_blocks) + return find_free_block(lv, pv, desc, i, + lblock, index); + } + dprintk("hsm: pv %s full!\n", partition_name(pv->dev)); + return -ENOSPC; +} + +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + int err; + + if (!lv->free_indices) + return -ENOSPC; + + /* fix me */ + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index); + + if (err || !index->data.phys_block) { + MD_BUG(); + return -ENOSPC; + } + + lv->free_indices--; + + return 0; +} + +/* + * fix me: wordsize assumptions ... + */ +#define INDEX_BITS 8 +#define INDEX_DEPTH (32/INDEX_BITS) +#define INDEX_MASK ((1< [.", index->data.phys_nr, + index->data.phys_block, index->cpu_addr); + + tmp = index_child(index); + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) + dprintk("(%d->%d)", i, index_block(lv, tmp)); + tmp++; + } + dprintk(".]\n"); +} + +static int read_index_group (lv_t *lv, lv_lptr_t *index) +{ + lv_lptr_t *index_group, *tmp; + struct buffer_head *bh; + int i; + + dprintk("reading index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -EIO; + } + if (!buffer_uptodate(bh)) + MD_BUG(); + + index_group = (lv_lptr_t *) bh->b_data; + tmp = index_group; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) { + dprintk("index group has BLOCK %d, non-present.\n", i); + tmp->cpu_addr = 0; + } + tmp++; + } + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("have read index group %p at block %d.\n", + index_group, index_block(lv, index)); + print_index_list(lv, index); + + return 0; +} + +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + struct buffer_head *bh; + lv_lptr_t * index_group; + + if (get_free_block(lv, lblock, index)) + return -ENOSPC; + + dprintk("creating block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), + index_block(lv, index), HSM_BLOCKSIZE); + + index_group = (lv_lptr_t *) bh->b_data; + md_clear_page(index_group); + mark_buffer_uptodate(bh, 1); + + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("allocated index group %p at block %d.\n", + index_group, index_block(lv, index)); + return 0; +} + +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (!index_present(index)) { + dprintk("no group, level %u, pos %u\n", l, idx); + if (alloc_index_group(lv, lblock, index)) + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) { + dprintk("no data, pos %u\n", idx); + if (get_free_block(lv, lblock, index)) + return NULL; + return index; + } + MD_BUG(); + return index; +} + +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (index_free(index)) + return NULL; + if (!index_present(index)) + read_index_group(lv, index); + if (!index_present(index)) { + MD_BUG(); + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) + return NULL; + return index; +} + +static int read_root_index(lv_t *lv) +{ + int err; + lv_lptr_t *index = &lv->root_index; + + if (!index_block(lv, index)) { + printk("LV has no root index yet, creating.\n"); + + err = alloc_index_group (lv, 0, index); + if (err) { + printk("could not create index group, err:%d\n", err); + return err; + } + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx = + lv->root_index.data; + } else { + printk("LV already has a root index.\n"); + printk("... at <%s:%d>.\n", + partition_name(index_dev(lv, index)), + index_block(lv, index)); + + read_index_group(lv, index); + } + return 0; +} + +static int init_pv(pv_t *pv) +{ + struct buffer_head *bh; + pv_sb_t *pv_sb; + + bh = bread (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + pv_sb = (pv_sb_t *) bh->b_data; + pv->pv_sb = pv_sb; + + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) { + printk("%s is not a PV, has magic %x instead of %x!\n", + partition_name(pv->dev), pv_sb->pv_magic, + HSM_PV_SB_MAGIC); + return -1; + } + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev), + pv->phys_nr); + printk("... created under HSM version %d.%d.%d, at %x.\n", + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime); + printk("... total # of blocks: %d (%d left unallocated).\n", + pv_sb->pv_total_size, pv_sb->pv_blocks_left); + + printk("... block size: %d bytes.\n", pv_sb->pv_block_size); + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size); + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size); + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups); + + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) { + MD_BUG(); + return 1; + } + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL); + if (!pv->bg_array) { + MD_BUG(); + return 1; + } + memset(pv->bg_array, 0, PAGE_SIZE); + + return 0; +} + +static int free_pv(pv_t *pv) +{ + struct buffer_head *bh; + + dprintk("freeing PV %d ...\n", pv->phys_nr); + + if (pv->bg_array) { + int i; + + dprintk(".... freeing BGs ...\n"); + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2; + pv_bg_desc_t *desc = pv->bg_array + i; + + if (desc->bg) { + dprintk(".... freeing BG %d ...\n", i); + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + } + } + free_page((unsigned long)pv->bg_array); + } else + MD_BUG(); + + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + + return 0; +} + +struct semaphore hsm_sem = MUTEX; + +#define HSM_SECTORS (HSM_BLOCKSIZE/512) + +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long bsectors) +{ + lv_t *lv = kdev_to_lv(dev); + lv_lptr_t *index; + unsigned int lblock = *rsector / HSM_SECTORS; + unsigned int offset = *rsector % HSM_SECTORS; + int err = -EIO; + + if (!lv) { + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev)); + goto out; + } + if (offset + bsectors > HSM_SECTORS) { + MD_BUG(); + goto out; + } + down(&hsm_sem); + index = find_index(lv, lblock); + if (!index) { + printk("no block %u yet ... allocating\n", lblock); + index = alloc_fixed_index(lv, lblock); + } + + err = 0; + + printk(" %u <%s : %ld(%ld)> -> ", lblock, + partition_name(*rdev), *rsector, bsectors); + + *rdev = index_dev(lv, index); + *rsector = index_block(lv, index) * HSM_SECTORS + offset; + + printk(" <%s : %ld> %u\n", + partition_name(*rdev), *rsector, index_block(lv, index)); + + up(&hsm_sem); +out: + return err; +} + +static void free_index (lv_t *lv, lv_lptr_t * index) +{ + struct buffer_head *bh; + + printk("tryin to get cached block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE); + + printk("....FREEING "); + print_index_list(lv, index); + + if (bh) { + if (!buffer_uptodate(bh)) + MD_BUG(); + if ((lv_lptr_t *)bh->b_data != index_child(index)) { + printk("huh? b_data is %p, index content is %p.\n", + bh->b_data, index_child(index)); + } else + printk("good, b_data == index content == %p.\n", + index_child(index)); + printk("b_count == %d, writing.\n", bh->b_count); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + printk("done.\n"); + } else { + printk("FAILED!\n"); + } + print_index_list(lv, index); + index_child(index) = NULL; +} + +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*8]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + index = index_0; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + free_index_group(lv, level-1, + index_child(index)); + dprintk("%s freeing index group block %p ...", + dots, index_child(index)); + free_index(lv, index); + } + } + index++; + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void free_lv_indextree (lv_t *lv) +{ + dprintk("freeing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..freeing root index %p ...", index_child(&lv->root_index)); + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr, + lv->root_index.data.phys_block, lv->root_index.cpu_addr); + free_index(lv, &lv->root_index); + dprintk("..INDEX TREE done.\n"); + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */ + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices; +} + +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*5]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + index = index_0 + i; + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + print_index_group(lv, level-1, + index_child(index)); + } + } + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void print_lv (lv_t *lv) +{ + dprintk("printing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..INDEX TREE done.\n"); +} + +static int map_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + mddev_t *mddev = lv->vg->mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + if (kdev_to_mddev(dev)) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1; + md_size[nr] = md_size[mdidx(mddev)]; + add_mddev_mapping(mddev, dev, lv); + + return 0; +} + +static int unmap_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = 0; + md_size[nr] = 0; + del_mddev_mapping(lv->vg->mddev, dev); + + return 0; +} + +static int init_vg (vg_t *vg) +{ + int i; + lv_t *lv; + kdev_t dev; + vg_sb_t *vg_sb; + struct buffer_head *bh; + lv_descriptor_t *lv_desc; + + /* + * fix me: read all PVs and compare the SB + */ + dev = vg->pv_array[0].dev; + bh = bread (dev, 1, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + vg_sb = (vg_sb_t *) bh->b_data; + vg->vg_sb = vg_sb; + + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) { + printk("%s is not a valid VG, has magic %x instead of %x!\n", + partition_name(dev), vg_sb->vg_magic, + HSM_VG_SB_MAGIC); + return -1; + } + + vg->nr_lv = 0; + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + unsigned int id; + lv_desc = vg->vg_sb->lv_array + i; + + id = lv_desc->lv_id; + if (!id) { + printk("... LV desc %d empty\n", i); + continue; + } + if (id >= HSM_MAX_LVS_PER_VG) { + MD_BUG(); + continue; + } + + lv = vg->lv_array + id; + if (lv->vg) { + MD_BUG(); + continue; + } + lv->log_id = id; + lv->vg = vg; + lv->max_indices = lv_desc->lv_max_indices; + lv->free_indices = lv_desc->lv_free_indices; + lv->root_index.data = lv_desc->lv_root_idx; + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id); + + vg->nr_lv++; + + map_lv(lv); + if (read_root_index(lv)) { + vg->nr_lv--; + unmap_lv(lv); + memset(lv, 0, sizeof(*lv)); + } + } + if (vg->nr_lv != vg_sb->nr_lvs) + MD_BUG(); + + return 0; +} + +static int hsm_run (mddev_t *mddev) +{ + int i; + vg_t *vg; + mdk_rdev_t *rdev; + + MOD_INC_USE_COUNT; + + vg = kmalloc (sizeof (*vg), GFP_KERNEL); + if (!vg) + goto out; + memset(vg, 0, sizeof(*vg)); + mddev->private = vg; + vg->mddev = mddev; + + if (md_check_ordering(mddev)) { + printk("hsm: disks are not ordered, aborting!\n"); + goto out; + } + + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE); + + vg->nr_pv = mddev->nb_dev; + ITERATE_RDEV_ORDERED(mddev,rdev,i) { + pv_t *pv = vg->pv_array + i; + + pv->dev = rdev->dev; + fsync_dev (pv->dev); + set_blocksize (pv->dev, HSM_BLOCKSIZE); + pv->phys_nr = i; + if (init_pv(pv)) + goto out; + } + + init_vg(vg); + + return 0; + +out: + if (vg) { + kfree(vg); + mddev->private = NULL; + } + MOD_DEC_USE_COUNT; + + return 1; +} + +static int hsm_stop (mddev_t *mddev) +{ + lv_t *lv; + vg_t *vg; + int i; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + print_lv(lv); + free_lv_indextree(lv); + unmap_lv(lv); + } + for (i = 0; i < vg->nr_pv; i++) + free_pv(vg->pv_array + i); + + kfree(vg); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static int hsm_status (char *page, mddev_t *mddev) +{ + int sz = 0, i; + lv_t *lv; + vg_t *vg; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + sz += sprintf(page+sz, " ", lv->log_id, + lv->max_indices - lv->free_indices, lv->max_indices); + } + return sz; +} + + +static mdk_personality_t hsm_personality= +{ + "hsm", + hsm_map, + NULL, + NULL, + hsm_run, + hsm_stop, + hsm_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL +}; + +#ifndef MODULE + +md__initfunc(void hsm_init (void)) +{ + register_md_personality (HSM, &hsm_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (HSM, &hsm_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (HSM); +} + +#endif + +/* + * This Linus-trick catches bugs via the linker. + */ + +extern void __BUG__in__hsm_dot_c_1(void); +extern void __BUG__in__hsm_dot_c_2(void); +extern void __BUG__in__hsm_dot_c_3(void); +extern void __BUG__in__hsm_dot_c_4(void); +extern void __BUG__in__hsm_dot_c_5(void); +extern void __BUG__in__hsm_dot_c_6(void); +extern void __BUG__in__hsm_dot_c_7(void); + +void bugcatcher (void) +{ + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_1(); + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_2(); + + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_4(); + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_3(); + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_6(); + + if (sizeof(lv_lptr_t) != 16) + __BUG__in__hsm_dot_c_5(); + if (sizeof(pv_pptr_t) != 16) + __BUG__in__hsm_dot_c_6(); +} + diff -ruN linux.orig/drivers/block/linear.c linux-2.2.16/drivers/block/linear.c --- linux.orig/drivers/block/linear.c Sat Nov 8 20:39:12 1997 +++ linux-2.2.16/drivers/block/linear.c Fri Jun 9 11:37:45 2000 @@ -1,4 +1,3 @@ - /* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -19,186 +18,207 @@ #include -#include +#include #include -#include -#include "linear.h" +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int linear_run (int minor, struct md_dev *mddev) +static int linear_run (mddev_t *mddev) { - int cur=0, i, size, dev0_size, nb_zone; - struct linear_data *data; - - MOD_INC_USE_COUNT; - - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL); - data=(struct linear_data *) mddev->private; - - /* - Find out the smallest device. This was previously done - at registry time, but since it violates modularity, - I moved it here... Any comment ? ;-) - */ - - data->smallest=mddev->devices; - for (i=1; inb_dev; i++) - if (data->smallest->size > mddev->devices[i].size) - data->smallest=mddev->devices+i; - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL); - - size=mddev->devices[cur].size; + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = + md_size[mdidx(mddev)] / conf->smallest->size + + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); + + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table->dev1 = disk; + table++; + } + size += disk->size; + + while (size) { + table->dev0 = disk; + size -= conf->smallest->size; + if (size < 0) + break; + table->dev1 = NULL; + table++; + } + } + table->dev1 = NULL; + + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; +} + +static int linear_stop (mddev_t *mddev) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf->hash_table); + kfree(conf); - i=0; - while (curnb_dev) - { - data->hash_table[i].dev0=mddev->devices+cur; + MOD_DEC_USE_COUNT; - if (size>=data->smallest->size) /* If we completely fill the slot */ - { - data->hash_table[i++].dev1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==mddev->nb_dev) continue; - size=mddev->devices[cur].size; - } - - continue; - } - - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */ - { - data->hash_table[i].dev1=NULL; - continue; - } - - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=mddev->devices[cur].size; - data->hash_table[i++].dev1=mddev->devices+cur; - size-=(data->smallest->size - dev0_size); - } - - return 0; -} - -static int linear_stop (int minor, struct md_dev *mddev) -{ - struct linear_data *data=(struct linear_data *) mddev->private; - - kfree (data->hash_table); - kfree (data); - - MOD_DEC_USE_COUNT; - - return 0; + return 0; } -static int linear_map (struct md_dev *mddev, kdev_t *rdev, +static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { - struct linear_data *data=(struct linear_data *) mddev->private; - struct linear_hash *hash; - struct real_dev *tmp_dev; - long block; - - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - if (block >= (hash->dev0->size + hash->dev0->offset)) - { - if (!hash->dev1) - { - printk ("linear_map : hash->dev1==NULL for block %ld\n", block); - return (-1); - } - - tmp_dev=hash->dev1; - } - else - tmp_dev=hash->dev0; + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + long block; + + block = *rsector >> 1; + hash = conf->hash_table + (block / conf->smallest->size); + + if (block >= (hash->dev0->size + hash->dev0->offset)) + { + if (!hash->dev1) + { + printk ("linear_map : hash->dev1==NULL for block %ld\n", + block); + return -1; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset) - printk ("Block %ld out of bounds on dev %s size %d offset %d\n", - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); + if (block >= (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) + printk ("Block %ld out of bounds on dev %s size %d offset %d\n", + block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); - *rdev=tmp_dev->dev; - *rsector=(block-(tmp_dev->offset)) << 1; + *rdev = tmp_dev->dev; + *rsector = (block - tmp_dev->offset) << 1; - return (0); + return 0; } -static int linear_status (char *page, int minor, struct md_dev *mddev) +static int linear_status (char *page, mddev_t *mddev) { - int sz=0; + int sz=0; #undef MD_DEBUG #ifdef MD_DEBUG - int j; - struct linear_data *data=(struct linear_data *) mddev->private; + int j; + linear_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; jnr_zones; j++) - { - sz+=sprintf (page+sz, "[%s", - partition_name (data->hash_table[j].dev0->dev)); - - if (data->hash_table[j].dev1) - sz+=sprintf (page+sz, "/%s] ", - partition_name(data->hash_table[j].dev1->dev)); - else - sz+=sprintf (page+sz, "] "); - } - - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); #endif - sz+=sprintf (page+sz, " %dk rounding", 1<param.chunk_size/1024); + return sz; } -static struct md_personality linear_personality= +static mdk_personality_t linear_personality= { - "linear", - linear_map, - NULL, - NULL, - linear_run, - linear_stop, - linear_status, - NULL, /* no ioctls */ - 0 + "linear", + linear_map, + NULL, + NULL, + linear_run, + linear_stop, + linear_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL }; - #ifndef MODULE -__initfunc(void linear_init (void)) +md__initfunc(void linear_init (void)) { - register_md_personality (LINEAR, &linear_personality); + register_md_personality (LINEAR, &linear_personality); } #else int init_module (void) { - return (register_md_personality (LINEAR, &linear_personality)); + return (register_md_personality (LINEAR, &linear_personality)); } void cleanup_module (void) { - unregister_md_personality (LINEAR); + unregister_md_personality (LINEAR); } #endif + diff -ruN linux.orig/drivers/block/linear.h linux-2.2.16/drivers/block/linear.h --- linux.orig/drivers/block/linear.h Fri Nov 22 15:07:23 1996 +++ linux-2.2.16/drivers/block/linear.h Thu Jan 1 01:00:00 1970 @@ -1,16 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct linear_hash -{ - struct real_dev *dev0, *dev1; -}; - -struct linear_data -{ - struct linear_hash *hash_table; /* Dynamically allocated */ - struct real_dev *smallest; - int nr_zones; -}; - -#endif diff -ruN linux.orig/drivers/block/ll_rw_blk.c linux-2.2.16/drivers/block/ll_rw_blk.c --- linux.orig/drivers/block/ll_rw_blk.c Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/drivers/block/ll_rw_blk.c Fri Jun 9 11:37:45 2000 @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -53,6 +54,11 @@ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; /* + * per-major idle-IO detection + */ +unsigned long io_events[MAX_BLKDEV] = {0, }; + +/* * used to wait on when there are no free requests */ struct wait_queue * wait_for_request; @@ -641,6 +647,8 @@ return; /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */ lock_buffer(bh); + if (!buffer_lowprio(bh)) + io_events[major]++; if (blk_size[major]) { unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1; @@ -892,7 +900,7 @@ bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9); #ifdef CONFIG_BLK_DEV_MD if (major==MD_MAJOR && - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev, + md_map (bh[i]->b_dev, &bh[i]->b_rdev, &bh[i]->b_rsector, bh[i]->b_size >> 9)) { printk (KERN_ERR "Bad md_map in ll_rw_block\n"); @@ -912,7 +920,7 @@ set_bit(BH_Req, &bh[i]->b_state); #ifdef CONFIG_BLK_DEV_MD if (MAJOR(bh[i]->b_dev) == MD_MAJOR) { - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]); + md_make_request(bh[i], rw); continue; } #endif diff -ruN linux.orig/drivers/block/md.c linux-2.2.16/drivers/block/md.c --- linux.orig/drivers/block/md.c Wed Jun 7 23:26:42 2000 +++ linux-2.2.16/drivers/block/md.c Fri Jun 9 11:43:43 2000 @@ -1,21 +1,17 @@ - /* md.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - + Copyright (C) 1998, 1999 Ingo Molnar - A lot of inspiration came from hd.c ... + completely rewritten, based on the MD driver code from Marc Zyngier - kerneld support by Boris Tobotras - boot support for linear and striped mode by Harald Hoyer + Changes: - RAID-1/RAID-5 extensions by: - Ingo Molnar, Miguel de Icaza, Gadi Oxman + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher - Changes for kmod by: - Cyrus Durgin - This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) @@ -26,809 +22,3007 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so - * the extra system load does not show up that much. Increase it if your - * system can take more. - */ -#define SPEED_LIMIT 1024 +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #ifdef CONFIG_KMOD #include #endif -#include -#include #define __KERNEL_SYSCALLS__ #include +#include + +extern asmlinkage int sys_sched_yield(void); +extern asmlinkage int sys_setsid(void); + +extern unsigned long io_events[MAX_BLKDEV]; + #define MAJOR_NR MD_MAJOR #define MD_DRIVER #include -#include -#include -#include #ifdef CONFIG_MD_BOOT -extern kdev_t name_to_kdev_t(char *line) __init; +extern kdev_t name_to_kdev_t(char *line) md__init; #endif -static struct hd_struct md_hd_struct[MAX_MD_DEV]; -static int md_blocksizes[MAX_MD_DEV]; -int md_maxreadahead[MAX_MD_DEV]; -#if SUPPORT_RECONSTRUCTION -static struct md_thread *md_sync_thread = NULL; -#endif /* SUPPORT_RECONSTRUCTION */ +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_maxreadahead[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread = NULL; -int md_size[MAX_MD_DEV]={0, }; +int md_size[MAX_MD_DEVS] = {0, }; static void md_geninit (struct gendisk *); static struct gendisk md_gendisk= { - MD_MAJOR, - "md", - 0, - 1, - MAX_MD_DEV, - md_geninit, - md_hd_struct, - md_size, - MAX_MD_DEV, - NULL, - NULL + MD_MAJOR, + "md", + 0, + 1, + MAX_MD_DEVS, + md_geninit, + md_hd_struct, + md_size, + MAX_MD_DEVS, + NULL, + NULL }; -static struct md_personality *pers[MAX_PERSONALITY]={NULL, }; -struct md_dev md_dev[MAX_MD_DEV]; - -int md_thread(void * arg); +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. + * + * you can change it via /proc/sys/dev/speed-limit + */ -static struct gendisk *find_gendisk (kdev_t dev) -{ - struct gendisk *tmp=gendisk_head; +static int sysctl_speed_limit = 100; - while (tmp != NULL) - { - if (tmp->major==MAJOR(dev)) - return (tmp); - - tmp=tmp->next; - } +static struct ctl_table_header *md_table_header; - return (NULL); -} +static ctl_table md_table[] = { + {DEV_MD_SPEED_LIMIT, "speed-limit", + &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; -char *partition_name (kdev_t dev) -{ - static char name[40]; /* This should be long - enough for a device name ! */ - struct gendisk *hd = find_gendisk (dev); +static ctl_table md_dir_table[] = { + {DEV_MD, "md", NULL, 0, 0555, md_table}, + {0} +}; - if (!hd) - { - sprintf (name, "[dev %s]", kdevname(dev)); - return (name); - } +static ctl_table md_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table}, + {0} +}; - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */ +static void md_register_sysctl(void) +{ + md_table_header = register_sysctl_table(md_root_table, 1); } -static int legacy_raid_sb (int minor, int pnum) +void md_unregister_sysctl(void) { - int i, factor; + unregister_sysctl_table(md_table_header); +} + +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); - /***** - * do size and offset calculations. - */ - for (i=0; i> PERSONALITY_SHIFT) - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev; - return 0; + if (mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = mddev; + mddev_map[minor].data = data; } -static void free_sb (struct md_dev *mddev) +void del_mddev_mapping (mddev_t * mddev, kdev_t dev) { - int i; - struct real_dev *realdev; + unsigned int minor = MINOR(dev); - if (mddev->sb) { - free_page((unsigned long) mddev->sb); - mddev->sb = NULL; + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; } - for (i = 0; i nb_dev; i++) { - realdev = mddev->devices + i; - if (realdev->sb) { - free_page((unsigned long) realdev->sb); - realdev->sb = NULL; - } + if (mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; } + mddev_map[minor].mddev = NULL; + mddev_map[minor].data = NULL; } /* - * Check one RAID superblock for generic plausibility + * Enables to iterate over all existing md arrays */ +static MD_LIST_HEAD(all_mddevs); -#define BAD_MAGIC KERN_ERR \ -"md: %s: invalid raid superblock magic (%x) on block %u\n" +static mddev_t * alloc_mddev (kdev_t dev) +{ + mddev_t * mddev; -#define OUT_OF_MEM KERN_ALERT \ -"md: out of memory.\n" + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); -#define NO_DEVICE KERN_ERR \ -"md: disabled device %s\n" + mddev->__minor = MINOR(dev); + mddev->reconfig_sem = MUTEX; + mddev->recovery_sem = MUTEX; + mddev->resync_sem = MUTEX; + MD_INIT_LIST_HEAD(&mddev->disks); + /* + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. + */ + add_mddev_mapping(mddev, dev, 0); + md_list_add(&mddev->all_mddevs, &all_mddevs); -#define SUCCESS 0 -#define FAILURE -1 + return mddev; +} -static int analyze_one_sb (struct real_dev * rdev) +static void free_mddev (mddev_t *mddev) { - int ret = FAILURE; - struct buffer_head *bh; - kdev_t dev = rdev->dev; - md_superblock_t *sb; + if (!mddev) { + MD_BUG(); + return; + } /* - * Read the superblock, it's at the end of the disk + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) */ - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]); - set_blocksize (dev, MD_SB_BYTES); - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - - if (bh) { - sb = (md_superblock_t *) bh->b_data; - if (sb->md_magic != MD_SB_MAGIC) { - printk (BAD_MAGIC, kdevname(dev), - sb->md_magic, rdev->sb_offset); - goto abort; - } - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL); - if (!rdev->sb) { - printk (OUT_OF_MEM); - goto abort; - } - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES); + while (md_atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (md_atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); - rdev->size = sb->size; - } else - printk (NO_DEVICE,kdevname(rdev->dev)); - ret = SUCCESS; -abort: - if (bh) - brelse (bh); - return ret; + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + kfree(mddev); } -#undef SUCCESS -#undef FAILURE - -#undef BAD_MAGIC -#undef OUT_OF_MEM -#undef NO_DEVICE -/* - * Check a full RAID array for plausibility - */ +struct gendisk * find_gendisk (kdev_t dev) +{ + struct gendisk *tmp = gendisk_head; -#define INCONSISTENT KERN_ERR \ -"md: superblock inconsistency -- run ckraid\n" + while (tmp != NULL) { + if (tmp->major == MAJOR(dev)) + return (tmp); + tmp = tmp->next; + } + return (NULL); +} -#define OUT_OF_DATE KERN_ERR \ -"md: superblock update time inconsistenty -- using the most recent one\n" +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; -#define OLD_VERSION KERN_ALERT \ -"md: %s: unsupported raid array version %d.%d.%d\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} -#define NOT_CLEAN KERN_ERR \ -"md: %s: raid array is not clean -- run ckraid\n" +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; -#define NOT_CLEAN_IGNORE KERN_ERR \ -"md: %s: raid array is not clean -- reconstructing parity\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} -#define UNKNOWN_LEVEL KERN_ERR \ -"md: %s: unsupported raid level %d\n" +static MD_LIST_HEAD(device_names); -static int analyze_sbs (int minor, int pnum) +char * partition_name (kdev_t dev) { - struct md_dev *mddev = md_dev + minor; - int i, N = mddev->nb_dev, out_of_date = 0; - struct real_dev * disks = mddev->devices; - md_superblock_t *sb, *freshest = NULL; + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp = device_names.next; - /* - * RAID-0 and linear don't use a RAID superblock - */ - if (pnum == RAID0 >> PERSONALITY_SHIFT || - pnum == LINEAR >> PERSONALITY_SHIFT) - return legacy_raid_sb (minor, pnum); + while (tmp != &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + if (!dname) + return nomem; /* - * Verify the RAID superblock on each real device + * ok, add this new device name to the list */ - for (i = 0; i < N; i++) - if (analyze_one_sb(disks+i)) - goto abort; + hd = find_gendisk (dev); + + if (!hd) + sprintf (dname->name, "[dev %s]", kdevname(dev)); + else + disk_name (hd, MINOR(dev), dname->name); + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int md_check_ordering (mddev_t *mddev) +{ + int i, c; + mdk_rdev_t *rdev; + struct md_list_head *tmp; /* - * The superblock constant part has to be the same - * for all disks in the array. + * First, all devices must be fully functional */ - sb = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!sb) { - sb = disks[i].sb; - continue; - } - if (memcmp(sb, - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) { - printk (INCONSISTENT); + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk("md: md%d's device %s faulty, aborting.\n", + mdidx(mddev), partition_name(rdev->dev)); goto abort; } } - /* - * OK, we have all disks and the array is ready to run. Let's - * find the freshest superblock, that one will be the superblock - * that represents the whole array. - */ - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL) + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; + } + if (c != mddev->nb_dev) { + MD_BUG(); goto abort; - freshest = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!freshest) { - freshest = disks[i].sb; - continue; - } - /* - * Find the newest superblock version - */ - if (disks[i].sb->utime != freshest->utime) { - out_of_date = 1; - if (disks[i].sb->utime > freshest->utime) - freshest = disks[i].sb; - } } - if (out_of_date) - printk(OUT_OF_DATE); - memcpy (sb, freshest, sizeof(*freshest)); - - /* - * Check if we can support this RAID array - */ - if (sb->major_version != MD_MAJOR_VERSION || - sb->minor_version > MD_MINOR_VERSION) { - - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)), - sb->major_version, sb->minor_version, - sb->patch_version); + if (mddev->nb_dev != mddev->sb->raid_disks) { + printk("md: md%d, array needs %d disks, has %d, aborting.\n", + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); goto abort; } - /* - * We need to add this as a superblock option. + * Now the numbering check */ -#if SUPPORT_RECONSTRUCTION - if (sb->state != (1 << MD_SB_CLEAN)) { - if (sb->level == 1) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (c == 0) { + printk("md: md%d, missing disk #%d, aborting.\n", + mdidx(mddev), i); goto abort; - } else - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor))); - } -#else - if (sb->state != (1 << MD_SB_CLEAN)) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; - } -#endif /* SUPPORT_RECONSTRUCTION */ - - switch (sb->level) { - case 1: - md_size[minor] = sb->size; - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD; - break; - case 4: - case 5: - md_size[minor] = sb->size * (sb->raid_disks - 1); - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1); - break; - default: - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)), - sb->level); + } + if (c > 1) { + printk("md: md%d, too many disks #%d, aborting.\n", + mdidx(mddev), i); goto abort; + } } return 0; abort: - free_sb(mddev); return 1; } -#undef INCONSISTENT -#undef OUT_OF_DATE -#undef OLD_VERSION -#undef NOT_CLEAN -#undef OLD_LEVEL - -int md_update_sb(int minor) +static unsigned int zoned_raid_size (mddev_t *mddev) { - struct md_dev *mddev = md_dev + minor; - struct buffer_head *bh; - md_superblock_t *sb = mddev->sb; - struct real_dev *realdev; - kdev_t dev; - int i; - u32 sb_offset; + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; - sb->utime = CURRENT_TIME; - for (i = 0; i < mddev->nb_dev; i++) { - realdev = mddev->devices + i; - if (!realdev->sb) - continue; - dev = realdev->dev; - sb_offset = realdev->sb_offset; - set_blocksize(dev, MD_SB_BYTES); - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset); - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (bh) { - sb = (md_superblock_t *) bh->b_data; - memcpy(sb, mddev->sb, MD_SB_BYTES); - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4); - mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - } else - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev)); + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); +printk("mask %08x\n", mask); + + ITERATE_RDEV(mddev,rdev,tmp) { +printk(" rdev->size: %d\n", rdev->size); + rdev->size &= mask; +printk(" masked rdev->size: %d\n", rdev->size); + md_size[mdidx(mddev)] += rdev->size; +printk(" new md_size: %d\n", md_size[mdidx(mddev)]); } return 0; } -static int do_md_run (int minor, int repart) +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) { - int pnum, i, min, factor, err; + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} - if (!md_dev[minor].nb_dev) - return -EINVAL; - - if (md_dev[minor].pers) - return -EBUSY; +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" - md_dev[minor].repartition=repart; - - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT)) - >= MAX_PERSONALITY) - return -EINVAL; - - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */ - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){ - for (i = 0; i < md_dev [minor].nb_dev; i++) - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR) - return -EINVAL; - } - if (!pers[pnum]) - { -#ifdef CONFIG_KMOD - char module_name[80]; - sprintf (module_name, "md-personality-%d", pnum); - request_module (module_name); - if (!pers[pnum]) -#endif - return -EINVAL; - } - - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); - - md_blocksizes[minor] <<= FACTOR_SHIFT(FACTOR((md_dev+minor))); +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" - for (i=0; irun (minor, md_dev+minor))) - { - md_dev[minor].pers=NULL; - free_sb(md_dev + minor); - return (err); - } - - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT) - { - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN); - md_update_sb(minor); - } - - /* FIXME : We assume here we have blocks - that are twice as large as sectors. - THIS MAY NOT BE TRUE !!! */ - md_hd_struct[minor].start_sect=0; - md_hd_struct[minor].nr_sects=md_size[minor]<<1; - - read_ahead[MD_MAJOR] = 128; - return (0); -} +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" -static int do_md_stop (int minor, struct inode *inode) +static int alloc_array_sb (mddev_t * mddev) { - int i; - - if (inode->i_count>1 || md_dev[minor].busy>1) { - /* - * ioctl : one open channel - */ - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n", - minor, inode->i_count, md_dev[minor].busy); - return -EBUSY; - } - - if (md_dev[minor].pers) { - /* - * It is safe to call stop here, it only frees private - * data. Also, it tells us if a device is unstoppable - * (eg. resyncing is in progress) - */ - if (md_dev[minor].pers->stop (minor, md_dev+minor)) - return -EBUSY; - /* - * The device won't exist anymore -> flush it now - */ - fsync_dev (inode->i_rdev); - invalidate_buffers (inode->i_rdev); - if (md_dev[minor].sb) { - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN; - md_update_sb(minor); - } + if (mddev->sb) { + MD_BUG(); + return 0; } - - /* Remove locks. */ - if (md_dev[minor].sb) - free_sb(md_dev + minor); - for (i=0; isb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page((unsigned long)mddev->sb); + return 0; } -static int do_md_add (int minor, kdev_t dev) +static int alloc_disk_sb (mdk_rdev_t * rdev) { - int i; - int hot_add=0; - struct real_dev *realdev; + if (rdev->sb) + MD_BUG(); - if (md_dev[minor].nb_dev==MAX_REAL) + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + printk (OUT_OF_MEM); return -EINVAL; + } + md_clear_page((unsigned long)rdev->sb); - if (!fs_may_mount (dev)) - return -EBUSY; + return 0; +} - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) { - printk("md_add(): zero device size, huh, bailing out.\n"); - return -EINVAL; +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); } +} - if (md_dev[minor].pers) { - /* - * The array is already running, hot-add the drive, or - * bail out: - */ - if (!md_dev[minor].pers->hot_add_disk) - return -EBUSY; - else - hot_add=1; +static void mark_rdev_faulty (mdk_rdev_t * rdev) +{ + unsigned long flags; + + if (!rdev) { + MD_BUG(); + return; } + save_flags(flags); + cli(); + free_disk_sb(rdev); + rdev->faulty = 1; + restore_flags(flags); +} + +static int read_disk_sb (mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + struct buffer_head *bh = NULL; + kdev_t dev = rdev->dev; + mdp_super_t *sb; + u32 sb_offset; + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + /* - * Careful. We cannot increase nb_dev for a running array. + * Calculate the position of the superblock, + * it's at the end of the disk */ - i=md_dev[minor].nb_dev; - realdev = &md_dev[minor].devices[i]; - realdev->dev=dev; - - /* Lock the device by inserting a dummy inode. This doesn't - smell very good, but I need to be consistent with the - mount stuff, specially with fs_may_mount. If someone have - a better idea, please help ! */ - - realdev->inode=get_empty_inode (); - realdev->inode->i_dev=dev; /* don't care about other fields */ - insert_inode_hash (realdev->inode); - - /* Sizes are now rounded at run time */ - -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/ - - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)]; + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + printk("(read) %s's sb offset: %d", partition_name(dev), + sb_offset); + fsync_dev(dev); + set_blocksize (dev, MD_SB_BYTES); + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (hot_add) { - /* - * Check the superblock for consistency. - * The personality itself has to check whether it's getting - * added with the proper flags. The personality has to be - * checked too. ;) + if (bh) { + sb = (mdp_super_t *) bh->b_data; + memcpy (rdev->sb, sb, MD_SB_BYTES); + } else { + printk (NO_SB,partition_name(rdev->dev)); + goto abort; + } + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events)); + ret = 0; +abort: + if (bh) + brelse (bh); + return ret; +} + +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk (BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk (BAD_MINOR, partition_name(rdev->dev), + sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) + printk(BAD_CSUM, partition_name(rdev->dev)); + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = find_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + md_list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + printk("unbind<%s,%d>\n", partition_name(rdev->dev), + rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev (mdk_rdev_t *rdev) +{ + int err = 0; + + /* + * First insert a dummy inode. + */ + if (rdev->inode) + MD_BUG(); + rdev->inode = get_empty_inode(); + /* + * we dont care about any other fields + */ + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; + insert_inode_hash(rdev->inode); + + memset(&rdev->filp, 0, sizeof(rdev->filp)); + rdev->filp.f_mode = 3; /* read write */ + err = blkdev_open(rdev->inode, &rdev->filp); + if (err) { + printk("blkdev_open() failed: %d\n", err); + clear_inode(rdev->inode); + rdev->inode = NULL; + } + return err; +} + +static void unlock_rdev (mdk_rdev_t *rdev) +{ + blkdev_release(rdev->inode); + if (!rdev->inode) + MD_BUG(); + clear_inode(rdev->inode); + rdev->inode = NULL; +} + +static void export_rdev (mdk_rdev_t * rdev) +{ + printk("export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + md_list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + printk("(%s was pending)\n",partition_name(rdev->dev)); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array (mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)get_unaligned(&sb->events)); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + printk(" D %2d: ", i); + print_desc(desc); + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void md_print_devices (void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(" **********************************\n"); + printk(" * *\n"); + printk(" **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk(" **********************************\n"); + printk("\n"); +} + +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all (kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + tmp = all_raid_disks.next; + while (tmp != &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + tmp = tmp->next; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + struct buffer_head *bh; + kdev_t dev; + u32 sb_offset, size; + mdp_super_t *sb; + + if (!rdev->sb) { + MD_BUG(); + return -1; + } + if (rdev->faulty) { + MD_BUG(); + return -1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return -1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * it's size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size); + goto skip; + } + + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset); + fsync_dev(dev); + set_blocksize(dev, MD_SB_BYTES); + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + if (!bh) { + printk(GETBLK_FAILED, partition_name(dev)); + return 1; + } + memset(bh->b_data,0,bh->b_size); + sb = (mdp_super_t *) bh->b_data; + memcpy(sb, rdev->sb, MD_SB_BYTES); + + mark_buffer_uptodate(bh, 1); + mark_buffer_dirty(bh, 1); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + brelse(bh); + fsync_dev(dev); +skip: + return 0; +} +#undef GETBLK_FAILED KERN_ERR + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int first, err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + __u64 ev; + +repeat: + mddev->sb->utime = CURRENT_TIME; + ev = get_unaligned(&mddev->sb->events); + ++ev; + put_unaligned(ev,&mddev->sb->events); + if (ev == (__u64)0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + --ev; + put_unaligned(ev,&mddev->sb->events); + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + first = 1; + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (!first) { + first = 0; + printk(", "); + } + if (rdev->faulty) + printk("(skipping faulty "); + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty) { + printk("[events: %08lx]", + (unsigned long)get_unaligned(&rdev->sb->events)); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + printk(".\n"); + if (err) { + printk("errors occured during superblock update, repeating\n"); + if (--count) + goto repeat; + printk("excessive errors occured during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static int md_import_device (kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk("could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (!fs_may_mount(newdev)) { + printk("md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk("md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk("md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk("md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk("md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + if (rdev->faulty && rdev->sb) + free_disk_sb(rdev); + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->inode) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs (mddev_t * mddev) +{ + int out_of_date = 0, i; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk (INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + __u64 ev = get_unaligned(&rdev->sb->events); + if (ev != (__u64)0) { + --ev; + put_unaligned(ev,&rdev->sb->events); + } + } + + printk("%s's event counter: %08lx\n", partition_name(rdev->dev), + (unsigned long)get_unaligned(&rdev->sb->events)); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&freshest->sb->events); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk("freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices faulty + */ + __u64 ev1, ev2; + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ++ev1; + if (ev1 < ev2) { + printk("md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty) { /* REMOVEME */ + MD_BUG(); + goto abort; + } + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + */ + if (disk_faulty(desc)) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk("md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk("md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + } + + /* + * Do a final reality check. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk (OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk (NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation (mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk (KERN_WARNING + "Dev %s smaller than chunk_size: %dk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) + readahead = mddev->sb->chunk_size * 4 * data_disks; + if (readahead < data_disks * MAX_SECTORS*512*2) + readahead = data_disks * MAX_SECTORS*512*2; + else { + if (sb->level == -3) + readahead = 0; + } + md_maxreadahead[mdidx(mddev)] = readahead; + + printk(KERN_INFO "md%d: max total readahead window set to %dk\n", + mdidx(mddev), readahead/1024); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %dk\n", + mdidx(mddev), data_disks, readahead/data_disks/1024); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run (mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (!mddev->nb_dev) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + mddev->param.chunk_size = chunk_size; + mddev->param.personality = pnum; + + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + return -EINVAL; + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + fsync_dev(rdev->dev); + invalidate_buffers(rdev->dev); + } + + mddev->pers = pers[pnum]; + + err = mddev->pers->run(mddev); + if (err) { + printk("pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +#define OUT(x) do { err = (x); goto out; } while (0) + +static int restart_array (mddev_t *mddev) +{ + int err = 0; + + /* + * Complain if it has no devices + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + if (!mddev->ro) + OUT(-EBUSY); + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk (KERN_INFO + "md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + } else + err = -EINVAL; + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" + +static int do_md_stop (mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (!ro && !fs_may_mount (dev)) { + printk (STILL_MOUNTED, mdidx(mddev)); + OUT(-EBUSY); + } + + /* + * complain if it's already stopped + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + /* + * sync and invalidate buffers because we cannot kill the + * main thread with valid IO transfers still around. + * the kernel lock protects us from new requests being + * added after invalidate_buffers(). + */ + fsync_dev (mddev_to_kdev(mddev)); + fsync_dev (dev); + invalidate_buffers (dev); + + if (ro) { + if (mddev->ro) + OUT(-ENXIO); + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + if (mddev->ro) + set_device_ro(dev, 1); + OUT(-EBUSY); + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk("marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + free_mddev(mddev); + + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); + } else + printk (KERN_INFO + "md%d switched to read-only mode.\n", mdidx(mddev)); +out: + return err; +} + +#undef OUT + +/* + * We have to safely support old arrays too. + */ +int detect_old_array (mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array (mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (mddev->disks.prev == &mddev->disks) { + MD_BUG(); + return; + } + + printk("running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\nnow!\n"); + + err = do_md_run (mddev); + if (err) { + printk("do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + */ +static void autorun_devices (void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk("autorun ...\n"); + while (pending_raid_disks.next != &pending_raid_disks) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk("considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(" adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. */ - if (analyze_one_sb (realdev)) + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk("md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + printk("created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + autorun_array(mddev); + } + printk("... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array (kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk("could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk("can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + +struct { + int set; + int noautodetect; + +} raid_setup_args md__initdata = { 0, 0 }; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +md__initfunc(void autodetect_raid(void)) +{ +#ifdef CONFIG_AUTODETECT_RAID + struct gendisk *disk; + mdk_rdev_t *rdev; + int i; + + if (raid_setup_args.noautodetect) { + printk(KERN_INFO "skipping autodetection of RAID arrays\n"); + return; + } + printk(KERN_INFO "autodetecting RAID arrays\n"); + + for (disk = gendisk_head ; disk ; disk = disk->next) { + for (i = 0; i < disk->max_p*disk->max_nr; i++) { + kdev_t dev = MKDEV(disk->major,i); + + if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) { + printk(KERN_ALERT +"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n" +" to maintain interoperability with other OSs! Autodetection support for\n" +" type 0x86 will be deleted after some migration timeout. Sorry.\n", + partition_name(dev)); + disk->part[i].type = LINUX_RAID_PARTITION; + } + if (disk->part[i].type != LINUX_RAID_PARTITION) + continue; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + } + + autorun_devices(); +#endif +} + +static int get_version (void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info (mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) + return -EINVAL; + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info (mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info.x + +static int add_new_disk (mddev_t * mddev, void * arg) +{ + int err, size, persistent; + mdu_disk_info_t info; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + dev = MKDEV(info.major,info.minor); + + if (find_rdev_all(dev)) { + printk("device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info.state & (1<old_dev = dev; + rdev->desc_nr = info.number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk("nonpersistent superblock ...\n"); + if (!mddev->sb->chunk_size) + printk("no chunksize?\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_remove_disk (mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + if (disk_removed(disk)) { + MD_BUG(); + return -EINVAL; + } + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk("cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk (mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk("md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + return -ENOSPC; + } + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk("md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + cli(); + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + sti(); + printk("md%d: can not hot-add to full array!\n", mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { /* - * hot_add has to bump up nb_dev itself + * reuse slot */ - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) { - /* - * FIXME: here we should free up the inode and stuff - */ - printk ("FIXME\n"); - return -EINVAL; + if (disk->number != i) { + sti(); + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; } - } else - md_dev[minor].nb_dev++; + } else { + disk->number = i; + } - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor); - return (0); + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + sti(); + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + + sti(); + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info.x +static int set_array_info (mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (mddev->sb) { + printk("array md%d already has a superblock!\n", + mdidx(mddev)); + return -EBUSY; + } + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_info (mddev_t * mddev, void * arg) +{ + printk("not yet"); + return -EINVAL; +} + +static int clear_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int write_raid_info (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int protect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int unprotect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int set_disk_faulty (mddev_t *mddev, kdev_t dev) +{ + int ret; + + fsync_dev(mddev_to_kdev(mddev)); + ret = md_error(mddev_to_kdev(mddev), dev); + return ret; } static int md_ioctl (struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - int minor, err; - struct hd_geometry *loc = (struct hd_geometry *) arg; + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (!md_capable_admin()) + return -EACCES; - if (((minor=MINOR(inode->i_rdev)) & 0x80) && - (minor & 0x7f) < MAX_PERSONALITY && - pers[minor & 0x7f] && - pers[minor & 0x7f]->ioctl) - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg)); - - if (minor >= MAX_MD_DEV) - return -EINVAL; + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) + return -EINVAL; - switch (cmd) - { - case REGISTER_DEV: - return do_md_add (minor, to_kdev_t ((dev_t) arg)); + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + + case BLKGETSIZE: /* Return device size */ + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user(md_hd_struct[minor].nr_sects, + (long *) arg); + goto done; - case START_MD: - return do_md_run (minor, (int) arg); + case BLKFLSBUF: + fsync_dev(dev); + invalidate_buffers(dev); + goto done; - case STOP_MD: - return do_md_stop (minor, inode); - - case BLKGETSIZE: /* Return device size */ - if (!arg) return -EINVAL; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg); - if (err) - return err; - break; - - case BLKFLSBUF: - fsync_dev (inode->i_rdev); - invalidate_buffers (inode->i_rdev); - break; - - case BLKRASET: - if (arg > 0xff) - return -EINVAL; - read_ahead[MAJOR(inode->i_rdev)] = arg; - return 0; - - case BLKRAGET: - if (!arg) return -EINVAL; - err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg); - if (err) - return err; - break; - - /* We have a problem here : there is no easy way to give a CHS - virtual geometry. We currently pretend that we have a 2 heads - 4 sectors (with a BIG number of cylinders...). This drives dosfs - just mad... ;-) */ - - case HDIO_GETGEO: - if (!loc) return -EINVAL; - err = put_user (2, (char *) &loc->heads); - if (err) - return err; - err = put_user (4, (char *) &loc->sectors); - if (err) - return err; - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders); - if (err) - return err; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect, - (long *) &loc->start); - if (err) - return err; - break; - - RO_IOCTLS(inode->i_rdev,arg); + case BLKRASET: + if (arg > 0xff) { + err = -EINVAL; + goto abort; + } + read_ahead[MAJOR(dev)] = arg; + goto done; - default: - return -EINVAL; - } + case BLKRAGET: + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user (read_ahead[ + MAJOR(dev)], (long *) arg); + goto done; + default: + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk("array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default: + } + + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk("ioctl, reason %d, cmd %d\n", err, cmd); + goto abort; + } + err = set_array_info(mddev, (void *)arg); + if (err) { + printk("couldnt set array info. %d\n", err); + goto abort; + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg); + if (err) { + printk("autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default: + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } - return (0); + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case CLEAR_ARRAY: + err = clear_array(mddev); + goto done_unlock; + + case ADD_NEW_DISK: + err = add_new_disk(mddev, (void *)arg); + goto done_unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_INFO: + err = set_disk_info(mddev, (void *)arg); + goto done_unlock; + + case WRITE_RAID_INFO: + err = write_raid_info(mddev); + goto done_unlock; + + case UNPROTECT_ARRAY: + err = unprotect_array(mddev); + goto done_unlock; + + case PROTECT_ARRAY: + err = protect_array(mddev); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + mdu_param_t param; + + err = md_copy_from_user(¶m, (mdu_param_t *)arg, + sizeof(param)); + if (err) + goto abort_unlock; + + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + else + printk("huh11?\n"); + + return err; +done: + if (err) + printk("huh12?\n"); +abort: + return err; } + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0) + static int md_open (struct inode *inode, struct file *file) { - int minor=MINOR(inode->i_rdev); + /* + * Always succeed + */ + return (0); +} + +static void md_release (struct inode *inode, struct file *file) +{ + sync_dev(inode->i_rdev); +} + + +static int md_read (struct inode *inode, struct file *file, + char *buf, int count) +{ + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev); - md_dev[minor].busy++; - return (0); /* Always succeed */ + if (!mddev || !mddev->pers) + return -ENXIO; + + return block_read (inode, file, buf, count); } +static int md_write (struct inode *inode, struct file *file, + const char *buf, int count) +{ + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev); + + if (!mddev || !mddev->pers) + return -ENXIO; -static int md_release (struct inode *inode, struct file *file) + return block_write (inode, file, buf, count); +} + +static struct file_operations md_fops= { - int minor=MINOR(inode->i_rdev); + NULL, + md_read, + md_write, + NULL, + NULL, + md_ioctl, + NULL, + md_open, + md_release, + block_fsync +}; + +#else - sync_dev (inode->i_rdev); - md_dev[minor].busy--; - return 0; +static int md_open (struct inode *inode, struct file *file) +{ + /* + * Always succeed + */ + return (0); } +static int md_release (struct inode *inode, struct file *file) +{ + sync_dev(inode->i_rdev); + return 0; +} static ssize_t md_read (struct file *file, char *buf, size_t count, loff_t *ppos) { - int minor=MINOR(file->f_dentry->d_inode->i_rdev); + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev); - if (!md_dev[minor].pers) /* Check if device is being run */ - return -ENXIO; + if (!mddev || !mddev->pers) + return -ENXIO; - return block_read(file, buf, count, ppos); + return block_read(file, buf, count, ppos); } static ssize_t md_write (struct file *file, const char *buf, size_t count, loff_t *ppos) { - int minor=MINOR(file->f_dentry->d_inode->i_rdev); + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev); - if (!md_dev[minor].pers) /* Check if device is being run */ - return -ENXIO; + if (!mddev || !mddev->pers) + return -ENXIO; - return block_write(file, buf, count, ppos); + return block_write(file, buf, count, ppos); } static struct file_operations md_fops= { - NULL, - md_read, - md_write, - NULL, - NULL, - md_ioctl, - NULL, - md_open, - NULL, - md_release, - block_fsync + NULL, + md_read, + md_write, + NULL, + NULL, + md_ioctl, + NULL, + md_open, + NULL, + md_release, + block_fsync }; -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size) +#endif + +int md_map (kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size) { - if ((unsigned int) minor >= MAX_MD_DEV) - { - printk ("Bad md device %d\n", minor); - return (-1); - } - - if (!md_dev[minor].pers) - { - printk ("Oops ! md%d not running, giving up !\n", minor); - return (-1); - } + int err; + mddev_t *mddev = kdev_to_mddev(dev); - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size)); + if (!mddev || !mddev->pers) { + err = -ENXIO; + goto out; + } + + err = mddev->pers->map(mddev, dev, rdev, rsector, size); +out: + return err; } -int md_make_request (int minor, int rw, struct buffer_head * bh) +int md_make_request (struct buffer_head * bh, int rw) { - if (md_dev [minor].pers->make_request) { - if (buffer_locked(bh)) - return 0; + int err; + mddev_t *mddev = kdev_to_mddev(bh->b_dev); + + if (!mddev || !mddev->pers) { + err = -ENXIO; + goto out; + } + + if (mddev->pers->make_request) { + if (buffer_locked(bh)) { + err = 0; + goto out; + } set_bit(BH_Lock, &bh->b_state); if (rw == WRITE || rw == WRITEA) { if (!buffer_dirty(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; + bh->b_end_io(bh, buffer_uptodate(bh)); + err = 0; + goto out; } } if (rw == READ || rw == READA) { if (buffer_uptodate(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; + bh->b_end_io(bh, buffer_uptodate(bh)); + err = 0; + goto out; } } - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh)); + err = mddev->pers->make_request(mddev, rw, bh); } else { make_request (MAJOR(bh->b_rdev), rw, bh); - return 0; + err = 0; } +out: + return err; } static void do_md_request (void) { - printk ("Got md request, not good..."); - return; + printk(KERN_ALERT "Got md request, not good..."); + return; +} + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + exit_mm(current); + exit_files(current); + exit_fs(current); + + /* + * Detach thread + */ + sys_setsid(); + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->priority = 40; + + up(thread->sem); + + for (;;) { + cli(); + if (!test_bit(THREAD_WAKEUP, &thread->flags)) { + if (!thread->run) + break; + interruptible_sleep_on(&thread->wqueue); + } + sti(); + clear_bit(THREAD_WAKEUP, &thread->flags); + if (thread->run) { + thread->run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) { + printk("%8s(%d) flushing signals.\n", current->comm, + current->pid); + md_flush_signals(); + } + } + sti(); + up(thread->sem); + return 0; } -void md_wakeup_thread(struct md_thread *thread) +void md_wakeup_thread(mdk_thread_t *thread) { set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } -struct md_thread *md_register_thread (void (*run) (void *), void *data) +mdk_thread_t *md_register_thread (void (*run) (void *), + void *data, const char *name) { - struct md_thread *thread = (struct md_thread *) - kmalloc(sizeof(struct md_thread), GFP_KERNEL); + mdk_thread_t *thread; int ret; struct semaphore sem = MUTEX_LOCKED; - if (!thread) return NULL; + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; - memset(thread, 0, sizeof(struct md_thread)); + memset(thread, 0, sizeof(mdk_thread_t)); init_waitqueue(&thread->wqueue); thread->sem = &sem; thread->run = run; thread->data = data; + thread->name = name; ret = kernel_thread(md_thread, thread, 0); if (ret < 0) { kfree(thread); @@ -838,270 +3032,407 @@ return thread; } -void md_unregister_thread (struct md_thread *thread) +void md_interrupt_thread (mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + printk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread (mdk_thread_t *thread) { struct semaphore sem = MUTEX_LOCKED; thread->sem = &sem; thread->run = NULL; - if (thread->tsk) - printk("Killing md_thread %d %p %s\n", - thread->tsk->pid, thread->tsk, thread->tsk->comm); - else - printk("Aiee. md_thread has 0 tsk\n"); - send_sig(SIGKILL, thread->tsk, 1); - printk("downing on %p\n", &sem); + thread->name = NULL; + if (!thread->tsk) { + MD_BUG(); + return; + } + md_interrupt_thread(thread); down(&sem); } -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM)) - -int md_thread(void * arg) +void md_recover_arrays (void) { - struct md_thread *thread = arg; - - lock_kernel(); - exit_mm(current); - exit_files(current); - exit_fs(current); - - current->session = 1; - current->pgrp = 1; - sprintf(current->comm, "md_thread"); - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); - thread->tsk = current; - up(thread->sem); - - for (;;) { - cli(); - if (!test_bit(THREAD_WAKEUP, &thread->flags)) { - do { - spin_lock(¤t->sigmask_lock); - flush_signals(current); - spin_unlock(¤t->sigmask_lock); - interruptible_sleep_on(&thread->wqueue); - cli(); - if (test_bit(THREAD_WAKEUP, &thread->flags)) - break; - if (!thread->run) { - sti(); - up(thread->sem); - return 0; - } - } while (signal_pending(current)); - } - sti(); - clear_bit(THREAD_WAKEUP, &thread->flags); - if (thread->run) { - thread->run(thread->data); - run_task_queue(&tq_disk); - } + if (!md_recovery_thread) { + MD_BUG(); + return; } + md_wakeup_thread(md_recovery_thread); } -EXPORT_SYMBOL(md_size); -EXPORT_SYMBOL(md_maxreadahead); -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(partition_name); -EXPORT_SYMBOL(md_dev); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_update_sb); -EXPORT_SYMBOL(md_map); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_do_sync); -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_md = { - PROC_MD, 6, "mdstat", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_array_inode_operations, -}; +int md_error (kdev_t dev, kdev_t rdev) +{ + mddev_t *mddev = kdev_to_mddev(dev); + mdk_rdev_t * rrdev; + int rc; + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + mark_rdev_faulty(rrdev); + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + if (mddev->pers->error_handler) { + rc = mddev->pers->error_handler(mddev, rdev); + md_recover_arrays(); + return rc; + } +#if 0 + /* + * Drop all buffers in the failed array. + * _not_. This is called from IRQ handlers ... + */ + invalidate_buffers(rdev); #endif + return 0; +} -static void md_geninit (struct gendisk *gdisk) +static int status_unused (char * page) { - int i; - - for(i=0;isame_set.next && !rdev->same_set.prev) { + /* + * The device is not yet used by any array. + */ + i++; + sz += sprintf(page + sz, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + sz += sprintf(page + sz, ""); + + sz += sprintf(page + sz, "\n"); + return sz; } -int md_error (kdev_t mddev, kdev_t rdev) + +static int status_resync (char * page, mddev_t * mddev) { - unsigned int minor = MINOR (mddev); - int rc; + int sz = 0; + unsigned int blocksize, max_blocks, resync, res, dt, tt, et; - if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV) - panic ("md_error gets unknown device\n"); - if (!md_dev [minor].pers) - panic ("md_error gets an error for an unknown device\n"); - if (md_dev [minor].pers->error_handler) { - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev); -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(md_sync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - return rc; - } - return 0; + resync = mddev->curr_resync; + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)]; + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10); + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return 0; + } + res = resync*100/max_blocks; + if (!mddev->recovery_running) + /* + * true resync + */ + sz += sprintf(page + sz, " resync=%u%%", res); + else + /* + * recovery ... + */ + sz += sprintf(page + sz, " recovery=%u%%", res); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time until now + * tt: total time + * et: estimated finish time + */ + dt = ((jiffies - mddev->resync_start) / HZ); + tt = (dt * (max_blocks / (resync/100+1)))/100; + if (tt > dt) + et = tt - dt; + else + /* + * ignore rounding effects near finish time + */ + et = 0; + + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6); + + return sz; } int get_md_status (char *page) { - int sz=0, i, j, size; - - sz+=sprintf( page+sz, "Personalities : "); - for (i=0; iname); - - page[sz-1]='\n'; - - sz+=sprintf (page+sz, "read_ahead "); - if (read_ahead[MD_MAJOR]==INT_MAX) - sz+=sprintf (page+sz, "not set\n"); - else - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + int sz = 0, j, size; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + sz += sprintf(page + sz, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + sz += sprintf(page+sz, "[%s] ", pers[j]->name); + + sz += sprintf(page+sz, "\n"); + + + sz += sprintf(page+sz, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + sz += sprintf(page+sz, "not set\n"); + else + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); - for (i=0; iname); + ITERATE_MDDEV(mddev,tmp) { + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + sz += sprintf(page + sz, " (read-only)"); + sz += sprintf(page + sz, " %s", mddev->pers->name); + } - size=0; - for (j=0; jdev), rdev->desc_nr); + if (rdev->faulty) { + sz += sprintf(page + sz, "(F)"); + continue; + } + size += rdev->size; + } - if (md_dev[i].nb_dev) { - if (md_dev[i].pers) - sz+=sprintf (page+sz, " %d blocks", md_size[i]); - else - sz+=sprintf (page+sz, " %d blocks", size); - } + if (mddev->nb_dev) { + if (mddev->pers) + sz += sprintf(page + sz, " %d blocks", + md_size[mdidx(mddev)]); + else + sz += sprintf(page + sz, " %d blocks", size); + } - if (!md_dev[i].pers) - { - sz+=sprintf (page+sz, "\n"); - continue; - } + if (!mddev->pers) { + sz += sprintf(page+sz, "\n"); + continue; + } - if (md_dev[i].pers->max_invalid_dev) - sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i)); + sz += mddev->pers->status (page+sz, mddev); - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i); - sz+=sprintf (page+sz, "\n"); - } + if (mddev->curr_resync) + sz += status_resync (page+sz, mddev); + else { + if (md_atomic_read(&mddev->resync_sem.count) != 1) + sz += sprintf(page + sz, " resync=DELAYED"); + } + sz += sprintf(page + sz, "\n"); + } + sz += status_unused (page + sz); - return (sz); + return (sz); } -int register_md_personality (int p_num, struct md_personality *p) +int register_md_personality (int pnum, mdk_personality_t *p) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - if (pers[i]) - return -EBUSY; + if (pers[pnum]) + return -EBUSY; - pers[i]=p; - printk ("%s personality registered\n", p->name); - return 0; + pers[pnum] = p; + printk(KERN_INFO "%s personality registered\n", p->name); + return 0; } -int unregister_md_personality (int p_num) +int unregister_md_personality (int pnum) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - printk ("%s personality unregistered\n", pers[i]->name); - pers[i]=NULL; - return 0; + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; } -static md_descriptor_t *get_spare(struct md_dev *mddev) +static mdp_disk_t *get_spare(mddev_t *mddev) { - int i; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); continue; - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); continue; - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) + } + if (disk_active(disk)) continue; - return descriptor; + return disk; } return NULL; } +static int is_mddev_idle (mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + curr_events = io_events[MAJOR(rdev->dev)]; + + if (curr_events != rdev->last_events) { +// printk("!I(%d)", curr_events-rdev->last_events); + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + /* * parallel resyncing thread. - * - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks - * - fix read error handing */ -int md_do_sync(struct md_dev *mddev) +/* + * Determine correct block size for this device. + */ +unsigned int device_bsize (kdev_t dev) +{ + unsigned int i, correct_size; + + correct_size = BLOCK_SIZE; + if (blksize_size[MAJOR(dev)]) { + i = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (i) + correct_size = i; + } + + return correct_size; +} + +static struct wait_queue *resync_wait = (struct wait_queue *)NULL; + +#define RA_ORDER (1) +#define RA_PAGE_SIZE (PAGE_SIZE*(1<resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; - blocksize = blksize_size[major][minor]; + blocksize = device_bsize(read_disk); max_blocks = blk_size[major][minor] / (blocksize >> 10); - printk("... resync log\n"); - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev); - printk(" .... raid array: %s\n", kdevname(read_disk)); - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize); - printk("md: syncing RAID array %s\n", kdevname(read_disk)); + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n", + sysctl_speed_limit); + printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n"); + + /* + * Resync has low priority. + */ + current->priority = 1; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + starttime = jiffies; + mddev->resync_start = starttime; - mddev->busy++; + /* + * Tune reconstruction: + */ + window = md_maxreadahead[mdidx(mddev)]/1024; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; + printk(KERN_INFO "md: using %dk window.\n",window); - starttime=jiffies; - for (j = 0; j < max_blocks; j++) { + for (j = 0; j < max_blocks; j += nr_blocks) { + if (j) + mddev->curr_resync = j; /* * B careful. When some1 mounts a non-'blocksize' filesystem * then we get the blocksize changed right under us. Go deal * with it transparently, recalculate 'blocksize', 'j' and * 'max_blocks': */ - curr_bsize = blksize_size[major][minor]; + curr_bsize = device_bsize(read_disk); if (curr_bsize != blocksize) { - diff_blocksize: + printk(KERN_INFO "md%d: blocksize changed\n", + mdidx(mddev)); +retry_read: if (curr_bsize > blocksize) /* * this is safe, rounds downwards. @@ -1111,114 +3442,384 @@ j *= blocksize/curr_bsize; blocksize = curr_bsize; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; max_blocks = blk_size[major][minor] / (blocksize >> 10); - } - if ((bh = breada (read_disk, j, blocksize, j * blocksize, - max_blocks * blocksize)) != NULL) { - mark_buffer_dirty(bh, 1); - brelse(bh); - } else { + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n", + nr_blocks, blocksize, j, max_blocks); /* - * FIXME: Ugly, but set_blocksize() isnt safe ... + * We will retry the current block-group */ - curr_bsize = blksize_size[major][minor]; - if (curr_bsize != blocksize) - goto diff_blocksize; + } - /* - * It's a real read problem. FIXME, handle this - * a better way. - */ - printk ( KERN_ALERT - "read error, stopping reconstruction.\n"); - mddev->busy--; - return 1; + /* + * Cleanup routines expect this + */ + for (k = 0; k < nr_blocks; k++) + bh[k] = NULL; + + chunk = nr_blocks; + if (chunk > max_blocks-j) + chunk = max_blocks-j; + + /* + * request buffer heads ... + */ + for (i = 0; i < chunk; i++) { + bh[i] = getblk (read_disk, j+i, blocksize); + if (!bh[i]) + goto read_error; + if (!buffer_dirty(bh[i])) + mark_buffer_lowprio(bh[i]); } /* - * Let's sleep some if we are faster than our speed limit: + * read buffer heads ... */ - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT) - { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); + ll_rw_block (READ, chunk, bh); + run_task_queue(&tq_disk); + + /* + * verify that all of them are OK ... + */ + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + wait_on_buffer(bh[ii]); + if (!buffer_uptodate(bh[ii])) + goto read_error; + } + +retry_write: + for (i = 0; i < chunk; i++) + mark_buffer_dirty_lowprio(bh[i]); + + ll_rw_block(WRITE, chunk, bh); + run_task_queue(&tq_disk); + + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + wait_on_buffer(bh[ii]); + + if (spare && disk_faulty(spare)) { + for (k = 0; k < chunk; k++) + brelse(bh[k]); + printk(" \n "); + err = -EIO; + goto out; + } + + if (!buffer_uptodate(bh[ii])) { + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO + "md%d: blocksize changed during write\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + if (bh[k]) { + if (buffer_lowprio(bh[k])) + mark_buffer_clean(bh[k]); + brelse(bh[k]); + } + goto retry_read; + } + printk(" BAD WRITE %8d>\n", j); + /* + * Ouch, write error, retry or bail out. + */ + if (max_write_errors) { + max_write_errors--; + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + goto retry_write; + } + printk ( KERN_ALERT + "too many write errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + if (bh[k]) { + if (buffer_lowprio(bh[k])) + mark_buffer_clean(bh[k]); + brelse(bh[k]); + } + err = -EIO; + goto out; + } } /* - * FIXME: put this status bar thing into /proc + * This is the normal 'everything went OK' case + * do a 'free-behind' logic, we sure dont need + * this buffer if it was the only user. */ - if (!(j%(max_blocks/100))) { - if (!(percent%10)) - printk (" %03d%% done.\n",percent); + for (i = 0; i < chunk; i++) + if (buffer_dirty(bh[i])) + brelse(bh[i]); else - printk ("."); - percent++; + bforget(bh[i]); + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk("md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ +repeat: + if (md_need_resched(current)) + schedule(); + + if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1 + > sysctl_speed_limit) { + current->priority = 1; + + if (!is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/2); + if (!md_signal_pending(current)) + goto repeat; + } + } else + current->priority = 40; } fsync_dev(read_disk); - printk("md: %s: sync done.\n", kdevname(read_disk)); - mddev->busy--; - return 0; + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + up(&mddev->resync_sem); +out_nolock: + free_pages((unsigned long)bh, RA_ORDER); + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; + +read_error: + /* + * set_blocksize() might change the blocksize. This + * should not happen often, but it happens when eg. + * someone mounts a filesystem that has non-1k + * blocksize. set_blocksize() doesnt touch our + * buffer, but to avoid aliasing problems we change + * our internal blocksize too and retry the read. + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed during read\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + if (bh[k]) { + if (buffer_lowprio(bh[k])) + mark_buffer_clean(bh[k]); + brelse(bh[k]); + } + goto retry_read; + } + + /* + * It's a real read problem. We retry and bail out + * only if it's excessive. + */ + if (max_read_errors) { + max_read_errors--; + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + for (k = 0; k < chunk; k++) + if (bh[k]) { + if (buffer_lowprio(bh[k])) + mark_buffer_clean(bh[k]); + brelse(bh[k]); + } + goto retry_read; + } + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + if (bh[k]) { + if (buffer_lowprio(bh[k])) + mark_buffer_clean(bh[k]); + brelse(bh[k]); + } + err = -EIO; + goto out; } +#undef MAX_NR_BLOCKS + /* - * This is a kernel thread which: syncs a spare disk with the active array + * This is a kernel thread which syncs a spare disk with the active array * * the amount of foolproofing might seem to be a tad excessive, but an * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs * of my root partition with the first 0.5 gigs of my /home partition ... so * i'm a bit nervous ;) */ -void mdsyncd (void *data) +void md_do_recovery (void *data) { - int i; - struct md_dev *mddev; - md_superblock_t *sb; - md_descriptor_t *spare; + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; unsigned long flags; + struct md_list_head *tmp; - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) { - if ((sb = mddev->sb) == NULL) + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) continue; if (sb->active_disks == sb->raid_disks) continue; - if (!sb->spare_disks) + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev)); continue; + } + /* + * now here we get the spare and resync it. + */ if ((spare = get_spare(mddev)) == NULL) continue; - if (!mddev->pers->mark_spare) + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) continue; - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE)) + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) continue; - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) { - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE); + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR) { + /* + * Recovery got interrupted ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); } save_flags(flags); cli(); - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE); - spare->state |= (1 << MD_SYNC_DEVICE); - spare->state |= (1 << MD_ACTIVE_DEVICE); - sb->spare_disks--; - sb->active_disks++; - mddev->sb_dirty = 1; - md_update_sb(mddev - md_dev); + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } restore_flags(flags); + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; } + printk(KERN_INFO "md: recovery thread finished ...\n"); } +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + md_notify_reboot, + NULL, + 0 +}; + +md__initfunc(void raid_setup(char *str, int *ints)) +{ + char tmpline[100]; + int len, pos, nr, i; + + len = strlen(str) + 1; + nr = 0; + pos = 0; + + for (i = 0; i < len; i++) { + char c = str[i]; + + if (c == ',' || !c) { + tmpline[pos] = 0; + if (!strcmp(tmpline,"noautodetect")) + raid_setup_args.noautodetect = 1; + nr++; + pos = 0; + continue; + } + tmpline[pos] = c; + pos++; + } + raid_setup_args.set = 1; + return; +} + #ifdef CONFIG_MD_BOOT struct { int set; int ints[100]; char str[100]; -} md_setup_args __initdata = { +} md_setup_args md__initdata = { 0,{0},{0} }; /* called from init/main.c */ -__initfunc(void md_setup(char *str,int *ints)) +md__initfunc(void md_setup(char *str,int *ints)) { int i; for(i=0;i<=ints[0];i++) { @@ -1230,21 +3831,24 @@ return; } -__initfunc(void do_md_setup(char *str,int *ints)) +md__initfunc(void do_md_setup(char *str,int *ints)) { - int minor, pers, factor, fault; +#if 0 + int minor, pers, chunk_size, fault; kdev_t dev; int i=1; + printk("i plan to phase this out --mingo\n"); + if(ints[0] < 4) { - printk ("md: Too few Arguments (%d).\n", ints[0]); + printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]); return; } minor=ints[i++]; - if (minor >= MAX_MD_DEV) { - printk ("md: Minor device number too high.\n"); + if ((unsigned int)minor >= MAX_MD_DEVS) { + printk (KERN_WARNING "md: Minor device number too high.\n"); return; } @@ -1254,18 +3858,20 @@ case -1: #ifdef CONFIG_MD_LINEAR pers = LINEAR; - printk ("md: Setting up md%d as linear device.\n",minor); + printk (KERN_INFO "md: Setting up md%d as linear device.\n", + minor); #else - printk ("md: Linear mode not configured." + printk (KERN_WARNING "md: Linear mode not configured." "Recompile the kernel with linear mode enabled!\n"); #endif break; case 0: pers = STRIPED; #ifdef CONFIG_MD_STRIPED - printk ("md: Setting up md%d as a striped device.\n",minor); + printk (KERN_INFO "md: Setting up md%d as a striped device.\n", + minor); #else - printk ("md: Striped mode not configured." + printk (KERN_WARNING "md: Striped mode not configured." "Recompile the kernel with striped mode enabled!\n"); #endif break; @@ -1280,79 +3886,145 @@ break; */ default: - printk ("md: Unknown or not supported raid level %d.\n", ints[--i]); + printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]); return; } - if(pers) { + if (pers) { - factor=ints[i++]; /* Chunksize */ - fault =ints[i++]; /* Faultlevel */ + chunk_size = ints[i++]; /* Chunksize */ + fault = ints[i++]; /* Faultlevel */ - pers=pers | factor | (fault << FAULT_SHIFT); + pers = pers | chunk_size | (fault << FAULT_SHIFT); - while( str && (dev = name_to_kdev_t(str))) { - do_md_add (minor, dev); - if((str = strchr (str, ',')) != NULL) - str++; - } + while( str && (dev = name_to_kdev_t(str))) { + do_md_add (minor, dev); + if((str = strchr (str, ',')) != NULL) + str++; + } - do_md_run (minor, pers); - printk ("md: Loading md%d.\n",minor); + do_md_run (minor, pers); + printk (KERN_INFO "md: Loading md%d.\n",minor); } - +#endif } #endif +void hsm_init (void); +void translucent_init (void); void linear_init (void); void raid0_init (void); void raid1_init (void); void raid5_init (void); -__initfunc(int md_init (void)) +md__initfunc(int md_init (void)) { - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION, - MAX_MD_DEV, MAX_REAL); - - if (register_blkdev (MD_MAJOR, "md", &md_fops)) - { - printk ("Unable to get major %d for md\n", MD_MAJOR); - return (-1); - } - - blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST; - blk_dev[MD_MAJOR].current_request=NULL; - read_ahead[MD_MAJOR]=INT_MAX; - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev)); - md_gendisk.next=gendisk_head; - - gendisk_head=&md_gendisk; - -#if SUPPORT_RECONSTRUCTION - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL) - printk("md: bug: md_sync_thread == NULL\n"); -#endif /* SUPPORT_RECONSTRUCTION */ + static char * name = "mdrecoveryd"; + + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL); + + if (register_blkdev (MD_MAJOR, "md", &md_fops)) + { + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR); + return (-1); + } + + blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST; + blk_dev[MD_MAJOR].current_request = NULL; + read_ahead[MD_MAJOR] = INT_MAX; + md_gendisk.next = gendisk_head; + + gendisk_head = &md_gendisk; + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n"); + md_register_reboot_notifier(&md_notifier); + md_register_sysctl(); + +#ifdef CONFIG_MD_HSM + hsm_init (); +#endif +#ifdef CONFIG_MD_TRANSLUCENT + translucent_init (); +#endif #ifdef CONFIG_MD_LINEAR - linear_init (); + linear_init (); #endif #ifdef CONFIG_MD_STRIPED - raid0_init (); + raid0_init (); #endif #ifdef CONFIG_MD_MIRRORING - raid1_init (); + raid1_init (); #endif #ifdef CONFIG_MD_RAID5 - raid5_init (); + raid5_init (); +#endif +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) + /* + * pick a XOR routine, runtime. + */ + calibrate_xor_block(); #endif - return (0); + + return (0); } #ifdef CONFIG_MD_BOOT -__initfunc(void md_setup_drive(void)) +md__initfunc(void md_setup_drive(void)) { if(md_setup_args.set) do_md_setup(md_setup_args.str, md_setup_args.ints); } #endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_map); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_check_ordering); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MD_EXPORT_SYMBOL(mddev_map); + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_md = { + PROC_MD, 6, "mdstat", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_array_inode_operations, +}; +#endif + +static void md_geninit (struct gendisk *gdisk) +{ + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_maxreadahead[i] = MD_READAHEAD; + md_gendisk.part[i].start_sect = -1; /* avoid partition check */ + md_gendisk.part[i].nr_sects = 0; + } + + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + + blksize_size[MD_MAJOR] = md_blocksizes; + md_set_global_readahead(md_maxreadahead); + +#ifdef CONFIG_PROC_FS + proc_register(&proc_root, &proc_md); +#endif +} + diff -ruN linux.orig/drivers/block/raid0.c linux-2.2.16/drivers/block/raid0.c --- linux.orig/drivers/block/raid0.c Tue Jan 4 19:12:14 2000 +++ linux-2.2.16/drivers/block/raid0.c Fri Jun 9 11:37:45 2000 @@ -1,4 +1,3 @@ - /* raid0.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -18,146 +17,201 @@ */ #include -#include -#include -#include +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int create_strip_zones (int minor, struct md_dev *mddev) +static int create_strip_zones (mddev_t *mddev) { - int i, j, c=0; - int current_offset=0; - struct real_dev *smallest_by_zone; - struct raid0_data *data=(struct raid0_data *) mddev->private; - - data->nr_strip_zones=1; - - for (i=1; inb_dev; i++) - { - for (j=0; jdevices[i].size==mddev->devices[j].size) - { - c=1; - break; - } - - if (!c) - data->nr_strip_zones++; - - c=0; - } - - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL) - return 1; - - data->smallest=NULL; - - for (i=0; inr_strip_zones; i++) - { - data->strip_zone[i].dev_offset=current_offset; - smallest_by_zone=NULL; - c=0; - - for (j=0; jnb_dev; j++) - if (mddev->devices[j].size>current_offset) - { - data->strip_zone[i].dev[c++]=mddev->devices+j; - if (!smallest_by_zone || - smallest_by_zone->size > mddev->devices[j].size) - smallest_by_zone=mddev->devices+j; - } - - data->strip_zone[i].nb_dev=c; - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c; - - if (!data->smallest || - data->smallest->size > data->strip_zone[i].size) - data->smallest=data->strip_zone+i; - - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+ - data->strip_zone[i-1].size) : 0; - current_offset=smallest_by_zone->size; - } - return 0; + int i, c, j, j1, j2; + int current_offset, curr_zone_offset; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + + /* + * The number of 'same size groups' + */ + conf->nr_strip_zones = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { + printk("raid0: looking at %s\n", partition_name(rdev1->dev)); + c = 0; + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size); + if (rdev2 == rdev1) { + printk("raid0: END\n"); + break; + } + if (rdev2->size == rdev1->size) + { + /* + * Not unique, dont count it as a new + * group + */ + printk("raid0: EQUAL\n"); + c = 1; + break; + } + printk("raid0: NOT EQUAL\n"); + } + if (!c) { + printk("raid0: ==> UNIQUE\n"); + conf->nr_strip_zones++; + printk("raid0: %d zones\n", conf->nr_strip_zones); + } + } + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); + + conf->strip_zone = vmalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones); + if (!conf->strip_zone) + return 1; + + + conf->smallest = NULL; + current_offset = 0; + curr_zone_offset = 0; + + for (i = 0; i < conf->nr_strip_zones; i++) + { + struct strip_zone *zone = conf->strip_zone + i; + + printk("zone %d\n", i); + zone->dev_offset = current_offset; + smallest = NULL; + c = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + + printk(" checking %s ...", partition_name(rdev->dev)); + if (rdev->size > current_offset) + { + printk(" contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || (rdev->size size)) { + smallest = rdev; + printk(" (%d) is smallest!.\n", rdev->size); + } + } else + printk(" nope.\n"); + } + + zone->nb_dev = c; + zone->size = (smallest->size - current_offset) * c; + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size); + + if (!conf->smallest || (zone->size < conf->smallest->size)) + conf->smallest = zone; + + zone->zone_offset = curr_zone_offset; + curr_zone_offset += zone->size; + + current_offset = smallest->size; + printk("current zone offset: %d\n", current_offset); + } + printk("done.\n"); + return 0; } -static int raid0_run (int minor, struct md_dev *mddev) +static int raid0_run (mddev_t *mddev) { - int cur=0, i=0, size, zone0_size, nb_zone; - struct raid0_data *data; - - MOD_INC_USE_COUNT; + int cur=0, i=0, size, zone0_size, nb_zone; + raid0_conf_t *conf; - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1; - data=(struct raid0_data *) mddev->private; - - if (create_strip_zones (minor, mddev)) - { - vfree(data); - return 1; - } - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone); - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL) - { - vfree(data->strip_zone); - vfree(data); - return 1; - } - size=data->strip_zone[cur].size; - - i=0; - while (curnr_strip_zones) - { - data->hash_table[i].zone0=data->strip_zone+cur; - - if (size>=data->smallest->size)/* If we completely fill the slot */ - { - data->hash_table[i++].zone1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==data->nr_strip_zones) continue; - size=data->strip_zone[cur].size; - } - - continue; - } - - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */ - { - data->hash_table[i].zone1=NULL; - continue; - } - - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=data->strip_zone[cur].size; - data->hash_table[i++].zone1=data->strip_zone+cur; - size-=(data->smallest->size - zone0_size); - } + MOD_INC_USE_COUNT; - return (0); + conf = vmalloc(sizeof (raid0_conf_t)); + if (!conf) + goto out; + mddev->private = (void *)conf; + + if (md_check_ordering(mddev)) { + printk("raid0: disks are not ordered, aborting!\n"); + goto out_free_conf; + } + + if (create_strip_zones (mddev)) + goto out_free_conf; + + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]); + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size); + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size + + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0); + printk("raid0 : nb_zone is %d.\n", nb_zone); + conf->nr_zones = nb_zone; + + printk("raid0 : Allocating %d bytes for hash.\n", + sizeof(struct raid0_hash)*nb_zone); + + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); + if (!conf->hash_table) + goto out_free_zone_conf; + size = conf->strip_zone[cur].size; + + i = 0; + while (cur < conf->nr_strip_zones) { + conf->hash_table[i].zone0 = conf->strip_zone + cur; + + /* + * If we completely fill the slot + */ + if (size >= conf->smallest->size) { + conf->hash_table[i++].zone1 = NULL; + size -= conf->smallest->size; + + if (!size) { + if (++cur == conf->nr_strip_zones) + continue; + size = conf->strip_zone[cur].size; + } + continue; + } + if (++cur == conf->nr_strip_zones) { + /* + * Last dev, set unit1 as NULL + */ + conf->hash_table[i].zone1=NULL; + continue; + } + + /* + * Here we use a 2nd dev to fill the slot + */ + zone0_size = size; + size = conf->strip_zone[cur].size; + conf->hash_table[i++].zone1 = conf->strip_zone + cur; + size -= (conf->smallest->size - zone0_size); + } + return 0; + +out_free_zone_conf: + vfree(conf->strip_zone); + conf->strip_zone = NULL; + +out_free_conf: + vfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return 1; } - -static int raid0_stop (int minor, struct md_dev *mddev) +static int raid0_stop (mddev_t *mddev) { - struct raid0_data *data=(struct raid0_data *) mddev->private; + raid0_conf_t *conf = mddev_to_conf(mddev); - vfree (data->hash_table); - vfree (data->strip_zone); - vfree (data); + vfree (conf->hash_table); + conf->hash_table = NULL; + vfree (conf->strip_zone); + conf->strip_zone = NULL; + vfree (conf); + mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; + MOD_DEC_USE_COUNT; + return 0; } /* @@ -167,135 +221,140 @@ * Of course, those facts may not be valid anymore (and surely won't...) * Hey guys, there's some work out there ;-) */ -static int raid0_map (struct md_dev *mddev, kdev_t *rdev, +static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { - struct raid0_data *data=(struct raid0_data *) mddev->private; - static struct raid0_hash *hash; - struct strip_zone *zone; - struct real_dev *tmp_dev; - int blk_in_chunk, factor, chunk, chunk_size; - long block, rblock; - - factor=FACTOR(mddev); - chunk_size=(1UL << FACTOR_SHIFT(factor)); - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - if (hash - data->hash_table > data->nr_zones) - { - printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block); - return -1; - } - - /* Sanity check */ - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size) - { - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size); - return (-1); - } - - if (block >= (hash->zone0->size + - hash->zone0->zone_offset)) - { - if (!hash->zone1) - { - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block); - return (-1); - } - - zone=hash->zone1; - } - else - zone=hash->zone0; + raid0_conf_t *conf = mddev_to_conf(mddev); + struct raid0_hash *hash; + struct strip_zone *zone; + mdk_rdev_t *tmp_dev; + int blk_in_chunk, chunksize_bits, chunk, chunk_size; + long block, rblock; + + chunk_size = mddev->param.chunk_size >> 10; + chunksize_bits = ffz(~chunk_size); + block = *rsector >> 1; + hash = conf->hash_table + block / conf->smallest->size; + + if (hash - conf->hash_table > conf->nr_zones) { + printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block); + return -1; + } + + /* Sanity check */ + if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size) + goto bad_map; + + if (!hash) + goto bad_hash; + + if (!hash->zone0) + goto bad_zone0; + + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) { + if (!hash->zone1) + goto bad_zone1; + zone = hash->zone1; + } else + zone = hash->zone0; - blk_in_chunk=block & (chunk_size -1); - chunk=(block - zone->zone_offset) / (zone->nb_dev<dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev]; - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset; + blk_in_chunk = block & (chunk_size -1); + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset; - *rdev=tmp_dev->dev; - *rsector=rblock<<1; + *rdev = tmp_dev->dev; + *rsector = rblock << 1; - return (0); + return 0; + +bad_map: + printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size); + return -1; +bad_hash: + printk("raid0_map bug: hash==NULL for block %ld\n", block); + return -1; +bad_zone0: + printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block); + return -1; +bad_zone1: + printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block); + return -1; } -static int raid0_status (char *page, int minor, struct md_dev *mddev) +static int raid0_status (char *page, mddev_t *mddev) { - int sz=0; + int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG - int j, k; - struct raid0_data *data=(struct raid0_data *) mddev->private; + int j, k; + raid0_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; jnr_zones; j++) - { - sz+=sprintf (page+sz, "[z%d", - data->hash_table[j].zone0-data->strip_zone); - if (data->hash_table[j].zone1) - sz+=sprintf (page+sz, "/z%d] ", - data->hash_table[j].zone1-data->strip_zone); - else - sz+=sprintf (page+sz, "] "); - } + sz += sprintf(page + sz, " "); + for (j = 0; j < conf->nr_zones; j++) { + sz += sprintf(page + sz, "[z%d", + conf->hash_table[j].zone0 - conf->strip_zone); + if (conf->hash_table[j].zone1) + sz += sprintf(page+sz, "/z%d] ", + conf->hash_table[j].zone1 - conf->strip_zone); + else + sz += sprintf(page+sz, "] "); + } - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page + sz, "\n"); - for (j=0; jnr_strip_zones; j++) - { - sz+=sprintf (page+sz, " z%d=[", j); - for (k=0; kstrip_zone[j].nb_dev; k++) - sz+=sprintf (page+sz, "%s/", - partition_name(data->strip_zone[j].dev[k]->dev)); - sz--; - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n", - data->strip_zone[j].zone_offset, - data->strip_zone[j].dev_offset, - data->strip_zone[j].size); - } + for (j = 0; j < conf->nr_strip_zones; j++) { + sz += sprintf(page + sz, " z%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + sz += sprintf (page+sz, "%s/", partition_name( + conf->strip_zone[j].dev[k]->dev)); + sz--; + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", + conf->strip_zone[j].zone_offset, + conf->strip_zone[j].dev_offset, + conf->strip_zone[j].size); + } #endif - sz+=sprintf (page+sz, " %dk chunks", 1<param.chunk_size/1024); + return sz; } - -static struct md_personality raid0_personality= +static mdk_personality_t raid0_personality= { - "raid0", - raid0_map, - NULL, /* no special make_request */ - NULL, /* no special end_request */ - raid0_run, - raid0_stop, - raid0_status, - NULL, /* no ioctls */ - 0, - NULL, /* no error_handler */ - NULL, /* hot_add_disk */ - NULL, /* hot_remove_disk */ - NULL /* mark_spare */ + "raid0", + raid0_map, + NULL, /* no special make_request */ + NULL, /* no special end_request */ + raid0_run, + raid0_stop, + raid0_status, + NULL, /* no ioctls */ + 0, + NULL, /* no error_handler */ + NULL, /* no diskop */ + NULL, /* no stop resync */ + NULL /* no restart resync */ }; - #ifndef MODULE void raid0_init (void) { - register_md_personality (RAID0, &raid0_personality); + register_md_personality (RAID0, &raid0_personality); } #else int init_module (void) { - return (register_md_personality (RAID0, &raid0_personality)); + return (register_md_personality (RAID0, &raid0_personality)); } void cleanup_module (void) { - unregister_md_personality (RAID0); + unregister_md_personality (RAID0); } #endif + diff -ruN linux.orig/drivers/block/raid1.c linux-2.2.16/drivers/block/raid1.c --- linux.orig/drivers/block/raid1.c Thu May 4 02:16:33 2000 +++ linux-2.2.16/drivers/block/raid1.c Fri Jun 9 11:37:45 2000 @@ -1,6 +1,6 @@ -/************************************************************************ +/* * raid1.c : Multiple Devices driver for Linux - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * @@ -15,50 +15,52 @@ */ #include -#include #include -#include -#include -#include +#include #include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -/* - * The following can be used to debug the driver - */ -/*#define RAID1_DEBUG*/ -#ifdef RAID1_DEBUG -#define PRINTK(x) do { printk x; } while (0); -#else -#define PRINTK(x) do { ; } while (0); -#endif +#define MAX_LINEAR_SECTORS 128 #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) -static struct md_personality raid1_personality; -static struct md_thread *raid1_thread = NULL; +static mdk_personality_t raid1_personality; struct buffer_head *raid1_retry_list = NULL; -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev, +static void * raid1_kmalloc (int size) +{ + void * ptr; + /* + * now we are rather fault tolerant than nice, but + * there are a couple of places in the RAID code where we + * simply can not afford to fail an allocation because + * there is no failure return path (eg. make_request()) + */ + while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL))) + printk ("raid1: out of memory, retrying...\n"); + + memset(ptr, 0, size); + return ptr; +} + +static int __raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ - PRINTK(("raid1_map().\n")); - - for (i=0; imirrors[i].operational) { - *rdev = raid_conf->mirrors[i].dev; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; return (0); } } @@ -67,29 +69,29 @@ return (-1); } -static int raid1_map (struct md_dev *mddev, kdev_t *rdev, +static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { return 0; } -void raid1_reschedule_retry (struct buffer_head *bh) +static void raid1_reschedule_retry (struct buffer_head *bh) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); - - PRINTK(("raid1_reschedule_retry().\n")); + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); r1_bh->next_retry = raid1_retry_list; raid1_retry_list = bh; - md_wakeup_thread(raid1_thread); + md_wakeup_thread(conf->thread); } /* - * raid1_end_buffer_io() is called when we have finished servicing a mirrored + * raid1_end_bh_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate) +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) { struct buffer_head *bh = r1_bh->master_bh; @@ -97,8 +99,6 @@ kfree(r1_bh); } -int raid1_one_error=0; - void raid1_end_request (struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); @@ -106,12 +106,7 @@ save_flags(flags); cli(); - PRINTK(("raid1_end_request().\n")); - if (raid1_one_error) { - raid1_one_error=0; - uptodate=0; - } /* * this branch is our 'one mirror IO has finished' event handler: */ @@ -136,15 +131,11 @@ */ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - - PRINTK(("raid1_end_request(), read branch.\n")); - /* * we have only one buffer_head on the read side */ if (uptodate) { - PRINTK(("raid1_end_request(), read branch, uptodate.\n")); - raid1_end_buffer_io(r1_bh, uptodate); + raid1_end_bh_io(r1_bh, uptodate); restore_flags(flags); return; } @@ -152,71 +143,56 @@ * oops, read error: */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry (bh); + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(bh); restore_flags(flags); return; } /* - * WRITE or WRITEA. - */ - PRINTK(("raid1_end_request(), write branch.\n")); - - /* + * WRITE: + * * Let's see if all mirrored write operations have finished - * already [we have irqs off, so we can decrease]: + * already. */ - if (!--r1_bh->remaining) { - struct md_dev *mddev = r1_bh->mddev; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + if (atomic_dec_and_test(&r1_bh->remaining)) { + int i, disks = MD_SB_DISKS; - PRINTK(("raid1_end_request(), remaining == 0.\n")); + for ( i = 0; i < disks; i++) + if (r1_bh->mirror_bh[i]) + kfree(r1_bh->mirror_bh[i]); - for ( i=0; imirror_bh[i]) kfree(r1_bh->mirror_bh[i]); - - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); } - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining)); restore_flags(flags); } -/* This routine checks if the undelying device is an md device and in that - * case it maps the blocks before putting the request on the queue +/* + * This routine checks if the undelying device is an md device + * and in that case it maps the blocks before putting the + * request on the queue */ -static inline void -map_and_make_request (int rw, struct buffer_head *bh) +static void map_and_make_request (int rw, struct buffer_head *bh) { if (MAJOR (bh->b_rdev) == MD_MAJOR) - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); + md_map (bh->b_rdev, &bh->b_rdev, + &bh->b_rsector, bh->b_size >> 9); clear_bit(BH_Lock, &bh->b_state); make_request (MAJOR (bh->b_rdev), rw, bh); } -static int -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) +static int raid1_make_request (mddev_t *mddev, int rw, + struct buffer_head * bh) { - - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; struct raid1_bh * r1_bh; - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0; struct mirror_info *mirror; - PRINTK(("raid1_make_request().\n")); - - while (!( /* FIXME: now we are rather fault tolerant than nice */ - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#1): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (r1_bh, 0, sizeof (struct raid1_bh)); + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); /* * make_request() can abort the operation when READA or WRITEA are being @@ -227,43 +203,65 @@ if (rw == READA) rw = READ; if (rw == WRITEA) rw = WRITE; - if (rw == WRITE || rw == WRITEA) - mark_buffer_clean(bh); /* Too early ? */ + if (rw == WRITE) { + /* + * Too early ? + */ + mark_buffer_clean(bh); + /* + * not too early. we _first_ clean the bh, then we start + * the IO, then when the IO has finished, we unlock the + * bh and mark it uptodate. This way we do not miss the + * case when the bh got dirty again during the IO. + */ + } + + /* + * special flag for 'lowprio' reconstruction requests ... + */ + if (buffer_lowprio(bh)) + lowprio = 1; /* - * i think the read and write branch should be separated completely, since we want - * to do read balancing on the read side for example. Comments? :) --mingo + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Comments? :) --mingo */ r1_bh->master_bh=bh; r1_bh->mddev=mddev; r1_bh->cmd = rw; - if (rw==READ || rw==READA) { - int last_used = raid_conf->last_used; - PRINTK(("raid1_make_request(), read branch.\n")); - mirror = raid_conf->mirrors + last_used; + if (rw==READ) { + int last_used = conf->last_used; + + /* + * read balancing logic: + */ + mirror = conf->mirrors + last_used; bh->b_rdev = mirror->dev; sectors = bh->b_size >> 9; - if (bh->b_blocknr * sectors == raid_conf->next_sect) { - raid_conf->sect_count += sectors; - if (raid_conf->sect_count >= mirror->sect_limit) + + if (bh->b_blocknr * sectors == conf->next_sect) { + conf->sect_count += sectors; + if (conf->sect_count >= mirror->sect_limit) switch_disks = 1; } else switch_disks = 1; - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors; - if (switch_disks) { - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count)); - raid_conf->sect_count = 0; - last_used = raid_conf->last_used = mirror->next; + conf->next_sect = (bh->b_blocknr + 1) * sectors; + /* + * Do not switch disks if full resync is in progress ... + */ + if (switch_disks && !conf->resync_mirrors) { + conf->sect_count = 0; + last_used = conf->last_used = mirror->next; /* - * Do not switch to write-only disks ... resyncing - * is in progress + * Do not switch to write-only disks ... + * reconstruction is in progress */ - while (raid_conf->mirrors[last_used].write_only) - raid_conf->last_used = raid_conf->mirrors[last_used].next; + while (conf->mirrors[last_used].write_only) + conf->last_used = conf->mirrors[last_used].next; } - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev))); bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; @@ -273,13 +271,12 @@ } /* - * WRITE or WRITEA. + * WRITE: */ - PRINTK(("raid1_make_request(n=%d), write branch.\n",n)); - for (i = 0; i < n; i++) { + for (i = 0; i < disks; i++) { - if (!raid_conf->mirrors [i].operational) { + if (!conf->mirrors[i].operational) { /* * the r1_bh->mirror_bh[i] pointer remains NULL */ @@ -287,89 +284,91 @@ continue; } + /* + * special case for reconstruction ... + */ + if (lowprio && (i == conf->last_used)) { + mirror_bh[i] = NULL; + continue; + } + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mirror_bh[i]->b_blocknr = bh->b_blocknr; + mirror_bh[i]->b_dev = bh->b_dev; + mirror_bh[i]->b_rdev = conf->mirrors[i].dev; + mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_state = (1<b_state |= (1<b_count = 1; + mirror_bh[i]->b_size = bh->b_size; + mirror_bh[i]->b_data = bh->b_data; + mirror_bh[i]->b_list = BUF_LOCKED; + mirror_bh[i]->b_end_io = raid1_end_request; + mirror_bh[i]->b_dev_id = r1_bh; + + r1_bh->mirror_bh[i] = mirror_bh[i]; + sum_bhs++; + } + + md_atomic_set(&r1_bh->remaining, sum_bhs); + /* - * We should use a private pool (size depending on NR_REQUEST), - * to avoid writes filling up the memory with bhs - * - * Such pools are much faster than kmalloc anyways (so we waste almost - * nothing by not using the master bh when writing and win alot of cleanness) - * - * but for now we are cool enough. --mingo - * - * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the beginning - * of this function to grok the difference ;) - */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#2): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (mirror_bh[i], 0, sizeof (struct buffer_head)); - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mirror_bh [i]->b_blocknr = bh->b_blocknr; - mirror_bh [i]->b_dev = bh->b_dev; - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev; - mirror_bh [i]->b_rsector = bh->b_rsector; - mirror_bh [i]->b_state = (1<b_count = 1; - mirror_bh [i]->b_size = bh->b_size; - mirror_bh [i]->b_data = bh->b_data; - mirror_bh [i]->b_list = BUF_LOCKED; - mirror_bh [i]->b_end_io = raid1_end_request; - mirror_bh [i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; - sum_bhs++; - } - - r1_bh->remaining = sum_bhs; - - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs)); - - /* - * We have to be a bit careful about the semaphore above, thats why we - * start the requests separately. Since kmalloc() could fail, sleep and - * make_request() can sleep too, this is the safer solution. Imagine, - * end_request decreasing the semaphore before we could have set it up ... - * We could play tricks with the semaphore (presetting it and correcting - * at the end if sum_bhs is not 'n' but we have to do end_request by hand - * if all requests finish until we had a chance to set up the semaphore - * correctly ... lots of races). - */ - for (i = 0; i < n; i++) - if (mirror_bh [i] != NULL) - map_and_make_request (rw, mirror_bh [i]); + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + for (i = 0; i < disks; i++) + if (mirror_bh[i]) + map_and_make_request(rw, mirror_bh[i]); return (0); } -static int raid1_status (char *page, int minor, struct md_dev *mddev) +static int raid1_status (char *page, mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_"); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) +static void unlink_disk (raid1_conf_t *conf, int target) { - int disks = raid_conf->raid_disks; - int j; + int disks = MD_SB_DISKS; + int i; - for (j = 0; j < disks; j++) - if (raid_conf->mirrors [j].next == failed_index) - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next; + for (i = 0; i < disks; i++) + if (conf->mirrors[i].next == target) + conf->mirrors[i].next = conf->mirrors[target].next; } #define LAST_DISK KERN_ALERT \ @@ -388,48 +387,53 @@ #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" -static int raid1_error (struct md_dev *mddev, kdev_t dev) +static void mark_disk_bad (mddev_t *mddev, int failed) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - int disks = raid_conf->raid_disks; - int i; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + unlink_disk(conf, failed); + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} - PRINTK(("raid1_error called\n")); +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; - if (raid_conf->working_disks == 1) { + if (conf->working_disks == 1) { /* * Uh oh, we can do nothing if this is our last disk, but * first check if this is a queued request for a device * which has just failed. */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) - if (mirror->dev == dev && !mirror->operational) + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && !mirrors[i].operational) return 0; + } printk (LAST_DISK); } else { - /* Mark disk as unusable */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) { - if (mirror->dev == dev && mirror->operational){ - mirror->operational = 0; - raid1_fix_links (raid_conf, i); - sb->disks[mirror->number].state |= - (1 << MD_FAULTY_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_SYNC_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_ACTIVE_DEVICE); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(raid1_thread); - raid_conf->working_disks--; - printk (DISK_FAILED, kdevname (dev), - raid_conf->working_disks); + /* + * Mark disk as unusable + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && mirrors[i].operational) { + mark_disk_bad (mddev, i); + break; } } } @@ -442,219 +446,396 @@ #undef START_SYNCING /* - * This is the personality-specific hot-addition routine + * Insert the spare disk into the drive-ring */ +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) +{ + int j, next; + int disks = MD_SB_DISKS; + struct mirror_info *p = conf->mirrors; + + for (j = 0; j < disks; j++, p++) + if (p->operational && !p->write_only) { + next = p->next; + p->next = mirror->raid_disk; + mirror->next = next; + return; + } -#define NO_SUPERBLOCK KERN_ERR \ -"raid1: cannot hot-add disk to the array with no RAID superblock\n" + printk("raid1: bug: no read-operational devices\n"); +} -#define WRONG_LEVEL KERN_ERR \ -"raid1: hot-add: level of disk is not RAID-1\n" +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; -#define HOT_ADD_SUCCEEDED KERN_INFO \ -"raid1: device %s hot-added\n" + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev) + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; unsigned long flags; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - struct real_dev * realdev; - int n; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + save_flags(flags); + cli(); + print_raid1_conf(conf); /* - * The device has its superblock already read and it was found - * to be consistent for generic RAID usage. Now we check whether - * it's usable for RAID-1 hot addition. + * find the disk ... */ + switch (state) { - n = mddev->nb_dev++; - realdev = &mddev->devices[n]; - if (!realdev->sb) { - printk (NO_SUPERBLOCK); - return -EINVAL; - } - if (realdev->sb->level != 1) { - printk (WRONG_LEVEL); - return -EINVAL; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - /* FIXME: are there other things left we could sanity-check? */ + switch (state) { /* - * We have to disable interrupts, as our RAID-1 state is used - * from irq handlers as well. + * Switch the spare disk to write-only mode: */ - save_flags(flags); - cli(); + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: - raid_conf->raid_disks++; - mirror = raid_conf->mirrors+n; + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; - mirror->number=n; - mirror->raid_disk=n; - mirror->dev=dev; - mirror->next=0; /* FIXME */ - mirror->sect_limit=128; - - mirror->operational=0; - mirror->spare=1; - mirror->write_only=0; - - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE); - sb->nr_disks++; - sb->spare_disks++; + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; - restore_flags(flags); + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } - md_update_sb(MINOR(dev)); + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev)); + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } - return 0; -} + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } -#undef NO_SUPERBLOCK -#undef WRONG_LEVEL -#undef HOT_ADD_SUCCEEDED + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); -/* - * Insert the spare disk into the drive-ring - */ -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror) -{ - int j, next; - struct mirror_info *p = raid_conf->mirrors; + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); - for (j = 0; j < raid_conf->raid_disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; - } - printk("raid1: bug: no read-operational devices\n"); -} + *d = failed_desc; -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, - int state) -{ - int i = 0, failed_disk = -1; - struct raid1_data *raid_conf = mddev->private; - struct mirror_info *mirror = raid_conf->mirrors; - md_descriptor_t *descriptor; - unsigned long flags; + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + link_disk(conf, fdisk); - for (i = 0; i < MD_SB_DISKS; i++, mirror++) { - if (mirror->spare && mirror->number == spare->number) - goto found; - } - return 1; -found: - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks; - i++, mirror++) - if (!mirror->operational) - failed_disk = i; + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ - save_flags(flags); - cli(); - switch (state) { - case SPARE_WRITE: - mirror->operational = 1; - mirror->write_only = 1; - raid_conf->raid_disks = MAX(raid_conf->raid_disks, - mirror->raid_disk + 1); - break; - case SPARE_INACTIVE: - mirror->operational = 0; - mirror->write_only = 0; - break; - case SPARE_ACTIVE: - mirror->spare = 0; - mirror->write_only = 0; - raid_conf->working_disks++; - add_ring(raid_conf, mirror); - - if (failed_disk != -1) { - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number]; - i = spare->raid_disk; - spare->raid_disk = descriptor->raid_disk; - descriptor->raid_disk = i; - } - break; - default: - printk("raid1_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; } +abort: restore_flags(flags); - return 0; + print_raid1_conf(conf); + return err; } + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + /* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. */ -void raid1d (void *data) +static void raid1d (void *data) { struct buffer_head *bh; kdev_t dev; unsigned long flags; - struct raid1_bh * r1_bh; - struct md_dev *mddev; + struct raid1_bh *r1_bh; + mddev_t *mddev; - PRINTK(("raid1d() active\n")); - save_flags(flags); - cli(); while (raid1_retry_list) { + save_flags(flags); + cli(); bh = raid1_retry_list; r1_bh = (struct raid1_bh *)(bh->b_dev_id); raid1_retry_list = r1_bh->next_retry; restore_flags(flags); - mddev = md_dev + MINOR(bh->b_dev); + mddev = kdev_to_mddev(bh->b_dev); if (mddev->sb_dirty) { - printk("dirty sb detected, updating.\n"); + printk(KERN_INFO "dirty sb detected, updating.\n"); mddev->sb_dirty = 0; - md_update_sb(MINOR(bh->b_dev)); + md_update_sb(mddev); } dev = bh->b_rdev; - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); + __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector, + bh->b_size >> 9); if (bh->b_rdev == dev) { - printk (KERN_ALERT - "raid1: %s: unrecoverable I/O read error for block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_end_buffer_io(r1_bh, 0); + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); } else { - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", - kdevname(bh->b_dev), bh->b_blocknr); + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); map_and_make_request (r1_bh->cmd, bh); } - cli(); } - restore_flags(flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev, NULL)) { + up(&mddev->recovery_sem); + return; + } + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + up(&mddev->recovery_sem); } + /* * This will catch the scenario in which one of the mirrors was * mounted as a normal device rather than as a part of a raid set. + * + * check_consistency is very personality-dependent, eg. RAID5 cannot + * do this check, it uses another method. */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid1_data *raid_conf = mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); + int disks = MD_SB_DISKS; kdev_t dev; struct buffer_head *bh = NULL; int i, rc = 0; char *buffer = NULL; - for (i = 0; i < raid_conf->raid_disks; i++) { - if (!raid_conf->mirrors[i].operational) + for (i = 0; i < disks; i++) { + printk("(checking disk %d)\n",i); + if (!conf->mirrors[i].operational) continue; - dev = raid_conf->mirrors[i].dev; + printk("(really checking disk %d)\n",i); + dev = conf->mirrors[i].dev; set_blocksize(dev, 4096); if ((bh = bread(dev, row / 4, 4096)) == NULL) break; @@ -683,167 +864,342 @@ return rc; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * we do not do this currently, as it's perfectly possible to + * have an inconsistent array when it's freshly created. Only + * newly written data has to be consistent. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid1_run (int minor, struct md_dev *mddev) +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define RUNNING_CKRAID KERN_ERR \ +"raid1: detected mirror differences -- running resync\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) { - struct raid1_data *raid_conf; - int i, j, raid_disk; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - kdevname(MKDEV(MD_MAJOR, minor)), sb->level); - MOD_DEC_USE_COUNT; - return -EIO; - } - /**** - * copy the now verified devices into our private RAID1 bookkeeping - * area. [whatever we allocate in raid1_run(), should be freed in - * raid1_stop()] + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL) - ) ) - { - printk ("raid1_run(): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - raid_conf = mddev->private; - memset(raid_conf, 0, sizeof(*raid_conf)); - - PRINTK(("raid1_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); + conf = raid1_kmalloc(sizeof(raid1_conf_t)); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); continue; } - - /* - * This is important -- we are using the descriptor on - * the disk only to get a pointer to the descriptor on - * the main superblock, which might be more recent. - */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev)); + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev)); + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); continue; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); continue; } - if (raid_conf->mirrors[raid_disk].operational) { - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); continue; } - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk); - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].operational = 1; - raid_conf->mirrors[raid_disk].sect_limit = 128; - raid_conf->working_disks++; + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].sect_limit = 128; - - raid_conf->mirrors[raid_disk].operational = 0; - raid_conf->mirrors[raid_disk].write_only = 0; - raid_conf->mirrors[raid_disk].spare = 1; - } - } - if (!raid_conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; - } - - raid_conf->raid_disks = sb->raid_disks; - raid_conf->mddev = mddev; - - for (j = 0; !raid_conf->mirrors[j].operational; j++); - raid_conf->last_used = j; - for (i = raid_conf->raid_disks - 1; i >= 0; i--) { - if (raid_conf->mirrors[i].operational) { - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j)); - raid_conf->mirrors[i].next = j; + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational; j++) + /* nothing */; + conf->last_used = j; + + /* + * initialize the 'working disks' list. + */ + for (i = conf->raid_disks - 1; i >= 0; i--) { + if (conf->mirrors[i].operational) { + conf->mirrors[i].next = j; j = i; } } - if (check_consistency(mddev)) { - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { + /* + * we do sanity checks even if the device says + * it's clean ... + */ + if (check_consistency(mddev)) { + printk(RUNNING_CKRAID); + sb->state &= ~(1 << MD_SB_CLEAN); + } + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + } + /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->mirrors[j].operational) + if (!conf->mirrors[j].operational) continue; - if (sb->disks[i].number == raid_conf->mirrors[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks); - /* Ok, everything is just fine now */ - return (0); + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef RUNNING_CKRAID +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; } -static int raid1_stop (int minor, struct md_dev *mddev) +static int raid1_stop (mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); - kfree (raid_conf); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static struct md_personality raid1_personality= +static mdk_personality_t raid1_personality= { "raid1", raid1_map, @@ -855,15 +1211,13 @@ NULL, /* no ioctls */ 0, raid1_error, - raid1_hot_add_disk, - /* raid1_hot_remove_drive */ NULL, - raid1_mark_spare + raid1_diskop, + raid1_stop_resync, + raid1_restart_resync }; int raid1_init (void) { - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL) - return -EBUSY; return register_md_personality (RAID1, &raid1_personality); } @@ -875,7 +1229,6 @@ void cleanup_module (void) { - md_unregister_thread (raid1_thread); unregister_md_personality (RAID1); } #endif diff -ruN linux.orig/drivers/block/raid5.c linux-2.2.16/drivers/block/raid5.c --- linux.orig/drivers/block/raid5.c Fri May 8 09:17:13 1998 +++ linux-2.2.16/drivers/block/raid5.c Fri Jun 9 11:37:45 2000 @@ -1,4 +1,4 @@ -/***************************************************************************** +/* * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman * @@ -14,16 +14,15 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + #include #include #include -#include -#include +#include #include #include -#include -static struct md_personality raid5_personality; +static mdk_personality_t raid5_personality; /* * Stripe cache @@ -33,7 +32,7 @@ #define HASH_PAGES_ORDER 0 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) /* * The following can be used to debug the driver @@ -46,6 +45,8 @@ #define PRINTK(x) do { ; } while (0) #endif +static void print_raid5_conf (raid5_conf_t *conf); + static inline int stripe_locked(struct stripe_head *sh) { return test_bit(STRIPE_LOCKED, &sh->state); @@ -61,32 +62,32 @@ */ static inline void lock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) { + raid5_conf_t *conf = sh->raid_conf; + if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { PRINTK(("locking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes++; + conf->nr_locked_stripes++; } } static inline void unlock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) { + raid5_conf_t *conf = sh->raid_conf; + if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) { PRINTK(("unlocking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes--; + conf->nr_locked_stripes--; wake_up(&sh->wait); } } static inline void finish_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; unlock_stripe(sh); sh->cmd = STRIPE_NONE; sh->phase = PHASE_COMPLETE; - raid_conf->nr_pending_stripes--; - raid_conf->nr_cached_stripes++; - wake_up(&raid_conf->wait_for_stripe); + conf->nr_pending_stripes--; + conf->nr_cached_stripes++; + wake_up(&conf->wait_for_stripe); } void __wait_on_stripe(struct stripe_head *sh) @@ -114,7 +115,7 @@ __wait_on_stripe(sh); } -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) { PRINTK(("remove_hash(), stripe %lu\n", sh->sector)); @@ -123,21 +124,22 @@ sh->hash_next->hash_pprev = sh->hash_pprev; *sh->hash_pprev = sh->hash_next; sh->hash_pprev = NULL; - raid_conf->nr_hashed_stripes--; + conf->nr_hashed_stripes--; } } -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size); + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes)); + PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", + sh->sector, conf->nr_hashed_stripes)); if ((sh->hash_next = *shp) != NULL) (*shp)->hash_pprev = &sh->hash_next; *shp = sh; sh->hash_pprev = shp; - raid_conf->nr_hashed_stripes++; + conf->nr_hashed_stripes++; } static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) @@ -145,13 +147,15 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->buffer_pool) == NULL) - return NULL; + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->buffer_pool; + if (!bh) + goto out_unlock; sh->buffer_pool = bh->b_next; bh->b_size = b_size; - restore_flags(flags); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -160,12 +164,14 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->bh_pool) == NULL) - return NULL; + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->bh_pool; + if (!bh) + goto out_unlock; sh->bh_pool = bh->b_next; - restore_flags(flags); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -173,54 +179,52 @@ { unsigned long flags; - save_flags(flags); - cli(); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->buffer_pool; sh->buffer_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) { unsigned long flags; - save_flags(flags); - cli(); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->bh_pool; sh->bh_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf) +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) { struct stripe_head *sh; unsigned long flags; save_flags(flags); cli(); - if ((sh = raid_conf->free_sh_list) == NULL) { + if ((sh = conf->free_sh_list) == NULL) { restore_flags(flags); return NULL; } - raid_conf->free_sh_list = sh->free_next; - raid_conf->nr_free_sh--; - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list) + conf->free_sh_list = sh->free_next; + conf->nr_free_sh--; + if (!conf->nr_free_sh && conf->free_sh_list) printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n"); restore_flags(flags); - if (sh->hash_pprev || sh->nr_pending || sh->count) + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count) printk("get_free_stripe(): bug\n"); return sh; } -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh) +static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh) { unsigned long flags; save_flags(flags); cli(); - sh->free_next = raid_conf->free_sh_list; - raid_conf->free_sh_list = sh; - raid_conf->nr_free_sh++; + sh->free_next = conf->free_sh_list; + conf->free_sh_list = sh; + conf->nr_free_sh++; restore_flags(flags); } @@ -324,8 +328,8 @@ static void kfree_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, j; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, j; PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector)); if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) { @@ -338,19 +342,19 @@ if (sh->bh_new[j] || sh->bh_copy[j]) printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]); } - remove_hash(raid_conf, sh); - put_free_stripe(raid_conf, sh); + remove_hash(conf, sh); + put_free_stripe(conf, sh); } -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr) +static int shrink_stripe_cache(raid5_conf_t *conf, int nr) { struct stripe_head *sh; int i, count = 0; - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock)); + PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock)); for (i = 0; i < NR_HASH; i++) { repeat: - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK]; + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; for (; sh; sh = sh->hash_next) { if (sh->phase != PHASE_COMPLETE) continue; @@ -360,30 +364,30 @@ continue; kfree_stripe(sh); if (++count == nr) { - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK; + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes)); + conf->clock = (i + conf->clock) & HASH_MASK; return nr; } goto repeat; } } - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes)); return count; } -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) +static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size) { struct stripe_head *sh; - if (raid_conf->buffer_size != size) { - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size)); - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - raid_conf->buffer_size = size; + if (conf->buffer_size != size) { + PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size)); + shrink_stripe_cache(conf, conf->max_nr_stripes); + conf->buffer_size = size; } PRINTK(("find_stripe, sector %lu\n", sector)); - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next) - if (sh->sector == sector && sh->raid_conf == raid_conf) { + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) + if (sh->sector == sector && sh->raid_conf == conf) { if (sh->size == size) { PRINTK(("found stripe %lu\n", sector)); return sh; @@ -397,7 +401,7 @@ return NULL; } -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority) +static int grow_stripes(raid5_conf_t *conf, int num, int priority) { struct stripe_head *sh; @@ -405,62 +409,64 @@ if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL) return 1; memset(sh, 0, sizeof(*sh)); - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + + if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); kfree(sh); return 1; } - if (grow_bh(sh, raid_conf->raid_disks, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); - shrink_bh(sh, raid_conf->raid_disks); + if (grow_bh(sh, conf->raid_disks, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); + shrink_bh(sh, conf->raid_disks); kfree(sh); return 1; } - put_free_stripe(raid_conf, sh); - raid_conf->nr_stripes++; + put_free_stripe(conf, sh); + conf->nr_stripes++; } return 0; } -static void shrink_stripes(struct raid5_data *raid_conf, int num) +static void shrink_stripes(raid5_conf_t *conf, int num) { struct stripe_head *sh; while (num--) { - sh = get_free_stripe(raid_conf); + sh = get_free_stripe(conf); if (!sh) break; - shrink_buffers(sh, raid_conf->raid_disks * 2); - shrink_bh(sh, raid_conf->raid_disks); + shrink_buffers(sh, conf->raid_disks * 2); + shrink_bh(sh, conf->raid_disks); kfree(sh); - raid_conf->nr_stripes--; + conf->nr_stripes--; } } -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) +static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) { struct stripe_head *sh = NULL, *tmp; struct buffer_head *buffer_pool, *bh_pool; PRINTK(("kmalloc_stripe called\n")); - while ((sh = get_free_stripe(raid_conf)) == NULL) { - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8); - if ((sh = get_free_stripe(raid_conf)) != NULL) + while ((sh = get_free_stripe(conf)) == NULL) { + shrink_stripe_cache(conf, conf->max_nr_stripes / 8); + if ((sh = get_free_stripe(conf)) != NULL) break; - if (!raid_conf->nr_pending_stripes) + if (!conf->nr_pending_stripes) printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n"); - md_wakeup_thread(raid_conf->thread); + md_wakeup_thread(conf->thread); PRINTK(("waiting for some stripes to complete\n")); - sleep_on(&raid_conf->wait_for_stripe); + sleep_on(&conf->wait_for_stripe); } /* * The above might have slept, so perhaps another process * already created the stripe for us.. */ - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) { - put_free_stripe(raid_conf, sh); + if ((tmp = find_stripe(conf, sector, size)) != NULL) { + put_free_stripe(conf, sh); wait_on_stripe(tmp); return tmp; } @@ -472,25 +478,25 @@ sh->bh_pool = bh_pool; sh->phase = PHASE_COMPLETE; sh->cmd = STRIPE_NONE; - sh->raid_conf = raid_conf; + sh->raid_conf = conf; sh->sector = sector; sh->size = size; - raid_conf->nr_cached_stripes++; - insert_hash(raid_conf, sh); + conf->nr_cached_stripes++; + insert_hash(conf, sh); } else printk("raid5: bug: kmalloc_stripe() == NULL\n"); return sh; } -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) +static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size) { struct stripe_head *sh; PRINTK(("get_stripe, sector %lu\n", sector)); - sh = find_stripe(raid_conf, sector, size); + sh = find_stripe(conf, sector, size); if (sh) wait_on_stripe(sh); else - sh = kmalloc_stripe(raid_conf, sector, size); + sh = kmalloc_stripe(conf, sector, size); return sh; } @@ -523,7 +529,7 @@ bh->b_end_io(bh, uptodate); if (!uptodate) printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr); + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); } static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) @@ -537,36 +543,35 @@ static void raid5_end_request (struct buffer_head * bh, int uptodate) { struct stripe_head *sh = bh->b_dev_id; - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, i; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; unsigned long flags; PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending)); - save_flags(flags); - cli(); + md_spin_lock_irqsave(&sh->stripe_lock, flags); raid5_mark_buffer_uptodate(bh, uptodate); - --sh->nr_pending; - if (!sh->nr_pending) { - md_wakeup_thread(raid_conf->thread); - atomic_inc(&raid_conf->nr_handle); + if (atomic_dec_and_test(&sh->nr_pending)) { + md_wakeup_thread(conf->thread); + atomic_inc(&conf->nr_handle); } - if (!uptodate) + if (!uptodate) { md_error(bh->b_dev, bh->b_rdev); - if (raid_conf->failed_disks) { + } + if (conf->failed_disks) { for (i = 0; i < disks; i++) { - if (raid_conf->disks[i].operational) + if (conf->disks[i].operational) continue; if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) continue; - if (bh->b_rdev != raid_conf->disks[i].dev) + if (bh->b_rdev != conf->disks[i].dev) continue; set_bit(STRIPE_ERROR, &sh->state); } } - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } -static int raid5_map (struct md_dev *mddev, kdev_t *rdev, +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { /* No complex mapping used: the core of the work is done in the @@ -577,11 +582,10 @@ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; char *b_data; - kdev_t dev = MKDEV(MD_MAJOR, minor); + kdev_t dev = mddev_to_kdev(mddev); int block = sh->sector / (sh->size >> 9); b_data = ((volatile struct buffer_head *) bh)->b_data; @@ -589,7 +593,7 @@ init_buffer(bh, dev, block, raid5_end_request, sh); ((volatile struct buffer_head *) bh)->b_data = b_data; - bh->b_rdev = raid_conf->disks[i].dev; + bh->b_rdev = conf->disks[i].dev; bh->b_rsector = sh->sector; bh->b_state = (1 << BH_Req); @@ -597,33 +601,62 @@ bh->b_list = BUF_LOCKED; } -static int raid5_error (struct md_dev *mddev, kdev_t dev) +static int raid5_error (mddev_t *mddev, kdev_t dev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; struct disk_info *disk; int i; PRINTK(("raid5_error called\n")); - raid_conf->resync_parity = 0; - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) + conf->resync_parity = 0; + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { if (disk->dev == dev && disk->operational) { disk->operational = 0; - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE); + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); sb->active_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; - raid_conf->working_disks--; - raid_conf->failed_disks++; - md_wakeup_thread(raid_conf->thread); + conf->working_disks--; + conf->failed_disks++; + md_wakeup_thread(conf->thread); printk (KERN_ALERT - "RAID5: Disk failure on %s, disabling device." - "Operation continuing on %d devices\n", - kdevname (dev), raid_conf->working_disks); + "raid5: Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + partition_name (dev), conf->working_disks); + return -EIO; } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->dev == dev) { + printk (KERN_ALERT + "raid5: Disk failure on spare %s\n", + partition_name (dev)); + if (!conf->spare->operational) { + MD_BUG(); + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + return -EIO; + } + } + MD_BUG(); return 0; } @@ -634,12 +667,12 @@ static inline unsigned long raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks, unsigned int * dd_idx, unsigned int * pd_idx, - struct raid5_data *raid_conf) + raid5_conf_t *conf) { unsigned int stripe; int chunk_number, chunk_offset; unsigned long new_sector; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; /* First compute the information on this sector */ @@ -662,9 +695,9 @@ /* * Select the parity disk based on the user selected algorithm. */ - if (raid_conf->level == 4) + if (conf->level == 4) *pd_idx = data_disks; - else switch (raid_conf->algorithm) { + else switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= *pd_idx) @@ -684,7 +717,7 @@ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } /* @@ -705,16 +738,16 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1; + raid5_conf_t *conf = sh->raid_conf; + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; unsigned long new_sector = sh->sector, check; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; unsigned long stripe = new_sector / sectors_per_chunk; int chunk_offset = new_sector % sectors_per_chunk; int chunk_number, dummy1, dummy2, dd_idx = i; unsigned long r_sector, blocknr; - switch (raid_conf->algorithm) { + switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -727,14 +760,14 @@ i -= (sh->pd_idx + 1); break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } chunk_number = stripe * data_disks + i; r_sector = chunk_number * sectors_per_chunk + chunk_offset; blocknr = r_sector / (sh->size >> 9); - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf); + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk("compute_blocknr: map not correct\n"); return 0; @@ -742,36 +775,11 @@ return blocknr; } -#ifdef HAVE_ARCH_XORBLOCK -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size); -} -#else -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - long lines = dest->b_size / (sizeof (long)) / 8, i; - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data; - - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(sourcep + 0); - *(destp + 1) ^= *(sourcep + 1); - *(destp + 2) ^= *(sourcep + 2); - *(destp + 3) ^= *(sourcep + 3); - *(destp + 4) ^= *(sourcep + 4); - *(destp + 5) ^= *(sourcep + 5); - *(destp + 6) ^= *(sourcep + 6); - *(destp + 7) ^= *(sourcep + 7); - destp += 8; - sourcep += 8; - } -} -#endif - static void compute_block(struct stripe_head *sh, int dd_idx) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx)); @@ -780,69 +788,100 @@ raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_old[dd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == dd_idx) continue; if (sh->bh_old[i]) { - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]); - continue; - } else + bh_ptr[count++] = sh->bh_old[i]; + } else { printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + } + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if(count != 1) { + xor_block(count, &bh_ptr[0]); } raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); } static void compute_parity(struct stripe_head *sh, int method) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method)); + lowprio = 1; for (i = 0; i < disks; i++) { if (i == pd_idx || !sh->bh_new[i]) continue; if (!sh->bh_copy[i]) sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_copy[i], i); + if (!buffer_lowprio(sh->bh_new[i])) + lowprio = 0; + else + mark_buffer_lowprio(sh->bh_copy[i]); mark_buffer_clean(sh->bh_new[i]); memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size); } if (sh->bh_copy[pd_idx] == NULL) sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); + if (lowprio) + mark_buffer_lowprio(sh->bh_copy[pd_idx]); if (method == RECONSTRUCT_WRITE) { memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + } else if (sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_old[i]; } - if (sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } else if (method == READ_MODIFY_WRITE) { memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i] && sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + bh_ptr[count++] = sh->bh_old[i]; + } + if (count >= (MAX_XOR_BLOCKS - 1)) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); } static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) { - struct raid5_data *raid_conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; struct buffer_head *bh_req; if (sh->bh_new[dd_idx]) { @@ -860,19 +899,22 @@ if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { sh->phase = PHASE_BEGIN; sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; - raid_conf->nr_pending_stripes++; - atomic_inc(&raid_conf->nr_handle); + conf->nr_pending_stripes++; + atomic_inc(&conf->nr_handle); } sh->bh_new[dd_idx] = bh; sh->bh_req[dd_idx] = bh_req; sh->cmd_new[dd_idx] = rw; sh->new[dd_idx] = 1; + + if (buffer_lowprio(bh)) + mark_buffer_lowprio(bh_req); } static void complete_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; int i, new = 0; PRINTK(("complete_stripe %lu\n", sh->sector)); @@ -909,6 +951,22 @@ } } + +static int is_stripe_lowprio(struct stripe_head *sh, int disks) +{ + int i, lowprio = 1; + + for (i = 0; i < disks; i++) { + if (sh->bh_new[i]) + if (!buffer_lowprio(sh->bh_new[i])) + lowprio = 0; + if (sh->bh_old[i]) + if (!buffer_lowprio(sh->bh_old[i])) + lowprio = 0; + } + return lowprio; +} + /* * handle_stripe() is our main logic routine. Note that: * @@ -919,28 +977,27 @@ * 2. We should be careful to set sh->nr_pending whenever we sleep, * to prevent re-entry of handle_stripe() for the same sh. * - * 3. raid_conf->failed_disks and disk->operational can be changed + * 3. conf->failed_disks and disk->operational can be changed * from an interrupt. This complicates things a bit, but it allows * us to stop issuing requests for a failed drive as soon as possible. */ static void handle_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; struct buffer_head *bh; - int disks = raid_conf->raid_disks; - int i, nr = 0, nr_read = 0, nr_write = 0; + int disks = conf->raid_disks; + int i, nr = 0, nr_read = 0, nr_write = 0, lowprio; int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0; int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; int reading = 0, nr_writing = 0; int method1 = INT_MAX, method2 = INT_MAX; int block; unsigned long flags; - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks; + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; PRINTK(("handle_stripe(), stripe %lu\n", sh->sector)); - if (sh->nr_pending) { + if (md_atomic_read(&sh->nr_pending)) { printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector); return; } @@ -949,9 +1006,9 @@ return; } - atomic_dec(&raid_conf->nr_handle); + atomic_dec(&conf->nr_handle); - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) { + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { printk("raid5: restarting stripe %lu\n", sh->sector); sh->phase = PHASE_BEGIN; } @@ -969,11 +1026,11 @@ save_flags(flags); cli(); for (i = 0; i < disks; i++) { - operational[i] = raid_conf->disks[i].operational; - if (i == sh->pd_idx && raid_conf->resync_parity) + operational[i] = conf->disks[i].operational; + if (i == sh->pd_idx && conf->resync_parity) operational[i] = 0; } - failed_disks = raid_conf->failed_disks; + failed_disks = conf->failed_disks; restore_flags(flags); if (failed_disks > 1) { @@ -1017,7 +1074,7 @@ } if (nr_write && nr_read) - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd); + printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd); if (nr_write) { /* @@ -1030,7 +1087,7 @@ if (sh->bh_new[i]) continue; block = (int) compute_blocknr(sh, i); - bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size); + bh = find_buffer(mddev_to_kdev(mddev), block, sh->size); if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) { PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block)); add_stripe_bh(sh, bh, i, WRITE); @@ -1064,21 +1121,22 @@ if (!method1 || !method2) { lock_stripe(sh); - sh->nr_pending++; + lowprio = is_stripe_lowprio(sh, disks); + atomic_inc(&sh->nr_pending); sh->phase = PHASE_WRITE; compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) + if (!operational[i] && !conf->spare && !conf->resync_parity) continue; if (i == sh->pd_idx || sh->bh_new[i]) nr_writing++; } - sh->nr_pending = nr_writing; - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending)); + md_atomic_set(&sh->nr_pending, nr_writing); + PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending))); for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) + if (!operational[i] && !conf->spare && !conf->resync_parity) continue; bh = sh->bh_copy[i]; if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) @@ -1089,18 +1147,30 @@ bh->b_state |= (1<b_state); - if (!operational[i] && !raid_conf->resync_parity) { - bh->b_rdev = raid_conf->spare->dev; - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh); - } else - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh); + if (!operational[i] && !conf->resync_parity) { + bh->b_rdev = conf->spare->dev; + make_request(MAJOR(conf->spare->dev), WRITE, bh); + } else { +#if 0 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh); +#else + if (!lowprio || (i==sh->pd_idx)) + make_request(MAJOR(conf->disks[i].dev), WRITE, bh); + else { + mark_buffer_clean(bh); + raid5_end_request(bh,1); + sh->new[i] = 0; + } +#endif + } } } return; } lock_stripe(sh); - sh->nr_pending++; + lowprio = is_stripe_lowprio(sh, disks); + atomic_inc(&sh->nr_pending); if (method1 < method2) { sh->write_method = RECONSTRUCT_WRITE; for (i = 0; i < disks; i++) { @@ -1110,6 +1180,8 @@ continue; sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_old[i], i); + if (lowprio) + mark_buffer_lowprio(sh->bh_old[i]); reading++; } } else { @@ -1121,19 +1193,21 @@ continue; sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_old[i], i); + if (lowprio) + mark_buffer_lowprio(sh->bh_old[i]); reading++; } } sh->phase = PHASE_READ_OLD; - sh->nr_pending = reading; - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending)); + md_atomic_set(&sh->nr_pending, reading); + PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending))); for (i = 0; i < disks; i++) { if (!sh->bh_old[i]) continue; if (buffer_uptodate(sh->bh_old[i])) continue; clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]); } } else { /* @@ -1141,7 +1215,8 @@ */ method1 = nr_read - nr_cache_overwrite; lock_stripe(sh); - sh->nr_pending++; + lowprio = is_stripe_lowprio(sh,disks); + atomic_inc(&sh->nr_pending); PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1)); if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { @@ -1149,18 +1224,22 @@ for (i = 0; i < disks; i++) { if (!sh->bh_new[i]) continue; - if (!sh->bh_old[i]) + if (!sh->bh_old[i]) { compute_block(sh, i); + if (lowprio) + mark_buffer_lowprio + (sh->bh_old[i]); + } memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); } - sh->nr_pending--; + atomic_dec(&sh->nr_pending); complete_stripe(sh); return; } if (nr_failed_overwrite) { sh->phase = PHASE_READ_OLD; - sh->nr_pending = (disks - 1) - nr_cache; - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending)); + md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache); + PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending))); for (i = 0; i < disks; i++) { if (sh->bh_old[i]) continue; @@ -1168,13 +1247,16 @@ continue; sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_old[i], i); + if (lowprio) + mark_buffer_lowprio(sh->bh_old[i]); clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]); } } else { sh->phase = PHASE_READ; - sh->nr_pending = nr_read - nr_cache_overwrite; - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending)); + md_atomic_set(&sh->nr_pending, + nr_read - nr_cache_overwrite); + PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending))); for (i = 0; i < disks; i++) { if (!sh->bh_new[i]) continue; @@ -1182,16 +1264,16 @@ memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); continue; } - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]); + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]); } } } } -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - const unsigned int raid_disks = raid_conf->raid_disks; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; unsigned long new_sector; @@ -1202,15 +1284,15 @@ if (rw == WRITEA) rw = WRITE; new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, - &dd_idx, &pd_idx, raid_conf); + &dd_idx, &pd_idx, conf); PRINTK(("raid5_make_request, sector %lu\n", new_sector)); repeat: - sh = get_stripe(raid_conf, new_sector, bh->b_size); + sh = get_stripe(conf, new_sector, bh->b_size); if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd)); lock_stripe(sh); - if (!sh->nr_pending) + if (!md_atomic_read(&sh->nr_pending)) handle_stripe(sh); goto repeat; } @@ -1221,24 +1303,24 @@ printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector); printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]); lock_stripe(sh); - md_wakeup_thread(raid_conf->thread); + md_wakeup_thread(conf->thread); wait_on_stripe(sh); goto repeat; } add_stripe_bh(sh, bh, dd_idx, rw); - md_wakeup_thread(raid_conf->thread); + md_wakeup_thread(conf->thread); return 0; } static void unplug_devices(struct stripe_head *sh) { #if 0 - struct raid5_data *raid_conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int i; - for (i = 0; i < raid_conf->raid_disks; i++) - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev)); + for (i = 0; i < conf->raid_disks; i++) + unplug_device(blk_dev + MAJOR(conf->disks[i].dev)); #endif } @@ -1252,8 +1334,8 @@ static void raid5d (void *data) { struct stripe_head *sh; - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; int i, handled = 0, unplug = 0; unsigned long flags; @@ -1261,47 +1343,47 @@ if (mddev->sb_dirty) { mddev->sb_dirty = 0; - md_update_sb((int) (mddev - md_dev)); + md_update_sb(mddev); } for (i = 0; i < NR_HASH; i++) { repeat: - sh = raid_conf->stripe_hashtbl[i]; + sh = conf->stripe_hashtbl[i]; for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != raid_conf) + if (sh->raid_conf != conf) continue; if (sh->phase == PHASE_COMPLETE) continue; - if (sh->nr_pending) + if (md_atomic_read(&sh->nr_pending)) continue; - if (sh->sector == raid_conf->next_sector) { - raid_conf->sector_count += (sh->size >> 9); - if (raid_conf->sector_count >= 128) + if (sh->sector == conf->next_sector) { + conf->sector_count += (sh->size >> 9); + if (conf->sector_count >= 128) unplug = 1; } else unplug = 1; if (unplug) { - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count)); + PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count)); unplug_devices(sh); unplug = 0; - raid_conf->sector_count = 0; + conf->sector_count = 0; } - raid_conf->next_sector = sh->sector + (sh->size >> 9); + conf->next_sector = sh->sector + (sh->size >> 9); handled++; handle_stripe(sh); goto repeat; } } - if (raid_conf) { - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle))); + if (conf) { + PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle))); save_flags(flags); cli(); - if (!atomic_read(&raid_conf->nr_handle)) - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags); + if (!md_atomic_read(&conf->nr_handle)) + clear_bit(THREAD_WAKEUP, &conf->thread->flags); + restore_flags(flags); } PRINTK(("--- raid5d inactive\n")); } -#if SUPPORT_RECONSTRUCTION /* * Private kernel thread for parity reconstruction after an unclean * shutdown. Reconstruction on spare drives in case of a failed drive @@ -1309,44 +1391,64 @@ */ static void raid5syncd (void *data) { - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; - if (!raid_conf->resync_parity) + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev,NULL)) { + up(&mddev->recovery_sem); + printk("raid5: resync aborted!\n"); return; - md_do_sync(mddev); - raid_conf->resync_parity = 0; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + printk("raid5: resync finished.\n"); } -#endif /* SUPPORT_RECONSTRUCTION */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid5_data *raid_conf = mddev->private; + raid5_conf_t *conf = mddev->private; kdev_t dev; struct buffer_head *bh[MD_SB_DISKS], tmp; - int i, rc = 0, nr = 0; + int i, rc = 0, nr = 0, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - if (raid_conf->working_disks != raid_conf->raid_disks) + if (conf->working_disks != conf->raid_disks) return 0; tmp.b_size = 4096; if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL) return 0; + md_clear_page((unsigned long)tmp.b_data); memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; set_blocksize(dev, 4096); if ((bh[i] = bread(dev, row / 4, 4096)) == NULL) break; nr++; } - if (nr == raid_conf->raid_disks) { - for (i = 1; i < nr; i++) - xor_block(&tmp, bh[i]); + if (nr == conf->raid_disks) { + bh_ptr[0] = &tmp; + count = 1; + for (i = 1; i < nr; i++) { + bh_ptr[count++] = bh[i]; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } if (memcmp(tmp.b_data, bh[0]->b_data, 4096)) rc = 1; } - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; if (bh[i]) { bforget(bh[i]); bh[i] = NULL; @@ -1358,285 +1460,607 @@ return rc; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * We are not checking this currently, as it's legitimate to have + * an inconsistent array, at creation time. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid5_run (int minor, struct md_dev *mddev) +static int raid5_run (mddev_t *mddev) { - struct raid5_data *raid_conf; + raid5_conf_t *conf; int i, j, raid_disk, memory; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 5 && sb->level != 4) { - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level); + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); MOD_DEC_USE_COUNT; return -EIO; } - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL); - if ((raid_conf = mddev->private) == NULL) + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) goto abort; - memset (raid_conf, 0, sizeof (*raid_conf)); - raid_conf->mddev = mddev; + memset (conf, 0, sizeof (*conf)); + conf->mddev = mddev; - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) goto abort; - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - init_waitqueue(&raid_conf->wait_for_stripe); - PRINTK(("raid5_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); - continue; - } + init_waitqueue(&conf->wait_for_stripe); + PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev))); + ITERATE_RDEV(mddev,rdev,tmp) { /* * This is important -- we are using the descriptor on * the disk only to get a pointer to the descriptor on * the main superblock, which might be more recent. */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev)); + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev)); - continue; + if (disk_active(desc)) { + if (!disk_sync(desc)) { + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); + MD_BUG(); + goto abort; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if (raid_disk > sb->raid_disks) { + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); continue; } - if (raid_conf->disks[raid_disk].operational) { - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); continue; } - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk); + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev; - raid_conf->disks[raid_disk].operational = 1; + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->operational = 1; + disk->used_slot = 1; - raid_conf->working_disks++; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev; - - raid_conf->disks[raid_disk].operational = 0; - raid_conf->disks[raid_disk].write_only = 0; - raid_conf->disks[raid_disk].spare = 1; - } - } - raid_conf->raid_disks = sb->raid_disks; - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks; - raid_conf->mddev = mddev; - raid_conf->chunk_size = sb->chunk_size; - raid_conf->level = sb->level; - raid_conf->algorithm = sb->parity_algorithm; - raid_conf->max_nr_stripes = NR_STRIPES; + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) { - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } } - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor))); + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + conf->raid_disks = sb->raid_disks; + /* + * 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + +#if 0 + for (i = 0; i < conf->raid_disks; i++) { + if (!conf->disks[i].used_slot) { + MD_BUG(); + goto abort; + } + } +#endif + if (!conf->chunk_size || conf->chunk_size % 4) { + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); goto abort; } - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor))); + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); goto abort; } - if (raid_conf->failed_disks > 1) { - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks); + if (conf->failed_disks > 1) { + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); goto abort; } - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) { - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - goto abort; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && + check_consistency(mddev)) { + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); + sb->state &= ~(1 << MD_SB_CLEAN); } -#if SUPPORT_RECONSTRUCTION - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + { + const char * name = "raid5d"; + + conf->thread = md_register_thread(raid5d, conf, name); + if (!conf->thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } } -#endif /* SUPPORT_RECONSTRUCTION */ - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) + - raid_conf->raid_disks * (sizeof(struct buffer_head) + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * (sizeof(struct buffer_head) + 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) { + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS ; i++) { + mark_disk_nonsync(sb->disks + i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->disks[j].operational) + if (!conf->disks[j].operational) continue; - if (sb->disks[i].number == raid_conf->disks[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; if (sb->active_disks == sb->raid_disks) - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) { + const char * name = "raid5syncd"; + + conf->resync_thread = md_register_thread(raid5syncd, conf,name); + if (!conf->resync_thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) { - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor))); - raid_conf->resync_parity = 1; -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); } + print_raid5_conf(conf); + if (start_recovery) + md_recover_arrays(); + print_raid5_conf(conf); + /* Ok, everything is just fine now */ return (0); abort: - if (raid_conf) { - if (raid_conf->stripe_hashtbl) - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); MOD_DEC_USE_COUNT; return -EIO; } -static int raid5_stop (int minor, struct md_dev *mddev) +static int raid5_stop_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_thread_t *thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + md_interrupt_thread(thread); + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid5_restart_resync (mddev_t *mddev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; + raid5_conf_t *conf = mddev_to_conf(mddev); - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); - md_unregister_thread(raid_conf->thread); -#if SUPPORT_RECONSTRUCTION - md_unregister_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + printk("raid5: waking up raid5resync.\n"); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } else + printk("raid5: no restart-resync needed.\n"); + return 0; +} + + +static int raid5_stop (mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + + shrink_stripe_cache(conf, conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static int raid5_status (char *page, int minor, struct md_dev *mddev) +static int raid5_status (char *page, mddev_t *mddev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; int sz = 0, i; - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm); - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_"); + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state) +static void print_raid5_conf (raid5_conf_t *conf) +{ + int i; + struct disk_info *tmp; + + printk("RAID5 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { - int i = 0, failed_disk = -1; - struct raid5_data *raid_conf = mddev->private; - struct disk_info *disk = raid_conf->disks; + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; unsigned long flags; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; - for (i = 0; i < MD_SB_DISKS; i++, disk++) { - if (disk->spare && disk->number == spare->number) - goto found; - } - return 1; -found: - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) - if (!disk->operational) - failed_disk = i; - if (failed_disk == -1) - return 1; save_flags(flags); cli(); + + print_raid5_conf(conf); + /* + * find the disk ... + */ switch (state) { - case SPARE_WRITE: - disk->operational = 1; - disk->write_only = 1; - raid_conf->spare = disk; - break; - case SPARE_INACTIVE: - disk->operational = 0; - disk->write_only = 0; - raid_conf->spare = NULL; - break; - case SPARE_ACTIVE: - disk->spare = 0; - disk->write_only = 0; - descriptor = &sb->disks[raid_conf->disks[failed_disk].number]; - i = spare->raid_disk; - disk->raid_disk = spare->raid_disk = descriptor->raid_disk; - if (disk->raid_disk != failed_disk) - printk("raid5: disk->raid_disk != failed_disk"); - descriptor->raid_disk = i; - - raid_conf->spare = NULL; - raid_conf->working_disks++; - raid_conf->failed_disks--; - raid_conf->disks[failed_disk] = *disk; - break; - default: - printk("raid5_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->disks + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + + + break; + + default: + MD_BUG(); + err = 1; + goto abort; } +abort: restore_flags(flags); - return 0; + print_raid5_conf(conf); + return err; } -static struct md_personality raid5_personality= +static mdk_personality_t raid5_personality= { "raid5", raid5_map, @@ -1648,14 +2072,19 @@ NULL, /* no ioctls */ 0, raid5_error, - /* raid5_hot_add_disk, */ NULL, - /* raid1_hot_remove_drive */ NULL, - raid5_mark_spare + raid5_diskop, + raid5_stop_resync, + raid5_restart_resync }; int raid5_init (void) { - return register_md_personality (RAID5, &raid5_personality); + int err; + + err = register_md_personality (RAID5, &raid5_personality); + if (err) + return err; + return 0; } #ifdef MODULE diff -ruN linux.orig/drivers/block/translucent.c linux-2.2.16/drivers/block/translucent.c --- linux.orig/drivers/block/translucent.c Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/drivers/block/translucent.c Fri Jun 9 11:37:45 2000 @@ -0,0 +1,136 @@ +/* + translucent.c : Translucent RAID driver for Linux + Copyright (C) 1998 Ingo Molnar + + Translucent mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#include +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +static int translucent_run (mddev_t *mddev) +{ + translucent_conf_t *conf; + mdk_rdev_t *rdev; + int i; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (mddev->nb_dev != 2) { + printk("translucent: this mode needs 2 disks, aborting!\n"); + goto out; + } + + if (md_check_ordering(mddev)) { + printk("translucent: disks are not ordered, aborting!\n"); + goto out; + } + + ITERATE_RDEV_ORDERED(mddev,rdev,i) { + dev_info_t *disk = conf->disks + i; + + disk->dev = rdev->dev; + disk->size = rdev->size; + } + + return 0; + +out: + if (conf) + kfree(conf); + + MOD_DEC_USE_COUNT; + return 1; +} + +static int translucent_stop (mddev_t *mddev) +{ + translucent_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size) +{ + translucent_conf_t *conf = mddev_to_conf(mddev); + + *rdev = conf->disks[0].dev; + + return 0; +} + +static int translucent_status (char *page, mddev_t *mddev) +{ + int sz = 0; + + sz += sprintf(page+sz, " %d%% full", 10); + return sz; +} + + +static mdk_personality_t translucent_personality= +{ + "translucent", + translucent_map, + NULL, + NULL, + translucent_run, + translucent_stop, + translucent_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL +}; + +#ifndef MODULE + +md__initfunc(void translucent_init (void)) +{ + register_md_personality (TRANSLUCENT, &translucent_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (TRANSLUCENT, &translucent_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (TRANSLUCENT); +} + +#endif + diff -ruN linux.orig/drivers/block/xor.c linux-2.2.16/drivers/block/xor.c --- linux.orig/drivers/block/xor.c Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/drivers/block/xor.c Fri Jun 9 11:37:45 2000 @@ -0,0 +1,1894 @@ +/* + * xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek + * + * + * optimized RAID-5 checksumming functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#ifdef __sparc_v9__ +#include +#include +#include +#endif + +/* + * we use the 'XOR function template' to register multiple xor + * functions runtime. The kernel measures their speed upon bootup + * and decides which one to use. (compile-time registration is + * not enough as certain CPU features like MMX can only be detected + * runtime) + * + * this architecture makes it pretty easy to add new routines + * that are faster on certain CPUs, without killing other CPU's + * 'native' routine. Although the current routines are belived + * to be the physically fastest ones on all CPUs tested, but + * feel free to prove me wrong and add yet another routine =B-) + * --mingo + */ + +#define MAX_XOR_BLOCKS 5 + +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) + +typedef void (*xor_block_t) XOR_ARGS; +xor_block_t xor_block = NULL; + +#ifndef __sparc_v9__ + +struct xor_block_template; + +struct xor_block_template { + char * name; + xor_block_t xor_block; + int speed; + struct xor_block_template * next; +}; + +struct xor_block_template * xor_functions = NULL; + +#define XORBLOCK_TEMPLATE(x) \ +static void xor_block_##x XOR_ARGS; \ +static struct xor_block_template t_xor_block_##x = \ + { #x, xor_block_##x, 0, NULL }; \ +static void xor_block_##x XOR_ARGS + +#ifdef __i386__ + +#ifdef CONFIG_X86_XMM +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +XORBLOCK_TEMPLATE(pIII_kni) +{ + char xmm_save[16*4]; + int cr0; + int lines = (bh_ptr[0]->b_size>>8); + + __asm__ __volatile__ ( + "movl %%cr0,%0 ;\n\t" + "clts ;\n\t" + "movups %%xmm0,(%1) ;\n\t" + "movups %%xmm1,0x10(%1) ;\n\t" + "movups %%xmm2,0x20(%1) ;\n\t" + "movups %%xmm3,0x30(%1) ;\n\t" + : "=r" (cr0) + : "r" (xmm_save) + : "memory" ); + +#define OFFS(x) "8*("#x"*2)" +#define PF0(x) \ + " prefetcht0 "OFFS(x)"(%1) ;\n" +#define LD(x,y) \ + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" +#define ST(x,y) \ + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" +#define PF1(x) \ + " prefetchnta "OFFS(x)"(%2) ;\n" +#define PF2(x) \ + " prefetchnta "OFFS(x)"(%3) ;\n" +#define PF3(x) \ + " prefetchnta "OFFS(x)"(%4) ;\n" +#define PF4(x) \ + " prefetchnta "OFFS(x)"(%5) ;\n" +#define PF5(x) \ + " prefetchnta "OFFS(x)"(%6) ;\n" +#define XO1(x,y) \ + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" +#define XO2(x,y) \ + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" +#define XO3(x,y) \ + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" +#define XO4(x,y) \ + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" +#define XO5(x,y) \ + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + PF1(i) \ + PF1(i+2) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + PF4(i) \ + PF4(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + XO4(i+1,1) \ + XO4(i+2,2) \ + XO4(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " addl $256, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( + "sfence ;\n\t" + "movups (%1),%%xmm0 ;\n\t" + "movups 0x10(%1),%%xmm1 ;\n\t" + "movups 0x20(%1),%%xmm2 ;\n\t" + "movups 0x30(%1),%%xmm3 ;\n\t" + "movl %0,%%cr0 ;\n\t" + : + : "r" (cr0), "r" (xmm_save) + : "memory" ); +} + +#undef OFFS +#undef LD +#undef ST +#undef PF0 +#undef PF1 +#undef PF2 +#undef PF3 +#undef PF4 +#undef PF5 +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef XO5 +#undef BLOCK + +#endif /* CONFIG_X86_XMM */ + +/* + * high-speed RAID5 checksumming functions utilizing MMX instructions + * Copyright (C) 1998 Ingo Molnar + */ +XORBLOCK_TEMPLATE(pII_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>7); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + +#define LD(x,y) \ + " movq 8*("#x")(%1), %%mm"#y" ;\n" +#define ST(x,y) \ + " movq %%mm"#y", 8*("#x")(%1) ;\n" +#define XO1(x,y) \ + " pxor 8*("#x")(%2), %%mm"#y" ;\n" +#define XO2(x,y) \ + " pxor 8*("#x")(%3), %%mm"#y" ;\n" +#define XO3(x,y) \ + " pxor 8*("#x")(%4), %%mm"#y" ;\n" +#define XO4(x,y) \ + " pxor 8*("#x")(%5), %%mm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + ST(i,0) \ + XO1(i+1,1) \ + ST(i+1,1) \ + XO1(i+2,2) \ + ST(i+2,2) \ + XO1(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory"); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + ST(i,0) \ + XO2(i+1,1) \ + ST(i+1,1) \ + XO2(i+2,2) \ + ST(i+2,2) \ + XO2(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory"); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + ST(i,0) \ + XO3(i+1,1) \ + ST(i+1,1) \ + XO3(i+2,2) \ + ST(i+2,2) \ + XO3(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory"); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + ST(i,0) \ + XO4(i+1,1) \ + ST(i+1,1) \ + XO4(i+2,2) \ + ST(i+2,2) \ + XO4(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " addl $128, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef BLOCK + +XORBLOCK_TEMPLATE(p5_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>6); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + + switch(count) { + case 2: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq 40(%1), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor (%4), %%mm0 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " pxor 16(%4), %%mm2 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 24(%4), %%mm3 ;\n" + " movq %%mm3, 24(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq 48(%1), %%mm6 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor (%4), %%mm0 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor (%5), %%mm0 ;\n" + " pxor 8(%5), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 16(%4), %%mm2 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%5), %%mm2 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%4), %%mm3 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%5), %%mm3 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 32(%5), %%mm4 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " pxor 40(%5), %%mm5 ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%5), %%mm6 ;\n" + " pxor 56(%5), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " addl $64, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory" ); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} +#endif /* __i386__ */ +#endif /* !__sparc_v9__ */ + +#ifdef __sparc_v9__ +/* + * High speed xor_block operation for RAID4/5 utilizing the + * UltraSparc Visual Instruction Set. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + * Requirements: + * !(((long)dest | (long)sourceN) & (64 - 1)) && + * !(len & 127) && len >= 256 + * + * It is done in pure assembly, as otherwise gcc makes it + * a non-leaf function, which is not what we want. + * Also, we don't measure the speeds as on other architectures, + * as the measuring routine does not take into account cold caches + * and the fact that xor_block_VIS bypasses the caches. + * xor_block_32regs might be 5% faster for count 2 if caches are hot + * and things just right (for count 3 VIS is about as fast as 32regs for + * hot caches and for count 4 and 5 VIS is faster by good margin always), + * but I think it is better not to pollute the caches. + * Actually, if I'd just fight for speed for hot caches, I could + * write a hybrid VIS/integer routine, which would do always two + * 64B blocks in VIS and two in IEUs, but I really care more about + * caches. + */ +extern void *VISenter(void); +extern void xor_block_VIS XOR_ARGS; + +void __xor_block_VIS(void) +{ +__asm__ (" + .globl xor_block_VIS +xor_block_VIS: + ldx [%%o1 + 0], %%o4 + ldx [%%o1 + 8], %%o3 + ldx [%%o4 + %1], %%g5 + ldx [%%o4 + %0], %%o4 + ldx [%%o3 + %0], %%o3 + rd %%fprs, %%o5 + andcc %%o5, %2, %%g0 + be,pt %%icc, 297f + sethi %%hi(%5), %%g1 + jmpl %%g1 + %%lo(%5), %%g7 + add %%g7, 8, %%g7 +297: wr %%g0, %4, %%fprs + membar #LoadStore|#StoreLoad|#StoreStore + sub %%g5, 64, %%g5 + ldda [%%o4] %3, %%f0 + ldda [%%o3] %3, %%f16 + cmp %%o0, 4 + bgeu,pt %%xcc, 10f + cmp %%o0, 3 + be,pn %%xcc, 13f + mov -64, %%g1 + sub %%g5, 64, %%g5 + rd %%asi, %%g1 + wr %%g0, %3, %%asi + +2: ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + ldda [%%o4 + 128] %%asi, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + add %%o4, 128, %%o4 + fxor %%f36, %%f52, %%f52 + add %%o3, 128, %%o3 + fxor %%f38, %%f54, %%f54 + subcc %%g5, 128, %%g5 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 - 64] %%asi + bne,pt %%xcc, 2b + ldda [%%o3] %3, %%f16 + + ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + 64] %%asi + membar #Sync|#StoreStore|#StoreLoad + wr %%g0, 0, %%fprs + retl + wr %%g1, %%g0, %%asi + +13: ldx [%%o1 + 16], %%o2 + ldx [%%o2 + %0], %%o2 + +3: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 3b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +10: cmp %%o0, 5 + be,pt %%xcc, 15f + mov -64, %%g1 + +14: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + +4: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + add %%o2, 64, %%o2 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + ldda [%%o4] %3, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + add %%o0, 64, %%o0 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 4b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +15: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o1 + 32], %%o1 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + ldx [%%o1 + %0], %%o1 + +5: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + add %%o0, 64, %%o0 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + add %%o1, 64, %%o1 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 5b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + " : : + "i" (&((struct buffer_head *)0)->b_data), + "i" (&((struct buffer_head *)0)->b_size), + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), + "i" (FPRS_FEF), "i" (VISenter)); +} +#endif /* __sparc_v9__ */ + +#if defined(__sparc__) && !defined(__sparc_v9__) +/* + * High speed xor_block operation for RAID4/5 utilizing the + * ldd/std SPARC instructions. + * + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + */ + +XORBLOCK_TEMPLATE(SPARC) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1 = (long *) bh_ptr[1]->b_data; + long *source2, *source3, *source4; + + switch (count) { + case 2: + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + } + break; + case 4: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + } + break; + case 5: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source4 = (long *) bh_ptr[4]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%4 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%4 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%4 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%4 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + } + break; + } +} +#endif /* __sparc_v[78]__ */ + +#ifndef __sparc_v9__ + +/* + * this one works reasonably on any x86 CPU + * (send me an assembly version for inclusion if you can make it faster) + * + * this one is just as fast as written in pure assembly on x86. + * the reason for this separate version is that the + * fast open-coded xor routine "32reg" produces suboptimal code + * on x86, due to lack of registers. + */ +XORBLOCK_TEMPLATE(8regs) +{ + int len = bh_ptr[0]->b_size; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + long lines = len / (sizeof (long)) / 8, i; + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 7) ^= *(source1 + 7); + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 0) ^= *(source4 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 1) ^= *(source4 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 2) ^= *(source4 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 3) ^= *(source4 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 4) ^= *(source4 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 5) ^= *(source4 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 6) ^= *(source4 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + *(destp + 7) ^= *(source4 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * platform independent RAID5 checksum calculation, this should + * be very fast on any platform that has a decent amount of + * registers. (32 or more) + */ +XORBLOCK_TEMPLATE(32regs) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + + /* LOTS of registers available... + We do explicite loop-unrolling here for code which + favours RISC machines. In fact this is almoast direct + RISC assembly on Alpha and SPARC :-) */ + + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + d0 ^= source4[0]; + d1 ^= source4[1]; + d2 ^= source4[2]; + d3 ^= source4[3]; + d4 ^= source4[4]; + d5 ^= source4[5]; + d6 ^= source4[6]; + d7 ^= source4[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * (the -6*32 shift factor colors the cache) + */ +#define SIZE (PAGE_SIZE-6*32) + +static void xor_speed ( struct xor_block_template * func, + struct buffer_head *b1, struct buffer_head *b2) +{ + int speed; + unsigned long now; + int i, count, max; + struct buffer_head *bh_ptr[6]; + + func->next = xor_functions; + xor_functions = func; + bh_ptr[0] = b1; + bh_ptr[1] = b2; + + /* + * count the number of XORs done during a whole jiffy. + * calculate the speed of checksumming from this. + * (we use a 2-page allocation to have guaranteed + * color L1-cache layout) + */ + max = 0; + for (i = 0; i < 5; i++) { + now = jiffies; + count = 0; + while (jiffies == now) { + mb(); + func->xor_block(2,bh_ptr); + mb(); + count++; + mb(); + } + if (count > max) + max = count; + } + + speed = max * (HZ*SIZE/1024); + func->speed = speed; + + printk( " %-10s: %5d.%03d MB/sec\n", func->name, + speed / 1000, speed % 1000); +} + +static inline void pick_fastest_function(void) +{ + struct xor_block_template *f, *fastest; + + fastest = xor_functions; + for (f = fastest; f; f = f->next) { + if (f->speed > fastest->speed) + fastest = f; + } +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + fastest = &t_xor_block_pIII_kni; + } +#endif + xor_block = fastest->xor_block; + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, + fastest->speed / 1000, fastest->speed % 1000); +} + + +void calibrate_xor_block(void) +{ + struct buffer_head b1, b2; + + memset(&b1,0,sizeof(b1)); + b2 = b1; + + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); + if (!b1.b_data) { + pick_fastest_function(); + return; + } + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; + + b1.b_size = SIZE; + + printk(KERN_INFO "raid5: measuring checksumming speed\n"); + + sti(); /* should be safe */ + +#if defined(__sparc__) && !defined(__sparc_v9__) + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); + xor_speed(&t_xor_block_SPARC,&b1,&b2); +#endif + +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + printk(KERN_INFO + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); + /* we force the use of the KNI xor block because it + can write around l2. we may also be able + to load into the l1 only depending on how + the cpu deals with a load to a line that is + being prefetched. + */ + xor_speed(&t_xor_block_pIII_kni,&b1,&b2); + } +#endif /* CONFIG_X86_XMM */ + +#ifdef __i386__ + + if (md_cpu_has_mmx()) { + printk(KERN_INFO + "raid5: MMX detected, trying high-speed MMX checksum routines\n"); + xor_speed(&t_xor_block_pII_mmx,&b1,&b2); + xor_speed(&t_xor_block_p5_mmx,&b1,&b2); + } + +#endif /* __i386__ */ + + + xor_speed(&t_xor_block_8regs,&b1,&b2); + xor_speed(&t_xor_block_32regs,&b1,&b2); + + free_pages((unsigned long)b1.b_data,2); + pick_fastest_function(); +} + +#else /* __sparc_v9__ */ + +void calibrate_xor_block(void) +{ + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); + xor_block = xor_block_VIS; +} + +#endif /* __sparc_v9__ */ + +MD_EXPORT_SYMBOL(xor_block); + diff -ruN linux.orig/include/asm-alpha/md.h linux-2.2.16/include/asm-alpha/md.h --- linux.orig/include/asm-alpha/md.h Fri May 8 09:17:13 1998 +++ linux-2.2.16/include/asm-alpha/md.h Thu Jan 1 01:00:00 1970 @@ -1,13 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ diff -ruN linux.orig/include/asm-i386/md.h linux-2.2.16/include/asm-i386/md.h --- linux.orig/include/asm-i386/md.h Fri May 8 09:17:13 1998 +++ linux-2.2.16/include/asm-i386/md.h Thu Jan 1 01:00:00 1970 @@ -1,13 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ diff -ruN linux.orig/include/asm-m68k/md.h linux-2.2.16/include/asm-m68k/md.h --- linux.orig/include/asm-m68k/md.h Fri May 8 09:15:22 1998 +++ linux-2.2.16/include/asm-m68k/md.h Thu Jan 1 01:00:00 1970 @@ -1,13 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ diff -ruN linux.orig/include/asm-ppc/md.h linux-2.2.16/include/asm-ppc/md.h --- linux.orig/include/asm-ppc/md.h Wed Oct 27 02:53:42 1999 +++ linux-2.2.16/include/asm-ppc/md.h Thu Jan 1 01:00:00 1970 @@ -1,13 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ diff -ruN linux.orig/include/asm-sparc/md.h linux-2.2.16/include/asm-sparc/md.h --- linux.orig/include/asm-sparc/md.h Tue Jan 13 00:15:54 1998 +++ linux-2.2.16/include/asm-sparc/md.h Thu Jan 1 01:00:00 1970 @@ -1,13 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ diff -ruN linux.orig/include/asm-sparc64/md.h linux-2.2.16/include/asm-sparc64/md.h --- linux.orig/include/asm-sparc64/md.h Tue Jan 13 00:15:58 1998 +++ linux-2.2.16/include/asm-sparc64/md.h Thu Jan 1 01:00:00 1970 @@ -1,91 +0,0 @@ -/* $Id$ - * md.h: High speed xor_block operation for RAID4/5 - * utilizing the UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -#include -#include - -#define HAVE_ARCH_XORBLOCK - -#define MD_XORBLOCK_ALIGNMENT 64 - -/* void __xor_block (char *dest, char *src, long len) - * { - * while (len--) *dest++ ^= *src++; - * } - * - * Requirements: - * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) && - * !(len & 127) && len >= 256 - */ - -static inline void __xor_block (char *dest, char *src, long len) -{ - __asm__ __volatile__ (" - wr %%g0, %3, %%fprs - wr %%g0, %4, %%asi - membar #LoadStore|#StoreLoad|#StoreStore - sub %2, 128, %2 - ldda [%0] %4, %%f0 - ldda [%1] %4, %%f16 -1: ldda [%0 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%0] %4 - ldda [%1 + 64] %%asi, %%f48 - ldda [%0 + 128] %%asi, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - add %0, 128, %0 - fxor %%f36, %%f52, %%f52 - add %1, 128, %1 - fxor %%f38, %%f54, %%f54 - subcc %2, 128, %2 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%0 - 64] %%asi - bne,pt %%xcc, 1b - ldda [%1] %4, %%f16 - ldda [%0 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%0] %4 - ldda [%1 + 64] %%asi, %%f48 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%0 + 64] %%asi - membar #Sync|#StoreStore|#StoreLoad - wr %%g0, 0, %%fprs - " : : - "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) : - "cc", "memory"); -} - -#endif /* __ASM_MD_H */ Binary files linux.orig/include/linux/.sysctl.h.rej.swp and linux-2.2.16/include/linux/.sysctl.h.rej.swp differ diff -ruN linux.orig/include/linux/blkdev.h linux-2.2.16/include/linux/blkdev.h --- linux.orig/include/linux/blkdev.h Wed Jun 7 23:26:44 2000 +++ linux-2.2.16/include/linux/blkdev.h Fri Jun 9 11:37:44 2000 @@ -93,8 +93,9 @@ extern void make_request(int major,int rw, struct buffer_head * bh); /* md needs this function to remap requests */ -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size); -extern int md_make_request (int minor, int rw, struct buffer_head * bh); +extern int md_map (kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); +extern int md_make_request (struct buffer_head * bh, int rw); extern int md_error (kdev_t mddev, kdev_t rdev); extern int * blk_size[MAX_BLKDEV]; diff -ruN linux.orig/include/linux/fs.h linux-2.2.16/include/linux/fs.h --- linux.orig/include/linux/fs.h Wed Jun 7 23:26:44 2000 +++ linux-2.2.16/include/linux/fs.h Fri Jun 9 11:37:44 2000 @@ -185,6 +185,7 @@ #define BH_Lock 2 /* 1 if the buffer is locked */ #define BH_Req 3 /* 0 if the buffer has been invalidated */ #define BH_Protected 6 /* 1 if the buffer is protected */ +#define BH_LowPrio 7 /* 1 if the buffer is lowprio */ /* * Try to keep the most commonly used fields in single cache lines (16 @@ -755,6 +756,7 @@ extern void refile_buffer(struct buffer_head * buf); extern void set_writetime(struct buffer_head * buf, int flag); extern int try_to_free_buffers(struct page *); +extern void cache_drop_behind(struct buffer_head *bh); extern int nr_buffers; extern long buffermem; @@ -775,6 +777,25 @@ } } +extern inline void mark_buffer_highprio(struct buffer_head * bh) +{ + clear_bit(BH_LowPrio, &bh->b_state); +} + +extern inline void mark_buffer_lowprio(struct buffer_head * bh) +{ + /* + * dirty buffers cannot be marked lowprio. + */ + if (!buffer_dirty(bh)) + set_bit(BH_LowPrio, &bh->b_state); +} + +static inline int buffer_lowprio(struct buffer_head * bh) +{ + return test_bit(BH_LowPrio, &bh->b_state); +} + extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag) { if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { @@ -782,6 +803,23 @@ if (bh->b_list != BUF_DIRTY) refile_buffer(bh); } + /* + * if a buffer gets marked dirty then it has to lose + * it's lowprio state. + */ + mark_buffer_highprio(bh); +} + +extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh) +{ + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { + if (bh->b_list != BUF_DIRTY) + refile_buffer(bh); + /* + * Mark it lowprio only if it was not dirty before! + */ + set_bit(BH_LowPrio, &bh->b_state); + } } extern int check_disk_change(kdev_t dev); @@ -855,6 +893,7 @@ extern struct buffer_head * find_buffer(kdev_t dev, int block, int size); extern void ll_rw_block(int, int, struct buffer_head * bh[]); extern int is_read_only(kdev_t); +extern int is_device_idle(kdev_t); extern void __brelse(struct buffer_head *); extern inline void brelse(struct buffer_head *buf) { @@ -870,8 +909,12 @@ extern void set_blocksize(kdev_t dev, int size); extern unsigned int get_hardblocksize(kdev_t dev); extern struct buffer_head * bread(kdev_t dev, int block, int size); +extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size); +extern void bread_ahead (kdev_t dev, int block, int size); extern struct buffer_head * breada(kdev_t dev,int block, int size, unsigned int pos, unsigned int filesize); +extern struct buffer_head * breada_blocks(kdev_t dev,int block, + int size, int blocks); extern int brw_page(int, struct page *, kdev_t, int [], int, int); diff -ruN linux.orig/include/linux/md.h linux-2.2.16/include/linux/md.h --- linux.orig/include/linux/md.h Fri May 8 09:17:13 1998 +++ linux-2.2.16/include/linux/md.h Thu Jan 1 01:00:00 1970 @@ -1,300 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include -#include - -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 36 -#define MD_PATCHLEVEL_VERSION 6 - -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024) - -/* ioctls */ -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4) - -/* - personalities : - Byte 0 : Chunk size factor - Byte 1 : Fault tolerance count for each physical device - ( 0 means no fault tolerance, - 0xFF means always tolerate faults), not used by now. - Byte 2 : Personality - Byte 3 : Reserved. - */ - -#define FAULT_SHIFT 8 -#define PERSONALITY_SHIFT 16 - -#define FACTOR_MASK 0x000000FFUL -#define FAULT_MASK 0x0000FF00UL -#define PERSONALITY_MASK 0x00FF0000UL - -#define MD_RESERVED 0 /* Not used by now */ -#define LINEAR (1UL << PERSONALITY_SHIFT) -#define STRIPED (2UL << PERSONALITY_SHIFT) -#define RAID0 STRIPED -#define RAID1 (3UL << PERSONALITY_SHIFT) -#define RAID5 (4UL << PERSONALITY_SHIFT) -#define MAX_PERSONALITY 5 - -/* - * MD superblock. - * - * The MD superblock maintains some statistics on each MD configuration. - * Each real device in the MD set contains it near the end of the device. - * Some of the ideas are copied from the ext2fs implementation. - * - * We currently use 4096 bytes as follows: - * - * word offset function - * - * 0 - 31 Constant generic MD device information. - * 32 - 63 Generic state information. - * 64 - 127 Personality specific information. - * 128 - 511 12 32-words descriptors of the disks in the raid set. - * 512 - 911 Reserved. - * 912 - 1023 Disk specific descriptor. - */ - -/* - * If x is the real device size in bytes, we return an apparent size of: - * - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES - * - * and place the 4kB superblock at offset y. - */ -#define MD_RESERVED_BYTES (64 * 1024) -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) - -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) - -#define MD_SB_BYTES 4096 -#define MD_SB_WORDS (MD_SB_BYTES / 4) -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) -#define MD_SB_SECTORS (MD_SB_BYTES / 512) - -/* - * The following are counted in 32-bit words - */ -#define MD_SB_GENERIC_OFFSET 0 -#define MD_SB_PERSONALITY_OFFSET 64 -#define MD_SB_DISKS_OFFSET 128 -#define MD_SB_DESCRIPTOR_OFFSET 992 - -#define MD_SB_GENERIC_CONSTANT_WORDS 32 -#define MD_SB_GENERIC_STATE_WORDS 32 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) -#define MD_SB_PERSONALITY_WORDS 64 -#define MD_SB_DISKS_WORDS 384 -#define MD_SB_DESCRIPTOR_WORDS 32 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) - -/* - * Device "operational" state bits - */ -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */ -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */ -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */ - -typedef struct md_device_descriptor_s { - __u32 number; /* 0 Device number in the entire set */ - __u32 major; /* 1 Device major number */ - __u32 minor; /* 2 Device minor number */ - __u32 raid_disk; /* 3 The role of the device in the raid set */ - __u32 state; /* 4 Operational state */ - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; -} md_descriptor_t; - -#define MD_SB_MAGIC 0xa92b4efc - -/* - * Superblock state bits - */ -#define MD_SB_CLEAN 0 -#define MD_SB_ERRORS 1 - -typedef struct md_superblock_s { - - /* - * Constant generic information - */ - __u32 md_magic; /* 0 MD identifier */ - __u32 major_version; /* 1 major version to which the set conforms */ - __u32 minor_version; /* 2 minor version to which the set conforms */ - __u32 patch_version; /* 3 patchlevel version to which the set conforms */ - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */ - __u32 set_magic; /* 5 Raid set identifier */ - __u32 ctime; /* 6 Creation time */ - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */ - __u32 size; /* 8 Apparent size of each individual disk, in kB */ - __u32 nr_disks; /* 9 Number of total disks in the raid set */ - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */ - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11]; - - /* - * Generic state information - */ - __u32 utime; /* 0 Superblock update time */ - __u32 state; /* 1 State bits (clean, ...) */ - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */ - __u32 working_disks; /* 3 Number of working disks */ - __u32 failed_disks; /* 4 Number of failed disks */ - __u32 spare_disks; /* 5 Number of spare disks */ - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6]; - - /* - * Personality information - */ - __u32 parity_algorithm; - __u32 chunk_size; - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2]; - - /* - * Disks information - */ - md_descriptor_t disks[MD_SB_DISKS]; - - /* - * Reserved - */ - __u32 reserved[MD_SB_RESERVED_WORDS]; - - /* - * Active descriptor - */ - md_descriptor_t descriptor; -} md_superblock_t; - -#ifdef __KERNEL__ - -#include -#include -#include -#include - -/* - * Kernel-based reconstruction is mostly working, but still requires - * some additional work. - */ -#define SUPPORT_RECONSTRUCTION 0 - -#define MAX_REAL 8 /* Max number of physical dev per md dev */ -#define MAX_MD_DEV 4 /* Max number of md dev */ - -#define FACTOR(a) ((a)->repartition & FACTOR_MASK) -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8) -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK) - -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10) - -struct real_dev -{ - kdev_t dev; /* Device number */ - int size; /* Device size (in blocks) */ - int offset; /* Real device offset (in blocks) in md dev - (only used in linear mode) */ - struct inode *inode; /* Lock inode */ - md_superblock_t *sb; - u32 sb_offset; -}; - -struct md_dev; - -#define SPARE_INACTIVE 0 -#define SPARE_WRITE 1 -#define SPARE_ACTIVE 2 - -struct md_personality -{ - char *name; - int (*map)(struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size); - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh); - void (*end_request)(struct buffer_head * bh, int uptodate); - int (*run)(int minor, struct md_dev *mddev); - int (*stop)(int minor, struct md_dev *mddev); - int (*status)(char *page, int minor, struct md_dev *mddev); - int (*ioctl)(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); - int max_invalid_dev; - int (*error_handler)(struct md_dev *mddev, kdev_t dev); - -/* - * Some personalities (RAID-1, RAID-5) can get disks hot-added and - * hot-removed. Hot removal is different from failure. (failure marks - * a disk inactive, but the disk is still part of the array) - */ - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev); - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev); - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state); -}; - -struct md_dev -{ - struct real_dev devices[MAX_REAL]; - struct md_personality *pers; - md_superblock_t *sb; - int sb_dirty; - int repartition; - int busy; - int nb_dev; - void *private; -}; - -struct md_thread { - void (*run) (void *data); - void *data; - struct wait_queue *wqueue; - unsigned long flags; - struct semaphore *sem; - struct task_struct *tsk; -}; - -#define THREAD_WAKEUP 0 - -extern struct md_dev md_dev[MAX_MD_DEV]; -extern int md_size[MAX_MD_DEV]; -extern int md_maxreadahead[MAX_MD_DEV]; - -extern char *partition_name (kdev_t dev); - -extern int register_md_personality (int p_num, struct md_personality *p); -extern int unregister_md_personality (int p_num); -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data); -extern void md_unregister_thread (struct md_thread *thread); -extern void md_wakeup_thread(struct md_thread *thread); -extern int md_update_sb (int minor); -extern int md_do_sync(struct md_dev *mddev); - -#endif __KERNEL__ -#endif _MD_H diff -ruN linux.orig/include/linux/raid/hsm.h linux-2.2.16/include/linux/raid/hsm.h --- linux.orig/include/linux/raid/hsm.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/hsm.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,65 @@ +#ifndef _HSM_H +#define _HSM_H + +#include + +#if __alpha__ +#error fix cpu_addr on Alpha first +#endif + +#include + +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr) +#define index_dev(lv,index) index_pv((lv),(index))->dev +#define index_block(lv,index) (index)->data.phys_block +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr)) + +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr)) + + +typedef struct pv_bg_desc_s { + unsigned int free_blocks; + pv_block_group_t *bg; +} pv_bg_desc_t; + +typedef struct pv_s pv_t; +typedef struct vg_s vg_t; +typedef struct lv_s lv_t; + +struct pv_s +{ + int phys_nr; + kdev_t dev; + pv_sb_t *pv_sb; + pv_bg_desc_t *bg_array; +}; + +struct lv_s +{ + int log_id; + vg_t *vg; + + unsigned int max_indices; + unsigned int free_indices; + lv_lptr_t root_index; + + kdev_t dev; +}; + +struct vg_s +{ + int nr_pv; + pv_t pv_array [MD_SB_DISKS]; + + int nr_lv; + lv_t lv_array [HSM_MAX_LVS_PER_VG]; + + vg_sb_t *vg_sb; + mddev_t *mddev; +}; + +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data) +#define mddev_to_vg(mddev) ((vg_t *) mddev->private) + +#endif + diff -ruN linux.orig/include/linux/raid/hsm_p.h linux-2.2.16/include/linux/raid/hsm_p.h --- linux.orig/include/linux/raid/hsm_p.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/hsm_p.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,237 @@ +#ifndef _HSM_P_H +#define _HSM_P_H + +#define HSM_BLOCKSIZE 4096 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4) +#define PACKED __attribute__ ((packed)) + +/* + * Identifies a block in physical space + */ +typedef struct phys_idx_s { + __u16 phys_nr; + __u32 phys_block; + +} PACKED phys_idx_t; + +/* + * Identifies a block in logical space + */ +typedef struct log_idx_s { + __u16 log_id; + __u32 log_index; + +} PACKED log_idx_t; + +/* + * Describes one PV + */ +#define HSM_PV_SB_MAGIC 0xf091ae9fU + +#define HSM_PV_SB_GENERIC_WORDS 32 +#define HSM_PV_SB_RESERVED_WORDS \ + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS) + +/* + * On-disk PV identification data, on block 0 in any PV. + */ +typedef struct pv_sb_s +{ + __u32 pv_magic; /* 0 */ + + __u32 pv_uuid0; /* 1 */ + __u32 pv_uuid1; /* 2 */ + __u32 pv_uuid2; /* 3 */ + __u32 pv_uuid3; /* 4 */ + + __u32 pv_major; /* 5 */ + __u32 pv_minor; /* 6 */ + __u32 pv_patch; /* 7 */ + + __u32 pv_ctime; /* 8 Creation time */ + + __u32 pv_total_size; /* 9 size of this PV, in blocks */ + __u32 pv_first_free; /* 10 first free block */ + __u32 pv_first_used; /* 11 first used block */ + __u32 pv_blocks_left; /* 12 unallocated blocks */ + __u32 pv_bg_size; /* 13 size of a block group, in blocks */ + __u32 pv_block_size; /* 14 size of blocks, in bytes */ + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */ + __u32 pv_block_groups; /* 16 number of block groups */ + + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17]; + + /* + * Reserved + */ + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS]; + +} PACKED pv_sb_t; + +/* + * this is pretty much arbitrary, but has to be less than ~64 + */ +#define HSM_MAX_LVS_PER_VG 32 + +#define HSM_VG_SB_GENERIC_WORDS 32 + +#define LV_DESCRIPTOR_WORDS 8 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \ + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS) + +#if (HSM_PV_SB_RESERVED_WORDS < 0) +#error you messed this one up dude ... +#endif + +typedef struct lv_descriptor_s +{ + __u32 lv_id; /* 0 */ + phys_idx_t lv_root_idx; /* 1 */ + __u16 __reserved; /* 2 */ + __u32 lv_max_indices; /* 3 */ + __u32 lv_free_indices; /* 4 */ + __u32 md_id; /* 5 */ + + __u32 reserved[LV_DESCRIPTOR_WORDS - 6]; + +} PACKED lv_descriptor_t; + +#define HSM_VG_SB_MAGIC 0x98320d7aU +/* + * On-disk VG identification data, in block 1 on all PVs + */ +typedef struct vg_sb_s +{ + __u32 vg_magic; /* 0 */ + __u32 nr_lvs; /* 1 */ + + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2]; + + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG]; + /* + * Reserved + */ + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS]; + +} PACKED vg_sb_t; + +/* + * Describes one LV + */ + +#define HSM_LV_SB_MAGIC 0xe182bd8aU + +/* do we need lv_sb_t? */ + +typedef struct lv_sb_s +{ + /* + * On-disk LV identifier + */ + __u32 lv_magic; /* 0 LV identifier */ + __u32 lv_uuid0; /* 1 */ + __u32 lv_uuid1; /* 2 */ + __u32 lv_uuid2; /* 3 */ + __u32 lv_uuid3; /* 4 */ + + __u32 lv_major; /* 5 PV identifier */ + __u32 lv_minor; /* 6 PV identifier */ + __u32 lv_patch; /* 7 PV identifier */ + + __u32 ctime; /* 8 Creation time */ + __u32 size; /* 9 size of this LV, in blocks */ + phys_idx_t start; /* 10 position of root index block */ + log_idx_t first_free; /* 11-12 first free index */ + + /* + * Reserved + */ + __u32 reserved[HSM_BLOCKSIZE_WORDS-13]; + +} PACKED lv_sb_t; + +/* + * Pointer pointing from the physical space, points to + * the LV owning this block. It also contains various + * statistics about the physical block. + */ +typedef struct pv_pptr_s +{ + union { + /* case 1 */ + struct { + log_idx_t owner; + log_idx_t predicted; + __u32 last_referenced; + } used; + /* case 2 */ + struct { + __u16 log_id; + __u16 __unused1; + __u32 next_free; + __u32 __unused2; + __u32 __unused3; + } free; + } u; +} PACKED pv_pptr_t; + +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr) +{ + return !pptr->u.free.log_id; +} + + +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1)) + +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1) +/* + * A table of pointers filling up a single block, managing + * the next DATA_BLOCKS_PER_BG physical blocks. Such block + * groups form the physical space of blocks. + */ +typedef struct pv_block_group_s +{ + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8]; + + pv_pptr_t blocks[DATA_BLOCKS_PER_BG]; + +} PACKED pv_block_group_t; + +/* + * Pointer from the logical space, points to + * the (PV,block) containing this logical block + */ +typedef struct lv_lptr_s +{ + phys_idx_t data; + __u16 __reserved; + __u32 cpu_addr; + __u32 __reserved2; + +} PACKED lv_lptr_t; + +static __inline__ int index_free (const lv_lptr_t * index) +{ + return !index->data.phys_block; +} + +static __inline__ int index_present (const lv_lptr_t * index) +{ + return index->cpu_addr; +} + + +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t)) +/* + * A table of pointers filling up a single block, managing + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form + * the logical space of blocks. + */ +typedef struct lv_index_block_s +{ + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK]; + +} PACKED lv_index_block_t; + +#endif + diff -ruN linux.orig/include/linux/raid/linear.h linux-2.2.16/include/linux/raid/linear.h --- linux.orig/include/linux/raid/linear.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/linear.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,32 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + kdev_t dev; + int size; + unsigned int offset; +}; + +typedef struct dev_info dev_info_t; + +struct linear_hash +{ + dev_info_t *dev0, *dev1; +}; + +struct linear_private_data +{ + struct linear_hash *hash_table; + dev_info_t disks[MD_SB_DISKS]; + dev_info_t *smallest; + int nr_zones; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff -ruN linux.orig/include/linux/raid/md.h linux-2.2.16/include/linux/raid/md.h --- linux.orig/include/linux/raid/md.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/md.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,96 @@ +/* + md.h : Multiple Devices driver for Linux + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + Copyright (C) 1994-96 Marc ZYNGIER + or + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_H +#define _MD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * 'md_p.h' holds the 'physical' layout of RAID devices + * 'md_u.h' holds the user <=> kernel API + * + * 'md_k.h' holds kernel internal definitions + */ + +#include +#include +#include + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +#define MD_PATCHLEVEL_VERSION 0 + +extern int md_size[MAX_MD_DEVS]; +extern struct hd_struct md_hd_struct[MAX_MD_DEVS]; + +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data); +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev); +extern char * partition_name (kdev_t dev); +extern int register_md_personality (int p_num, mdk_personality_t *p); +extern int unregister_md_personality (int p_num); +extern mdk_thread_t * md_register_thread (void (*run) (void *data), + void *data, const char *name); +extern void md_unregister_thread (mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_interrupt_thread (mdk_thread_t *thread); +extern int md_update_sb (mddev_t *mddev); +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare); +extern void md_recover_arrays (void); +extern int md_check_ordering (mddev_t *mddev); +extern void autodetect_raid(void); +extern struct gendisk * find_gendisk (kdev_t dev); +extern int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x); +#if CONFIG_BLK_DEV_MD +extern void raid_setup(char *str,int *ints) md__init; +#endif +#ifdef CONFIG_MD_BOOT +extern void md_setup(char *str,int *ints) md__init; +#endif + +extern void md_print_devices (void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + +#endif _MD_H + diff -ruN linux.orig/include/linux/raid/md_compatible.h linux-2.2.16/include/linux/raid/md_compatible.h --- linux.orig/include/linux/raid/md_compatible.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/md_compatible.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,387 @@ + +/* + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2 + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#ifndef _MD_COMPATIBLE_H +#define _MD_COMPATIBLE_H + +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s)) + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0) + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return x86_capability & 0x00800000; +} +#endif + +/* 002 */ +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE) + +/* 003 */ +/* + * someone please suggest a sane compatibility layer for modules + */ +#define MD_EXPORT_SYMBOL(x) + +/* 004 */ +static inline unsigned long +md_copy_from_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_READ,from,n); + if (!err) + memcpy_fromfs(to, from, n); + return err; +} + +/* 005 */ +extern inline unsigned long +md_copy_to_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_WRITE,to,n); + if (!err) + memcpy_tofs(to, from, n); + return err; +} + +/* 006 */ +#define md_put_user(x,ptr) \ +({ \ + int __err; \ + \ + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \ + if (!__err) \ + put_user(x,ptr); \ + __err; \ +}) + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return suser(); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + current->signal = 0; +} + +/* 010 */ +#define __S(nr) (1<<((nr)-1)) +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + current->blocked = ~(__S(SIGKILL)); +} +#undef __S + +/* 011 */ +extern inline unsigned long md_signal_pending (struct task_struct * tsk) +{ + return (tsk->signal & ~tsk->blocked); +} + +/* 012 */ +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD + +/* 013 */ +#define md_mdelay(n) (\ + {unsigned long msec=(n); while (msec--) udelay(1000);}) + +/* 014 */ +#define MD_SYS_DOWN 0 +#define MD_SYS_HALT 0 +#define MD_SYS_POWER_OFF 0 + +/* 015 */ +#define md_register_reboot_notifier(x) + +/* 016 */ +extern __inline__ unsigned long +md_test_and_set_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + set_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 017 */ +extern __inline__ unsigned long +md_test_and_clear_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + clear_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 018 */ +#define md_atomic_read(x) (*(volatile int *)(x)) +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y)) + +/* 019 */ +extern __inline__ void md_lock_kernel (void) +{ +#if __SMP__ + lock_kernel(); + syscall_count++; +#endif +} + +extern __inline__ void md_unlock_kernel (void) +{ +#if __SMP__ + syscall_count--; + unlock_kernel(); +#endif +} +/* 020 */ + +#define md__init +#define md__initdata +#define md__initfunc(__arginit) __arginit + +/* 021 */ + +/* 022 */ + +struct md_list_head { + struct md_list_head *next, *prev; +}; + +#define MD_LIST_HEAD(name) \ + struct md_list_head name = { &name, &name } + +#define MD_INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +static __inline__ void md__list_add(struct md_list_head * new, + struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static __inline__ void md_list_add(struct md_list_head *new, + struct md_list_head *head) +{ + md__list_add(new, head, head->next); +} + +static __inline__ void md__list_del(struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static __inline__ void md_list_del(struct md_list_head *entry) +{ + md__list_del(entry->prev, entry->next); +} + +static __inline__ int md_list_empty(struct md_list_head *head) +{ + return head->next == head; +} + +#define md_list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/* 023 */ + +static __inline__ signed long md_schedule_timeout(signed long timeout) +{ + current->timeout = jiffies + timeout; + schedule(); + return 0; +} + +/* 024 */ +#define md_need_resched(tsk) (need_resched) + +/* 025 */ +typedef struct { int gcc_is_buggy; } md_spinlock_t; +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 } + +#define md_spin_lock_irq cli +#define md_spin_unlock_irq sti +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags) +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0) + +/* END */ + +#else + +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* END */ + +#endif + +#endif _MD_COMPATIBLE_H + diff -ruN linux.orig/include/linux/raid/md_k.h linux-2.2.16/include/linux/raid/md_k.h --- linux.orig/include/linux/raid/md_k.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/md_k.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,338 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define STRIPED 2UL +#define RAID0 STRIPED +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MAX_PERSONALITY 7UL + +extern inline int pers_to_level (int pers) +{ + switch (pers) { + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + panic("pers_to_level()"); +} + +extern inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_REAL 12 /* Max number of disks per md dev */ +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +extern inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +extern inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +extern inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +extern inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +extern inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +extern inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +extern inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + int size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct inode *inode; /* Lock inode */ + struct file filp; /* Lock file */ + + mdp_super_t *sb; + int sb_offset; + + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_REMOVE_DISK 3 +#define DISKOP_HOT_ADD_DISK 4 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + int __minor; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + mdu_param_t param; + int ro; + unsigned int curr_resync; + unsigned long resync_start; + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + struct md_list_head all_mddevs; +}; + +struct mdk_personality_s +{ + char *name; + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); + void (*end_request)(struct buffer_head * bh, int uptodate); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*ioctl)(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + int max_invalid_dev; + int (*error_handler)(mddev_t *mddev, kdev_t dev); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); +}; + + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +extern inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +extern inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +extern inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +extern inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +typedef struct mdk_thread_s { + void (*run) (void *data); + void *data; + struct wait_queue *wqueue; + unsigned long flags; + struct semaphore *sem; + struct task_struct *tsk; + const char *name; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char name [MAX_DISKNAME_LEN]; +} dev_name_t; + +#endif _MD_K_H + diff -ruN linux.orig/include/linux/raid/md_p.h linux-2.2.16/include/linux/raid/md_p.h --- linux.orig/include/linux/raid/md_p.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/md_p.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,161 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DISKS_WORDS 384 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) +#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 14 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ + __u64 events; /* 7 number of superblock updates (64-bit!) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#endif _MD_P_H + diff -ruN linux.orig/include/linux/raid/md_u.h linux-2.2.16/include/linux/raid/md_u.h --- linux.orig/include/linux/raid/md_u.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/md_u.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,115 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif _MD_U_H + diff -ruN linux.orig/include/linux/raid/raid0.h linux-2.2.16/include/linux/raid/raid0.h --- linux.orig/include/linux/raid/raid0.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/raid0.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,33 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + int zone_offset; /* Zone offset in md_dev */ + int dev_offset; /* Zone offset in real dev */ + int size; /* Zone size */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */ +}; + +struct raid0_hash +{ + struct strip_zone *zone0, *zone1; +}; + +struct raid0_private_data +{ + struct raid0_hash *hash_table; /* Dynamically allocated */ + struct strip_zone *strip_zone; /* This one too */ + int nr_strip_zones; + struct strip_zone *smallest; + int nr_zones; +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif diff -ruN linux.orig/include/linux/raid/raid1.h linux-2.2.16/include/linux/raid/raid1.h --- linux.orig/include/linux/raid/raid1.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/raid1.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,64 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +struct mirror_info { + int number; + int raid_disk; + kdev_t dev; + int next; + int sect_limit; + + /* + * State bits: + */ + int operational; + int write_only; + int spare; + + int used_slot; +}; + +struct raid1_private_data { + mddev_t *mddev; + struct mirror_info mirrors[MD_SB_DISKS]; + int nr_disks; + int raid_disks; + int working_disks; + int last_used; + unsigned long next_sect; + int sect_count; + mdk_thread_t *thread, *resync_thread; + int resync_mirrors; + struct mirror_info *spare; +}; + +typedef struct raid1_private_data raid1_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' RAID1 buffer head. + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct raid1_bh { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + int cmd; + unsigned long state; + mddev_t *mddev; + struct buffer_head *master_bh; + struct buffer_head *mirror_bh [MD_SB_DISKS]; + struct buffer_head bh_req; + struct buffer_head *next_retry; +}; + +#endif diff -ruN linux.orig/include/linux/raid/raid5.h linux-2.2.16/include/linux/raid/raid5.h --- linux.orig/include/linux/raid/raid5.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/raid5.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,113 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +struct disk_info { + kdev_t dev; + int operational; + int number; + int raid_disk; + int write_only; + int spare; + int used_slot; +}; + +struct stripe_head { + md_spinlock_t stripe_lock; + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ + struct stripe_head *free_next; /* pool of free sh's */ + struct buffer_head *buffer_pool; /* pool of free buffers */ + struct buffer_head *bh_pool; /* pool of free bh's */ + struct raid5_private_data *raid_conf; + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ + unsigned long sector; /* sector of this row */ + int size; /* buffers size */ + int pd_idx; /* parity disk index */ + atomic_t nr_pending; /* nr of pending cmds */ + unsigned long state; /* state flags */ + int cmd; /* stripe cmd */ + int count; /* nr of waiters */ + int write_method; /* reconstruct-write / read-modify-write */ + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ + struct wait_queue *wait; /* processes waiting for this stripe */ +}; + +/* + * Phase + */ +#define PHASE_BEGIN 0 +#define PHASE_READ_OLD 1 +#define PHASE_WRITE 2 +#define PHASE_READ 3 +#define PHASE_COMPLETE 4 + +/* + * Write method + */ +#define METHOD_NONE 0 +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 + +/* + * Stripe state + */ +#define STRIPE_LOCKED 0 +#define STRIPE_ERROR 1 + +/* + * Stripe commands + */ +#define STRIPE_NONE 0 +#define STRIPE_WRITE 1 +#define STRIPE_READ 2 + +struct raid5_private_data { + struct stripe_head **stripe_hashtbl; + mddev_t *mddev; + mdk_thread_t *thread, *resync_thread; + struct disk_info disks[MD_SB_DISKS]; + struct disk_info *spare; + int buffer_size; + int chunk_size, level, algorithm; + int raid_disks, working_disks, failed_disks; + int sector_count; + unsigned long next_sector; + atomic_t nr_handle; + struct stripe_head *next_free_stripe; + int nr_stripes; + int resync_parity; + int max_nr_stripes; + int clock; + int nr_hashed_stripes; + int nr_locked_stripes; + int nr_pending_stripes; + int nr_cached_stripes; + + /* + * Free stripes pool + */ + int nr_free_sh; + struct stripe_head *free_sh_list; + struct wait_queue *wait_for_stripe; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif diff -ruN linux.orig/include/linux/raid/translucent.h linux-2.2.16/include/linux/raid/translucent.h --- linux.orig/include/linux/raid/translucent.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/translucent.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,23 @@ +#ifndef _TRANSLUCENT_H +#define _TRANSLUCENT_H + +#include + +typedef struct dev_info dev_info_t; + +struct dev_info { + kdev_t dev; + int size; +}; + +struct translucent_private_data +{ + dev_info_t disks[MD_SB_DISKS]; +}; + + +typedef struct translucent_private_data translucent_conf_t; + +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private) + +#endif diff -ruN linux.orig/include/linux/raid/xor.h linux-2.2.16/include/linux/raid/xor.h --- linux.orig/include/linux/raid/xor.h Thu Jan 1 01:00:00 1970 +++ linux-2.2.16/include/linux/raid/xor.h Fri Jun 9 11:37:44 2000 @@ -0,0 +1,12 @@ +#ifndef _XOR_H +#define _XOR_H + +#include + +#define MAX_XOR_BLOCKS 5 + +extern void calibrate_xor_block(void); +extern void (*xor_block)(unsigned int count, + struct buffer_head **bh_ptr); + +#endif diff -ruN linux.orig/include/linux/raid0.h linux-2.2.16/include/linux/raid0.h --- linux.orig/include/linux/raid0.h Tue Oct 29 14:20:24 1996 +++ linux-2.2.16/include/linux/raid0.h Thu Jan 1 01:00:00 1970 @@ -1,27 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -struct strip_zone -{ - int zone_offset; /* Zone offset in md_dev */ - int dev_offset; /* Zone offset in real dev */ - int size; /* Zone size */ - int nb_dev; /* Number of devices attached to the zone */ - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */ -}; - -struct raid0_hash -{ - struct strip_zone *zone0, *zone1; -}; - -struct raid0_data -{ - struct raid0_hash *hash_table; /* Dynamically allocated */ - struct strip_zone *strip_zone; /* This one too */ - int nr_strip_zones; - struct strip_zone *smallest; - int nr_zones; -}; - -#endif diff -ruN linux.orig/include/linux/raid1.h linux-2.2.16/include/linux/raid1.h --- linux.orig/include/linux/raid1.h Fri May 8 09:17:13 1998 +++ linux-2.2.16/include/linux/raid1.h Thu Jan 1 01:00:00 1970 @@ -1,49 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -struct mirror_info { - int number; - int raid_disk; - kdev_t dev; - int next; - int sect_limit; - - /* - * State bits: - */ - int operational; - int write_only; - int spare; -}; - -struct raid1_data { - struct md_dev *mddev; - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */ - int raid_disks; - int working_disks; /* Number of working disks */ - int last_used; - unsigned long next_sect; - int sect_count; - int resync_running; -}; - -/* - * this is our 'private' 'collective' RAID1 buffer head. - * it contains information about what kind of IO operations were started - * for this RAID5 operation, and about their status: - */ - -struct raid1_bh { - unsigned int remaining; - int cmd; - unsigned long state; - struct md_dev *mddev; - struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; - struct buffer_head bh_req; - struct buffer_head *next_retry; -}; - -#endif diff -ruN linux.orig/include/linux/raid5.h linux-2.2.16/include/linux/raid5.h --- linux.orig/include/linux/raid5.h Fri May 8 09:17:13 1998 +++ linux-2.2.16/include/linux/raid5.h Thu Jan 1 01:00:00 1970 @@ -1,110 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#ifdef __KERNEL__ -#include -#include - -struct disk_info { - kdev_t dev; - int operational; - int number; - int raid_disk; - int write_only; - int spare; -}; - -struct stripe_head { - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ - struct stripe_head *free_next; /* pool of free sh's */ - struct buffer_head *buffer_pool; /* pool of free buffers */ - struct buffer_head *bh_pool; /* pool of free bh's */ - struct raid5_data *raid_conf; - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ - unsigned long sector; /* sector of this row */ - int size; /* buffers size */ - int pd_idx; /* parity disk index */ - int nr_pending; /* nr of pending cmds */ - unsigned long state; /* state flags */ - int cmd; /* stripe cmd */ - int count; /* nr of waiters */ - int write_method; /* reconstruct-write / read-modify-write */ - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ - struct wait_queue *wait; /* processes waiting for this stripe */ -}; - -/* - * Phase - */ -#define PHASE_BEGIN 0 -#define PHASE_READ_OLD 1 -#define PHASE_WRITE 2 -#define PHASE_READ 3 -#define PHASE_COMPLETE 4 - -/* - * Write method - */ -#define METHOD_NONE 0 -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 - -/* - * Stripe state - */ -#define STRIPE_LOCKED 0 -#define STRIPE_ERROR 1 - -/* - * Stripe commands - */ -#define STRIPE_NONE 0 -#define STRIPE_WRITE 1 -#define STRIPE_READ 2 - -struct raid5_data { - struct stripe_head **stripe_hashtbl; - struct md_dev *mddev; - struct md_thread *thread, *resync_thread; - struct disk_info disks[MD_SB_DISKS]; - struct disk_info *spare; - int buffer_size; - int chunk_size, level, algorithm; - int raid_disks, working_disks, failed_disks; - int sector_count; - unsigned long next_sector; - atomic_t nr_handle; - struct stripe_head *next_free_stripe; - int nr_stripes; - int resync_parity; - int max_nr_stripes; - int clock; - int nr_hashed_stripes; - int nr_locked_stripes; - int nr_pending_stripes; - int nr_cached_stripes; - - /* - * Free stripes pool - */ - int nr_free_sh; - struct stripe_head *free_sh_list; - struct wait_queue *wait_for_stripe; -}; - -#endif - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif diff -ruN linux.orig/include/linux/sysctl.h linux-2.2.16/include/linux/sysctl.h --- linux.orig/include/linux/sysctl.h Wed Jun 7 23:26:44 2000 +++ linux-2.2.16/include/linux/sysctl.h Fri Jun 9 11:45:55 2000 @@ -430,7 +430,8 @@ /* CTL_DEV names: */ enum { DEV_CDROM=1, - DEV_HWMON=2 + DEV_HWMON=2, + DEV_MD=3 }; /* /proc/sys/dev/cdrom */ @@ -441,6 +442,11 @@ DEV_CDROM_DEBUG=4, DEV_CDROM_LOCK=5, DEV_CDROM_CHECK_MEDIA=6 +}; + +/* /proc/sys/dev/md */ +enum { + DEV_MD_SPEED_LIMIT=1 }; #ifdef __KERNEL__ diff -ruN linux.orig/init/main.c linux-2.2.16/init/main.c --- linux.orig/init/main.c Wed Jun 7 23:26:44 2000 +++ linux-2.2.16/init/main.c Fri Jun 9 11:37:44 2000 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -540,7 +541,7 @@ #ifdef CONFIG_BLK_DEV_FD { "fd", 0x0200 }, #endif -#ifdef CONFIG_MD_BOOT +#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID { "md", 0x0900 }, #endif #ifdef CONFIG_BLK_DEV_XD @@ -1042,6 +1043,9 @@ #ifdef CONFIG_MD_BOOT { "md=", md_setup}, #endif +#if CONFIG_BLK_DEV_MD + { "raid=", raid_setup}, +#endif #ifdef CONFIG_ADBMOUSE { "adb_buttons=", adb_mouse_setup }, #endif @@ -1580,6 +1584,9 @@ while (pid != wait(&i)); if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); +#endif error = change_root(real_root_dev,"/initrd"); if (error) printk(KERN_ERR "Change root to /initrd: "