1 --- linux/init/main.c.orig Tue Jan 16 13:30:09 2001
2 +++ linux/init/main.c Tue Jan 16 13:42:03 2001
4 #include <linux/utsname.h>
5 #include <linux/ioport.h>
6 #include <linux/init.h>
7 +#include <linux/raid/md.h>
8 #include <linux/smp_lock.h>
10 #include <linux/hdreg.h>
12 #ifdef CONFIG_BLK_DEV_FD
15 -#ifdef CONFIG_MD_BOOT
16 +#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
19 #ifdef CONFIG_BLK_DEV_XD
24 +#if CONFIG_BLK_DEV_MD
25 + { "raid=", raid_setup},
27 #ifdef CONFIG_ADBMOUSE
28 { "adb_buttons=", adb_mouse_setup },
31 while (pid != wait(&i));
32 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
33 || MINOR(real_root_dev) != 0) {
34 +#ifdef CONFIG_BLK_DEV_MD
37 error = change_root(real_root_dev,"/initrd");
39 printk(KERN_ERR "Change root to /initrd: "
40 --- linux/include/linux/raid/hsm.h.orig Tue Jan 16 13:42:03 2001
41 +++ linux/include/linux/raid/hsm.h Tue Jan 16 13:42:03 2001
46 +#include <linux/raid/md.h>
49 +#error fix cpu_addr on Alpha first
52 +#include <linux/raid/hsm_p.h>
54 +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
55 +#define index_dev(lv,index) index_pv((lv),(index))->dev
56 +#define index_block(lv,index) (index)->data.phys_block
57 +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
59 +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
62 +typedef struct pv_bg_desc_s {
63 + unsigned int free_blocks;
64 + pv_block_group_t *bg;
67 +typedef struct pv_s pv_t;
68 +typedef struct vg_s vg_t;
69 +typedef struct lv_s lv_t;
76 + pv_bg_desc_t *bg_array;
84 + unsigned int max_indices;
85 + unsigned int free_indices;
86 + lv_lptr_t root_index;
94 + pv_t pv_array [MD_SB_DISKS];
97 + lv_t lv_array [HSM_MAX_LVS_PER_VG];
103 +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
104 +#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
108 --- linux/include/linux/raid/hsm_p.h.orig Tue Jan 16 13:42:03 2001
109 +++ linux/include/linux/raid/hsm_p.h Tue Jan 16 13:42:03 2001
114 +#define HSM_BLOCKSIZE 4096
115 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
116 +#define PACKED __attribute__ ((packed))
119 + * Identifies a block in physical space
121 +typedef struct phys_idx_s {
125 +} PACKED phys_idx_t;
128 + * Identifies a block in logical space
130 +typedef struct log_idx_s {
139 +#define HSM_PV_SB_MAGIC 0xf091ae9fU
141 +#define HSM_PV_SB_GENERIC_WORDS 32
142 +#define HSM_PV_SB_RESERVED_WORDS \
143 + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
146 + * On-disk PV identification data, on block 0 in any PV.
148 +typedef struct pv_sb_s
150 + __u32 pv_magic; /* 0 */
152 + __u32 pv_uuid0; /* 1 */
153 + __u32 pv_uuid1; /* 2 */
154 + __u32 pv_uuid2; /* 3 */
155 + __u32 pv_uuid3; /* 4 */
157 + __u32 pv_major; /* 5 */
158 + __u32 pv_minor; /* 6 */
159 + __u32 pv_patch; /* 7 */
161 + __u32 pv_ctime; /* 8 Creation time */
163 + __u32 pv_total_size; /* 9 size of this PV, in blocks */
164 + __u32 pv_first_free; /* 10 first free block */
165 + __u32 pv_first_used; /* 11 first used block */
166 + __u32 pv_blocks_left; /* 12 unallocated blocks */
167 + __u32 pv_bg_size; /* 13 size of a block group, in blocks */
168 + __u32 pv_block_size; /* 14 size of blocks, in bytes */
169 + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
170 + __u32 pv_block_groups; /* 16 number of block groups */
172 + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
177 + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
182 + * this is pretty much arbitrary, but has to be less than ~64
184 +#define HSM_MAX_LVS_PER_VG 32
186 +#define HSM_VG_SB_GENERIC_WORDS 32
188 +#define LV_DESCRIPTOR_WORDS 8
189 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
190 + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
192 +#if (HSM_PV_SB_RESERVED_WORDS < 0)
193 +#error you messed this one up dude ...
196 +typedef struct lv_descriptor_s
198 + __u32 lv_id; /* 0 */
199 + phys_idx_t lv_root_idx; /* 1 */
200 + __u16 __reserved; /* 2 */
201 + __u32 lv_max_indices; /* 3 */
202 + __u32 lv_free_indices; /* 4 */
203 + __u32 md_id; /* 5 */
205 + __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
207 +} PACKED lv_descriptor_t;
209 +#define HSM_VG_SB_MAGIC 0x98320d7aU
211 + * On-disk VG identification data, in block 1 on all PVs
213 +typedef struct vg_sb_s
215 + __u32 vg_magic; /* 0 */
216 + __u32 nr_lvs; /* 1 */
218 + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
220 + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
224 + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
232 +#define HSM_LV_SB_MAGIC 0xe182bd8aU
234 +/* do we need lv_sb_t? */
236 +typedef struct lv_sb_s
239 + * On-disk LV identifier
241 + __u32 lv_magic; /* 0 LV identifier */
242 + __u32 lv_uuid0; /* 1 */
243 + __u32 lv_uuid1; /* 2 */
244 + __u32 lv_uuid2; /* 3 */
245 + __u32 lv_uuid3; /* 4 */
247 + __u32 lv_major; /* 5 PV identifier */
248 + __u32 lv_minor; /* 6 PV identifier */
249 + __u32 lv_patch; /* 7 PV identifier */
251 + __u32 ctime; /* 8 Creation time */
252 + __u32 size; /* 9 size of this LV, in blocks */
253 + phys_idx_t start; /* 10 position of root index block */
254 + log_idx_t first_free; /* 11-12 first free index */
259 + __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
264 + * Pointer pointing from the physical space, points to
265 + * the LV owning this block. It also contains various
266 + * statistics about the physical block.
268 +typedef struct pv_pptr_s
274 + log_idx_t predicted;
275 + __u32 last_referenced;
288 +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
290 + return !pptr->u.free.log_id;
294 +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
296 +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
298 + * A table of pointers filling up a single block, managing
299 + * the next DATA_BLOCKS_PER_BG physical blocks. Such block
300 + * groups form the physical space of blocks.
302 +typedef struct pv_block_group_s
304 + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
306 + pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
308 +} PACKED pv_block_group_t;
311 + * Pointer from the logical space, points to
312 + * the (PV,block) containing this logical block
314 +typedef struct lv_lptr_s
323 +static __inline__ int index_free (const lv_lptr_t * index)
325 + return !index->data.phys_block;
328 +static __inline__ int index_present (const lv_lptr_t * index)
330 + return index->cpu_addr;
334 +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
336 + * A table of pointers filling up a single block, managing
337 + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
338 + * the logical space of blocks.
340 +typedef struct lv_index_block_s
342 + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
344 +} PACKED lv_index_block_t;
348 --- linux/include/linux/raid/linear.h.orig Tue Jan 16 13:42:03 2001
349 +++ linux/include/linux/raid/linear.h Tue Jan 16 13:44:37 2001
354 +#include <linux/raid/md.h>
359 + unsigned int offset;
362 +typedef struct dev_info dev_info_t;
366 + dev_info_t *dev0, *dev1;
369 +struct linear_private_data
371 + struct linear_hash *hash_table;
372 + dev_info_t disks[MD_SB_DISKS];
373 + dev_info_t *smallest;
378 +typedef struct linear_private_data linear_conf_t;
380 +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
383 --- linux/include/linux/raid/md.h.orig Tue Jan 16 13:42:03 2001
384 +++ linux/include/linux/raid/md.h Tue Jan 16 13:47:20 2001
387 + md.h : Multiple Devices driver for Linux
388 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
389 + Copyright (C) 1994-96 Marc ZYNGIER
390 + <zyngier@ufr-info-p7.ibp.fr> or
391 + <maz@gloups.fdn.fr>
393 + This program is free software; you can redistribute it and/or modify
394 + it under the terms of the GNU General Public License as published by
395 + the Free Software Foundation; either version 2, or (at your option)
398 + You should have received a copy of the GNU General Public License
399 + (for example /usr/src/linux/COPYING); if not, write to the Free
400 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
406 +#include <linux/mm.h>
407 +#include <linux/fs.h>
408 +#include <linux/blkdev.h>
409 +#include <asm/semaphore.h>
410 +#include <linux/major.h>
411 +#include <linux/ioctl.h>
412 +#include <linux/types.h>
413 +#include <asm/bitops.h>
414 +#include <linux/module.h>
415 +#include <linux/mm.h>
416 +#include <linux/hdreg.h>
417 +#include <linux/sysctl.h>
418 +#include <linux/fs.h>
419 +#include <linux/proc_fs.h>
420 +#include <linux/smp_lock.h>
421 +#include <linux/delay.h>
422 +#include <net/checksum.h>
423 +#include <linux/random.h>
424 +#include <linux/locks.h>
427 +#include <linux/raid/md_compatible.h>
429 + * 'md_p.h' holds the 'physical' layout of RAID devices
430 + * 'md_u.h' holds the user <=> kernel API
432 + * 'md_k.h' holds kernel internal definitions
435 +#include <linux/raid/md_p.h>
436 +#include <linux/raid/md_u.h>
437 +#include <linux/raid/md_k.h>
440 + * Different major versions are not compatible.
441 + * Different minor versions are only downward compatible.
442 + * Different patchlevel versions are downward and upward compatible.
444 +#define MD_MAJOR_VERSION 0
445 +#define MD_MINOR_VERSION 90
446 +#define MD_PATCHLEVEL_VERSION 0
448 +extern int md_size[MAX_MD_DEVS];
449 +extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
451 +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
452 +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
453 +extern char * partition_name (kdev_t dev);
454 +extern int register_md_personality (int p_num, mdk_personality_t *p);
455 +extern int unregister_md_personality (int p_num);
456 +extern mdk_thread_t * md_register_thread (void (*run) (void *data),
457 + void *data, const char *name);
458 +extern void md_unregister_thread (mdk_thread_t *thread);
459 +extern void md_wakeup_thread(mdk_thread_t *thread);
460 +extern void md_interrupt_thread (mdk_thread_t *thread);
461 +extern int md_update_sb (mddev_t *mddev);
462 +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
463 +extern void md_recover_arrays (void);
464 +extern int md_check_ordering (mddev_t *mddev);
465 +extern void autodetect_raid(void);
466 +extern struct gendisk * find_gendisk (kdev_t dev);
467 +extern int md_notify_reboot(struct notifier_block *this,
468 + unsigned long code, void *x);
469 +#if CONFIG_BLK_DEV_MD
470 +extern void raid_setup(char *str,int *ints) md__init;
472 +#ifdef CONFIG_MD_BOOT
473 +extern void md_setup(char *str,int *ints) md__init;
476 +extern void md_print_devices (void);
478 +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
482 --- linux/include/linux/raid/md_compatible.h.orig Tue Jan 16 13:42:03 2001
483 +++ linux/include/linux/raid/md_compatible.h Tue Jan 16 13:47:19 2001
487 + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
488 + Copyright (C) 1998 Ingo Molnar
490 + This program is free software; you can redistribute it and/or modify
491 + it under the terms of the GNU General Public License as published by
492 + the Free Software Foundation; either version 2, or (at your option)
495 + You should have received a copy of the GNU General Public License
496 + (for example /usr/src/linux/COPYING); if not, write to the Free
497 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
500 +#include <linux/version.h>
502 +#ifndef _MD_COMPATIBLE_H
503 +#define _MD_COMPATIBLE_H
505 +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
507 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
510 +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
514 +extern __inline__ int md_cpu_has_mmx(void)
516 + return x86_capability & 0x00800000;
521 +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
525 + * someone please suggest a sane compatibility layer for modules
527 +#define MD_EXPORT_SYMBOL(x)
530 +static inline unsigned long
531 +md_copy_from_user(void *to, const void *from, unsigned long n)
535 + err = verify_area(VERIFY_READ,from,n);
537 + memcpy_fromfs(to, from, n);
542 +extern inline unsigned long
543 +md_copy_to_user(void *to, const void *from, unsigned long n)
547 + err = verify_area(VERIFY_WRITE,to,n);
549 + memcpy_tofs(to, from, n);
554 +#define md_put_user(x,ptr) \
558 + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
565 +extern inline int md_capable_admin(void)
571 +#define MD_FILE_TO_INODE(file) ((file)->f_inode)
574 +extern inline void md_flush_signals (void)
576 + current->signal = 0;
580 +#define __S(nr) (1<<((nr)-1))
581 +extern inline void md_init_signals (void)
583 + current->exit_signal = SIGCHLD;
584 + current->blocked = ~(__S(SIGKILL));
589 +extern inline unsigned long md_signal_pending (struct task_struct * tsk)
591 + return (tsk->signal & ~tsk->blocked);
595 +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
598 +#define md_mdelay(n) (\
599 + {unsigned long msec=(n); while (msec--) udelay(1000);})
602 +#define MD_SYS_DOWN 0
603 +#define MD_SYS_HALT 0
604 +#define MD_SYS_POWER_OFF 0
607 +#define md_register_reboot_notifier(x)
610 +extern __inline__ unsigned long
611 +md_test_and_set_bit(int nr, void * addr)
613 + unsigned long flags;
614 + unsigned long oldbit;
618 + oldbit = test_bit(nr,addr);
620 + restore_flags(flags);
625 +extern __inline__ unsigned long
626 +md_test_and_clear_bit(int nr, void * addr)
628 + unsigned long flags;
629 + unsigned long oldbit;
633 + oldbit = test_bit(nr,addr);
634 + clear_bit(nr,addr);
635 + restore_flags(flags);
640 +#define md_atomic_read(x) (*(volatile int *)(x))
641 +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
644 +extern __inline__ void md_lock_kernel (void)
652 +extern __inline__ void md_unlock_kernel (void)
662 +#define md__initdata
663 +#define md__initfunc(__arginit) __arginit
669 +struct md_list_head {
670 + struct md_list_head *next, *prev;
673 +#define MD_LIST_HEAD(name) \
674 + struct md_list_head name = { &name, &name }
676 +#define MD_INIT_LIST_HEAD(ptr) do { \
677 + (ptr)->next = (ptr); (ptr)->prev = (ptr); \
680 +static __inline__ void md__list_add(struct md_list_head * new,
681 + struct md_list_head * prev,
682 + struct md_list_head * next)
690 +static __inline__ void md_list_add(struct md_list_head *new,
691 + struct md_list_head *head)
693 + md__list_add(new, head, head->next);
696 +static __inline__ void md__list_del(struct md_list_head * prev,
697 + struct md_list_head * next)
703 +static __inline__ void md_list_del(struct md_list_head *entry)
705 + md__list_del(entry->prev, entry->next);
708 +static __inline__ int md_list_empty(struct md_list_head *head)
710 + return head->next == head;
713 +#define md_list_entry(ptr, type, member) \
714 + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
718 +static __inline__ signed long md_schedule_timeout(signed long timeout)
720 + current->timeout = jiffies + timeout;
726 +#define md_need_resched(tsk) (need_resched)
729 +typedef struct { int gcc_is_buggy; } md_spinlock_t;
730 +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
732 +#define md_spin_lock_irq cli
733 +#define md_spin_unlock_irq sti
734 +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
735 +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
741 +#include <linux/reboot.h>
742 +#include <linux/vmalloc.h>
745 +#define md__get_free_pages(x,y) __get_free_pages(x,y)
749 +extern __inline__ int md_cpu_has_mmx(void)
751 + return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
756 +#define md_clear_page(page) clear_page(page)
759 +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
762 +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
765 +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
768 +#define md_put_user put_user
771 +extern inline int md_capable_admin(void)
773 + return capable(CAP_SYS_ADMIN);
777 +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
780 +extern inline void md_flush_signals (void)
782 + spin_lock(¤t->sigmask_lock);
783 + flush_signals(current);
784 + spin_unlock(¤t->sigmask_lock);
788 +extern inline void md_init_signals (void)
790 + current->exit_signal = SIGCHLD;
791 + siginitsetinv(¤t->blocked, sigmask(SIGKILL));
795 +#define md_signal_pending signal_pending
798 +extern inline void md_set_global_readahead(int * table)
800 + max_readahead[MD_MAJOR] = table;
804 +#define md_mdelay(x) mdelay(x)
807 +#define MD_SYS_DOWN SYS_DOWN
808 +#define MD_SYS_HALT SYS_HALT
809 +#define MD_SYS_POWER_OFF SYS_POWER_OFF
812 +#define md_register_reboot_notifier register_reboot_notifier
815 +#define md_test_and_set_bit test_and_set_bit
818 +#define md_test_and_clear_bit test_and_clear_bit
821 +#define md_atomic_read atomic_read
822 +#define md_atomic_set atomic_set
825 +#define md_lock_kernel lock_kernel
826 +#define md_unlock_kernel unlock_kernel
830 +#include <linux/init.h>
832 +#define md__init __init
833 +#define md__initdata __initdata
834 +#define md__initfunc(__arginit) __initfunc(__arginit)
841 +#define md_list_head list_head
842 +#define MD_LIST_HEAD(name) LIST_HEAD(name)
843 +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
844 +#define md_list_add list_add
845 +#define md_list_del list_del
846 +#define md_list_empty list_empty
848 +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
852 +#define md_schedule_timeout schedule_timeout
855 +#define md_need_resched(tsk) ((tsk)->need_resched)
858 +#define md_spinlock_t spinlock_t
859 +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
861 +#define md_spin_lock_irq spin_lock_irq
862 +#define md_spin_unlock_irq spin_unlock_irq
863 +#define md_spin_unlock_irqrestore spin_unlock_irqrestore
864 +#define md_spin_lock_irqsave spin_lock_irqsave
870 +#endif _MD_COMPATIBLE_H
872 --- linux/include/linux/raid/md_k.h.orig Tue Jan 16 13:42:03 2001
873 +++ linux/include/linux/raid/md_k.h Tue Jan 16 13:42:03 2001
876 + md_k.h : kernel internal structure of the Linux MD driver
877 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
879 + This program is free software; you can redistribute it and/or modify
880 + it under the terms of the GNU General Public License as published by
881 + the Free Software Foundation; either version 2, or (at your option)
884 + You should have received a copy of the GNU General Public License
885 + (for example /usr/src/linux/COPYING); if not, write to the Free
886 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
892 +#define MD_RESERVED 0UL
895 +#define RAID0 STRIPED
898 +#define TRANSLUCENT 5UL
900 +#define MAX_PERSONALITY 7UL
902 +extern inline int pers_to_level (int pers)
905 + case HSM: return -3;
906 + case TRANSLUCENT: return -2;
907 + case LINEAR: return -1;
908 + case RAID0: return 0;
909 + case RAID1: return 1;
910 + case RAID5: return 5;
912 + panic("pers_to_level()");
915 +extern inline int level_to_pers (int level)
918 + case -3: return HSM;
919 + case -2: return TRANSLUCENT;
920 + case -1: return LINEAR;
921 + case 0: return RAID0;
922 + case 1: return RAID1;
924 + case 5: return RAID5;
926 + return MD_RESERVED;
929 +typedef struct mddev_s mddev_t;
930 +typedef struct mdk_rdev_s mdk_rdev_t;
932 +#if (MINORBITS != 8)
933 +#error MD doesnt handle bigger kdev yet
936 +#define MAX_REAL 12 /* Max number of disks per md dev */
937 +#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
940 + * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
941 + * the personality. (eg. HSM uses this to identify individual LVs)
943 +typedef struct dev_mapping_s {
948 +extern dev_mapping_t mddev_map [MAX_MD_DEVS];
950 +extern inline mddev_t * kdev_to_mddev (kdev_t dev)
952 + return mddev_map[MINOR(dev)].mddev;
956 + * options passed in raidrun:
959 +#define MAX_CHUNK_SIZE (4096*1024)
962 + * default readahead
964 +#define MD_READAHEAD (256 * 512)
966 +extern inline int disk_faulty(mdp_disk_t * d)
968 + return d->state & (1 << MD_DISK_FAULTY);
971 +extern inline int disk_active(mdp_disk_t * d)
973 + return d->state & (1 << MD_DISK_ACTIVE);
976 +extern inline int disk_sync(mdp_disk_t * d)
978 + return d->state & (1 << MD_DISK_SYNC);
981 +extern inline int disk_spare(mdp_disk_t * d)
983 + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
986 +extern inline int disk_removed(mdp_disk_t * d)
988 + return d->state & (1 << MD_DISK_REMOVED);
991 +extern inline void mark_disk_faulty(mdp_disk_t * d)
993 + d->state |= (1 << MD_DISK_FAULTY);
996 +extern inline void mark_disk_active(mdp_disk_t * d)
998 + d->state |= (1 << MD_DISK_ACTIVE);
1001 +extern inline void mark_disk_sync(mdp_disk_t * d)
1003 + d->state |= (1 << MD_DISK_SYNC);
1006 +extern inline void mark_disk_spare(mdp_disk_t * d)
1011 +extern inline void mark_disk_removed(mdp_disk_t * d)
1013 + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
1016 +extern inline void mark_disk_inactive(mdp_disk_t * d)
1018 + d->state &= ~(1 << MD_DISK_ACTIVE);
1021 +extern inline void mark_disk_nonsync(mdp_disk_t * d)
1023 + d->state &= ~(1 << MD_DISK_SYNC);
1027 + * MD's 'extended' device
1031 + struct md_list_head same_set; /* RAID devices within the same set */
1032 + struct md_list_head all; /* all RAID devices */
1033 + struct md_list_head pending; /* undetected RAID devices */
1035 + kdev_t dev; /* Device number */
1036 + kdev_t old_dev; /* "" when it was last imported */
1037 + int size; /* Device size (in blocks) */
1038 + mddev_t *mddev; /* RAID array if running */
1039 + unsigned long last_events; /* IO event timestamp */
1041 + struct inode *inode; /* Lock inode */
1042 + struct file filp; /* Lock file */
1047 + int faulty; /* if faulty do not issue IO requests */
1048 + int desc_nr; /* descriptor index in the superblock */
1053 + * disk operations in a working array:
1055 +#define DISKOP_SPARE_INACTIVE 0
1056 +#define DISKOP_SPARE_WRITE 1
1057 +#define DISKOP_SPARE_ACTIVE 2
1058 +#define DISKOP_HOT_REMOVE_DISK 3
1059 +#define DISKOP_HOT_ADD_DISK 4
1061 +typedef struct mdk_personality_s mdk_personality_t;
1066 + mdk_personality_t *pers;
1070 + struct md_list_head disks;
1072 + mdu_param_t param;
1074 + unsigned int curr_resync;
1075 + unsigned long resync_start;
1077 + int recovery_running;
1078 + struct semaphore reconfig_sem;
1079 + struct semaphore recovery_sem;
1080 + struct semaphore resync_sem;
1081 + struct md_list_head all_mddevs;
1084 +struct mdk_personality_s
1087 + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1088 + unsigned long *rsector, unsigned long size);
1089 + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
1090 + void (*end_request)(struct buffer_head * bh, int uptodate);
1091 + int (*run)(mddev_t *mddev);
1092 + int (*stop)(mddev_t *mddev);
1093 + int (*status)(char *page, mddev_t *mddev);
1094 + int (*ioctl)(struct inode *inode, struct file *file,
1095 + unsigned int cmd, unsigned long arg);
1096 + int max_invalid_dev;
1097 + int (*error_handler)(mddev_t *mddev, kdev_t dev);
1100 + * Some personalities (RAID-1, RAID-5) can have disks hot-added and
1101 + * hot-removed. Hot removal is different from failure. (failure marks
1102 + * a disk inactive, but the disk is still part of the array) The interface
1103 + * to such operations is the 'pers->diskop()' function, can be NULL.
1105 + * the diskop function can change the pointer pointing to the incoming
1106 + * descriptor, but must do so very carefully. (currently only
1107 + * SPARE_ACTIVE expects such a change)
1109 + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
1111 + int (*stop_resync)(mddev_t *mddev);
1112 + int (*restart_resync)(mddev_t *mddev);
1117 + * Currently we index md_array directly, based on the minor
1118 + * number. This will have to change to dynamic allocation
1119 + * once we start supporting partitioning of md devices.
1121 +extern inline int mdidx (mddev_t * mddev)
1123 + return mddev->__minor;
1126 +extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
1128 + return MKDEV(MD_MAJOR, mdidx(mddev));
1131 +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
1132 +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
1135 + * iterates through some rdev ringlist. It's safe to remove the
1136 + * current 'rdev'. Dont touch 'tmp' though.
1138 +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
1140 + for (tmp = head.next; \
1141 + rdev = md_list_entry(tmp, mdk_rdev_t, field), \
1142 + tmp = tmp->next, tmp->prev != &head \
1145 + * iterates through the 'same array disks' ringlist
1147 +#define ITERATE_RDEV(mddev,rdev,tmp) \
1148 + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
1151 + * Same as above, but assumes that the device has rdev->desc_nr numbered
1152 + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
1154 +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
1155 + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
1159 + * Iterates through all 'RAID managed disks'
1161 +#define ITERATE_RDEV_ALL(rdev,tmp) \
1162 + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
1165 + * Iterates through 'pending RAID disks'
1167 +#define ITERATE_RDEV_PENDING(rdev,tmp) \
1168 + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
1171 + * iterates through all used mddevs in the system.
1173 +#define ITERATE_MDDEV(mddev,tmp) \
1175 + for (tmp = all_mddevs.next; \
1176 + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
1177 + tmp = tmp->next, tmp->prev != &all_mddevs \
1180 +extern inline int lock_mddev (mddev_t * mddev)
1182 + return down_interruptible(&mddev->reconfig_sem);
1185 +extern inline void unlock_mddev (mddev_t * mddev)
1187 + up(&mddev->reconfig_sem);
1190 +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
1191 + x = y; y = __tmp; } while (0)
1193 +typedef struct mdk_thread_s {
1194 + void (*run) (void *data);
1196 + struct wait_queue *wqueue;
1197 + unsigned long flags;
1198 + struct semaphore *sem;
1199 + struct task_struct *tsk;
1203 +#define THREAD_WAKEUP 0
1205 +typedef struct dev_name_s {
1206 + struct md_list_head list;
1208 + char name [MAX_DISKNAME_LEN];
1213 --- linux/include/linux/raid/md_p.h.orig Tue Jan 16 13:42:03 2001
1214 +++ linux/include/linux/raid/md_p.h Tue Jan 16 13:42:03 2001
1217 + md_p.h : physical layout of Linux RAID devices
1218 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
1220 + This program is free software; you can redistribute it and/or modify
1221 + it under the terms of the GNU General Public License as published by
1222 + the Free Software Foundation; either version 2, or (at your option)
1223 + any later version.
1225 + You should have received a copy of the GNU General Public License
1226 + (for example /usr/src/linux/COPYING); if not, write to the Free
1227 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1234 + * RAID superblock.
1236 + * The RAID superblock maintains some statistics on each RAID configuration.
1237 + * Each real device in the RAID set contains it near the end of the device.
1238 + * Some of the ideas are copied from the ext2fs implementation.
1240 + * We currently use 4096 bytes as follows:
1242 + * word offset function
1244 + * 0 - 31 Constant generic RAID device information.
1245 + * 32 - 63 Generic state information.
1246 + * 64 - 127 Personality specific information.
1247 + * 128 - 511 12 32-words descriptors of the disks in the raid set.
1248 + * 512 - 911 Reserved.
1249 + * 912 - 1023 Disk specific descriptor.
1253 + * If x is the real device size in bytes, we return an apparent size of:
1255 + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1257 + * and place the 4kB superblock at offset y.
1259 +#define MD_RESERVED_BYTES (64 * 1024)
1260 +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1261 +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1263 +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1264 +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1266 +#define MD_SB_BYTES 4096
1267 +#define MD_SB_WORDS (MD_SB_BYTES / 4)
1268 +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1269 +#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1272 + * The following are counted in 32-bit words
1274 +#define MD_SB_GENERIC_OFFSET 0
1275 +#define MD_SB_PERSONALITY_OFFSET 64
1276 +#define MD_SB_DISKS_OFFSET 128
1277 +#define MD_SB_DESCRIPTOR_OFFSET 992
1279 +#define MD_SB_GENERIC_CONSTANT_WORDS 32
1280 +#define MD_SB_GENERIC_STATE_WORDS 32
1281 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1282 +#define MD_SB_PERSONALITY_WORDS 64
1283 +#define MD_SB_DISKS_WORDS 384
1284 +#define MD_SB_DESCRIPTOR_WORDS 32
1285 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1286 +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1287 +#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1290 + * Device "operational" state bits
1292 +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
1293 +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
1294 +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
1295 +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
1297 +typedef struct mdp_device_descriptor_s {
1298 + __u32 number; /* 0 Device number in the entire set */
1299 + __u32 major; /* 1 Device major number */
1300 + __u32 minor; /* 2 Device minor number */
1301 + __u32 raid_disk; /* 3 The role of the device in the raid set */
1302 + __u32 state; /* 4 Operational state */
1303 + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1306 +#define MD_SB_MAGIC 0xa92b4efc
1309 + * Superblock state bits
1311 +#define MD_SB_CLEAN 0
1312 +#define MD_SB_ERRORS 1
1314 +typedef struct mdp_superblock_s {
1316 + * Constant generic information
1318 + __u32 md_magic; /* 0 MD identifier */
1319 + __u32 major_version; /* 1 major version to which the set conforms */
1320 + __u32 minor_version; /* 2 minor version ... */
1321 + __u32 patch_version; /* 3 patchlevel version ... */
1322 + __u32 gvalid_words; /* 4 Number of used words in this section */
1323 + __u32 set_uuid0; /* 5 Raid set identifier */
1324 + __u32 ctime; /* 6 Creation time */
1325 + __u32 level; /* 7 Raid personality */
1326 + __u32 size; /* 8 Apparent size of each individual disk */
1327 + __u32 nr_disks; /* 9 total disks in the raid set */
1328 + __u32 raid_disks; /* 10 disks in a fully functional raid set */
1329 + __u32 md_minor; /* 11 preferred MD minor device number */
1330 + __u32 not_persistent; /* 12 does it have a persistent superblock */
1331 + __u32 set_uuid1; /* 13 Raid set identifier #2 */
1332 + __u32 set_uuid2; /* 14 Raid set identifier #3 */
1333 + __u32 set_uuid3; /* 14 Raid set identifier #4 */
1334 + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
1337 + * Generic state information
1339 + __u32 utime; /* 0 Superblock update time */
1340 + __u32 state; /* 1 State bits (clean, ...) */
1341 + __u32 active_disks; /* 2 Number of currently active disks */
1342 + __u32 working_disks; /* 3 Number of working disks */
1343 + __u32 failed_disks; /* 4 Number of failed disks */
1344 + __u32 spare_disks; /* 5 Number of spare disks */
1345 + __u32 sb_csum; /* 6 checksum of the whole superblock */
1346 + __u64 events; /* 7 number of superblock updates (64-bit!) */
1347 + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
1350 + * Personality information
1352 + __u32 layout; /* 0 the array's physical layout */
1353 + __u32 chunk_size; /* 1 chunk size in bytes */
1354 + __u32 root_pv; /* 2 LV root PV */
1355 + __u32 root_block; /* 3 LV root block */
1356 + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
1359 + * Disks information
1361 + mdp_disk_t disks[MD_SB_DISKS];
1366 + __u32 reserved[MD_SB_RESERVED_WORDS];
1369 + * Active descriptor
1371 + mdp_disk_t this_disk;
1377 --- linux/include/linux/raid/md_u.h.orig Tue Jan 16 13:42:03 2001
1378 +++ linux/include/linux/raid/md_u.h Tue Jan 16 13:42:03 2001
1381 + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
1382 + Copyright (C) 1998 Ingo Molnar
1384 + This program is free software; you can redistribute it and/or modify
1385 + it under the terms of the GNU General Public License as published by
1386 + the Free Software Foundation; either version 2, or (at your option)
1387 + any later version.
1389 + You should have received a copy of the GNU General Public License
1390 + (for example /usr/src/linux/COPYING); if not, write to the Free
1391 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1400 +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
1401 +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
1402 +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
1403 +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
1405 +/* configuration */
1406 +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
1407 +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
1408 +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
1409 +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
1410 +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
1411 +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
1412 +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
1413 +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
1414 +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
1415 +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
1418 +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
1419 +#define START_ARRAY _IO (MD_MAJOR, 0x31)
1420 +#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
1421 +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
1422 +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
1424 +typedef struct mdu_version_s {
1430 +typedef struct mdu_array_info_s {
1432 + * Generic constant information
1434 + int major_version;
1435 + int minor_version;
1436 + int patch_version;
1443 + int not_persistent;
1446 + * Generic state information
1448 + int utime; /* 0 Superblock update time */
1449 + int state; /* 1 State bits (clean, ...) */
1450 + int active_disks; /* 2 Number of currently active disks */
1451 + int working_disks; /* 3 Number of working disks */
1452 + int failed_disks; /* 4 Number of failed disks */
1453 + int spare_disks; /* 5 Number of spare disks */
1456 + * Personality information
1458 + int layout; /* 0 the array's physical layout */
1459 + int chunk_size; /* 1 chunk size in bytes */
1461 +} mdu_array_info_t;
1463 +typedef struct mdu_disk_info_s {
1465 + * configuration/status of one particular disk
1475 +typedef struct mdu_start_info_s {
1477 + * configuration/status of one particular disk
1484 +} mdu_start_info_t;
1486 +typedef struct mdu_param_s
1488 + int personality; /* 1,2,3,4 */
1489 + int chunk_size; /* in bytes */
1490 + int max_fault; /* unused for now */
1495 --- linux/include/linux/raid/raid0.h.orig Tue Jan 16 13:42:03 2001
1496 +++ linux/include/linux/raid/raid0.h Tue Jan 16 13:44:37 2001
1501 +#include <linux/raid/md.h>
1505 + int zone_offset; /* Zone offset in md_dev */
1506 + int dev_offset; /* Zone offset in real dev */
1507 + int size; /* Zone size */
1508 + int nb_dev; /* # of devices attached to the zone */
1509 + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
1514 + struct strip_zone *zone0, *zone1;
1517 +struct raid0_private_data
1519 + struct raid0_hash *hash_table; /* Dynamically allocated */
1520 + struct strip_zone *strip_zone; /* This one too */
1521 + int nr_strip_zones;
1522 + struct strip_zone *smallest;
1526 +typedef struct raid0_private_data raid0_conf_t;
1528 +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
1531 --- linux/include/linux/raid/raid1.h.orig Tue Jan 16 13:42:03 2001
1532 +++ linux/include/linux/raid/raid1.h Tue Jan 16 13:44:37 2001
1537 +#include <linux/raid/md.h>
1539 +struct mirror_info {
1556 +struct raid1_private_data {
1558 + struct mirror_info mirrors[MD_SB_DISKS];
1561 + int working_disks;
1563 + unsigned long next_sect;
1565 + mdk_thread_t *thread, *resync_thread;
1566 + int resync_mirrors;
1567 + struct mirror_info *spare;
1570 +typedef struct raid1_private_data raid1_conf_t;
1573 + * this is the only point in the RAID code where we violate
1574 + * C type safety. mddev->private is an 'opaque' pointer.
1576 +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
1579 + * this is our 'private' 'collective' RAID1 buffer head.
1580 + * it contains information about what kind of IO operations were started
1581 + * for this RAID1 operation, and about their status:
1585 + atomic_t remaining; /* 'have we finished' count,
1586 + * used from IRQ handlers
1589 + unsigned long state;
1591 + struct buffer_head *master_bh;
1592 + struct buffer_head *mirror_bh [MD_SB_DISKS];
1593 + struct buffer_head bh_req;
1594 + struct buffer_head *next_retry;
1598 --- linux/include/linux/raid/raid5.h.orig Tue Jan 16 13:42:03 2001
1599 +++ linux/include/linux/raid/raid5.h Tue Jan 16 13:44:37 2001
1604 +#include <linux/raid/md.h>
1605 +#include <linux/raid/xor.h>
1617 +struct stripe_head {
1618 + md_spinlock_t stripe_lock;
1619 + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
1620 + struct stripe_head *free_next; /* pool of free sh's */
1621 + struct buffer_head *buffer_pool; /* pool of free buffers */
1622 + struct buffer_head *bh_pool; /* pool of free bh's */
1623 + struct raid5_private_data *raid_conf;
1624 + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
1625 + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
1626 + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
1627 + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
1628 + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
1629 + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
1630 + unsigned long sector; /* sector of this row */
1631 + int size; /* buffers size */
1632 + int pd_idx; /* parity disk index */
1633 + atomic_t nr_pending; /* nr of pending cmds */
1634 + unsigned long state; /* state flags */
1635 + int cmd; /* stripe cmd */
1636 + int count; /* nr of waiters */
1637 + int write_method; /* reconstruct-write / read-modify-write */
1638 + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
1639 + struct wait_queue *wait; /* processes waiting for this stripe */
1645 +#define PHASE_BEGIN 0
1646 +#define PHASE_READ_OLD 1
1647 +#define PHASE_WRITE 2
1648 +#define PHASE_READ 3
1649 +#define PHASE_COMPLETE 4
1654 +#define METHOD_NONE 0
1655 +#define RECONSTRUCT_WRITE 1
1656 +#define READ_MODIFY_WRITE 2
1661 +#define STRIPE_LOCKED 0
1662 +#define STRIPE_ERROR 1
1667 +#define STRIPE_NONE 0
1668 +#define STRIPE_WRITE 1
1669 +#define STRIPE_READ 2
1671 +struct raid5_private_data {
1672 + struct stripe_head **stripe_hashtbl;
1674 + mdk_thread_t *thread, *resync_thread;
1675 + struct disk_info disks[MD_SB_DISKS];
1676 + struct disk_info *spare;
1678 + int chunk_size, level, algorithm;
1679 + int raid_disks, working_disks, failed_disks;
1681 + unsigned long next_sector;
1682 + atomic_t nr_handle;
1683 + struct stripe_head *next_free_stripe;
1685 + int resync_parity;
1686 + int max_nr_stripes;
1688 + int nr_hashed_stripes;
1689 + int nr_locked_stripes;
1690 + int nr_pending_stripes;
1691 + int nr_cached_stripes;
1694 + * Free stripes pool
1697 + struct stripe_head *free_sh_list;
1698 + struct wait_queue *wait_for_stripe;
1701 +typedef struct raid5_private_data raid5_conf_t;
1703 +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
1706 + * Our supported algorithms
1708 +#define ALGORITHM_LEFT_ASYMMETRIC 0
1709 +#define ALGORITHM_RIGHT_ASYMMETRIC 1
1710 +#define ALGORITHM_LEFT_SYMMETRIC 2
1711 +#define ALGORITHM_RIGHT_SYMMETRIC 3
1714 --- linux/include/linux/raid/translucent.h.orig Tue Jan 16 13:42:03 2001
1715 +++ linux/include/linux/raid/translucent.h Tue Jan 16 13:42:03 2001
1717 +#ifndef _TRANSLUCENT_H
1718 +#define _TRANSLUCENT_H
1720 +#include <linux/raid/md.h>
1722 +typedef struct dev_info dev_info_t;
1729 +struct translucent_private_data
1731 + dev_info_t disks[MD_SB_DISKS];
1735 +typedef struct translucent_private_data translucent_conf_t;
1737 +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
1740 --- linux/include/linux/raid/xor.h.orig Tue Jan 16 13:42:03 2001
1741 +++ linux/include/linux/raid/xor.h Tue Jan 16 13:44:35 2001
1746 +#include <linux/raid/md.h>
1748 +#define MAX_XOR_BLOCKS 5
1750 +extern void calibrate_xor_block(void);
1751 +extern void (*xor_block)(unsigned int count,
1752 + struct buffer_head **bh_ptr);
1755 --- linux/include/linux/blkdev.h.orig Mon Dec 11 01:49:44 2000
1756 +++ linux/include/linux/blkdev.h Tue Jan 16 13:47:19 2001
1758 extern void make_request(int major,int rw, struct buffer_head * bh);
1760 /* md needs this function to remap requests */
1761 -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
1762 -extern int md_make_request (int minor, int rw, struct buffer_head * bh);
1763 +extern int md_map (kdev_t dev, kdev_t *rdev,
1764 + unsigned long *rsector, unsigned long size);
1765 +extern int md_make_request (struct buffer_head * bh, int rw);
1766 extern int md_error (kdev_t mddev, kdev_t rdev);
1768 extern int * blk_size[MAX_BLKDEV];
1769 --- linux/include/linux/fs.h.orig Tue Jan 16 13:30:09 2001
1770 +++ linux/include/linux/fs.h Tue Jan 16 13:47:18 2001
1772 #define BH_Req 3 /* 0 if the buffer has been invalidated */
1773 #define BH_Protected 6 /* 1 if the buffer is protected */
1774 #define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */
1775 +#define BH_LowPrio 8 /* 1 if the buffer is lowprio */
1778 * Try to keep the most commonly used fields in single cache lines (16
1780 extern void refile_buffer(struct buffer_head * buf);
1781 extern void set_writetime(struct buffer_head * buf, int flag);
1782 extern int try_to_free_buffers(struct page *, int);
1783 +extern void cache_drop_behind(struct buffer_head *bh);
1785 extern int nr_buffers;
1786 extern long buffermem;
1787 @@ -814,6 +816,25 @@
1791 +extern inline void mark_buffer_highprio(struct buffer_head * bh)
1793 + clear_bit(BH_LowPrio, &bh->b_state);
1796 +extern inline void mark_buffer_lowprio(struct buffer_head * bh)
1799 + * dirty buffers cannot be marked lowprio.
1801 + if (!buffer_dirty(bh))
1802 + set_bit(BH_LowPrio, &bh->b_state);
1805 +static inline int buffer_lowprio(struct buffer_head * bh)
1807 + return test_bit(BH_LowPrio, &bh->b_state);
1810 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
1812 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1813 @@ -821,6 +842,23 @@
1814 if (bh->b_list != BUF_DIRTY)
1818 + * if a buffer gets marked dirty then it has to lose
1819 + * it's lowprio state.
1821 + mark_buffer_highprio(bh);
1824 +extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
1826 + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1827 + if (bh->b_list != BUF_DIRTY)
1828 + refile_buffer(bh);
1830 + * Mark it lowprio only if it was not dirty before!
1832 + set_bit(BH_LowPrio, &bh->b_state);
1836 extern int check_disk_change(kdev_t dev);
1838 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
1839 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
1840 extern int is_read_only(kdev_t);
1841 +extern int is_device_idle(kdev_t);
1842 extern void __brelse(struct buffer_head *);
1843 extern inline void brelse(struct buffer_head *buf)
1845 @@ -913,8 +952,12 @@
1846 extern void set_blocksize(kdev_t dev, int size);
1847 extern unsigned int get_hardblocksize(kdev_t dev);
1848 extern struct buffer_head * bread(kdev_t dev, int block, int size);
1849 +extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
1850 +extern void bread_ahead (kdev_t dev, int block, int size);
1851 extern struct buffer_head * breada(kdev_t dev,int block, int size,
1852 unsigned int pos, unsigned int filesize);
1853 +extern struct buffer_head * breada_blocks(kdev_t dev,int block,
1854 + int size, int blocks);
1856 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
1858 --- linux/include/linux/md.h.orig Fri May 8 09:17:13 1998
1859 +++ linux/include/linux/md.h Tue Jan 16 13:42:03 2001
1862 - md.h : Multiple Devices driver for Linux
1863 - Copyright (C) 1994-96 Marc ZYNGIER
1864 - <zyngier@ufr-info-p7.ibp.fr> or
1865 - <maz@gloups.fdn.fr>
1867 - This program is free software; you can redistribute it and/or modify
1868 - it under the terms of the GNU General Public License as published by
1869 - the Free Software Foundation; either version 2, or (at your option)
1870 - any later version.
1872 - You should have received a copy of the GNU General Public License
1873 - (for example /usr/src/linux/COPYING); if not, write to the Free
1874 - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1880 -#include <linux/major.h>
1881 -#include <linux/ioctl.h>
1882 -#include <linux/types.h>
1885 - * Different major versions are not compatible.
1886 - * Different minor versions are only downward compatible.
1887 - * Different patchlevel versions are downward and upward compatible.
1889 -#define MD_MAJOR_VERSION 0
1890 -#define MD_MINOR_VERSION 36
1891 -#define MD_PATCHLEVEL_VERSION 6
1893 -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
1896 -#define REGISTER_DEV _IO (MD_MAJOR, 1)
1897 -#define START_MD _IO (MD_MAJOR, 2)
1898 -#define STOP_MD _IO (MD_MAJOR, 3)
1899 -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
1903 - Byte 0 : Chunk size factor
1904 - Byte 1 : Fault tolerance count for each physical device
1905 - ( 0 means no fault tolerance,
1906 - 0xFF means always tolerate faults), not used by now.
1907 - Byte 2 : Personality
1908 - Byte 3 : Reserved.
1911 -#define FAULT_SHIFT 8
1912 -#define PERSONALITY_SHIFT 16
1914 -#define FACTOR_MASK 0x000000FFUL
1915 -#define FAULT_MASK 0x0000FF00UL
1916 -#define PERSONALITY_MASK 0x00FF0000UL
1918 -#define MD_RESERVED 0 /* Not used by now */
1919 -#define LINEAR (1UL << PERSONALITY_SHIFT)
1920 -#define STRIPED (2UL << PERSONALITY_SHIFT)
1921 -#define RAID0 STRIPED
1922 -#define RAID1 (3UL << PERSONALITY_SHIFT)
1923 -#define RAID5 (4UL << PERSONALITY_SHIFT)
1924 -#define MAX_PERSONALITY 5
1929 - * The MD superblock maintains some statistics on each MD configuration.
1930 - * Each real device in the MD set contains it near the end of the device.
1931 - * Some of the ideas are copied from the ext2fs implementation.
1933 - * We currently use 4096 bytes as follows:
1935 - * word offset function
1937 - * 0 - 31 Constant generic MD device information.
1938 - * 32 - 63 Generic state information.
1939 - * 64 - 127 Personality specific information.
1940 - * 128 - 511 12 32-words descriptors of the disks in the raid set.
1941 - * 512 - 911 Reserved.
1942 - * 912 - 1023 Disk specific descriptor.
1946 - * If x is the real device size in bytes, we return an apparent size of:
1948 - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1950 - * and place the 4kB superblock at offset y.
1952 -#define MD_RESERVED_BYTES (64 * 1024)
1953 -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1954 -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1956 -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1957 -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1959 -#define MD_SB_BYTES 4096
1960 -#define MD_SB_WORDS (MD_SB_BYTES / 4)
1961 -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1962 -#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1965 - * The following are counted in 32-bit words
1967 -#define MD_SB_GENERIC_OFFSET 0
1968 -#define MD_SB_PERSONALITY_OFFSET 64
1969 -#define MD_SB_DISKS_OFFSET 128
1970 -#define MD_SB_DESCRIPTOR_OFFSET 992
1972 -#define MD_SB_GENERIC_CONSTANT_WORDS 32
1973 -#define MD_SB_GENERIC_STATE_WORDS 32
1974 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1975 -#define MD_SB_PERSONALITY_WORDS 64
1976 -#define MD_SB_DISKS_WORDS 384
1977 -#define MD_SB_DESCRIPTOR_WORDS 32
1978 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1979 -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1980 -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1983 - * Device "operational" state bits
1985 -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
1986 -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
1987 -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
1989 -typedef struct md_device_descriptor_s {
1990 - __u32 number; /* 0 Device number in the entire set */
1991 - __u32 major; /* 1 Device major number */
1992 - __u32 minor; /* 2 Device minor number */
1993 - __u32 raid_disk; /* 3 The role of the device in the raid set */
1994 - __u32 state; /* 4 Operational state */
1995 - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1998 -#define MD_SB_MAGIC 0xa92b4efc
2001 - * Superblock state bits
2003 -#define MD_SB_CLEAN 0
2004 -#define MD_SB_ERRORS 1
2006 -typedef struct md_superblock_s {
2009 - * Constant generic information
2011 - __u32 md_magic; /* 0 MD identifier */
2012 - __u32 major_version; /* 1 major version to which the set conforms */
2013 - __u32 minor_version; /* 2 minor version to which the set conforms */
2014 - __u32 patch_version; /* 3 patchlevel version to which the set conforms */
2015 - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
2016 - __u32 set_magic; /* 5 Raid set identifier */
2017 - __u32 ctime; /* 6 Creation time */
2018 - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
2019 - __u32 size; /* 8 Apparent size of each individual disk, in kB */
2020 - __u32 nr_disks; /* 9 Number of total disks in the raid set */
2021 - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
2022 - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
2025 - * Generic state information
2027 - __u32 utime; /* 0 Superblock update time */
2028 - __u32 state; /* 1 State bits (clean, ...) */
2029 - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
2030 - __u32 working_disks; /* 3 Number of working disks */
2031 - __u32 failed_disks; /* 4 Number of failed disks */
2032 - __u32 spare_disks; /* 5 Number of spare disks */
2033 - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
2036 - * Personality information
2038 - __u32 parity_algorithm;
2040 - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
2043 - * Disks information
2045 - md_descriptor_t disks[MD_SB_DISKS];
2050 - __u32 reserved[MD_SB_RESERVED_WORDS];
2053 - * Active descriptor
2055 - md_descriptor_t descriptor;
2060 -#include <linux/mm.h>
2061 -#include <linux/fs.h>
2062 -#include <linux/blkdev.h>
2063 -#include <asm/semaphore.h>
2066 - * Kernel-based reconstruction is mostly working, but still requires
2067 - * some additional work.
2069 -#define SUPPORT_RECONSTRUCTION 0
2071 -#define MAX_REAL 8 /* Max number of physical dev per md dev */
2072 -#define MAX_MD_DEV 4 /* Max number of md dev */
2074 -#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
2075 -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
2076 -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
2078 -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
2082 - kdev_t dev; /* Device number */
2083 - int size; /* Device size (in blocks) */
2084 - int offset; /* Real device offset (in blocks) in md dev
2085 - (only used in linear mode) */
2086 - struct inode *inode; /* Lock inode */
2087 - md_superblock_t *sb;
2093 -#define SPARE_INACTIVE 0
2094 -#define SPARE_WRITE 1
2095 -#define SPARE_ACTIVE 2
2097 -struct md_personality
2100 - int (*map)(struct md_dev *mddev, kdev_t *rdev,
2101 - unsigned long *rsector, unsigned long size);
2102 - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
2103 - void (*end_request)(struct buffer_head * bh, int uptodate);
2104 - int (*run)(int minor, struct md_dev *mddev);
2105 - int (*stop)(int minor, struct md_dev *mddev);
2106 - int (*status)(char *page, int minor, struct md_dev *mddev);
2107 - int (*ioctl)(struct inode *inode, struct file *file,
2108 - unsigned int cmd, unsigned long arg);
2109 - int max_invalid_dev;
2110 - int (*error_handler)(struct md_dev *mddev, kdev_t dev);
2113 - * Some personalities (RAID-1, RAID-5) can get disks hot-added and
2114 - * hot-removed. Hot removal is different from failure. (failure marks
2115 - * a disk inactive, but the disk is still part of the array)
2117 - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
2118 - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
2119 - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
2124 - struct real_dev devices[MAX_REAL];
2125 - struct md_personality *pers;
2126 - md_superblock_t *sb;
2135 - void (*run) (void *data);
2137 - struct wait_queue *wqueue;
2138 - unsigned long flags;
2139 - struct semaphore *sem;
2140 - struct task_struct *tsk;
2143 -#define THREAD_WAKEUP 0
2145 -extern struct md_dev md_dev[MAX_MD_DEV];
2146 -extern int md_size[MAX_MD_DEV];
2147 -extern int md_maxreadahead[MAX_MD_DEV];
2149 -extern char *partition_name (kdev_t dev);
2151 -extern int register_md_personality (int p_num, struct md_personality *p);
2152 -extern int unregister_md_personality (int p_num);
2153 -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
2154 -extern void md_unregister_thread (struct md_thread *thread);
2155 -extern void md_wakeup_thread(struct md_thread *thread);
2156 -extern int md_update_sb (int minor);
2157 -extern int md_do_sync(struct md_dev *mddev);
2161 --- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996
2162 +++ linux/include/linux/raid0.h Tue Jan 16 13:42:03 2001
2169 - int zone_offset; /* Zone offset in md_dev */
2170 - int dev_offset; /* Zone offset in real dev */
2171 - int size; /* Zone size */
2172 - int nb_dev; /* Number of devices attached to the zone */
2173 - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
2178 - struct strip_zone *zone0, *zone1;
2183 - struct raid0_hash *hash_table; /* Dynamically allocated */
2184 - struct strip_zone *strip_zone; /* This one too */
2185 - int nr_strip_zones;
2186 - struct strip_zone *smallest;
2191 --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998
2192 +++ linux/include/linux/raid1.h Tue Jan 16 13:42:03 2001
2197 -#include <linux/md.h>
2199 -struct mirror_info {
2214 -struct raid1_data {
2215 - struct md_dev *mddev;
2216 - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
2218 - int working_disks; /* Number of working disks */
2220 - unsigned long next_sect;
2222 - int resync_running;
2226 - * this is our 'private' 'collective' RAID1 buffer head.
2227 - * it contains information about what kind of IO operations were started
2228 - * for this RAID5 operation, and about their status:
2232 - unsigned int remaining;
2234 - unsigned long state;
2235 - struct md_dev *mddev;
2236 - struct buffer_head *master_bh;
2237 - struct buffer_head *mirror_bh [MD_SB_DISKS];
2238 - struct buffer_head bh_req;
2239 - struct buffer_head *next_retry;
2243 --- linux/include/linux/raid5.h.orig Fri May 8 09:17:13 1998
2244 +++ linux/include/linux/raid5.h Tue Jan 16 13:42:03 2001
2250 -#include <linux/md.h>
2251 -#include <asm/atomic.h>
2262 -struct stripe_head {
2263 - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
2264 - struct stripe_head *free_next; /* pool of free sh's */
2265 - struct buffer_head *buffer_pool; /* pool of free buffers */
2266 - struct buffer_head *bh_pool; /* pool of free bh's */
2267 - struct raid5_data *raid_conf;
2268 - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
2269 - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
2270 - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
2271 - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
2272 - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
2273 - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
2274 - unsigned long sector; /* sector of this row */
2275 - int size; /* buffers size */
2276 - int pd_idx; /* parity disk index */
2277 - int nr_pending; /* nr of pending cmds */
2278 - unsigned long state; /* state flags */
2279 - int cmd; /* stripe cmd */
2280 - int count; /* nr of waiters */
2281 - int write_method; /* reconstruct-write / read-modify-write */
2282 - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
2283 - struct wait_queue *wait; /* processes waiting for this stripe */
2289 -#define PHASE_BEGIN 0
2290 -#define PHASE_READ_OLD 1
2291 -#define PHASE_WRITE 2
2292 -#define PHASE_READ 3
2293 -#define PHASE_COMPLETE 4
2298 -#define METHOD_NONE 0
2299 -#define RECONSTRUCT_WRITE 1
2300 -#define READ_MODIFY_WRITE 2
2305 -#define STRIPE_LOCKED 0
2306 -#define STRIPE_ERROR 1
2311 -#define STRIPE_NONE 0
2312 -#define STRIPE_WRITE 1
2313 -#define STRIPE_READ 2
2315 -struct raid5_data {
2316 - struct stripe_head **stripe_hashtbl;
2317 - struct md_dev *mddev;
2318 - struct md_thread *thread, *resync_thread;
2319 - struct disk_info disks[MD_SB_DISKS];
2320 - struct disk_info *spare;
2322 - int chunk_size, level, algorithm;
2323 - int raid_disks, working_disks, failed_disks;
2325 - unsigned long next_sector;
2326 - atomic_t nr_handle;
2327 - struct stripe_head *next_free_stripe;
2329 - int resync_parity;
2330 - int max_nr_stripes;
2332 - int nr_hashed_stripes;
2333 - int nr_locked_stripes;
2334 - int nr_pending_stripes;
2335 - int nr_cached_stripes;
2338 - * Free stripes pool
2341 - struct stripe_head *free_sh_list;
2342 - struct wait_queue *wait_for_stripe;
2348 - * Our supported algorithms
2350 -#define ALGORITHM_LEFT_ASYMMETRIC 0
2351 -#define ALGORITHM_RIGHT_ASYMMETRIC 1
2352 -#define ALGORITHM_LEFT_SYMMETRIC 2
2353 -#define ALGORITHM_RIGHT_SYMMETRIC 3
2356 --- linux/include/linux/sysctl.h.orig Mon Dec 11 01:49:44 2000
2357 +++ linux/include/linux/sysctl.h Tue Jan 16 13:42:03 2001
2366 @@ -446,6 +447,11 @@
2369 DEV_CDROM_CHECK_MEDIA=6
2372 +/* /proc/sys/dev/md */
2374 + DEV_MD_SPEED_LIMIT=1
2377 /* /proc/sys/dev/mac_hid */
2378 --- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998
2379 +++ linux/include/asm-i386/md.h Tue Jan 16 13:42:03 2001
2382 - * md.h: High speed xor_block operation for RAID4/5
2389 -/* #define HAVE_ARCH_XORBLOCK */
2391 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2393 -#endif /* __ASM_MD_H */
2394 --- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998
2395 +++ linux/include/asm-alpha/md.h Tue Jan 16 13:42:03 2001
2398 - * md.h: High speed xor_block operation for RAID4/5
2405 -/* #define HAVE_ARCH_XORBLOCK */
2407 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2409 -#endif /* __ASM_MD_H */
2410 --- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998
2411 +++ linux/include/asm-m68k/md.h Tue Jan 16 13:42:04 2001
2414 - * md.h: High speed xor_block operation for RAID4/5
2421 -/* #define HAVE_ARCH_XORBLOCK */
2423 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2425 -#endif /* __ASM_MD_H */
2426 --- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998
2427 +++ linux/include/asm-sparc/md.h Tue Jan 16 13:42:04 2001
2430 - * md.h: High speed xor_block operation for RAID4/5
2437 -/* #define HAVE_ARCH_XORBLOCK */
2439 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2441 -#endif /* __ASM_MD_H */
2442 --- linux/include/asm-ppc/md.h.orig Wed Oct 27 02:53:42 1999
2443 +++ linux/include/asm-ppc/md.h Tue Jan 16 13:42:04 2001
2446 - * md.h: High speed xor_block operation for RAID4/5
2453 -/* #define HAVE_ARCH_XORBLOCK */
2455 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2457 -#endif /* __ASM_MD_H */
2458 --- linux/include/asm-sparc64/md.h.orig Tue Jan 13 00:15:58 1998
2459 +++ linux/include/asm-sparc64/md.h Tue Jan 16 13:42:04 2001
2462 - * md.h: High speed xor_block operation for RAID4/5
2463 - * utilizing the UltraSparc Visual Instruction Set.
2465 - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
2471 -#include <asm/head.h>
2472 -#include <asm/asi.h>
2474 -#define HAVE_ARCH_XORBLOCK
2476 -#define MD_XORBLOCK_ALIGNMENT 64
2478 -/* void __xor_block (char *dest, char *src, long len)
2480 - * while (len--) *dest++ ^= *src++;
2484 - * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
2485 - * !(len & 127) && len >= 256
2488 -static inline void __xor_block (char *dest, char *src, long len)
2490 - __asm__ __volatile__ ("
2491 - wr %%g0, %3, %%fprs
2492 - wr %%g0, %4, %%asi
2493 - membar #LoadStore|#StoreLoad|#StoreStore
2495 - ldda [%0] %4, %%f0
2496 - ldda [%1] %4, %%f16
2497 -1: ldda [%0 + 64] %%asi, %%f32
2498 - fxor %%f0, %%f16, %%f16
2499 - fxor %%f2, %%f18, %%f18
2500 - fxor %%f4, %%f20, %%f20
2501 - fxor %%f6, %%f22, %%f22
2502 - fxor %%f8, %%f24, %%f24
2503 - fxor %%f10, %%f26, %%f26
2504 - fxor %%f12, %%f28, %%f28
2505 - fxor %%f14, %%f30, %%f30
2506 - stda %%f16, [%0] %4
2507 - ldda [%1 + 64] %%asi, %%f48
2508 - ldda [%0 + 128] %%asi, %%f0
2509 - fxor %%f32, %%f48, %%f48
2510 - fxor %%f34, %%f50, %%f50
2512 - fxor %%f36, %%f52, %%f52
2514 - fxor %%f38, %%f54, %%f54
2516 - fxor %%f40, %%f56, %%f56
2517 - fxor %%f42, %%f58, %%f58
2518 - fxor %%f44, %%f60, %%f60
2519 - fxor %%f46, %%f62, %%f62
2520 - stda %%f48, [%0 - 64] %%asi
2522 - ldda [%1] %4, %%f16
2523 - ldda [%0 + 64] %%asi, %%f32
2524 - fxor %%f0, %%f16, %%f16
2525 - fxor %%f2, %%f18, %%f18
2526 - fxor %%f4, %%f20, %%f20
2527 - fxor %%f6, %%f22, %%f22
2528 - fxor %%f8, %%f24, %%f24
2529 - fxor %%f10, %%f26, %%f26
2530 - fxor %%f12, %%f28, %%f28
2531 - fxor %%f14, %%f30, %%f30
2532 - stda %%f16, [%0] %4
2533 - ldda [%1 + 64] %%asi, %%f48
2535 - fxor %%f32, %%f48, %%f48
2536 - fxor %%f34, %%f50, %%f50
2537 - fxor %%f36, %%f52, %%f52
2538 - fxor %%f38, %%f54, %%f54
2539 - fxor %%f40, %%f56, %%f56
2540 - fxor %%f42, %%f58, %%f58
2541 - fxor %%f44, %%f60, %%f60
2542 - fxor %%f46, %%f62, %%f62
2543 - stda %%f48, [%0 + 64] %%asi
2544 - membar #Sync|#StoreStore|#StoreLoad
2545 - wr %%g0, 0, %%fprs
2547 - "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
2551 -#endif /* __ASM_MD_H */
2552 --- linux/drivers/block/Config.in.orig Mon Dec 11 01:49:41 2000
2553 +++ linux/drivers/block/Config.in Tue Jan 16 13:42:04 2001
2554 @@ -103,10 +103,13 @@
2556 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
2557 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
2558 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
2559 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
2560 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
2561 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
2562 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
2563 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
2564 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
2566 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
2567 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
2568 --- linux/drivers/block/Makefile.orig Mon Dec 11 01:49:41 2000
2569 +++ linux/drivers/block/Makefile Tue Jan 16 13:42:04 2001
2570 @@ -294,10 +294,28 @@
2573 ifeq ($(CONFIG_MD_RAID5),y)
2577 ifeq ($(CONFIG_MD_RAID5),m)
2583 +ifeq ($(CONFIG_MD_TRANSLUCENT),y)
2584 +L_OBJS += translucent.o
2586 + ifeq ($(CONFIG_MD_TRANSLUCENT),m)
2587 + M_OBJS += translucent.o
2591 +ifeq ($(CONFIG_MD_HSM),y)
2594 + ifeq ($(CONFIG_MD_HSM),m)
2599 --- linux/drivers/block/genhd.c.orig Tue Jan 16 13:30:06 2001
2600 +++ linux/drivers/block/genhd.c Tue Jan 16 13:42:04 2001
2602 #include <linux/string.h>
2603 #include <linux/blk.h>
2604 #include <linux/init.h>
2605 +#include <linux/raid/md.h>
2607 #ifdef CONFIG_ARCH_S390
2608 #include <asm/dasd.h>
2609 @@ -1785,6 +1786,9 @@
2614 +#ifdef CONFIG_BLK_DEV_MD
2615 + autodetect_raid();
2617 #ifdef CONFIG_MD_BOOT
2619 --- linux/drivers/block/hsm.c.orig Tue Jan 16 13:42:04 2001
2620 +++ linux/drivers/block/hsm.c Tue Jan 16 13:42:04 2001
2623 + hsm.c : HSM RAID driver for Linux
2624 + Copyright (C) 1998 Ingo Molnar
2626 + HSM mode management functions.
2628 + This program is free software; you can redistribute it and/or modify
2629 + it under the terms of the GNU General Public License as published by
2630 + the Free Software Foundation; either version 2, or (at your option)
2631 + any later version.
2633 + You should have received a copy of the GNU General Public License
2634 + (for example /usr/src/linux/COPYING); if not, write to the Free
2635 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2638 +#include <linux/module.h>
2640 +#include <linux/raid/md.h>
2641 +#include <linux/malloc.h>
2643 +#include <linux/raid/hsm.h>
2644 +#include <linux/blk.h>
2646 +#define MAJOR_NR MD_MAJOR
2648 +#define MD_PERSONALITY
2651 +#define DEBUG_HSM 1
2654 +#define dprintk(x,y...) printk(x,##y)
2656 +#define dprintk(x,y...) do { } while (0)
2659 +void print_bh(struct buffer_head *bh)
2661 + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
2662 + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
2663 + bh->b_rsector, bh->b_this_page, bh->b_state,
2664 + bh->b_next_free, bh->b_count, bh->b_data,
2665 + bh->b_list, bh->b_flushtime
2669 +static int check_bg (pv_t *pv, pv_block_group_t * bg)
2673 + dprintk("checking bg ...\n");
2675 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2676 + if (pv_pptr_free(bg->blocks + i)) {
2678 + if (test_bit(i, bg->used_bitmap)) {
2679 + printk("hm, bit %d set?\n", i);
2682 + if (!test_bit(i, bg->used_bitmap)) {
2683 + printk("hm, bit %d not set?\n", i);
2687 + dprintk("%d free blocks in bg ...\n", free);
2691 +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
2693 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2694 + struct buffer_head *bh;
2696 + dprintk("... getting BG at %u ...\n", bg_pos);
2698 + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
2703 + desc->bg = (pv_block_group_t *) bh->b_data;
2704 + desc->free_blocks = check_bg(pv, desc->bg);
2707 +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
2708 + unsigned int lblock, lv_lptr_t * index)
2712 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2713 + pv_pptr_t * bptr = desc->bg->blocks + i;
2714 + if (pv_pptr_free(bptr)) {
2715 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2717 + if (test_bit(i, desc->bg->used_bitmap)) {
2721 + bptr->u.used.owner.log_id = lv->log_id;
2722 + bptr->u.used.owner.log_index = lblock;
2723 + index->data.phys_nr = pv->phys_nr;
2724 + index->data.phys_block = bg_pos + i + 1;
2725 + set_bit(i, desc->bg->used_bitmap);
2726 + desc->free_blocks--;
2727 + dprintk(".....free blocks left in bg %p: %d\n",
2728 + desc->bg, desc->free_blocks);
2735 +static int __get_free_block (lv_t *lv, pv_t *pv,
2736 + unsigned int lblock, lv_lptr_t * index)
2740 + dprintk("trying to get free block for lblock %d ...\n", lblock);
2742 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2743 + pv_bg_desc_t *desc = pv->bg_array + i;
2745 + dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
2747 + get_bg(pv, desc, i);
2749 + if (desc->bg && desc->free_blocks)
2750 + return find_free_block(lv, pv, desc, i,
2753 + dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
2757 +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2761 + if (!lv->free_indices)
2765 + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
2767 + if (err || !index->data.phys_block) {
2772 + lv->free_indices--;
2778 + * fix me: wordsize assumptions ...
2780 +#define INDEX_BITS 8
2781 +#define INDEX_DEPTH (32/INDEX_BITS)
2782 +#define INDEX_MASK ((1<<INDEX_BITS) - 1)
2784 +static void print_index_list (lv_t *lv, lv_lptr_t *index)
2789 + dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
2790 + index->data.phys_block, index->cpu_addr);
2792 + tmp = index_child(index);
2793 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2794 + if (index_block(lv, tmp))
2795 + dprintk("(%d->%d)", i, index_block(lv, tmp));
2801 +static int read_index_group (lv_t *lv, lv_lptr_t *index)
2803 + lv_lptr_t *index_group, *tmp;
2804 + struct buffer_head *bh;
2807 + dprintk("reading index group <%s:%d>\n",
2808 + partition_name(index_dev(lv, index)), index_block(lv, index));
2810 + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
2815 + if (!buffer_uptodate(bh))
2818 + index_group = (lv_lptr_t *) bh->b_data;
2819 + tmp = index_group;
2820 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2821 + if (index_block(lv, tmp)) {
2822 + dprintk("index group has BLOCK %d, non-present.\n", i);
2823 + tmp->cpu_addr = 0;
2827 + index->cpu_addr = ptr_to_cpuaddr(index_group);
2829 + dprintk("have read index group %p at block %d.\n",
2830 + index_group, index_block(lv, index));
2831 + print_index_list(lv, index);
2836 +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2838 + struct buffer_head *bh;
2839 + lv_lptr_t * index_group;
2841 + if (get_free_block(lv, lblock, index))
2844 + dprintk("creating block for index group <%s:%d>\n",
2845 + partition_name(index_dev(lv, index)), index_block(lv, index));
2847 + bh = getblk(index_dev(lv, index),
2848 + index_block(lv, index), HSM_BLOCKSIZE);
2850 + index_group = (lv_lptr_t *) bh->b_data;
2851 + md_clear_page(index_group);
2852 + mark_buffer_uptodate(bh, 1);
2854 + index->cpu_addr = ptr_to_cpuaddr(index_group);
2856 + dprintk("allocated index group %p at block %d.\n",
2857 + index_group, index_block(lv, index));
2861 +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
2863 + lv_lptr_t * index = index_child(&lv->root_index);
2866 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
2867 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2871 + if (!index_present(index)) {
2872 + dprintk("no group, level %u, pos %u\n", l, idx);
2873 + if (alloc_index_group(lv, lblock, index))
2876 + index = index_child(index);
2878 + if (!index_block(lv,index)) {
2879 + dprintk("no data, pos %u\n", idx);
2880 + if (get_free_block(lv, lblock, index))
2888 +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
2890 + lv_lptr_t * index = index_child(&lv->root_index);
2893 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
2894 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2898 + if (index_free(index))
2900 + if (!index_present(index))
2901 + read_index_group(lv, index);
2902 + if (!index_present(index)) {
2906 + index = index_child(index);
2908 + if (!index_block(lv,index))
2913 +static int read_root_index(lv_t *lv)
2916 + lv_lptr_t *index = &lv->root_index;
2918 + if (!index_block(lv, index)) {
2919 + printk("LV has no root index yet, creating.\n");
2921 + err = alloc_index_group (lv, 0, index);
2923 + printk("could not create index group, err:%d\n", err);
2926 + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
2927 + lv->root_index.data;
2929 + printk("LV already has a root index.\n");
2930 + printk("... at <%s:%d>.\n",
2931 + partition_name(index_dev(lv, index)),
2932 + index_block(lv, index));
2934 + read_index_group(lv, index);
2939 +static int init_pv(pv_t *pv)
2941 + struct buffer_head *bh;
2944 + bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
2950 + pv_sb = (pv_sb_t *) bh->b_data;
2951 + pv->pv_sb = pv_sb;
2953 + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
2954 + printk("%s is not a PV, has magic %x instead of %x!\n",
2955 + partition_name(pv->dev), pv_sb->pv_magic,
2959 + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
2961 + printk("... created under HSM version %d.%d.%d, at %x.\n",
2962 + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
2963 + printk("... total # of blocks: %d (%d left unallocated).\n",
2964 + pv_sb->pv_total_size, pv_sb->pv_blocks_left);
2966 + printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
2967 + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
2968 + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
2969 + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
2971 + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
2975 + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
2976 + if (!pv->bg_array) {
2980 + memset(pv->bg_array, 0, PAGE_SIZE);
2985 +static int free_pv(pv_t *pv)
2987 + struct buffer_head *bh;
2989 + dprintk("freeing PV %d ...\n", pv->phys_nr);
2991 + if (pv->bg_array) {
2994 + dprintk(".... freeing BGs ...\n");
2995 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2996 + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
2997 + pv_bg_desc_t *desc = pv->bg_array + i;
3000 + dprintk(".... freeing BG %d ...\n", i);
3001 + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
3002 + mark_buffer_dirty(bh, 1);
3007 + free_page((unsigned long)pv->bg_array);
3011 + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
3016 + mark_buffer_dirty(bh, 1);
3023 +struct semaphore hsm_sem = MUTEX;
3025 +#define HSM_SECTORS (HSM_BLOCKSIZE/512)
3027 +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3028 + unsigned long *rsector, unsigned long bsectors)
3030 + lv_t *lv = kdev_to_lv(dev);
3032 + unsigned int lblock = *rsector / HSM_SECTORS;
3033 + unsigned int offset = *rsector % HSM_SECTORS;
3037 + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
3040 + if (offset + bsectors > HSM_SECTORS) {
3045 + index = find_index(lv, lblock);
3047 + printk("no block %u yet ... allocating\n", lblock);
3048 + index = alloc_fixed_index(lv, lblock);
3053 + printk(" %u <%s : %ld(%ld)> -> ", lblock,
3054 + partition_name(*rdev), *rsector, bsectors);
3056 + *rdev = index_dev(lv, index);
3057 + *rsector = index_block(lv, index) * HSM_SECTORS + offset;
3059 + printk(" <%s : %ld> %u\n",
3060 + partition_name(*rdev), *rsector, index_block(lv, index));
3067 +static void free_index (lv_t *lv, lv_lptr_t * index)
3069 + struct buffer_head *bh;
3071 + printk("tryin to get cached block for index group <%s:%d>\n",
3072 + partition_name(index_dev(lv, index)), index_block(lv, index));
3074 + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
3076 + printk("....FREEING ");
3077 + print_index_list(lv, index);
3080 + if (!buffer_uptodate(bh))
3082 + if ((lv_lptr_t *)bh->b_data != index_child(index)) {
3083 + printk("huh? b_data is %p, index content is %p.\n",
3084 + bh->b_data, index_child(index));
3086 + printk("good, b_data == index content == %p.\n",
3087 + index_child(index));
3088 + printk("b_count == %d, writing.\n", bh->b_count);
3089 + mark_buffer_dirty(bh, 1);
3092 + printk("done.\n");
3094 + printk("FAILED!\n");
3096 + print_index_list(lv, index);
3097 + index_child(index) = NULL;
3100 +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3103 + lv_lptr_t * index;
3106 + nr_dots = (INDEX_DEPTH-level)*3;
3107 + memcpy(dots,"...............",nr_dots);
3108 + dots[nr_dots] = 0;
3110 + dprintk("%s level %d index group block:\n", dots, level);
3114 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3115 + if (index->data.phys_block) {
3116 + dprintk("%s block <%u,%u,%x>\n", dots,
3117 + index->data.phys_nr,
3118 + index->data.phys_block,
3120 + if (level && index_present(index)) {
3121 + dprintk("%s==> deeper one level\n", dots);
3122 + free_index_group(lv, level-1,
3123 + index_child(index));
3124 + dprintk("%s freeing index group block %p ...",
3125 + dots, index_child(index));
3126 + free_index(lv, index);
3131 + dprintk("%s DONE: level %d index group block.\n", dots, level);
3134 +static void free_lv_indextree (lv_t *lv)
3136 + dprintk("freeing LV %d ...\n", lv->log_id);
3137 + dprintk("..root index: %p\n", index_child(&lv->root_index));
3138 + dprintk("..INDEX TREE:\n");
3139 + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3140 + dprintk("..freeing root index %p ...", index_child(&lv->root_index));
3141 + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
3142 + lv->root_index.data.phys_block, lv->root_index.cpu_addr);
3143 + free_index(lv, &lv->root_index);
3144 + dprintk("..INDEX TREE done.\n");
3145 + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
3146 + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
3149 +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3152 + lv_lptr_t * index;
3155 + nr_dots = (INDEX_DEPTH-level)*3;
3156 + memcpy(dots,"...............",nr_dots);
3157 + dots[nr_dots] = 0;
3159 + dprintk("%s level %d index group block:\n", dots, level);
3162 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3163 + index = index_0 + i;
3164 + if (index->data.phys_block) {
3165 + dprintk("%s block <%u,%u,%x>\n", dots,
3166 + index->data.phys_nr,
3167 + index->data.phys_block,
3169 + if (level && index_present(index)) {
3170 + dprintk("%s==> deeper one level\n", dots);
3171 + print_index_group(lv, level-1,
3172 + index_child(index));
3176 + dprintk("%s DONE: level %d index group block.\n", dots, level);
3179 +static void print_lv (lv_t *lv)
3181 + dprintk("printing LV %d ...\n", lv->log_id);
3182 + dprintk("..root index: %p\n", index_child(&lv->root_index));
3183 + dprintk("..INDEX TREE:\n");
3184 + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3185 + dprintk("..INDEX TREE done.\n");
3188 +static int map_lv (lv_t *lv)
3190 + kdev_t dev = lv->dev;
3191 + unsigned int nr = MINOR(dev);
3192 + mddev_t *mddev = lv->vg->mddev;
3194 + if (MAJOR(dev) != MD_MAJOR) {
3198 + if (kdev_to_mddev(dev)) {
3202 + md_hd_struct[nr].start_sect = 0;
3203 + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
3204 + md_size[nr] = md_size[mdidx(mddev)];
3205 + add_mddev_mapping(mddev, dev, lv);
3210 +static int unmap_lv (lv_t *lv)
3212 + kdev_t dev = lv->dev;
3213 + unsigned int nr = MINOR(dev);
3215 + if (MAJOR(dev) != MD_MAJOR) {
3219 + md_hd_struct[nr].start_sect = 0;
3220 + md_hd_struct[nr].nr_sects = 0;
3222 + del_mddev_mapping(lv->vg->mddev, dev);
3227 +static int init_vg (vg_t *vg)
3233 + struct buffer_head *bh;
3234 + lv_descriptor_t *lv_desc;
3237 + * fix me: read all PVs and compare the SB
3239 + dev = vg->pv_array[0].dev;
3240 + bh = bread (dev, 1, HSM_BLOCKSIZE);
3246 + vg_sb = (vg_sb_t *) bh->b_data;
3247 + vg->vg_sb = vg_sb;
3249 + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
3250 + printk("%s is not a valid VG, has magic %x instead of %x!\n",
3251 + partition_name(dev), vg_sb->vg_magic,
3257 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3259 + lv_desc = vg->vg_sb->lv_array + i;
3261 + id = lv_desc->lv_id;
3263 + printk("... LV desc %d empty\n", i);
3266 + if (id >= HSM_MAX_LVS_PER_VG) {
3271 + lv = vg->lv_array + id;
3278 + lv->max_indices = lv_desc->lv_max_indices;
3279 + lv->free_indices = lv_desc->lv_free_indices;
3280 + lv->root_index.data = lv_desc->lv_root_idx;
3281 + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
3286 + if (read_root_index(lv)) {
3289 + memset(lv, 0, sizeof(*lv));
3292 + if (vg->nr_lv != vg_sb->nr_lvs)
3298 +static int hsm_run (mddev_t *mddev)
3304 + MOD_INC_USE_COUNT;
3306 + vg = kmalloc (sizeof (*vg), GFP_KERNEL);
3309 + memset(vg, 0, sizeof(*vg));
3310 + mddev->private = vg;
3311 + vg->mddev = mddev;
3313 + if (md_check_ordering(mddev)) {
3314 + printk("hsm: disks are not ordered, aborting!\n");
3318 + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
3320 + vg->nr_pv = mddev->nb_dev;
3321 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
3322 + pv_t *pv = vg->pv_array + i;
3324 + pv->dev = rdev->dev;
3325 + fsync_dev (pv->dev);
3326 + set_blocksize (pv->dev, HSM_BLOCKSIZE);
3339 + mddev->private = NULL;
3341 + MOD_DEC_USE_COUNT;
3346 +static int hsm_stop (mddev_t *mddev)
3352 + vg = mddev_to_vg(mddev);
3354 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3355 + lv = vg->lv_array + i;
3359 + free_lv_indextree(lv);
3362 + for (i = 0; i < vg->nr_pv; i++)
3363 + free_pv(vg->pv_array + i);
3367 + MOD_DEC_USE_COUNT;
3373 +static int hsm_status (char *page, mddev_t *mddev)
3379 + vg = mddev_to_vg(mddev);
3381 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3382 + lv = vg->lv_array + i;
3385 + sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
3386 + lv->max_indices - lv->free_indices, lv->max_indices);
3392 +static mdk_personality_t hsm_personality=
3411 +md__initfunc(void hsm_init (void))
3413 + register_md_personality (HSM, &hsm_personality);
3418 +int init_module (void)
3420 + return (register_md_personality (HSM, &hsm_personality));
3423 +void cleanup_module (void)
3425 + unregister_md_personality (HSM);
3431 + * This Linus-trick catches bugs via the linker.
3434 +extern void __BUG__in__hsm_dot_c_1(void);
3435 +extern void __BUG__in__hsm_dot_c_2(void);
3436 +extern void __BUG__in__hsm_dot_c_3(void);
3437 +extern void __BUG__in__hsm_dot_c_4(void);
3438 +extern void __BUG__in__hsm_dot_c_5(void);
3439 +extern void __BUG__in__hsm_dot_c_6(void);
3440 +extern void __BUG__in__hsm_dot_c_7(void);
3442 +void bugcatcher (void)
3444 + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
3445 + __BUG__in__hsm_dot_c_1();
3446 + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
3447 + __BUG__in__hsm_dot_c_2();
3449 + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
3450 + __BUG__in__hsm_dot_c_4();
3451 + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
3452 + __BUG__in__hsm_dot_c_3();
3453 + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
3454 + __BUG__in__hsm_dot_c_6();
3456 + if (sizeof(lv_lptr_t) != 16)
3457 + __BUG__in__hsm_dot_c_5();
3458 + if (sizeof(pv_pptr_t) != 16)
3459 + __BUG__in__hsm_dot_c_6();
3462 --- linux/drivers/block/linear.c.orig Sat Nov 8 20:39:12 1997
3463 +++ linux/drivers/block/linear.c Tue Jan 16 13:42:04 2001
3467 linear.c : Multiple Devices driver for Linux
3468 Copyright (C) 1994-96 Marc ZYNGIER
3469 @@ -19,186 +18,207 @@
3471 #include <linux/module.h>
3473 -#include <linux/md.h>
3474 +#include <linux/raid/md.h>
3475 #include <linux/malloc.h>
3476 -#include <linux/init.h>
3478 -#include "linear.h"
3479 +#include <linux/raid/linear.h>
3481 #define MAJOR_NR MD_MAJOR
3483 #define MD_PERSONALITY
3485 -static int linear_run (int minor, struct md_dev *mddev)
3486 +static int linear_run (mddev_t *mddev)
3488 - int cur=0, i, size, dev0_size, nb_zone;
3489 - struct linear_data *data;
3491 - MOD_INC_USE_COUNT;
3493 - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
3494 - data=(struct linear_data *) mddev->private;
3497 - Find out the smallest device. This was previously done
3498 - at registry time, but since it violates modularity,
3499 - I moved it here... Any comment ? ;-)
3502 - data->smallest=mddev->devices;
3503 - for (i=1; i<mddev->nb_dev; i++)
3504 - if (data->smallest->size > mddev->devices[i].size)
3505 - data->smallest=mddev->devices+i;
3507 - nb_zone=data->nr_zones=
3508 - md_size[minor]/data->smallest->size +
3509 - (md_size[minor]%data->smallest->size ? 1 : 0);
3511 - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
3513 - size=mddev->devices[cur].size;
3514 + linear_conf_t *conf;
3515 + struct linear_hash *table;
3517 + int size, i, j, nb_zone;
3518 + unsigned int curr_offset;
3520 + MOD_INC_USE_COUNT;
3522 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
3525 + mddev->private = conf;
3527 + if (md_check_ordering(mddev)) {
3528 + printk("linear: disks are not ordered, aborting!\n");
3532 + * Find the smallest device.
3535 + conf->smallest = NULL;
3537 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
3538 + dev_info_t *disk = conf->disks + j;
3540 + disk->dev = rdev->dev;
3541 + disk->size = rdev->size;
3542 + disk->offset = curr_offset;
3544 + curr_offset += disk->size;
3546 + if (!conf->smallest || (disk->size < conf->smallest->size))
3547 + conf->smallest = disk;
3550 + nb_zone = conf->nr_zones =
3551 + md_size[mdidx(mddev)] / conf->smallest->size +
3552 + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
3554 + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
3556 + if (!conf->hash_table)
3560 + * Here we generate the linear hash table
3562 + table = conf->hash_table;
3565 + for (j = 0; j < mddev->nb_dev; j++) {
3566 + dev_info_t *disk = conf->disks + j;
3569 + table->dev1 = disk;
3572 + size += disk->size;
3575 + table->dev0 = disk;
3576 + size -= conf->smallest->size;
3579 + table->dev1 = NULL;
3583 + table->dev1 = NULL;
3590 + MOD_DEC_USE_COUNT;
3594 +static int linear_stop (mddev_t *mddev)
3596 + linear_conf_t *conf = mddev_to_conf(mddev);
3598 + kfree(conf->hash_table);
3602 - while (cur<mddev->nb_dev)
3604 - data->hash_table[i].dev0=mddev->devices+cur;
3605 + MOD_DEC_USE_COUNT;
3607 - if (size>=data->smallest->size) /* If we completely fill the slot */
3609 - data->hash_table[i++].dev1=NULL;
3610 - size-=data->smallest->size;
3614 - if (++cur==mddev->nb_dev) continue;
3615 - size=mddev->devices[cur].size;
3621 - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
3623 - data->hash_table[i].dev1=NULL;
3627 - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
3628 - size=mddev->devices[cur].size;
3629 - data->hash_table[i++].dev1=mddev->devices+cur;
3630 - size-=(data->smallest->size - dev0_size);
3636 -static int linear_stop (int minor, struct md_dev *mddev)
3638 - struct linear_data *data=(struct linear_data *) mddev->private;
3640 - kfree (data->hash_table);
3643 - MOD_DEC_USE_COUNT;
3650 -static int linear_map (struct md_dev *mddev, kdev_t *rdev,
3651 +static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3652 unsigned long *rsector, unsigned long size)
3654 - struct linear_data *data=(struct linear_data *) mddev->private;
3655 - struct linear_hash *hash;
3656 - struct real_dev *tmp_dev;
3659 - block=*rsector >> 1;
3660 - hash=data->hash_table+(block/data->smallest->size);
3662 - if (block >= (hash->dev0->size + hash->dev0->offset))
3666 - printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
3670 - tmp_dev=hash->dev1;
3673 - tmp_dev=hash->dev0;
3674 + linear_conf_t *conf = mddev_to_conf(mddev);
3675 + struct linear_hash *hash;
3676 + dev_info_t *tmp_dev;
3679 + block = *rsector >> 1;
3680 + hash = conf->hash_table + (block / conf->smallest->size);
3682 + if (block >= (hash->dev0->size + hash->dev0->offset))
3686 + printk ("linear_map : hash->dev1==NULL for block %ld\n",
3690 + tmp_dev = hash->dev1;
3692 + tmp_dev = hash->dev0;
3694 - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
3695 - printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3696 - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3697 + if (block >= (tmp_dev->size + tmp_dev->offset)
3698 + || block < tmp_dev->offset)
3699 + printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3700 + block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3702 - *rdev=tmp_dev->dev;
3703 - *rsector=(block-(tmp_dev->offset)) << 1;
3704 + *rdev = tmp_dev->dev;
3705 + *rsector = (block - tmp_dev->offset) << 1;
3711 -static int linear_status (char *page, int minor, struct md_dev *mddev)
3712 +static int linear_status (char *page, mddev_t *mddev)
3720 - struct linear_data *data=(struct linear_data *) mddev->private;
3722 + linear_conf_t *conf = mddev_to_conf(mddev);
3724 - sz+=sprintf (page+sz, " ");
3725 - for (j=0; j<data->nr_zones; j++)
3727 - sz+=sprintf (page+sz, "[%s",
3728 - partition_name (data->hash_table[j].dev0->dev));
3730 - if (data->hash_table[j].dev1)
3731 - sz+=sprintf (page+sz, "/%s] ",
3732 - partition_name(data->hash_table[j].dev1->dev));
3734 - sz+=sprintf (page+sz, "] ");
3737 - sz+=sprintf (page+sz, "\n");
3738 + sz += sprintf(page+sz, " ");
3739 + for (j = 0; j < conf->nr_zones; j++)
3741 + sz += sprintf(page+sz, "[%s",
3742 + partition_name(conf->hash_table[j].dev0->dev));
3744 + if (conf->hash_table[j].dev1)
3745 + sz += sprintf(page+sz, "/%s] ",
3746 + partition_name(conf->hash_table[j].dev1->dev));
3748 + sz += sprintf(page+sz, "] ");
3750 + sz += sprintf(page+sz, "\n");
3752 - sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
3754 + sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
3759 -static struct md_personality linear_personality=
3760 +static mdk_personality_t linear_personality=
3769 - NULL, /* no ioctls */
3789 -__initfunc(void linear_init (void))
3790 +md__initfunc(void linear_init (void))
3792 - register_md_personality (LINEAR, &linear_personality);
3793 + register_md_personality (LINEAR, &linear_personality);
3798 int init_module (void)
3800 - return (register_md_personality (LINEAR, &linear_personality));
3801 + return (register_md_personality (LINEAR, &linear_personality));
3804 void cleanup_module (void)
3806 - unregister_md_personality (LINEAR);
3807 + unregister_md_personality (LINEAR);
3812 --- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996
3813 +++ linux/drivers/block/linear.h Tue Jan 16 13:42:04 2001
3820 - struct real_dev *dev0, *dev1;
3825 - struct linear_hash *hash_table; /* Dynamically allocated */
3826 - struct real_dev *smallest;
3831 --- linux/drivers/block/ll_rw_blk.c.orig Mon Dec 11 01:49:41 2000
3832 +++ linux/drivers/block/ll_rw_blk.c Tue Jan 16 13:42:04 2001
3835 #include <asm/uaccess.h>
3836 #include <linux/blk.h>
3837 +#include <linux/raid/md.h>
3839 #include <linux/module.h>
3842 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
3845 + * per-major idle-IO detection
3847 +unsigned long io_events[MAX_BLKDEV] = {0, };
3850 * used to wait on when there are no free requests
3852 struct wait_queue * wait_for_request;
3855 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
3857 + if (!buffer_lowprio(bh))
3858 + io_events[major]++;
3860 if (blk_size[major]) {
3861 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
3863 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
3864 #ifdef CONFIG_BLK_DEV_MD
3865 if (major==MD_MAJOR &&
3866 - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
3867 + md_map (bh[i]->b_dev, &bh[i]->b_rdev,
3868 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
3870 "Bad md_map in ll_rw_block\n");
3872 set_bit(BH_Req, &bh[i]->b_state);
3873 #ifdef CONFIG_BLK_DEV_MD
3874 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
3875 - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
3876 + md_make_request(bh[i], rw);
3880 --- linux/drivers/block/md.c.orig Mon Sep 4 19:39:16 2000
3881 +++ linux/drivers/block/md.c Tue Jan 16 13:42:04 2001
3885 md.c : Multiple Devices driver for Linux
3886 - Copyright (C) 1994-96 Marc ZYNGIER
3887 - <zyngier@ufr-info-p7.ibp.fr> or
3888 - <maz@gloups.fdn.fr>
3889 + Copyright (C) 1998, 1999 Ingo Molnar
3891 - A lot of inspiration came from hd.c ...
3892 + completely rewritten, based on the MD driver code from Marc Zyngier
3894 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3895 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3898 - RAID-1/RAID-5 extensions by:
3899 - Ingo Molnar, Miguel de Icaza, Gadi Oxman
3900 + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
3901 + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3902 + - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3903 + - kmod support by: Cyrus Durgin
3904 + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
3906 - Changes for kmod by:
3909 This program is free software; you can redistribute it and/or modify
3910 it under the terms of the GNU General Public License as published by
3911 the Free Software Foundation; either version 2, or (at your option)
3912 @@ -26,807 +22,3007 @@
3913 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3917 - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
3918 - * the extra system load does not show up that much. Increase it if your
3919 - * system can take more.
3921 -#define SPEED_LIMIT 1024
3922 +#include <linux/raid/md.h>
3923 +#include <linux/raid/xor.h>
3925 -#include <linux/config.h>
3926 -#include <linux/module.h>
3927 -#include <linux/version.h>
3928 -#include <linux/malloc.h>
3929 -#include <linux/mm.h>
3930 -#include <linux/md.h>
3931 -#include <linux/hdreg.h>
3932 -#include <linux/stat.h>
3933 -#include <linux/fs.h>
3934 -#include <linux/proc_fs.h>
3935 -#include <linux/blkdev.h>
3936 -#include <linux/genhd.h>
3937 -#include <linux/smp_lock.h>
3939 #include <linux/kmod.h>
3941 -#include <linux/errno.h>
3942 -#include <linux/init.h>
3944 #define __KERNEL_SYSCALLS__
3945 #include <linux/unistd.h>
3947 +#include <asm/unaligned.h>
3949 +extern asmlinkage int sys_sched_yield(void);
3950 +extern asmlinkage int sys_setsid(void);
3952 +extern unsigned long io_events[MAX_BLKDEV];
3954 #define MAJOR_NR MD_MAJOR
3957 #include <linux/blk.h>
3958 -#include <asm/uaccess.h>
3959 -#include <asm/bitops.h>
3960 -#include <asm/atomic.h>
3962 #ifdef CONFIG_MD_BOOT
3963 -extern kdev_t name_to_kdev_t(char *line) __init;
3964 +extern kdev_t name_to_kdev_t(char *line) md__init;
3967 -static struct hd_struct md_hd_struct[MAX_MD_DEV];
3968 -static int md_blocksizes[MAX_MD_DEV];
3969 -int md_maxreadahead[MAX_MD_DEV];
3970 -#if SUPPORT_RECONSTRUCTION
3971 -static struct md_thread *md_sync_thread = NULL;
3972 -#endif /* SUPPORT_RECONSTRUCTION */
3973 +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
3976 + * these have to be allocated separately because external
3977 + * subsystems want to have a pre-defined structure
3979 +struct hd_struct md_hd_struct[MAX_MD_DEVS];
3980 +static int md_blocksizes[MAX_MD_DEVS];
3981 +static int md_maxreadahead[MAX_MD_DEVS];
3982 +static mdk_thread_t *md_recovery_thread = NULL;
3984 -int md_size[MAX_MD_DEV]={0, };
3985 +int md_size[MAX_MD_DEVS] = {0, };
3987 static void md_geninit (struct gendisk *);
3989 static struct gendisk md_gendisk=
4015 -static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
4016 -struct md_dev md_dev[MAX_MD_DEV];
4018 -int md_thread(void * arg);
4020 + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
4021 + * is 100 KB/sec, so the extra system load does not show up that much.
4022 + * Increase it if you want to have more _guaranteed_ speed. Note that
4023 + * the RAID driver will use the maximum available bandwith if the IO
4024 + * subsystem is idle.
4026 + * you can change it via /proc/sys/dev/speed-limit
4029 -static struct gendisk *find_gendisk (kdev_t dev)
4031 - struct gendisk *tmp=gendisk_head;
4032 +static int sysctl_speed_limit = 100;
4034 - while (tmp != NULL)
4036 - if (tmp->major==MAJOR(dev))
4041 +static struct ctl_table_header *md_table_header;
4045 +static ctl_table md_table[] = {
4046 + {DEV_MD_SPEED_LIMIT, "speed-limit",
4047 + &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
4051 -char *partition_name (kdev_t dev)
4053 - static char name[40]; /* This should be long
4054 - enough for a device name ! */
4055 - struct gendisk *hd = find_gendisk (dev);
4056 +static ctl_table md_dir_table[] = {
4057 + {DEV_MD, "md", NULL, 0, 0555, md_table},
4063 - sprintf (name, "[dev %s]", kdevname(dev));
4066 +static ctl_table md_root_table[] = {
4067 + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
4071 - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
4072 +static void md_register_sysctl(void)
4074 + md_table_header = register_sysctl_table(md_root_table, 1);
4077 -static int legacy_raid_sb (int minor, int pnum)
4078 +void md_unregister_sysctl(void)
4081 + unregister_sysctl_table(md_table_header);
4085 + * The mapping between kdev and mddev is not necessary a simple
4086 + * one! Eg. HSM uses several sub-devices to implement Logical
4087 + * Volumes. All these sub-devices map to the same mddev.
4089 +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
4091 - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4092 +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
4094 + unsigned int minor = MINOR(dev);
4097 - * do size and offset calculations.
4099 - for (i=0; i<md_dev[minor].nb_dev; i++) {
4100 - md_dev[minor].devices[i].size &= ~(factor - 1);
4101 - md_size[minor] += md_dev[minor].devices[i].size;
4102 - md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
4103 - md_dev[minor].devices[i-1].size) : 0;
4104 + if (MAJOR(dev) != MD_MAJOR) {
4108 - if (pnum == RAID0 >> PERSONALITY_SHIFT)
4109 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
4111 + if (mddev_map[minor].mddev != NULL) {
4115 + mddev_map[minor].mddev = mddev;
4116 + mddev_map[minor].data = data;
4119 -static void free_sb (struct md_dev *mddev)
4120 +void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
4123 - struct real_dev *realdev;
4124 + unsigned int minor = MINOR(dev);
4127 - free_page((unsigned long) mddev->sb);
4129 + if (MAJOR(dev) != MD_MAJOR) {
4133 - for (i = 0; i <mddev->nb_dev; i++) {
4134 - realdev = mddev->devices + i;
4135 - if (realdev->sb) {
4136 - free_page((unsigned long) realdev->sb);
4137 - realdev->sb = NULL;
4139 + if (mddev_map[minor].mddev != mddev) {
4143 + mddev_map[minor].mddev = NULL;
4144 + mddev_map[minor].data = NULL;
4148 - * Check one RAID superblock for generic plausibility
4149 + * Enables to iterate over all existing md arrays
4151 +static MD_LIST_HEAD(all_mddevs);
4153 -#define BAD_MAGIC KERN_ERR \
4154 -"md: %s: invalid raid superblock magic (%x) on block %u\n"
4155 +static mddev_t * alloc_mddev (kdev_t dev)
4159 -#define OUT_OF_MEM KERN_ALERT \
4160 -"md: out of memory.\n"
4161 + if (MAJOR(dev) != MD_MAJOR) {
4165 + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
4169 + memset(mddev, 0, sizeof(*mddev));
4171 -#define NO_DEVICE KERN_ERR \
4172 -"md: disabled device %s\n"
4173 + mddev->__minor = MINOR(dev);
4174 + mddev->reconfig_sem = MUTEX;
4175 + mddev->recovery_sem = MUTEX;
4176 + mddev->resync_sem = MUTEX;
4177 + MD_INIT_LIST_HEAD(&mddev->disks);
4179 + * The 'base' mddev is the one with data NULL.
4180 + * personalities can create additional mddevs
4183 + add_mddev_mapping(mddev, dev, 0);
4184 + md_list_add(&mddev->all_mddevs, &all_mddevs);
4191 -static int analyze_one_sb (struct real_dev * rdev)
4192 +static void free_mddev (mddev_t *mddev)
4194 - int ret = FAILURE;
4195 - struct buffer_head *bh;
4196 - kdev_t dev = rdev->dev;
4197 - md_superblock_t *sb;
4204 - * Read the superblock, it's at the end of the disk
4205 + * Make sure nobody else is using this mddev
4206 + * (careful, we rely on the global kernel lock here)
4208 - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
4209 - set_blocksize (dev, MD_SB_BYTES);
4210 - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4213 - sb = (md_superblock_t *) bh->b_data;
4214 - if (sb->md_magic != MD_SB_MAGIC) {
4215 - printk (BAD_MAGIC, kdevname(dev),
4216 - sb->md_magic, rdev->sb_offset);
4219 - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
4221 - printk (OUT_OF_MEM);
4224 - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
4225 + while (md_atomic_read(&mddev->resync_sem.count) != 1)
4227 + while (md_atomic_read(&mddev->recovery_sem.count) != 1)
4230 - rdev->size = sb->size;
4232 - printk (NO_DEVICE,kdevname(rdev->dev));
4238 + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
4239 + md_list_del(&mddev->all_mddevs);
4240 + MD_INIT_LIST_HEAD(&mddev->all_mddevs);
4252 - * Check a full RAID array for plausibility
4254 +struct gendisk * find_gendisk (kdev_t dev)
4256 + struct gendisk *tmp = gendisk_head;
4258 -#define INCONSISTENT KERN_ERR \
4259 -"md: superblock inconsistency -- run ckraid\n"
4260 + while (tmp != NULL) {
4261 + if (tmp->major == MAJOR(dev))
4268 -#define OUT_OF_DATE KERN_ERR \
4269 -"md: superblock update time inconsistenty -- using the most recent one\n"
4270 +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
4272 + mdk_rdev_t * rdev;
4273 + struct md_list_head *tmp;
4275 -#define OLD_VERSION KERN_ALERT \
4276 -"md: %s: unsupported raid array version %d.%d.%d\n"
4277 + ITERATE_RDEV(mddev,rdev,tmp) {
4278 + if (rdev->desc_nr == nr)
4284 -#define NOT_CLEAN KERN_ERR \
4285 -"md: %s: raid array is not clean -- run ckraid\n"
4286 +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
4288 + struct md_list_head *tmp;
4291 -#define NOT_CLEAN_IGNORE KERN_ERR \
4292 -"md: %s: raid array is not clean -- reconstructing parity\n"
4293 + ITERATE_RDEV(mddev,rdev,tmp) {
4294 + if (rdev->dev == dev)
4300 -#define UNKNOWN_LEVEL KERN_ERR \
4301 -"md: %s: unsupported raid level %d\n"
4302 +static MD_LIST_HEAD(device_names);
4304 -static int analyze_sbs (int minor, int pnum)
4305 +char * partition_name (kdev_t dev)
4307 - struct md_dev *mddev = md_dev + minor;
4308 - int i, N = mddev->nb_dev, out_of_date = 0;
4309 - struct real_dev * disks = mddev->devices;
4310 - md_superblock_t *sb, *freshest = NULL;
4311 + struct gendisk *hd;
4312 + static char nomem [] = "<nomem>";
4313 + dev_name_t *dname;
4314 + struct md_list_head *tmp = device_names.next;
4317 - * RAID-0 and linear don't use a RAID superblock
4319 - if (pnum == RAID0 >> PERSONALITY_SHIFT ||
4320 - pnum == LINEAR >> PERSONALITY_SHIFT)
4321 - return legacy_raid_sb (minor, pnum);
4322 + while (tmp != &device_names) {
4323 + dname = md_list_entry(tmp, dev_name_t, list);
4324 + if (dname->dev == dev)
4325 + return dname->name;
4329 + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
4334 - * Verify the RAID superblock on each real device
4335 + * ok, add this new device name to the list
4337 - for (i = 0; i < N; i++)
4338 - if (analyze_one_sb(disks+i))
4340 + hd = find_gendisk (dev);
4343 + sprintf (dname->name, "[dev %s]", kdevname(dev));
4345 + disk_name (hd, MINOR(dev), dname->name);
4348 + md_list_add(&dname->list, &device_names);
4350 + return dname->name;
4353 +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
4356 + unsigned int size = 0;
4358 + if (blk_size[MAJOR(dev)])
4359 + size = blk_size[MAJOR(dev)][MINOR(dev)];
4361 + size = MD_NEW_SIZE_BLOCKS(size);
4365 +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
4367 + unsigned int size;
4369 + size = calc_dev_sboffset(dev, mddev, persistent);
4374 + if (mddev->sb->chunk_size)
4375 + size &= ~(mddev->sb->chunk_size/1024 - 1);
4380 + * We check wether all devices are numbered from 0 to nb_dev-1. The
4381 + * order is guaranteed even after device name changes.
4383 + * Some personalities (raid0, linear) use this. Personalities that
4384 + * provide data have to be able to deal with loss of individual
4385 + * disks, so they do their checking themselves.
4387 +int md_check_ordering (mddev_t *mddev)
4391 + struct md_list_head *tmp;
4394 - * The superblock constant part has to be the same
4395 - * for all disks in the array.
4396 + * First, all devices must be fully functional
4399 - for (i = 0; i < N; i++) {
4407 - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
4408 - printk (INCONSISTENT);
4409 + ITERATE_RDEV(mddev,rdev,tmp) {
4410 + if (rdev->faulty) {
4411 + printk("md: md%d's device %s faulty, aborting.\n",
4412 + mdidx(mddev), partition_name(rdev->dev));
4418 - * OK, we have all disks and the array is ready to run. Let's
4419 - * find the freshest superblock, that one will be the superblock
4420 - * that represents the whole array.
4422 - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
4424 + ITERATE_RDEV(mddev,rdev,tmp) {
4427 + if (c != mddev->nb_dev) {
4431 - for (i = 0; i < N; i++) {
4435 - freshest = disks[i].sb;
4439 - * Find the newest superblock version
4441 - if (disks[i].sb->utime != freshest->utime) {
4443 - if (disks[i].sb->utime > freshest->utime)
4444 - freshest = disks[i].sb;
4448 - printk(OUT_OF_DATE);
4449 - memcpy (sb, freshest, sizeof(*freshest));
4452 - * Check if we can support this RAID array
4454 - if (sb->major_version != MD_MAJOR_VERSION ||
4455 - sb->minor_version > MD_MINOR_VERSION) {
4457 - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
4458 - sb->major_version, sb->minor_version,
4459 - sb->patch_version);
4460 + if (mddev->nb_dev != mddev->sb->raid_disks) {
4461 + printk("md: md%d, array needs %d disks, has %d, aborting.\n",
4462 + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
4467 - * We need to add this as a superblock option.
4468 + * Now the numbering check
4470 -#if SUPPORT_RECONSTRUCTION
4471 - if (sb->state != (1 << MD_SB_CLEAN)) {
4472 - if (sb->level == 1) {
4473 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4474 + for (i = 0; i < mddev->nb_dev; i++) {
4476 + ITERATE_RDEV(mddev,rdev,tmp) {
4477 + if (rdev->desc_nr == i)
4481 + printk("md: md%d, missing disk #%d, aborting.\n",
4485 - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
4488 - if (sb->state != (1 << MD_SB_CLEAN)) {
4489 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4492 -#endif /* SUPPORT_RECONSTRUCTION */
4494 - switch (sb->level) {
4496 - md_size[minor] = sb->size;
4497 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
4501 - md_size[minor] = sb->size * (sb->raid_disks - 1);
4502 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
4505 - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
4509 + printk("md: md%d, too many disks #%d, aborting.\n",
4520 -#undef INCONSISTENT
4526 -int md_update_sb(int minor)
4527 +static unsigned int zoned_raid_size (mddev_t *mddev)
4529 - struct md_dev *mddev = md_dev + minor;
4530 - struct buffer_head *bh;
4531 - md_superblock_t *sb = mddev->sb;
4532 - struct real_dev *realdev;
4536 + unsigned int mask;
4537 + mdk_rdev_t * rdev;
4538 + struct md_list_head *tmp;
4540 - sb->utime = CURRENT_TIME;
4541 - for (i = 0; i < mddev->nb_dev; i++) {
4542 - realdev = mddev->devices + i;
4545 - dev = realdev->dev;
4546 - sb_offset = realdev->sb_offset;
4547 - set_blocksize(dev, MD_SB_BYTES);
4548 - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
4549 - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4551 - sb = (md_superblock_t *) bh->b_data;
4552 - memcpy(sb, mddev->sb, MD_SB_BYTES);
4553 - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
4554 - mark_buffer_uptodate(bh, 1);
4555 - mark_buffer_dirty(bh, 1);
4556 - ll_rw_block(WRITE, 1, &bh);
4557 - wait_on_buffer(bh);
4560 - invalidate_buffers(dev);
4562 - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
4568 + * do size and offset calculations.
4570 + mask = ~(mddev->sb->chunk_size/1024 - 1);
4571 +printk("mask %08x\n", mask);
4573 + ITERATE_RDEV(mddev,rdev,tmp) {
4574 +printk(" rdev->size: %d\n", rdev->size);
4575 + rdev->size &= mask;
4576 +printk(" masked rdev->size: %d\n", rdev->size);
4577 + md_size[mdidx(mddev)] += rdev->size;
4578 +printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
4583 -static int do_md_run (int minor, int repart)
4584 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
4586 - int pnum, i, min, factor, err;
4587 + if (disk_active(disk)) {
4588 + sb->working_disks--;
4590 + if (disk_spare(disk)) {
4591 + sb->spare_disks--;
4592 + sb->working_disks--;
4594 + sb->failed_disks--;
4600 + mark_disk_removed(disk);
4603 - if (!md_dev[minor].nb_dev)
4606 - if (md_dev[minor].pers)
4608 +#define BAD_MAGIC KERN_ERR \
4609 +"md: invalid raid superblock magic on %s\n"
4611 - md_dev[minor].repartition=repart;
4613 - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
4614 - >= MAX_PERSONALITY)
4617 - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
4618 - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
4619 - for (i = 0; i < md_dev [minor].nb_dev; i++)
4620 - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
4626 - char module_name[80];
4627 - sprintf (module_name, "md-personality-%d", pnum);
4628 - request_module (module_name);
4634 - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4636 - for (i=0; i<md_dev[minor].nb_dev; i++)
4637 - if (md_dev[minor].devices[i].size<min)
4639 - printk ("Dev %s smaller than %dk, cannot shrink\n",
4640 - partition_name (md_dev[minor].devices[i].dev), min);
4644 - for (i=0; i<md_dev[minor].nb_dev; i++) {
4645 - fsync_dev(md_dev[minor].devices[i].dev);
4646 - invalidate_buffers(md_dev[minor].devices[i].dev);
4649 - /* Resize devices according to the factor. It is used to align
4650 - partitions size on a given chunk size. */
4654 - * Analyze the raid superblock
4656 - if (analyze_sbs(minor, pnum))
4658 +#define BAD_MINOR KERN_ERR \
4659 +"md: %s: invalid raid minor (%x)\n"
4661 - md_dev[minor].pers=pers[pnum];
4663 - if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
4665 - md_dev[minor].pers=NULL;
4666 - free_sb(md_dev + minor);
4670 - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
4672 - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
4673 - md_update_sb(minor);
4676 - /* FIXME : We assume here we have blocks
4677 - that are twice as large as sectors.
4678 - THIS MAY NOT BE TRUE !!! */
4679 - md_hd_struct[minor].start_sect=0;
4680 - md_hd_struct[minor].nr_sects=md_size[minor]<<1;
4682 - read_ahead[MD_MAJOR] = 128;
4685 +#define OUT_OF_MEM KERN_ALERT \
4686 +"md: out of memory.\n"
4688 +#define NO_SB KERN_ERR \
4689 +"md: disabled device %s, could not read superblock.\n"
4691 -static int do_md_stop (int minor, struct inode *inode)
4692 +#define BAD_CSUM KERN_WARNING \
4693 +"md: invalid superblock checksum on %s\n"
4695 +static int alloc_array_sb (mddev_t * mddev)
4699 - if (inode->i_count>1 || md_dev[minor].busy>1) {
4701 - * ioctl : one open channel
4703 - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
4704 - minor, inode->i_count, md_dev[minor].busy);
4708 - if (md_dev[minor].pers) {
4710 - * It is safe to call stop here, it only frees private
4711 - * data. Also, it tells us if a device is unstoppable
4712 - * (eg. resyncing is in progress)
4714 - if (md_dev[minor].pers->stop (minor, md_dev+minor))
4717 - * The device won't exist anymore -> flush it now
4719 - fsync_dev (inode->i_rdev);
4720 - invalidate_buffers (inode->i_rdev);
4721 - if (md_dev[minor].sb) {
4722 - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
4723 - md_update_sb(minor);
4730 - /* Remove locks. */
4731 - if (md_dev[minor].sb)
4732 - free_sb(md_dev + minor);
4733 - for (i=0; i<md_dev[minor].nb_dev; i++)
4734 - clear_inode (md_dev[minor].devices[i].inode);
4736 - md_dev[minor].nb_dev=md_size[minor]=0;
4737 - md_hd_struct[minor].nr_sects=0;
4738 - md_dev[minor].pers=NULL;
4740 - read_ahead[MD_MAJOR] = 128;
4744 + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
4747 + md_clear_page((unsigned long)mddev->sb);
4751 -static int do_md_add (int minor, kdev_t dev)
4752 +static int alloc_disk_sb (mdk_rdev_t * rdev)
4756 - struct real_dev *realdev;
4760 - if (md_dev[minor].nb_dev==MAX_REAL)
4761 + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
4763 + printk (OUT_OF_MEM);
4766 + md_clear_page((unsigned long)rdev->sb);
4768 - if (!fs_may_mount (dev))
4773 - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
4774 - printk("md_add(): zero device size, huh, bailing out.\n");
4776 +static void free_disk_sb (mdk_rdev_t * rdev)
4779 + free_page((unsigned long) rdev->sb);
4781 + rdev->sb_offset = 0;
4784 + if (!rdev->faulty)
4789 - if (md_dev[minor].pers) {
4791 - * The array is already running, hot-add the drive, or
4794 - if (!md_dev[minor].pers->hot_add_disk)
4798 +static void mark_rdev_faulty (mdk_rdev_t * rdev)
4800 + unsigned long flags;
4806 + save_flags(flags);
4808 + free_disk_sb(rdev);
4810 + restore_flags(flags);
4813 +static int read_disk_sb (mdk_rdev_t * rdev)
4815 + int ret = -EINVAL;
4816 + struct buffer_head *bh = NULL;
4817 + kdev_t dev = rdev->dev;
4827 - * Careful. We cannot increase nb_dev for a running array.
4828 + * Calculate the position of the superblock,
4829 + * it's at the end of the disk
4831 - i=md_dev[minor].nb_dev;
4832 - realdev = &md_dev[minor].devices[i];
4835 - /* Lock the device by inserting a dummy inode. This doesn't
4836 - smell very good, but I need to be consistent with the
4837 - mount stuff, specially with fs_may_mount. If someone have
4838 - a better idea, please help ! */
4840 - realdev->inode=get_empty_inode ();
4841 - realdev->inode->i_dev=dev; /* don't care about other fields */
4842 - insert_inode_hash (realdev->inode);
4844 - /* Sizes are now rounded at run time */
4846 -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
4848 - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
4849 + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
4850 + rdev->sb_offset = sb_offset;
4851 + printk("(read) %s's sb offset: %d", partition_name(dev),
4854 + set_blocksize (dev, MD_SB_BYTES);
4855 + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4859 + sb = (mdp_super_t *) bh->b_data;
4860 + memcpy (rdev->sb, sb, MD_SB_BYTES);
4862 + printk (NO_SB,partition_name(rdev->dev));
4865 + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
4873 +static unsigned int calc_sb_csum (mdp_super_t * sb)
4875 + unsigned int disk_csum, csum;
4877 + disk_csum = sb->sb_csum;
4879 + csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
4880 + sb->sb_csum = disk_csum;
4885 + * Check one RAID superblock for generic plausibility
4888 +static int check_disk_sb (mdk_rdev_t * rdev)
4891 + int ret = -EINVAL;
4899 + if (sb->md_magic != MD_SB_MAGIC) {
4900 + printk (BAD_MAGIC, partition_name(rdev->dev));
4904 + if (sb->md_minor >= MAX_MD_DEVS) {
4905 + printk (BAD_MINOR, partition_name(rdev->dev),
4910 + if (calc_sb_csum(sb) != sb->sb_csum)
4911 + printk(BAD_CSUM, partition_name(rdev->dev));
4917 +static kdev_t dev_unit(kdev_t dev)
4919 + unsigned int mask;
4920 + struct gendisk *hd = find_gendisk(dev);
4924 + mask = ~((1 << hd->minor_shift) - 1);
4926 + return MKDEV(MAJOR(dev), MINOR(dev) & mask);
4929 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
4931 + struct md_list_head *tmp;
4934 + ITERATE_RDEV(mddev,rdev,tmp)
4935 + if (dev_unit(rdev->dev) == dev_unit(dev))
4941 +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
4943 + struct md_list_head *tmp;
4946 + ITERATE_RDEV(mddev1,rdev,tmp)
4947 + if (match_dev_unit(mddev2, rdev->dev))
4953 +static MD_LIST_HEAD(all_raid_disks);
4954 +static MD_LIST_HEAD(pending_raid_disks);
4956 +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
4958 + mdk_rdev_t *same_pdev;
4960 + if (rdev->mddev) {
4964 + same_pdev = match_dev_unit(mddev, rdev->dev);
4966 + printk( KERN_WARNING
4967 +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
4968 +" protection against single-disk failure might be compromised.\n",
4969 + mdidx(mddev), partition_name(rdev->dev),
4970 + partition_name(same_pdev->dev));
4972 + md_list_add(&rdev->same_set, &mddev->disks);
4973 + rdev->mddev = mddev;
4975 + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
4978 +static void unbind_rdev_from_array (mdk_rdev_t * rdev)
4980 + if (!rdev->mddev) {
4984 + md_list_del(&rdev->same_set);
4985 + MD_INIT_LIST_HEAD(&rdev->same_set);
4986 + rdev->mddev->nb_dev--;
4987 + printk("unbind<%s,%d>\n", partition_name(rdev->dev),
4988 + rdev->mddev->nb_dev);
4989 + rdev->mddev = NULL;
4993 + * prevent the device from being mounted, repartitioned or
4994 + * otherwise reused by a RAID array (or any other kernel
4995 + * subsystem), by opening the device. [simply getting an
4996 + * inode is not enough, the SCSI module usage code needs
4997 + * an explicit open() on the device]
4999 +static int lock_rdev (mdk_rdev_t *rdev)
5004 + * First insert a dummy inode.
5008 + rdev->inode = get_empty_inode();
5010 + * we dont care about any other fields
5012 + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
5013 + insert_inode_hash(rdev->inode);
5015 + memset(&rdev->filp, 0, sizeof(rdev->filp));
5016 + rdev->filp.f_mode = 3; /* read write */
5017 + err = blkdev_open(rdev->inode, &rdev->filp);
5019 + printk("blkdev_open() failed: %d\n", err);
5020 + clear_inode(rdev->inode);
5021 + rdev->inode = NULL;
5026 +static void unlock_rdev (mdk_rdev_t *rdev)
5028 + blkdev_release(rdev->inode);
5031 + clear_inode(rdev->inode);
5032 + rdev->inode = NULL;
5035 +static void export_rdev (mdk_rdev_t * rdev)
5037 + printk("export_rdev(%s)\n",partition_name(rdev->dev));
5040 + unlock_rdev(rdev);
5041 + free_disk_sb(rdev);
5042 + md_list_del(&rdev->all);
5043 + MD_INIT_LIST_HEAD(&rdev->all);
5044 + if (rdev->pending.next != &rdev->pending) {
5045 + printk("(%s was pending)\n",partition_name(rdev->dev));
5046 + md_list_del(&rdev->pending);
5047 + MD_INIT_LIST_HEAD(&rdev->pending);
5054 +static void kick_rdev_from_array (mdk_rdev_t * rdev)
5056 + unbind_rdev_from_array(rdev);
5057 + export_rdev(rdev);
5060 +static void export_array (mddev_t *mddev)
5062 + struct md_list_head *tmp;
5064 + mdp_super_t *sb = mddev->sb;
5068 + free_page((unsigned long) sb);
5071 + ITERATE_RDEV(mddev,rdev,tmp) {
5072 + if (!rdev->mddev) {
5076 + kick_rdev_from_array(rdev);
5078 + if (mddev->nb_dev)
5087 +static void print_desc(mdp_disk_t *desc)
5089 + printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
5090 + partition_name(MKDEV(desc->major,desc->minor)),
5091 + desc->major,desc->minor,desc->raid_disk,desc->state);
5094 +static void print_sb(mdp_super_t *sb)
5098 + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
5099 + sb->major_version, sb->minor_version, sb->patch_version,
5100 + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
5102 + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
5103 + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
5104 + sb->layout, sb->chunk_size);
5105 + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
5106 + sb->utime, sb->state, sb->active_disks, sb->working_disks,
5107 + sb->failed_disks, sb->spare_disks,
5108 + sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
5110 + for (i = 0; i < MD_SB_DISKS; i++) {
5113 + desc = sb->disks + i;
5114 + printk(" D %2d: ", i);
5117 + printk(" THIS: ");
5118 + print_desc(&sb->this_disk);
5122 +static void print_rdev(mdk_rdev_t *rdev)
5124 + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
5125 + partition_name(rdev->dev), partition_name(rdev->old_dev),
5126 + rdev->size, rdev->faulty, rdev->desc_nr);
5128 + printk("rdev superblock:\n");
5129 + print_sb(rdev->sb);
5131 + printk("no rdev superblock!\n");
5134 +void md_print_devices (void)
5136 + struct md_list_head *tmp, *tmp2;
5141 + printk(" **********************************\n");
5142 + printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
5143 + printk(" **********************************\n");
5144 + ITERATE_MDDEV(mddev,tmp) {
5145 + printk("md%d: ", mdidx(mddev));
5147 + ITERATE_RDEV(mddev,rdev,tmp2)
5148 + printk("<%s>", partition_name(rdev->dev));
5151 + printk(" array superblock:\n");
5152 + print_sb(mddev->sb);
5154 + printk(" no array superblock.\n");
5156 + ITERATE_RDEV(mddev,rdev,tmp2)
5159 + printk(" **********************************\n");
5163 +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
5166 + mdp_super_t *tmp1, *tmp2;
5168 + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
5169 + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
5171 + if (!tmp1 || !tmp2) {
5180 + * nr_disks is not constant
5182 + tmp1->nr_disks = 0;
5183 + tmp2->nr_disks = 0;
5185 + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
5199 +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
5201 + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
5202 + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
5203 + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
5204 + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
5211 +static mdk_rdev_t * find_rdev_all (kdev_t dev)
5213 + struct md_list_head *tmp;
5216 + tmp = all_raid_disks.next;
5217 + while (tmp != &all_raid_disks) {
5218 + rdev = md_list_entry(tmp, mdk_rdev_t, all);
5219 + if (rdev->dev == dev)
5226 +#define GETBLK_FAILED KERN_ERR \
5227 +"md: getblk failed for device %s\n"
5229 +static int write_disk_sb(mdk_rdev_t * rdev)
5231 + struct buffer_head *bh;
5233 + u32 sb_offset, size;
5240 + if (rdev->faulty) {
5244 + if (rdev->sb->md_magic != MD_SB_MAGIC) {
5250 + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
5251 + if (rdev->sb_offset != sb_offset) {
5252 + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
5256 + * If the disk went offline meanwhile and it's just a spare, then
5257 + * it's size has changed to zero silently, and the MD code does
5258 + * not yet know that it's faulty.
5260 + size = calc_dev_size(dev, rdev->mddev, 1);
5261 + if (size != rdev->size) {
5262 + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
5266 + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
5268 + set_blocksize(dev, MD_SB_BYTES);
5269 + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
5271 + printk(GETBLK_FAILED, partition_name(dev));
5274 + memset(bh->b_data,0,bh->b_size);
5275 + sb = (mdp_super_t *) bh->b_data;
5276 + memcpy(sb, rdev->sb, MD_SB_BYTES);
5278 + mark_buffer_uptodate(bh, 1);
5279 + mark_buffer_dirty(bh, 1);
5280 + ll_rw_block(WRITE, 1, &bh);
5281 + wait_on_buffer(bh);
5287 +#undef GETBLK_FAILED KERN_ERR
5289 +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5294 + for (i = 0; i < MD_SB_DISKS; i++) {
5295 + desc = mddev->sb->disks + i;
5297 + if (disk_faulty(desc)) {
5298 + if (MKDEV(desc->major,desc->minor) == rdev->dev)
5303 + if (MKDEV(desc->major,desc->minor) == rdev->dev) {
5304 + rdev->sb->this_disk = *desc;
5305 + rdev->desc_nr = desc->number;
5316 +static int sync_sbs(mddev_t * mddev)
5320 + struct md_list_head *tmp;
5322 + ITERATE_RDEV(mddev,rdev,tmp) {
5327 + set_this_disk(mddev, rdev);
5328 + sb->sb_csum = calc_sb_csum(sb);
5333 +int md_update_sb(mddev_t * mddev)
5335 + int first, err, count = 100;
5336 + struct md_list_head *tmp;
5341 + mddev->sb->utime = CURRENT_TIME;
5342 + ev = get_unaligned(&mddev->sb->events);
5344 + put_unaligned(ev,&mddev->sb->events);
5345 + if (ev == (__u64)0) {
5347 + * oops, this 64-bit counter should never wrap.
5348 + * Either we are in around ~1 trillion A.C., assuming
5349 + * 1 reboot per second, or we have a bug:
5353 + put_unaligned(ev,&mddev->sb->events);
5358 + * do not write anything to disk if using
5359 + * nonpersistent superblocks
5361 + if (mddev->sb->not_persistent)
5364 + printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
5369 + ITERATE_RDEV(mddev,rdev,tmp) {
5375 + printk("(skipping faulty ");
5376 + printk("%s ", partition_name(rdev->dev));
5377 + if (!rdev->faulty) {
5378 + printk("[events: %08lx]",
5379 + (unsigned long)get_unaligned(&rdev->sb->events));
5380 + err += write_disk_sb(rdev);
5386 + printk("errors occured during superblock update, repeating\n");
5389 + printk("excessive errors occured during superblock update, exiting\n");
5395 + * Import a device. If 'on_disk', then sanity check the superblock
5397 + * mark the device faulty if:
5399 + * - the device is nonexistent (zero size)
5400 + * - the device has no valid superblock
5402 + * a faulty rdev _never_ has rdev->sb set.
5404 +static int md_import_device (kdev_t newdev, int on_disk)
5408 + unsigned int size;
5410 + if (find_rdev_all(newdev))
5413 + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
5415 + printk("could not alloc mem for %s!\n", partition_name(newdev));
5418 + memset(rdev, 0, sizeof(*rdev));
5420 + if (!fs_may_mount(newdev)) {
5421 + printk("md: can not import %s, has active inodes!\n",
5422 + partition_name(newdev));
5427 + if ((err = alloc_disk_sb(rdev)))
5430 + rdev->dev = newdev;
5431 + if (lock_rdev(rdev)) {
5432 + printk("md: could not lock %s, zero-size? Marking faulty.\n",
5433 + partition_name(newdev));
5437 + rdev->desc_nr = -1;
5441 + if (blk_size[MAJOR(newdev)])
5442 + size = blk_size[MAJOR(newdev)][MINOR(newdev)];
5444 + printk("md: %s has zero size, marking faulty!\n",
5445 + partition_name(newdev));
5451 + if ((err = read_disk_sb(rdev))) {
5452 + printk("md: could not read %s's sb, not importing!\n",
5453 + partition_name(newdev));
5456 + if ((err = check_disk_sb(rdev))) {
5457 + printk("md: %s has invalid sb, not importing!\n",
5458 + partition_name(newdev));
5462 + rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
5463 + rdev->sb->this_disk.minor);
5464 + rdev->desc_nr = rdev->sb->this_disk.number;
5466 + md_list_add(&rdev->all, &all_raid_disks);
5467 + MD_INIT_LIST_HEAD(&rdev->pending);
5469 + if (rdev->faulty && rdev->sb)
5470 + free_disk_sb(rdev);
5476 + unlock_rdev(rdev);
5477 + free_disk_sb(rdev);
5484 + * Check a full RAID array for plausibility
5487 +#define INCONSISTENT KERN_ERR \
5488 +"md: fatal superblock inconsistency in %s -- removing from array\n"
5490 +#define OUT_OF_DATE KERN_ERR \
5491 +"md: superblock update time inconsistency -- using the most recent one\n"
5493 +#define OLD_VERSION KERN_ALERT \
5494 +"md: md%d: unsupported raid array version %d.%d.%d\n"
5496 +#define NOT_CLEAN_IGNORE KERN_ERR \
5497 +"md: md%d: raid array is not clean -- starting background reconstruction\n"
5499 +#define UNKNOWN_LEVEL KERN_ERR \
5500 +"md: md%d: unsupported raid level %d\n"
5502 +static int analyze_sbs (mddev_t * mddev)
5504 + int out_of_date = 0, i;
5505 + struct md_list_head *tmp, *tmp2;
5506 + mdk_rdev_t *rdev, *rdev2, *freshest;
5510 + * Verify the RAID superblock on each real device
5512 + ITERATE_RDEV(mddev,rdev,tmp) {
5513 + if (rdev->faulty) {
5521 + if (check_disk_sb(rdev))
5526 + * The superblock constant part has to be the same
5527 + * for all disks in the array.
5531 + ITERATE_RDEV(mddev,rdev,tmp) {
5536 + if (!sb_equal(sb, rdev->sb)) {
5537 + printk (INCONSISTENT, partition_name(rdev->dev));
5538 + kick_rdev_from_array(rdev);
5544 + * OK, we have all disks and the array is ready to run. Let's
5545 + * find the freshest superblock, that one will be the superblock
5546 + * that represents the whole array.
5549 + if (alloc_array_sb(mddev))
5554 + ITERATE_RDEV(mddev,rdev,tmp) {
5557 + * if the checksum is invalid, use the superblock
5558 + * only as a last resort. (decrease it's age by
5561 + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
5562 + __u64 ev = get_unaligned(&rdev->sb->events);
5563 + if (ev != (__u64)0) {
5565 + put_unaligned(ev,&rdev->sb->events);
5569 + printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
5570 + (unsigned long)get_unaligned(&rdev->sb->events));
5576 + * Find the newest superblock version
5578 + ev1 = get_unaligned(&rdev->sb->events);
5579 + ev2 = get_unaligned(&freshest->sb->events);
5586 + if (out_of_date) {
5587 + printk(OUT_OF_DATE);
5588 + printk("freshest: %s\n", partition_name(freshest->dev));
5590 + memcpy (sb, freshest->sb, sizeof(*sb));
5593 + * at this point we have picked the 'best' superblock
5594 + * from all available superblocks.
5595 + * now we validate this superblock and kick out possibly
5598 + ITERATE_RDEV(mddev,rdev,tmp) {
5600 + * Kick all non-fresh devices faulty
5603 + ev1 = get_unaligned(&rdev->sb->events);
5604 + ev2 = get_unaligned(&sb->events);
5607 + printk("md: kicking non-fresh %s from array!\n",
5608 + partition_name(rdev->dev));
5609 + kick_rdev_from_array(rdev);
5615 + * Fix up changed device names ... but only if this disk has a
5616 + * recent update time. Use faulty checksum ones too.
5618 + ITERATE_RDEV(mddev,rdev,tmp) {
5619 + __u64 ev1, ev2, ev3;
5620 + if (rdev->faulty) { /* REMOVEME */
5624 + ev1 = get_unaligned(&rdev->sb->events);
5625 + ev2 = get_unaligned(&sb->events);
5628 + if ((rdev->dev != rdev->old_dev) &&
5629 + ((ev1 == ev2) || (ev1 == ev3))) {
5632 + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
5633 + if (rdev->desc_nr == -1) {
5637 + desc = &sb->disks[rdev->desc_nr];
5638 + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
5642 + desc->major = MAJOR(rdev->dev);
5643 + desc->minor = MINOR(rdev->dev);
5644 + desc = &rdev->sb->this_disk;
5645 + desc->major = MAJOR(rdev->dev);
5646 + desc->minor = MINOR(rdev->dev);
5651 + * Remove unavailable and faulty devices ...
5653 + * note that if an array becomes completely unrunnable due to
5654 + * missing devices, we do not write the superblock back, so the
5655 + * administrator has a chance to fix things up. The removal thus
5656 + * only happens if it's nonfatal to the contents of the array.
5658 + for (i = 0; i < MD_SB_DISKS; i++) {
5663 + desc = sb->disks + i;
5664 + dev = MKDEV(desc->major, desc->minor);
5667 + * We kick faulty devices/descriptors immediately.
5669 + if (disk_faulty(desc)) {
5671 + ITERATE_RDEV(mddev,rdev,tmp) {
5672 + if (rdev->desc_nr != desc->number)
5674 + printk("md%d: kicking faulty %s!\n",
5675 + mdidx(mddev),partition_name(rdev->dev));
5676 + kick_rdev_from_array(rdev);
5681 + if (dev == MKDEV(0,0))
5683 + printk("md%d: removing former faulty %s!\n",
5684 + mdidx(mddev), partition_name(dev));
5686 + remove_descriptor(desc, sb);
5690 + if (dev == MKDEV(0,0))
5693 + * Is this device present in the rdev ring?
5696 + ITERATE_RDEV(mddev,rdev,tmp) {
5697 + if (rdev->desc_nr == desc->number) {
5705 + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
5706 + remove_descriptor(desc, sb);
5710 + * Double check wether all devices mentioned in the
5711 + * superblock are in the rdev ring.
5713 + for (i = 0; i < MD_SB_DISKS; i++) {
5717 + desc = sb->disks + i;
5718 + dev = MKDEV(desc->major, desc->minor);
5720 + if (dev == MKDEV(0,0))
5723 + if (disk_faulty(desc)) {
5728 + rdev = find_rdev(mddev, dev);
5736 + * Do a final reality check.
5738 + ITERATE_RDEV(mddev,rdev,tmp) {
5739 + if (rdev->desc_nr == -1) {
5744 + * is the desc_nr unique?
5746 + ITERATE_RDEV(mddev,rdev2,tmp2) {
5747 + if ((rdev2 != rdev) &&
5748 + (rdev2->desc_nr == rdev->desc_nr)) {
5754 + * is the device unique?
5756 + ITERATE_RDEV(mddev,rdev2,tmp2) {
5757 + if ((rdev2 != rdev) &&
5758 + (rdev2->dev == rdev->dev)) {
5766 + * Check if we can support this RAID array
5768 + if (sb->major_version != MD_MAJOR_VERSION ||
5769 + sb->minor_version > MD_MINOR_VERSION) {
5771 + printk (OLD_VERSION, mdidx(mddev), sb->major_version,
5772 + sb->minor_version, sb->patch_version);
5776 + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
5777 + (sb->level == 4) || (sb->level == 5)))
5778 + printk (NOT_CLEAN_IGNORE, mdidx(mddev));
5785 +#undef INCONSISTENT
5790 +static int device_size_calculation (mddev_t * mddev)
5792 + int data_disks = 0, persistent;
5793 + unsigned int readahead;
5794 + mdp_super_t *sb = mddev->sb;
5795 + struct md_list_head *tmp;
5799 + * Do device size calculation. Bail out if too small.
5800 + * (we have to do this after having validated chunk_size,
5801 + * because device size has to be modulo chunk_size)
5803 + persistent = !mddev->sb->not_persistent;
5804 + ITERATE_RDEV(mddev,rdev,tmp) {
5811 + rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
5812 + if (rdev->size < sb->chunk_size / 1024) {
5813 + printk (KERN_WARNING
5814 + "Dev %s smaller than chunk_size: %dk < %dk\n",
5815 + partition_name(rdev->dev),
5816 + rdev->size, sb->chunk_size / 1024);
5821 + switch (sb->level) {
5829 + zoned_raid_size(mddev);
5833 + zoned_raid_size(mddev);
5834 + data_disks = sb->raid_disks;
5841 + data_disks = sb->raid_disks-1;
5844 + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
5847 + if (!md_size[mdidx(mddev)])
5848 + md_size[mdidx(mddev)] = sb->size * data_disks;
5850 + readahead = MD_READAHEAD;
5851 + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
5852 + readahead = mddev->sb->chunk_size * 4 * data_disks;
5853 + if (readahead < data_disks * MAX_SECTORS*512*2)
5854 + readahead = data_disks * MAX_SECTORS*512*2;
5856 + if (sb->level == -3)
5859 + md_maxreadahead[mdidx(mddev)] = readahead;
5861 + printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
5862 + mdidx(mddev), readahead/1024);
5865 + "md%d: %d data-disks, max readahead per data-disk: %dk\n",
5866 + mdidx(mddev), data_disks, readahead/data_disks/1024);
5873 +#define TOO_BIG_CHUNKSIZE KERN_ERR \
5874 +"too big chunk_size: %d > %d\n"
5876 +#define TOO_SMALL_CHUNKSIZE KERN_ERR \
5877 +"too small chunk_size: %d < %ld\n"
5879 +#define BAD_CHUNKSIZE KERN_ERR \
5880 +"no chunksize specified, see 'man raidtab'\n"
5882 +static int do_md_run (mddev_t * mddev)
5886 + struct md_list_head *tmp;
5890 + if (!mddev->nb_dev) {
5899 + * Resize disks to align partitions size on a given
5902 + md_size[mdidx(mddev)] = 0;
5905 + * Analyze all RAID superblock(s)
5907 + if (analyze_sbs(mddev)) {
5912 + chunk_size = mddev->sb->chunk_size;
5913 + pnum = level_to_pers(mddev->sb->level);
5915 + mddev->param.chunk_size = chunk_size;
5916 + mddev->param.personality = pnum;
5918 + if (chunk_size > MAX_CHUNK_SIZE) {
5919 + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
5923 + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
5925 + if ( (1 << ffz(~chunk_size)) != chunk_size) {
5929 + if (chunk_size < PAGE_SIZE) {
5930 + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
5934 + if (pnum >= MAX_PERSONALITY) {
5939 + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
5941 + * 'default chunksize' in the old md code used to
5942 + * be PAGE_SIZE, baaad.
5943 + * we abort here to be on the safe side. We dont
5944 + * want to continue the bad practice.
5946 + printk(BAD_CHUNKSIZE);
5953 + char module_name[80];
5954 + sprintf (module_name, "md-personality-%d", pnum);
5955 + request_module (module_name);
5961 + if (device_size_calculation(mddev))
5965 + * Drop all container device buffers, from now on
5966 + * the only valid external interface is through the md
5969 + ITERATE_RDEV(mddev,rdev,tmp) {
5972 + fsync_dev(rdev->dev);
5973 + invalidate_buffers(rdev->dev);
5976 + mddev->pers = pers[pnum];
5978 + err = mddev->pers->run(mddev);
5980 + printk("pers->run() failed ...\n");
5981 + mddev->pers = NULL;
5985 + mddev->sb->state &= ~(1 << MD_SB_CLEAN);
5986 + md_update_sb(mddev);
5989 + * md_size has units of 1K blocks, which are
5990 + * twice as large as sectors.
5992 + md_hd_struct[mdidx(mddev)].start_sect = 0;
5993 + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
5995 + read_ahead[MD_MAJOR] = 1024;
5999 +#undef TOO_BIG_CHUNKSIZE
6000 +#undef BAD_CHUNKSIZE
6002 +#define OUT(x) do { err = (x); goto out; } while (0)
6004 +static int restart_array (mddev_t *mddev)
6009 + * Complain if it has no devices
6011 + if (!mddev->nb_dev)
6014 + if (mddev->pers) {
6019 + set_device_ro(mddev_to_kdev(mddev), 0);
6022 + "md%d switched to read-write mode.\n", mdidx(mddev));
6024 + * Kick recovery or resync if necessary
6026 + md_recover_arrays();
6027 + if (mddev->pers->restart_resync)
6028 + mddev->pers->restart_resync(mddev);
6036 +#define STILL_MOUNTED KERN_WARNING \
6037 +"md: md%d still mounted.\n"
6039 +static int do_md_stop (mddev_t * mddev, int ro)
6041 + int err = 0, resync_interrupted = 0;
6042 + kdev_t dev = mddev_to_kdev(mddev);
6044 + if (!ro && !fs_may_mount (dev)) {
6045 + printk (STILL_MOUNTED, mdidx(mddev));
6050 + * complain if it's already stopped
6052 + if (!mddev->nb_dev)
6055 + if (mddev->pers) {
6057 + * It is safe to call stop here, it only frees private
6058 + * data. Also, it tells us if a device is unstoppable
6059 + * (eg. resyncing is in progress)
6061 + if (mddev->pers->stop_resync)
6062 + if (mddev->pers->stop_resync(mddev))
6063 + resync_interrupted = 1;
6065 + if (mddev->recovery_running)
6066 + md_interrupt_thread(md_recovery_thread);
6069 + * This synchronizes with signal delivery to the
6070 + * resync or reconstruction thread. It also nicely
6071 + * hangs the process if some reconstruction has not
6074 + down(&mddev->recovery_sem);
6075 + up(&mddev->recovery_sem);
6078 + * sync and invalidate buffers because we cannot kill the
6079 + * main thread with valid IO transfers still around.
6080 + * the kernel lock protects us from new requests being
6081 + * added after invalidate_buffers().
6083 + fsync_dev (mddev_to_kdev(mddev));
6085 + invalidate_buffers (dev);
6093 + set_device_ro(dev, 0);
6094 + if (mddev->pers->stop(mddev)) {
6096 + set_device_ro(dev, 1);
6104 + * mark it clean only if there was no resync
6107 + if (!mddev->recovery_running && !resync_interrupted) {
6108 + printk("marking sb clean...\n");
6109 + mddev->sb->state |= 1 << MD_SB_CLEAN;
6111 + md_update_sb(mddev);
6114 + set_device_ro(dev, 1);
6118 + * Free resources if final stop
6121 + export_array(mddev);
6122 + md_size[mdidx(mddev)] = 0;
6123 + md_hd_struct[mdidx(mddev)].nr_sects = 0;
6124 + free_mddev(mddev);
6126 + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
6129 + "md%d switched to read-only mode.\n", mdidx(mddev));
6137 + * We have to safely support old arrays too.
6139 +int detect_old_array (mdp_super_t *sb)
6141 + if (sb->major_version > 0)
6143 + if (sb->minor_version >= 90)
6150 +static void autorun_array (mddev_t *mddev)
6153 + struct md_list_head *tmp;
6156 + if (mddev->disks.prev == &mddev->disks) {
6161 + printk("running: ");
6163 + ITERATE_RDEV(mddev,rdev,tmp) {
6164 + printk("<%s>", partition_name(rdev->dev));
6166 + printk("\nnow!\n");
6168 + err = do_md_run (mddev);
6170 + printk("do_md_run() returned %d\n", err);
6172 + * prevent the writeback of an unrunnable array
6174 + mddev->sb_dirty = 0;
6175 + do_md_stop (mddev, 0);
6180 + * lets try to run arrays based on all disks that have arrived
6181 + * until now. (those are in the ->pending list)
6183 + * the method: pick the first pending disk, collect all disks with
6184 + * the same UUID, remove all from the pending list and put them into
6185 + * the 'same_array' list. Then order this list based on superblock
6186 + * update time (freshest comes first), kick out 'old' disks and
6187 + * compare superblocks. If everything's fine then run it.
6189 +static void autorun_devices (void)
6191 + struct md_list_head candidates;
6192 + struct md_list_head *tmp;
6193 + mdk_rdev_t *rdev0, *rdev;
6198 + printk("autorun ...\n");
6199 + while (pending_raid_disks.next != &pending_raid_disks) {
6200 + rdev0 = md_list_entry(pending_raid_disks.next,
6201 + mdk_rdev_t, pending);
6203 + printk("considering %s ...\n", partition_name(rdev0->dev));
6204 + MD_INIT_LIST_HEAD(&candidates);
6205 + ITERATE_RDEV_PENDING(rdev,tmp) {
6206 + if (uuid_equal(rdev0, rdev)) {
6207 + if (!sb_equal(rdev0->sb, rdev->sb)) {
6208 + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
6211 + printk(" adding %s ...\n", partition_name(rdev->dev));
6212 + md_list_del(&rdev->pending);
6213 + md_list_add(&rdev->pending, &candidates);
6217 - * Check the superblock for consistency.
6218 - * The personality itself has to check whether it's getting
6219 - * added with the proper flags. The personality has to be
6221 + * now we have a set of devices, with all of them having
6222 + * mostly sane superblocks. It's time to allocate the
6225 - if (analyze_one_sb (realdev))
6226 + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
6227 + mddev = kdev_to_mddev(md_kdev);
6229 + printk("md%d already running, cannot run %s\n",
6230 + mdidx(mddev), partition_name(rdev0->dev));
6231 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
6232 + export_rdev(rdev);
6235 + mddev = alloc_mddev(md_kdev);
6236 + printk("created md%d\n", mdidx(mddev));
6237 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
6238 + bind_rdev_to_array(rdev, mddev);
6239 + md_list_del(&rdev->pending);
6240 + MD_INIT_LIST_HEAD(&rdev->pending);
6242 + autorun_array(mddev);
6244 + printk("... autorun DONE.\n");
6248 + * import RAID devices based on one partition
6249 + * if possible, the array gets run as well.
6252 +#define BAD_VERSION KERN_ERR \
6253 +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
6255 +#define OUT_OF_MEM KERN_ALERT \
6256 +"md: out of memory.\n"
6258 +#define NO_DEVICE KERN_ERR \
6259 +"md: disabled device %s\n"
6261 +#define AUTOADD_FAILED KERN_ERR \
6262 +"md: auto-adding devices to md%d FAILED (error %d).\n"
6264 +#define AUTOADD_FAILED_USED KERN_ERR \
6265 +"md: cannot auto-add device %s to md%d, already used.\n"
6267 +#define AUTORUN_FAILED KERN_ERR \
6268 +"md: auto-running md%d FAILED (error %d).\n"
6270 +#define MDDEV_BUSY KERN_ERR \
6271 +"md: cannot auto-add to md%d, already running.\n"
6273 +#define AUTOADDING KERN_INFO \
6274 +"md: auto-adding devices to md%d, based on %s's superblock.\n"
6276 +#define AUTORUNNING KERN_INFO \
6277 +"md: auto-running md%d.\n"
6279 +static int autostart_array (kdev_t startdev)
6281 + int err = -EINVAL, i;
6282 + mdp_super_t *sb = NULL;
6283 + mdk_rdev_t *start_rdev = NULL, *rdev;
6285 + if (md_import_device(startdev, 1)) {
6286 + printk("could not import %s!\n", partition_name(startdev));
6290 + start_rdev = find_rdev_all(startdev);
6291 + if (!start_rdev) {
6295 + if (start_rdev->faulty) {
6296 + printk("can not autostart based on faulty %s!\n",
6297 + partition_name(startdev));
6300 + md_list_add(&start_rdev->pending, &pending_raid_disks);
6302 + sb = start_rdev->sb;
6304 + err = detect_old_array(sb);
6306 + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
6310 + for (i = 0; i < MD_SB_DISKS; i++) {
6314 + desc = sb->disks + i;
6315 + dev = MKDEV(desc->major, desc->minor);
6317 + if (dev == MKDEV(0,0))
6319 + if (dev == startdev)
6321 + if (md_import_device(dev, 1)) {
6322 + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
6325 + rdev = find_rdev_all(dev);
6330 + md_list_add(&rdev->pending, &pending_raid_disks);
6334 + * possibly return codes
6336 + autorun_devices();
6341 + export_rdev(start_rdev);
6348 +#undef AUTOADD_FAILED_USED
6349 +#undef AUTOADD_FAILED
6350 +#undef AUTORUN_FAILED
6358 +} raid_setup_args md__initdata = { 0, 0 };
6361 + * Searches all registered partitions for autorun RAID arrays
6364 +md__initfunc(void autodetect_raid(void))
6366 +#ifdef CONFIG_AUTODETECT_RAID
6367 + struct gendisk *disk;
6371 + if (raid_setup_args.noautodetect) {
6372 + printk(KERN_INFO "skipping autodetection of RAID arrays\n");
6375 + printk(KERN_INFO "autodetecting RAID arrays\n");
6377 + for (disk = gendisk_head ; disk ; disk = disk->next) {
6378 + for (i = 0; i < disk->max_p*disk->max_nr; i++) {
6379 + kdev_t dev = MKDEV(disk->major,i);
6381 + if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
6383 +"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
6384 +" to maintain interoperability with other OSs! Autodetection support for\n"
6385 +" type 0x86 will be deleted after some migration timeout. Sorry.\n",
6386 + partition_name(dev));
6387 + disk->part[i].type = LINUX_RAID_PARTITION;
6389 + if (disk->part[i].type != LINUX_RAID_PARTITION)
6392 + if (md_import_device(dev,1)) {
6393 + printk(KERN_ALERT "could not import %s!\n",
6394 + partition_name(dev));
6400 + rdev = find_rdev_all(dev);
6405 + if (rdev->faulty) {
6409 + md_list_add(&rdev->pending, &pending_raid_disks);
6413 + autorun_devices();
6417 +static int get_version (void * arg)
6419 + mdu_version_t ver;
6421 + ver.major = MD_MAJOR_VERSION;
6422 + ver.minor = MD_MINOR_VERSION;
6423 + ver.patchlevel = MD_PATCHLEVEL_VERSION;
6425 + if (md_copy_to_user(arg, &ver, sizeof(ver)))
6431 +#define SET_FROM_SB(x) info.x = mddev->sb->x
6432 +static int get_array_info (mddev_t * mddev, void * arg)
6434 + mdu_array_info_t info;
6439 + SET_FROM_SB(major_version);
6440 + SET_FROM_SB(minor_version);
6441 + SET_FROM_SB(patch_version);
6442 + SET_FROM_SB(ctime);
6443 + SET_FROM_SB(level);
6444 + SET_FROM_SB(size);
6445 + SET_FROM_SB(nr_disks);
6446 + SET_FROM_SB(raid_disks);
6447 + SET_FROM_SB(md_minor);
6448 + SET_FROM_SB(not_persistent);
6450 + SET_FROM_SB(utime);
6451 + SET_FROM_SB(state);
6452 + SET_FROM_SB(active_disks);
6453 + SET_FROM_SB(working_disks);
6454 + SET_FROM_SB(failed_disks);
6455 + SET_FROM_SB(spare_disks);
6457 + SET_FROM_SB(layout);
6458 + SET_FROM_SB(chunk_size);
6460 + if (md_copy_to_user(arg, &info, sizeof(info)))
6467 +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
6468 +static int get_disk_info (mddev_t * mddev, void * arg)
6470 + mdu_disk_info_t info;
6476 + if (md_copy_from_user(&info, arg, sizeof(info)))
6480 + if (nr >= mddev->sb->nr_disks)
6483 + SET_FROM_SB(major);
6484 + SET_FROM_SB(minor);
6485 + SET_FROM_SB(raid_disk);
6486 + SET_FROM_SB(state);
6488 + if (md_copy_to_user(arg, &info, sizeof(info)))
6495 +#define SET_SB(x) mddev->sb->disks[nr].x = info.x
6497 +static int add_new_disk (mddev_t * mddev, void * arg)
6499 + int err, size, persistent;
6500 + mdu_disk_info_t info;
6508 + if (md_copy_from_user(&info, arg, sizeof(info)))
6512 + if (nr >= mddev->sb->nr_disks)
6515 + dev = MKDEV(info.major,info.minor);
6517 + if (find_rdev_all(dev)) {
6518 + printk("device %s already used in a RAID array!\n",
6519 + partition_name(dev));
6526 + SET_SB(raid_disk);
6529 + if ((info.state & (1<<MD_DISK_FAULTY))==0) {
6530 + err = md_import_device (dev, 0);
6532 + printk("md: error, md_import_device() returned %d\n", err);
6535 + rdev = find_rdev_all(dev);
6541 + rdev->old_dev = dev;
6542 + rdev->desc_nr = info.number;
6544 + bind_rdev_to_array(rdev, mddev);
6546 + persistent = !mddev->sb->not_persistent;
6548 + printk("nonpersistent superblock ...\n");
6549 + if (!mddev->sb->chunk_size)
6550 + printk("no chunksize?\n");
6552 + size = calc_dev_size(dev, mddev, persistent);
6553 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6555 + if (!mddev->sb->size || (mddev->sb->size > size))
6556 + mddev->sb->size = size;
6560 + * sync all other superblocks with the main superblock
6568 +static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
6577 + printk("trying to remove %s from md%d ... \n",
6578 + partition_name(dev), mdidx(mddev));
6580 + if (!mddev->pers->diskop) {
6581 + printk("md%d: personality does not support diskops!\n",
6586 + rdev = find_rdev(mddev, dev);
6590 + if (rdev->desc_nr == -1) {
6594 + disk = &mddev->sb->disks[rdev->desc_nr];
6595 + if (disk_active(disk))
6597 + if (disk_removed(disk)) {
6602 + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
6603 + if (err == -EBUSY)
6610 + remove_descriptor(disk, mddev->sb);
6611 + kick_rdev_from_array(rdev);
6612 + mddev->sb_dirty = 1;
6613 + md_update_sb(mddev);
6617 + printk("cannot remove active disk %s from md%d ... \n",
6618 + partition_name(dev), mdidx(mddev));
6622 +static int hot_add_disk (mddev_t * mddev, kdev_t dev)
6624 + int i, err, persistent;
6625 + unsigned int size;
6632 + printk("trying to hot-add %s to md%d ... \n",
6633 + partition_name(dev), mdidx(mddev));
6635 + if (!mddev->pers->diskop) {
6636 + printk("md%d: personality does not support diskops!\n",
6641 + persistent = !mddev->sb->not_persistent;
6642 + size = calc_dev_size(dev, mddev, persistent);
6644 + if (size < mddev->sb->size) {
6645 + printk("md%d: disk size %d blocks < array size %d\n",
6646 + mdidx(mddev), size, mddev->sb->size);
6650 + rdev = find_rdev(mddev, dev);
6654 + err = md_import_device (dev, 0);
6656 + printk("md: error, md_import_device() returned %d\n", err);
6659 + rdev = find_rdev_all(dev);
6664 + if (rdev->faulty) {
6665 + printk("md: can not hot-add faulty %s disk to md%d!\n",
6666 + partition_name(dev), mdidx(mddev));
6668 + goto abort_export;
6670 + bind_rdev_to_array(rdev, mddev);
6673 + * The rest should better be atomic, we can have disk failures
6674 + * noticed in interrupt contexts ...
6677 + rdev->old_dev = dev;
6678 + rdev->size = size;
6679 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6681 + disk = mddev->sb->disks + mddev->sb->raid_disks;
6682 + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
6683 + disk = mddev->sb->disks + i;
6685 + if (!disk->major && !disk->minor)
6687 + if (disk_removed(disk))
6690 + if (i == MD_SB_DISKS) {
6692 + printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
6694 + goto abort_unbind_export;
6697 + if (disk_removed(disk)) {
6699 - * hot_add has to bump up nb_dev itself
6702 - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
6704 - * FIXME: here we should free up the inode and stuff
6706 - printk ("FIXME\n");
6708 + if (disk->number != i) {
6712 + goto abort_unbind_export;
6715 - md_dev[minor].nb_dev++;
6720 - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
6722 + disk->raid_disk = disk->number;
6723 + disk->major = MAJOR(dev);
6724 + disk->minor = MINOR(dev);
6726 + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
6730 + goto abort_unbind_export;
6733 + mark_disk_spare(disk);
6734 + mddev->sb->nr_disks++;
6735 + mddev->sb->spare_disks++;
6736 + mddev->sb->working_disks++;
6738 + mddev->sb_dirty = 1;
6741 + md_update_sb(mddev);
6744 + * Kick recovery, maybe this spare has to be added to the
6745 + * array immediately.
6747 + md_recover_arrays();
6751 +abort_unbind_export:
6752 + unbind_rdev_from_array(rdev);
6755 + export_rdev(rdev);
6759 +#define SET_SB(x) mddev->sb->x = info.x
6760 +static int set_array_info (mddev_t * mddev, void * arg)
6762 + mdu_array_info_t info;
6765 + printk("array md%d already has a superblock!\n",
6770 + if (md_copy_from_user(&info, arg, sizeof(info)))
6773 + if (alloc_array_sb(mddev))
6776 + mddev->sb->major_version = MD_MAJOR_VERSION;
6777 + mddev->sb->minor_version = MD_MINOR_VERSION;
6778 + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
6779 + mddev->sb->ctime = CURRENT_TIME;
6784 + SET_SB(raid_disks);
6786 + SET_SB(not_persistent);
6789 + SET_SB(active_disks);
6790 + SET_SB(working_disks);
6791 + SET_SB(failed_disks);
6792 + SET_SB(spare_disks);
6795 + SET_SB(chunk_size);
6797 + mddev->sb->md_magic = MD_SB_MAGIC;
6800 + * Generate a 128 bit UUID
6802 + get_random_bytes(&mddev->sb->set_uuid0, 4);
6803 + get_random_bytes(&mddev->sb->set_uuid1, 4);
6804 + get_random_bytes(&mddev->sb->set_uuid2, 4);
6805 + get_random_bytes(&mddev->sb->set_uuid3, 4);
6811 +static int set_disk_info (mddev_t * mddev, void * arg)
6813 + printk("not yet");
6817 +static int clear_array (mddev_t * mddev)
6819 + printk("not yet");
6823 +static int write_raid_info (mddev_t * mddev)
6825 + printk("not yet");
6829 +static int protect_array (mddev_t * mddev)
6831 + printk("not yet");
6835 +static int unprotect_array (mddev_t * mddev)
6837 + printk("not yet");
6841 +static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
6845 + fsync_dev(mddev_to_kdev(mddev));
6846 + ret = md_error(mddev_to_kdev(mddev), dev);
6850 static int md_ioctl (struct inode *inode, struct file *file,
6851 unsigned int cmd, unsigned long arg)
6854 - struct hd_geometry *loc = (struct hd_geometry *) arg;
6855 + unsigned int minor;
6857 + struct hd_geometry *loc = (struct hd_geometry *) arg;
6858 + mddev_t *mddev = NULL;
6861 - if (!capable(CAP_SYS_ADMIN))
6863 + if (!md_capable_admin())
6866 - if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
6867 - (minor & 0x7f) < MAX_PERSONALITY &&
6868 - pers[minor & 0x7f] &&
6869 - pers[minor & 0x7f]->ioctl)
6870 - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
6872 - if (minor >= MAX_MD_DEV)
6874 + dev = inode->i_rdev;
6875 + minor = MINOR(dev);
6876 + if (minor >= MAX_MD_DEVS)
6881 - case REGISTER_DEV:
6882 - return do_md_add (minor, to_kdev_t ((dev_t) arg));
6884 + * Commands dealing with the RAID driver but not any
6885 + * particular array:
6889 + case RAID_VERSION:
6890 + err = get_version((void *)arg);
6893 + case PRINT_RAID_DEBUG:
6895 + md_print_devices();
6898 + case BLKGETSIZE: /* Return device size */
6903 + err = md_put_user(md_hd_struct[minor].nr_sects,
6908 - return do_md_run (minor, (int) arg);
6911 + invalidate_buffers(dev);
6915 - return do_md_stop (minor, inode);
6917 - case BLKGETSIZE: /* Return device size */
6918 - if (!arg) return -EINVAL;
6919 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
6925 - fsync_dev (inode->i_rdev);
6926 - invalidate_buffers (inode->i_rdev);
6932 - read_ahead[MAJOR(inode->i_rdev)] = arg;
6936 - if (!arg) return -EINVAL;
6937 - err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
6942 - /* We have a problem here : there is no easy way to give a CHS
6943 - virtual geometry. We currently pretend that we have a 2 heads
6944 - 4 sectors (with a BIG number of cylinders...). This drives dosfs
6945 - just mad... ;-) */
6948 - if (!loc) return -EINVAL;
6949 - err = put_user (2, (char *) &loc->heads);
6952 - err = put_user (4, (char *) &loc->sectors);
6955 - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
6958 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
6959 - (long *) &loc->start);
6964 - RO_IOCTLS(inode->i_rdev,arg);
6970 + read_ahead[MAJOR(dev)] = arg;
6981 + err = md_put_user (read_ahead[
6982 + MAJOR(dev)], (long *) arg);
6988 + * Commands creating/starting a new array:
6991 + mddev = kdev_to_mddev(dev);
6995 + case SET_ARRAY_INFO:
6998 + printk("array md%d already exists!\n",
7008 + case SET_ARRAY_INFO:
7009 + mddev = alloc_mddev(dev);
7015 + * alloc_mddev() should possibly self-lock.
7017 + err = lock_mddev(mddev);
7019 + printk("ioctl, reason %d, cmd %d\n", err, cmd);
7022 + err = set_array_info(mddev, (void *)arg);
7024 + printk("couldnt set array info. %d\n", err);
7031 + * possibly make it lock the array ...
7033 + err = autostart_array((kdev_t)arg);
7035 + printk("autostart %s failed!\n",
7036 + partition_name((kdev_t)arg));
7045 + * Commands querying/configuring an existing array:
7052 + err = lock_mddev(mddev);
7054 + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
7059 + * Commands even a read-only array can execute:
7063 + case GET_ARRAY_INFO:
7064 + err = get_array_info(mddev, (void *)arg);
7067 + case GET_DISK_INFO:
7068 + err = get_disk_info(mddev, (void *)arg);
7071 + case RESTART_ARRAY_RW:
7072 + err = restart_array(mddev);
7076 + err = do_md_stop (mddev, 0);
7079 + case STOP_ARRAY_RO:
7080 + err = do_md_stop (mddev, 1);
7084 + * We have a problem here : there is no easy way to give a CHS
7085 + * virtual geometry. We currently pretend that we have a 2 heads
7086 + * 4 sectors (with a BIG number of cylinders...). This drives
7087 + * dosfs just mad... ;-)
7092 + goto abort_unlock;
7094 + err = md_put_user (2, (char *) &loc->heads);
7096 + goto abort_unlock;
7097 + err = md_put_user (4, (char *) &loc->sectors);
7099 + goto abort_unlock;
7100 + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
7101 + (short *) &loc->cylinders);
7103 + goto abort_unlock;
7104 + err = md_put_user (md_hd_struct[minor].start_sect,
7105 + (long *) &loc->start);
7111 + * The remaining ioctls are changing the state of the
7112 + * superblock, so we do not allow read-only arrays
7117 + goto abort_unlock;
7123 + err = clear_array(mddev);
7126 + case ADD_NEW_DISK:
7127 + err = add_new_disk(mddev, (void *)arg);
7130 + case HOT_REMOVE_DISK:
7131 + err = hot_remove_disk(mddev, (kdev_t)arg);
7134 + case HOT_ADD_DISK:
7135 + err = hot_add_disk(mddev, (kdev_t)arg);
7138 + case SET_DISK_INFO:
7139 + err = set_disk_info(mddev, (void *)arg);
7142 + case WRITE_RAID_INFO:
7143 + err = write_raid_info(mddev);
7146 + case UNPROTECT_ARRAY:
7147 + err = unprotect_array(mddev);
7150 + case PROTECT_ARRAY:
7151 + err = protect_array(mddev);
7154 + case SET_DISK_FAULTY:
7155 + err = set_disk_faulty(mddev, (kdev_t)arg);
7160 + mdu_param_t param;
7162 + err = md_copy_from_user(¶m, (mdu_param_t *)arg,
7165 + goto abort_unlock;
7167 + err = do_md_run (mddev);
7169 + * we have to clean up the mess if
7170 + * the array cannot be run for some
7174 + mddev->sb_dirty = 0;
7175 + do_md_stop (mddev, 0);
7181 + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
7183 + goto abort_unlock;
7189 + unlock_mddev(mddev);
7191 + printk("huh11?\n");
7196 + printk("huh12?\n");
7202 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
7204 static int md_open (struct inode *inode, struct file *file)
7206 - int minor=MINOR(inode->i_rdev);
7213 +static void md_release (struct inode *inode, struct file *file)
7215 + sync_dev(inode->i_rdev);
7219 +static int md_read (struct inode *inode, struct file *file,
7220 + char *buf, int count)
7222 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7224 - md_dev[minor].busy++;
7225 - return (0); /* Always succeed */
7226 + if (!mddev || !mddev->pers)
7229 + return block_read (inode, file, buf, count);
7232 +static int md_write (struct inode *inode, struct file *file,
7233 + const char *buf, int count)
7235 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7237 + if (!mddev || !mddev->pers)
7240 -static int md_release (struct inode *inode, struct file *file)
7241 + return block_write (inode, file, buf, count);
7244 +static struct file_operations md_fops=
7246 - int minor=MINOR(inode->i_rdev);
7261 - sync_dev (inode->i_rdev);
7262 - md_dev[minor].busy--;
7264 +static int md_open (struct inode *inode, struct file *file)
7272 +static int md_release (struct inode *inode, struct file *file)
7274 + sync_dev(inode->i_rdev);
7278 static ssize_t md_read (struct file *file, char *buf, size_t count,
7281 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7282 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7284 - if (!md_dev[minor].pers) /* Check if device is being run */
7286 + if (!mddev || !mddev->pers)
7289 - return block_read(file, buf, count, ppos);
7290 + return block_read(file, buf, count, ppos);
7293 static ssize_t md_write (struct file *file, const char *buf,
7294 size_t count, loff_t *ppos)
7296 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7297 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7299 - if (!md_dev[minor].pers) /* Check if device is being run */
7301 + if (!mddev || !mddev->pers)
7304 - return block_write(file, buf, count, ppos);
7305 + return block_write(file, buf, count, ppos);
7308 static struct file_operations md_fops=
7334 -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
7337 +int md_map (kdev_t dev, kdev_t *rdev,
7338 + unsigned long *rsector, unsigned long size)
7340 - if ((unsigned int) minor >= MAX_MD_DEV)
7342 - printk ("Bad md device %d\n", minor);
7346 - if (!md_dev[minor].pers)
7348 - printk ("Oops ! md%d not running, giving up !\n", minor);
7352 + mddev_t *mddev = kdev_to_mddev(dev);
7354 - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
7355 + if (!mddev || !mddev->pers) {
7360 + err = mddev->pers->map(mddev, dev, rdev, rsector, size);
7365 -int md_make_request (int minor, int rw, struct buffer_head * bh)
7366 +int md_make_request (struct buffer_head * bh, int rw)
7368 - if (md_dev [minor].pers->make_request) {
7369 - if (buffer_locked(bh))
7372 + mddev_t *mddev = kdev_to_mddev(bh->b_dev);
7374 + if (!mddev || !mddev->pers) {
7379 + if (mddev->pers->make_request) {
7380 + if (buffer_locked(bh)) {
7384 set_bit(BH_Lock, &bh->b_state);
7385 if (rw == WRITE || rw == WRITEA) {
7386 if (!buffer_dirty(bh)) {
7387 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7389 + bh->b_end_io(bh, buffer_uptodate(bh));
7394 if (rw == READ || rw == READA) {
7395 if (buffer_uptodate(bh)) {
7396 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7398 + bh->b_end_io(bh, buffer_uptodate(bh));
7403 - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
7404 + err = mddev->pers->make_request(mddev, rw, bh);
7406 make_request (MAJOR(bh->b_rdev), rw, bh);
7414 static void do_md_request (void)
7416 - printk ("Got md request, not good...");
7418 + printk(KERN_ALERT "Got md request, not good...");
7422 +int md_thread(void * arg)
7424 + mdk_thread_t *thread = arg;
7428 + exit_files(current);
7435 + sprintf(current->comm, thread->name);
7436 + md_init_signals();
7437 + md_flush_signals();
7438 + thread->tsk = current;
7441 + * md_thread is a 'system-thread', it's priority should be very
7442 + * high. We avoid resource deadlocks individually in each
7443 + * raid personality. (RAID5 does preallocation) We also use RR and
7444 + * the very same RT priority as kswapd, thus we will never get
7445 + * into a priority inversion deadlock.
7447 + * we definitely have to have equal or higher priority than
7448 + * bdflush, otherwise bdflush will deadlock if there are too
7449 + * many dirty RAID5 blocks.
7451 + current->policy = SCHED_OTHER;
7452 + current->priority = 40;
7458 + if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7461 + interruptible_sleep_on(&thread->wqueue);
7464 + clear_bit(THREAD_WAKEUP, &thread->flags);
7465 + if (thread->run) {
7466 + thread->run(thread->data);
7467 + run_task_queue(&tq_disk);
7469 + if (md_signal_pending(current)) {
7470 + printk("%8s(%d) flushing signals.\n", current->comm,
7472 + md_flush_signals();
7480 -void md_wakeup_thread(struct md_thread *thread)
7481 +void md_wakeup_thread(mdk_thread_t *thread)
7483 set_bit(THREAD_WAKEUP, &thread->flags);
7484 wake_up(&thread->wqueue);
7487 -struct md_thread *md_register_thread (void (*run) (void *), void *data)
7488 +mdk_thread_t *md_register_thread (void (*run) (void *),
7489 + void *data, const char *name)
7491 - struct md_thread *thread = (struct md_thread *)
7492 - kmalloc(sizeof(struct md_thread), GFP_KERNEL);
7493 + mdk_thread_t *thread;
7495 struct semaphore sem = MUTEX_LOCKED;
7497 - if (!thread) return NULL;
7498 + thread = (mdk_thread_t *) kmalloc
7499 + (sizeof(mdk_thread_t), GFP_KERNEL);
7503 - memset(thread, 0, sizeof(struct md_thread));
7504 + memset(thread, 0, sizeof(mdk_thread_t));
7505 init_waitqueue(&thread->wqueue);
7509 thread->data = data;
7510 + thread->name = name;
7511 ret = kernel_thread(md_thread, thread, 0);
7514 @@ -836,270 +3032,407 @@
7518 -void md_unregister_thread (struct md_thread *thread)
7519 +void md_interrupt_thread (mdk_thread_t *thread)
7521 + if (!thread->tsk) {
7525 + printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
7526 + send_sig(SIGKILL, thread->tsk, 1);
7529 +void md_unregister_thread (mdk_thread_t *thread)
7531 struct semaphore sem = MUTEX_LOCKED;
7536 - printk("Killing md_thread %d %p %s\n",
7537 - thread->tsk->pid, thread->tsk, thread->tsk->comm);
7539 - printk("Aiee. md_thread has 0 tsk\n");
7540 - send_sig(SIGKILL, thread->tsk, 1);
7541 - printk("downing on %p\n", &sem);
7542 + thread->name = NULL;
7543 + if (!thread->tsk) {
7547 + md_interrupt_thread(thread);
7551 -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
7553 -int md_thread(void * arg)
7554 +void md_recover_arrays (void)
7556 - struct md_thread *thread = arg;
7560 - exit_files(current);
7563 - current->session = 1;
7564 - current->pgrp = 1;
7565 - sprintf(current->comm, "md_thread");
7566 - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
7567 - thread->tsk = current;
7572 - if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7574 - spin_lock(¤t->sigmask_lock);
7575 - flush_signals(current);
7576 - spin_unlock(¤t->sigmask_lock);
7577 - interruptible_sleep_on(&thread->wqueue);
7579 - if (test_bit(THREAD_WAKEUP, &thread->flags))
7581 - if (!thread->run) {
7586 - } while (signal_pending(current));
7589 - clear_bit(THREAD_WAKEUP, &thread->flags);
7590 - if (thread->run) {
7591 - thread->run(thread->data);
7592 - run_task_queue(&tq_disk);
7594 + if (!md_recovery_thread) {
7598 + md_wakeup_thread(md_recovery_thread);
7601 -EXPORT_SYMBOL(md_size);
7602 -EXPORT_SYMBOL(md_maxreadahead);
7603 -EXPORT_SYMBOL(register_md_personality);
7604 -EXPORT_SYMBOL(unregister_md_personality);
7605 -EXPORT_SYMBOL(partition_name);
7606 -EXPORT_SYMBOL(md_dev);
7607 -EXPORT_SYMBOL(md_error);
7608 -EXPORT_SYMBOL(md_register_thread);
7609 -EXPORT_SYMBOL(md_unregister_thread);
7610 -EXPORT_SYMBOL(md_update_sb);
7611 -EXPORT_SYMBOL(md_map);
7612 -EXPORT_SYMBOL(md_wakeup_thread);
7613 -EXPORT_SYMBOL(md_do_sync);
7615 -#ifdef CONFIG_PROC_FS
7616 -static struct proc_dir_entry proc_md = {
7617 - PROC_MD, 6, "mdstat",
7618 - S_IFREG | S_IRUGO, 1, 0, 0,
7619 - 0, &proc_array_inode_operations,
7621 +int md_error (kdev_t dev, kdev_t rdev)
7623 + mddev_t *mddev = kdev_to_mddev(dev);
7624 + mdk_rdev_t * rrdev;
7631 + rrdev = find_rdev(mddev, rdev);
7632 + mark_rdev_faulty(rrdev);
7634 + * if recovery was running, stop it now.
7636 + if (mddev->pers->stop_resync)
7637 + mddev->pers->stop_resync(mddev);
7638 + if (mddev->recovery_running)
7639 + md_interrupt_thread(md_recovery_thread);
7640 + if (mddev->pers->error_handler) {
7641 + rc = mddev->pers->error_handler(mddev, rdev);
7642 + md_recover_arrays();
7647 + * Drop all buffers in the failed array.
7648 + * _not_. This is called from IRQ handlers ...
7650 + invalidate_buffers(rdev);
7655 -static void md_geninit (struct gendisk *gdisk)
7656 +static int status_unused (char * page)
7660 - for(i=0;i<MAX_MD_DEV;i++)
7662 - md_blocksizes[i] = 1024;
7663 - md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
7664 - md_gendisk.part[i].start_sect=-1; /* avoid partition check */
7665 - md_gendisk.part[i].nr_sects=0;
7666 - md_dev[i].pers=NULL;
7668 + int sz = 0, i = 0;
7670 + struct md_list_head *tmp;
7672 - blksize_size[MD_MAJOR] = md_blocksizes;
7673 - max_readahead[MD_MAJOR] = md_maxreadahead;
7674 + sz += sprintf(page + sz, "unused devices: ");
7676 -#ifdef CONFIG_PROC_FS
7677 - proc_register(&proc_root, &proc_md);
7679 + ITERATE_RDEV_ALL(rdev,tmp) {
7680 + if (!rdev->same_set.next && !rdev->same_set.prev) {
7682 + * The device is not yet used by any array.
7685 + sz += sprintf(page + sz, "%s ",
7686 + partition_name(rdev->dev));
7690 + sz += sprintf(page + sz, "<none>");
7692 + sz += sprintf(page + sz, "\n");
7696 -int md_error (kdev_t mddev, kdev_t rdev)
7698 +static int status_resync (char * page, mddev_t * mddev)
7700 - unsigned int minor = MINOR (mddev);
7703 + unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
7705 - if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
7706 - panic ("md_error gets unknown device\n");
7707 - if (!md_dev [minor].pers)
7708 - panic ("md_error gets an error for an unknown device\n");
7709 - if (md_dev [minor].pers->error_handler) {
7710 - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
7711 -#if SUPPORT_RECONSTRUCTION
7712 - md_wakeup_thread(md_sync_thread);
7713 -#endif /* SUPPORT_RECONSTRUCTION */
7717 + resync = mddev->curr_resync;
7718 + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
7719 + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
7722 + * Should not happen.
7724 + if (!max_blocks) {
7728 + res = resync*100/max_blocks;
7729 + if (!mddev->recovery_running)
7733 + sz += sprintf(page + sz, " resync=%u%%", res);
7738 + sz += sprintf(page + sz, " recovery=%u%%", res);
7741 + * We do not want to overflow, so the order of operands and
7742 + * the * 100 / 100 trick are important. We do a +1 to be
7743 + * safe against division by zero. We only estimate anyway.
7745 + * dt: time until now
7747 + * et: estimated finish time
7749 + dt = ((jiffies - mddev->resync_start) / HZ);
7750 + tt = (dt * (max_blocks / (resync/100+1)))/100;
7755 + * ignore rounding effects near finish time
7759 + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
7764 int get_md_status (char *page)
7766 - int sz=0, i, j, size;
7768 - sz+=sprintf( page+sz, "Personalities : ");
7769 - for (i=0; i<MAX_PERSONALITY; i++)
7771 - sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
7775 - sz+=sprintf (page+sz, "read_ahead ");
7776 - if (read_ahead[MD_MAJOR]==INT_MAX)
7777 - sz+=sprintf (page+sz, "not set\n");
7779 - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7780 + int sz = 0, j, size;
7781 + struct md_list_head *tmp, *tmp2;
7785 + sz += sprintf(page + sz, "Personalities : ");
7786 + for (j = 0; j < MAX_PERSONALITY; j++)
7788 + sz += sprintf(page+sz, "[%s] ", pers[j]->name);
7790 + sz += sprintf(page+sz, "\n");
7793 + sz += sprintf(page+sz, "read_ahead ");
7794 + if (read_ahead[MD_MAJOR] == INT_MAX)
7795 + sz += sprintf(page+sz, "not set\n");
7797 + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7799 - for (i=0; i<MAX_MD_DEV; i++)
7801 - sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
7803 - if (md_dev[i].pers)
7804 - sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
7805 + ITERATE_MDDEV(mddev,tmp) {
7806 + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
7807 + mddev->pers ? "" : "in");
7808 + if (mddev->pers) {
7810 + sz += sprintf(page + sz, " (read-only)");
7811 + sz += sprintf(page + sz, " %s", mddev->pers->name);
7815 - for (j=0; j<md_dev[i].nb_dev; j++)
7817 - sz+=sprintf (page+sz, " %s",
7818 - partition_name(md_dev[i].devices[j].dev));
7819 - size+=md_dev[i].devices[j].size;
7822 + ITERATE_RDEV(mddev,rdev,tmp2) {
7823 + sz += sprintf(page + sz, " %s[%d]",
7824 + partition_name(rdev->dev), rdev->desc_nr);
7825 + if (rdev->faulty) {
7826 + sz += sprintf(page + sz, "(F)");
7829 + size += rdev->size;
7832 - if (md_dev[i].nb_dev) {
7833 - if (md_dev[i].pers)
7834 - sz+=sprintf (page+sz, " %d blocks", md_size[i]);
7836 - sz+=sprintf (page+sz, " %d blocks", size);
7838 + if (mddev->nb_dev) {
7840 + sz += sprintf(page + sz, " %d blocks",
7841 + md_size[mdidx(mddev)]);
7843 + sz += sprintf(page + sz, " %d blocks", size);
7846 - if (!md_dev[i].pers)
7848 - sz+=sprintf (page+sz, "\n");
7851 + if (!mddev->pers) {
7852 + sz += sprintf(page+sz, "\n");
7856 - if (md_dev[i].pers->max_invalid_dev)
7857 - sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
7858 + sz += mddev->pers->status (page+sz, mddev);
7860 - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
7861 - sz+=sprintf (page+sz, "\n");
7863 + if (mddev->curr_resync)
7864 + sz += status_resync (page+sz, mddev);
7866 + if (md_atomic_read(&mddev->resync_sem.count) != 1)
7867 + sz += sprintf(page + sz, " resync=DELAYED");
7869 + sz += sprintf(page + sz, "\n");
7871 + sz += status_unused (page + sz);
7877 -int register_md_personality (int p_num, struct md_personality *p)
7878 +int register_md_personality (int pnum, mdk_personality_t *p)
7880 - int i=(p_num >> PERSONALITY_SHIFT);
7882 - if (i >= MAX_PERSONALITY)
7884 + if (pnum >= MAX_PERSONALITY)
7893 - printk ("%s personality registered\n", p->name);
7896 + printk(KERN_INFO "%s personality registered\n", p->name);
7900 -int unregister_md_personality (int p_num)
7901 +int unregister_md_personality (int pnum)
7903 - int i=(p_num >> PERSONALITY_SHIFT);
7905 - if (i >= MAX_PERSONALITY)
7907 + if (pnum >= MAX_PERSONALITY)
7910 - printk ("%s personality unregistered\n", pers[i]->name);
7913 + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
7914 + pers[pnum] = NULL;
7918 -static md_descriptor_t *get_spare(struct md_dev *mddev)
7919 +static mdp_disk_t *get_spare(mddev_t *mddev)
7922 - md_superblock_t *sb = mddev->sb;
7923 - md_descriptor_t *descriptor;
7924 - struct real_dev *realdev;
7926 - for (i = 0; i < mddev->nb_dev; i++) {
7927 - realdev = &mddev->devices[i];
7929 + mdp_super_t *sb = mddev->sb;
7932 + struct md_list_head *tmp;
7934 + ITERATE_RDEV(mddev,rdev,tmp) {
7940 - descriptor = &sb->disks[realdev->sb->descriptor.number];
7941 - if (descriptor->state & (1 << MD_FAULTY_DEVICE))
7943 + disk = &sb->disks[rdev->desc_nr];
7944 + if (disk_faulty(disk)) {
7947 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
7949 + if (disk_active(disk))
7951 - return descriptor;
7957 +static int is_mddev_idle (mddev_t *mddev)
7959 + mdk_rdev_t * rdev;
7960 + struct md_list_head *tmp;
7962 + unsigned long curr_events;
7965 + ITERATE_RDEV(mddev,rdev,tmp) {
7966 + curr_events = io_events[MAJOR(rdev->dev)];
7968 + if (curr_events != rdev->last_events) {
7969 +// printk("!I(%d)", curr_events-rdev->last_events);
7970 + rdev->last_events = curr_events;
7978 * parallel resyncing thread.
7980 - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
7981 - * - fix read error handing
7984 -int md_do_sync(struct md_dev *mddev)
7986 + * Determine correct block size for this device.
7988 +unsigned int device_bsize (kdev_t dev)
7990 + unsigned int i, correct_size;
7992 + correct_size = BLOCK_SIZE;
7993 + if (blksize_size[MAJOR(dev)]) {
7994 + i = blksize_size[MAJOR(dev)][MINOR(dev)];
7999 + return correct_size;
8002 +static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
8004 +#define RA_ORDER (1)
8005 +#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
8006 +#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
8008 +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
8010 - struct buffer_head *bh;
8011 - int max_blocks, blocksize, curr_bsize, percent=1, j;
8012 - kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
8014 + struct buffer_head **bh;
8015 + unsigned int max_blocks, blocksize, curr_bsize,
8016 + i, ii, j, k, chunk, window, nr_blocks, err, serialize;
8017 + kdev_t read_disk = mddev_to_kdev(mddev);
8018 int major = MAJOR(read_disk), minor = MINOR(read_disk);
8019 unsigned long starttime;
8020 + int max_read_errors = 2*MAX_NR_BLOCKS,
8021 + max_write_errors = 2*MAX_NR_BLOCKS;
8022 + struct md_list_head *tmp;
8025 + bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
8028 + "could not alloc bh array for reconstruction ... retrying!\n");
8032 + err = down_interruptible(&mddev->resync_sem);
8038 + ITERATE_MDDEV(mddev2,tmp) {
8039 + if (mddev2 == mddev)
8041 + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
8042 + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
8048 + interruptible_sleep_on(&resync_wait);
8049 + if (md_signal_pending(current)) {
8050 + md_flush_signals();
8057 + mddev->curr_resync = 1;
8059 - blocksize = blksize_size[major][minor];
8060 + blocksize = device_bsize(read_disk);
8061 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8063 - printk("... resync log\n");
8064 - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
8065 - printk(" .... raid array: %s\n", kdevname(read_disk));
8066 - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
8067 - printk("md: syncing RAID array %s\n", kdevname(read_disk));
8068 + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
8069 + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
8070 + sysctl_speed_limit);
8071 + printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
8074 + * Resync has low priority.
8076 + current->priority = 1;
8078 + is_mddev_idle(mddev); /* this also initializes IO event counters */
8079 + starttime = jiffies;
8080 + mddev->resync_start = starttime;
8084 + * Tune reconstruction:
8086 + window = md_maxreadahead[mdidx(mddev)]/1024;
8087 + nr_blocks = window / (blocksize >> 10);
8088 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8089 + nr_blocks = MAX_NR_BLOCKS;
8090 + printk(KERN_INFO "md: using %dk window.\n",window);
8092 - starttime=jiffies;
8093 - for (j = 0; j < max_blocks; j++) {
8094 + for (j = 0; j < max_blocks; j += nr_blocks) {
8097 + mddev->curr_resync = j;
8099 * B careful. When some1 mounts a non-'blocksize' filesystem
8100 * then we get the blocksize changed right under us. Go deal
8101 * with it transparently, recalculate 'blocksize', 'j' and
8104 - curr_bsize = blksize_size[major][minor];
8105 + curr_bsize = device_bsize(read_disk);
8106 if (curr_bsize != blocksize) {
8108 + printk(KERN_INFO "md%d: blocksize changed\n",
8111 if (curr_bsize > blocksize)
8113 * this is safe, rounds downwards.
8114 @@ -1109,114 +3442,384 @@
8115 j *= blocksize/curr_bsize;
8117 blocksize = curr_bsize;
8118 + nr_blocks = window / (blocksize >> 10);
8119 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8120 + nr_blocks = MAX_NR_BLOCKS;
8121 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8123 - if ((bh = breada (read_disk, j, blocksize, j * blocksize,
8124 - max_blocks * blocksize)) != NULL) {
8125 - mark_buffer_dirty(bh, 1);
8128 + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
8129 + nr_blocks, blocksize, j, max_blocks);
8131 - * FIXME: Ugly, but set_blocksize() isnt safe ...
8132 + * We will retry the current block-group
8134 - curr_bsize = blksize_size[major][minor];
8135 - if (curr_bsize != blocksize)
8136 - goto diff_blocksize;
8140 - * It's a real read problem. FIXME, handle this
8143 - printk ( KERN_ALERT
8144 - "read error, stopping reconstruction.\n");
8148 + * Cleanup routines expect this
8150 + for (k = 0; k < nr_blocks; k++)
8153 + chunk = nr_blocks;
8154 + if (chunk > max_blocks-j)
8155 + chunk = max_blocks-j;
8158 + * request buffer heads ...
8160 + for (i = 0; i < chunk; i++) {
8161 + bh[i] = getblk (read_disk, j+i, blocksize);
8164 + if (!buffer_dirty(bh[i]))
8165 + mark_buffer_lowprio(bh[i]);
8169 - * Let's sleep some if we are faster than our speed limit:
8170 + * read buffer heads ...
8172 - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
8174 - current->state = TASK_INTERRUPTIBLE;
8175 - schedule_timeout(1);
8176 + ll_rw_block (READ, chunk, bh);
8177 + run_task_queue(&tq_disk);
8180 + * verify that all of them are OK ...
8182 + for (i = 0; i < chunk; i++) {
8184 + wait_on_buffer(bh[ii]);
8185 + if (!buffer_uptodate(bh[ii]))
8190 + for (i = 0; i < chunk; i++)
8191 + mark_buffer_dirty_lowprio(bh[i]);
8193 + ll_rw_block(WRITE, chunk, bh);
8194 + run_task_queue(&tq_disk);
8196 + for (i = 0; i < chunk; i++) {
8198 + wait_on_buffer(bh[ii]);
8200 + if (spare && disk_faulty(spare)) {
8201 + for (k = 0; k < chunk; k++)
8203 + printk(" <SPARE FAILED!>\n ");
8208 + if (!buffer_uptodate(bh[ii])) {
8209 + curr_bsize = device_bsize(read_disk);
8210 + if (curr_bsize != blocksize) {
8212 + "md%d: blocksize changed during write\n",
8214 + for (k = 0; k < chunk; k++)
8216 + if (buffer_lowprio(bh[k]))
8217 + mark_buffer_clean(bh[k]);
8222 + printk(" BAD WRITE %8d>\n", j);
8224 + * Ouch, write error, retry or bail out.
8226 + if (max_write_errors) {
8227 + max_write_errors--;
8228 + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8231 + printk ( KERN_ALERT
8232 + "too many write errors, stopping reconstruction.\n");
8233 + for (k = 0; k < chunk; k++)
8235 + if (buffer_lowprio(bh[k]))
8236 + mark_buffer_clean(bh[k]);
8245 - * FIXME: put this status bar thing into /proc
8246 + * This is the normal 'everything went OK' case
8247 + * do a 'free-behind' logic, we sure dont need
8248 + * this buffer if it was the only user.
8250 - if (!(j%(max_blocks/100))) {
8251 - if (!(percent%10))
8252 - printk (" %03d%% done.\n",percent);
8253 + for (i = 0; i < chunk; i++)
8254 + if (buffer_dirty(bh[i]))
8262 + if (md_signal_pending(current)) {
8264 + * got a signal, exit.
8266 + mddev->curr_resync = 0;
8267 + printk("md_do_sync() got signal ... exiting\n");
8268 + md_flush_signals();
8274 + * this loop exits only if either when we are slower than
8275 + * the 'hard' speed limit, or the system was IO-idle for
8277 + * the system might be non-idle CPU-wise, but we only care
8278 + * about not overloading the IO subsystem. (things like an
8279 + * e2fsck being done on the RAID array should execute fast)
8282 + if (md_need_resched(current))
8285 + if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
8286 + > sysctl_speed_limit) {
8287 + current->priority = 1;
8289 + if (!is_mddev_idle(mddev)) {
8290 + current->state = TASK_INTERRUPTIBLE;
8291 + md_schedule_timeout(HZ/2);
8292 + if (!md_signal_pending(current))
8296 + current->priority = 40;
8298 fsync_dev(read_disk);
8299 - printk("md: %s: sync done.\n", kdevname(read_disk));
8302 + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
8305 + * this also signals 'finished resyncing' to md_stop
8308 + up(&mddev->resync_sem);
8310 + free_pages((unsigned long)bh, RA_ORDER);
8311 + mddev->curr_resync = 0;
8312 + wake_up(&resync_wait);
8317 + * set_blocksize() might change the blocksize. This
8318 + * should not happen often, but it happens when eg.
8319 + * someone mounts a filesystem that has non-1k
8320 + * blocksize. set_blocksize() doesnt touch our
8321 + * buffer, but to avoid aliasing problems we change
8322 + * our internal blocksize too and retry the read.
8324 + curr_bsize = device_bsize(read_disk);
8325 + if (curr_bsize != blocksize) {
8326 + printk(KERN_INFO "md%d: blocksize changed during read\n",
8328 + for (k = 0; k < chunk; k++)
8330 + if (buffer_lowprio(bh[k]))
8331 + mark_buffer_clean(bh[k]);
8338 + * It's a real read problem. We retry and bail out
8339 + * only if it's excessive.
8341 + if (max_read_errors) {
8342 + max_read_errors--;
8343 + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8344 + for (k = 0; k < chunk; k++)
8346 + if (buffer_lowprio(bh[k]))
8347 + mark_buffer_clean(bh[k]);
8352 + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
8353 + for (k = 0; k < chunk; k++)
8355 + if (buffer_lowprio(bh[k]))
8356 + mark_buffer_clean(bh[k]);
8363 +#undef MAX_NR_BLOCKS
8366 - * This is a kernel thread which: syncs a spare disk with the active array
8367 + * This is a kernel thread which syncs a spare disk with the active array
8369 * the amount of foolproofing might seem to be a tad excessive, but an
8370 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
8371 * of my root partition with the first 0.5 gigs of my /home partition ... so
8372 * i'm a bit nervous ;)
8374 -void mdsyncd (void *data)
8375 +void md_do_recovery (void *data)
8378 - struct md_dev *mddev;
8379 - md_superblock_t *sb;
8380 - md_descriptor_t *spare;
8384 + mdp_disk_t *spare;
8385 unsigned long flags;
8386 + struct md_list_head *tmp;
8388 - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
8389 - if ((sb = mddev->sb) == NULL)
8390 + printk(KERN_INFO "md: recovery thread got woken up ...\n");
8392 + ITERATE_MDDEV(mddev,tmp) {
8396 + if (mddev->recovery_running)
8398 if (sb->active_disks == sb->raid_disks)
8400 - if (!sb->spare_disks)
8401 + if (!sb->spare_disks) {
8402 + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
8406 + * now here we get the spare and resync it.
8408 if ((spare = get_spare(mddev)) == NULL)
8410 - if (!mddev->pers->mark_spare)
8411 + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8412 + if (!mddev->pers->diskop)
8414 - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
8415 + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
8417 - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
8418 - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
8419 + down(&mddev->recovery_sem);
8420 + mddev->recovery_running = 1;
8421 + err = md_do_sync(mddev, spare);
8422 + if (err == -EIO) {
8423 + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8424 + if (!disk_faulty(spare)) {
8425 + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
8426 + mark_disk_faulty(spare);
8427 + mark_disk_nonsync(spare);
8428 + mark_disk_inactive(spare);
8429 + sb->spare_disks--;
8430 + sb->working_disks--;
8431 + sb->failed_disks++;
8434 + if (disk_faulty(spare))
8435 + mddev->pers->diskop(mddev, &spare,
8436 + DISKOP_SPARE_INACTIVE);
8437 + if (err == -EINTR) {
8439 + * Recovery got interrupted ...
8440 + * signal back that we have finished using the array.
8442 + mddev->pers->diskop(mddev, &spare,
8443 + DISKOP_SPARE_INACTIVE);
8444 + up(&mddev->recovery_sem);
8445 + mddev->recovery_running = 0;
8448 + mddev->recovery_running = 0;
8449 + up(&mddev->recovery_sem);
8453 - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
8454 - spare->state |= (1 << MD_SYNC_DEVICE);
8455 - spare->state |= (1 << MD_ACTIVE_DEVICE);
8456 - sb->spare_disks--;
8457 - sb->active_disks++;
8458 - mddev->sb_dirty = 1;
8459 - md_update_sb(mddev - md_dev);
8460 + if (!disk_faulty(spare)) {
8462 + * the SPARE_ACTIVE diskop possibly changes the
8465 + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
8466 + mark_disk_sync(spare);
8467 + mark_disk_active(spare);
8468 + sb->active_disks++;
8469 + sb->spare_disks--;
8471 restore_flags(flags);
8472 + mddev->sb_dirty = 1;
8473 + md_update_sb(mddev);
8476 + printk(KERN_INFO "md: recovery thread finished ...\n");
8480 +int md_notify_reboot(struct notifier_block *this,
8481 + unsigned long code, void *x)
8483 + struct md_list_head *tmp;
8486 + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
8487 + || (code == MD_SYS_POWER_OFF)) {
8489 + printk(KERN_INFO "stopping all md devices.\n");
8491 + ITERATE_MDDEV(mddev,tmp)
8492 + do_md_stop (mddev, 1);
8494 + * certain more exotic SCSI devices are known to be
8495 + * volatile wrt too early system reboots. While the
8496 + * right place to handle this issue is the given
8497 + * driver, we do want to have a safe RAID driver ...
8499 + md_mdelay(1000*1);
8501 + return NOTIFY_DONE;
8504 +struct notifier_block md_notifier = {
8510 +md__initfunc(void raid_setup(char *str, int *ints))
8512 + char tmpline[100];
8513 + int len, pos, nr, i;
8515 + len = strlen(str) + 1;
8519 + for (i = 0; i < len; i++) {
8522 + if (c == ',' || !c) {
8524 + if (!strcmp(tmpline,"noautodetect"))
8525 + raid_setup_args.noautodetect = 1;
8533 + raid_setup_args.set = 1;
8537 #ifdef CONFIG_MD_BOOT
8542 -} md_setup_args __initdata = {
8543 +} md_setup_args md__initdata = {
8547 /* called from init/main.c */
8548 -__initfunc(void md_setup(char *str,int *ints))
8549 +md__initfunc(void md_setup(char *str,int *ints))
8552 for(i=0;i<=ints[0];i++) {
8553 @@ -1228,21 +3831,24 @@
8557 -__initfunc(void do_md_setup(char *str,int *ints))
8558 +md__initfunc(void do_md_setup(char *str,int *ints))
8560 - int minor, pers, factor, fault;
8562 + int minor, pers, chunk_size, fault;
8566 + printk("i plan to phase this out --mingo\n");
8569 - printk ("md: Too few Arguments (%d).\n", ints[0]);
8570 + printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
8576 - if (minor >= MAX_MD_DEV) {
8577 - printk ("md: Minor device number too high.\n");
8578 + if ((unsigned int)minor >= MAX_MD_DEVS) {
8579 + printk (KERN_WARNING "md: Minor device number too high.\n");
8583 @@ -1252,18 +3858,20 @@
8585 #ifdef CONFIG_MD_LINEAR
8587 - printk ("md: Setting up md%d as linear device.\n",minor);
8588 + printk (KERN_INFO "md: Setting up md%d as linear device.\n",
8591 - printk ("md: Linear mode not configured."
8592 + printk (KERN_WARNING "md: Linear mode not configured."
8593 "Recompile the kernel with linear mode enabled!\n");
8598 #ifdef CONFIG_MD_STRIPED
8599 - printk ("md: Setting up md%d as a striped device.\n",minor);
8600 + printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
8603 - printk ("md: Striped mode not configured."
8604 + printk (KERN_WARNING "md: Striped mode not configured."
8605 "Recompile the kernel with striped mode enabled!\n");
8608 @@ -1278,79 +3886,145 @@
8612 - printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
8613 + printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
8620 - factor=ints[i++]; /* Chunksize */
8621 - fault =ints[i++]; /* Faultlevel */
8622 + chunk_size = ints[i++]; /* Chunksize */
8623 + fault = ints[i++]; /* Faultlevel */
8625 - pers=pers | factor | (fault << FAULT_SHIFT);
8626 + pers = pers | chunk_size | (fault << FAULT_SHIFT);
8628 - while( str && (dev = name_to_kdev_t(str))) {
8629 - do_md_add (minor, dev);
8630 - if((str = strchr (str, ',')) != NULL)
8633 + while( str && (dev = name_to_kdev_t(str))) {
8634 + do_md_add (minor, dev);
8635 + if((str = strchr (str, ',')) != NULL)
8639 - do_md_run (minor, pers);
8640 - printk ("md: Loading md%d.\n",minor);
8641 + do_md_run (minor, pers);
8642 + printk (KERN_INFO "md: Loading md%d.\n",minor);
8649 +void hsm_init (void);
8650 +void translucent_init (void);
8651 void linear_init (void);
8652 void raid0_init (void);
8653 void raid1_init (void);
8654 void raid5_init (void);
8656 -__initfunc(int md_init (void))
8657 +md__initfunc(int md_init (void))
8659 - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
8660 - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
8661 - MAX_MD_DEV, MAX_REAL);
8663 - if (register_blkdev (MD_MAJOR, "md", &md_fops))
8665 - printk ("Unable to get major %d for md\n", MD_MAJOR);
8669 - blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
8670 - blk_dev[MD_MAJOR].current_request=NULL;
8671 - read_ahead[MD_MAJOR]=INT_MAX;
8672 - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
8673 - md_gendisk.next=gendisk_head;
8675 - gendisk_head=&md_gendisk;
8677 -#if SUPPORT_RECONSTRUCTION
8678 - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
8679 - printk("md: bug: md_sync_thread == NULL\n");
8680 -#endif /* SUPPORT_RECONSTRUCTION */
8681 + static char * name = "mdrecoveryd";
8683 + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
8684 + MD_MAJOR_VERSION, MD_MINOR_VERSION,
8685 + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
8687 + if (register_blkdev (MD_MAJOR, "md", &md_fops))
8689 + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
8693 + blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
8694 + blk_dev[MD_MAJOR].current_request = NULL;
8695 + read_ahead[MD_MAJOR] = INT_MAX;
8696 + md_gendisk.next = gendisk_head;
8698 + gendisk_head = &md_gendisk;
8700 + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
8701 + if (!md_recovery_thread)
8702 + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
8704 + md_register_reboot_notifier(&md_notifier);
8705 + md_register_sysctl();
8707 +#ifdef CONFIG_MD_HSM
8710 +#ifdef CONFIG_MD_TRANSLUCENT
8711 + translucent_init ();
8713 #ifdef CONFIG_MD_LINEAR
8717 #ifdef CONFIG_MD_STRIPED
8721 #ifdef CONFIG_MD_MIRRORING
8725 #ifdef CONFIG_MD_RAID5
8729 +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
8731 + * pick a XOR routine, runtime.
8733 + calibrate_xor_block();
8740 #ifdef CONFIG_MD_BOOT
8741 -__initfunc(void md_setup_drive(void))
8742 +md__initfunc(void md_setup_drive(void))
8744 if(md_setup_args.set)
8745 do_md_setup(md_setup_args.str, md_setup_args.ints);
8749 +MD_EXPORT_SYMBOL(md_size);
8750 +MD_EXPORT_SYMBOL(register_md_personality);
8751 +MD_EXPORT_SYMBOL(unregister_md_personality);
8752 +MD_EXPORT_SYMBOL(partition_name);
8753 +MD_EXPORT_SYMBOL(md_error);
8754 +MD_EXPORT_SYMBOL(md_recover_arrays);
8755 +MD_EXPORT_SYMBOL(md_register_thread);
8756 +MD_EXPORT_SYMBOL(md_unregister_thread);
8757 +MD_EXPORT_SYMBOL(md_update_sb);
8758 +MD_EXPORT_SYMBOL(md_map);
8759 +MD_EXPORT_SYMBOL(md_wakeup_thread);
8760 +MD_EXPORT_SYMBOL(md_do_sync);
8761 +MD_EXPORT_SYMBOL(md_print_devices);
8762 +MD_EXPORT_SYMBOL(find_rdev_nr);
8763 +MD_EXPORT_SYMBOL(md_check_ordering);
8764 +MD_EXPORT_SYMBOL(md_interrupt_thread);
8765 +MD_EXPORT_SYMBOL(mddev_map);
8767 +#ifdef CONFIG_PROC_FS
8768 +static struct proc_dir_entry proc_md = {
8769 + PROC_MD, 6, "mdstat",
8770 + S_IFREG | S_IRUGO, 1, 0, 0,
8771 + 0, &proc_array_inode_operations,
8775 +static void md_geninit (struct gendisk *gdisk)
8779 + for(i = 0; i < MAX_MD_DEVS; i++) {
8780 + md_blocksizes[i] = 1024;
8781 + md_maxreadahead[i] = MD_READAHEAD;
8782 + md_gendisk.part[i].start_sect = -1; /* avoid partition check */
8783 + md_gendisk.part[i].nr_sects = 0;
8786 + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8788 + blksize_size[MD_MAJOR] = md_blocksizes;
8789 + md_set_global_readahead(md_maxreadahead);
8791 +#ifdef CONFIG_PROC_FS
8792 + proc_register(&proc_root, &proc_md);
8796 --- linux/drivers/block/raid0.c.orig Mon Sep 4 19:39:16 2000
8797 +++ linux/drivers/block/raid0.c Tue Jan 16 13:42:04 2001
8801 raid0.c : Multiple Devices driver for Linux
8802 Copyright (C) 1994-96 Marc ZYNGIER
8803 @@ -18,146 +17,201 @@
8806 #include <linux/module.h>
8807 -#include <linux/md.h>
8808 -#include <linux/raid0.h>
8809 -#include <linux/vmalloc.h>
8810 +#include <linux/raid/raid0.h>
8812 #define MAJOR_NR MD_MAJOR
8814 #define MD_PERSONALITY
8816 -static int create_strip_zones (int minor, struct md_dev *mddev)
8817 +static int create_strip_zones (mddev_t *mddev)
8820 - int current_offset=0;
8821 - struct real_dev *smallest_by_zone;
8822 - struct raid0_data *data=(struct raid0_data *) mddev->private;
8824 - data->nr_strip_zones=1;
8826 - for (i=1; i<mddev->nb_dev; i++)
8828 - for (j=0; j<i; j++)
8829 - if (mddev->devices[i].size==mddev->devices[j].size)
8836 - data->nr_strip_zones++;
8841 - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
8844 - data->smallest=NULL;
8846 - for (i=0; i<data->nr_strip_zones; i++)
8848 - data->strip_zone[i].dev_offset=current_offset;
8849 - smallest_by_zone=NULL;
8852 - for (j=0; j<mddev->nb_dev; j++)
8853 - if (mddev->devices[j].size>current_offset)
8855 - data->strip_zone[i].dev[c++]=mddev->devices+j;
8856 - if (!smallest_by_zone ||
8857 - smallest_by_zone->size > mddev->devices[j].size)
8858 - smallest_by_zone=mddev->devices+j;
8861 - data->strip_zone[i].nb_dev=c;
8862 - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
8864 - if (!data->smallest ||
8865 - data->smallest->size > data->strip_zone[i].size)
8866 - data->smallest=data->strip_zone+i;
8868 - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
8869 - data->strip_zone[i-1].size) : 0;
8870 - current_offset=smallest_by_zone->size;
8873 + int i, c, j, j1, j2;
8874 + int current_offset, curr_zone_offset;
8875 + raid0_conf_t *conf = mddev_to_conf(mddev);
8876 + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
8879 + * The number of 'same size groups'
8881 + conf->nr_strip_zones = 0;
8883 + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
8884 + printk("raid0: looking at %s\n", partition_name(rdev1->dev));
8886 + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
8887 + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
8888 + if (rdev2 == rdev1) {
8889 + printk("raid0: END\n");
8892 + if (rdev2->size == rdev1->size)
8895 + * Not unique, dont count it as a new
8898 + printk("raid0: EQUAL\n");
8902 + printk("raid0: NOT EQUAL\n");
8905 + printk("raid0: ==> UNIQUE\n");
8906 + conf->nr_strip_zones++;
8907 + printk("raid0: %d zones\n", conf->nr_strip_zones);
8910 + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
8912 + conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
8913 + conf->nr_strip_zones);
8914 + if (!conf->strip_zone)
8918 + conf->smallest = NULL;
8919 + current_offset = 0;
8920 + curr_zone_offset = 0;
8922 + for (i = 0; i < conf->nr_strip_zones; i++)
8924 + struct strip_zone *zone = conf->strip_zone + i;
8926 + printk("zone %d\n", i);
8927 + zone->dev_offset = current_offset;
8931 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
8933 + printk(" checking %s ...", partition_name(rdev->dev));
8934 + if (rdev->size > current_offset)
8936 + printk(" contained as device %d\n", c);
8937 + zone->dev[c] = rdev;
8939 + if (!smallest || (rdev->size <smallest->size)) {
8941 + printk(" (%d) is smallest!.\n", rdev->size);
8944 + printk(" nope.\n");
8948 + zone->size = (smallest->size - current_offset) * c;
8949 + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
8951 + if (!conf->smallest || (zone->size < conf->smallest->size))
8952 + conf->smallest = zone;
8954 + zone->zone_offset = curr_zone_offset;
8955 + curr_zone_offset += zone->size;
8957 + current_offset = smallest->size;
8958 + printk("current zone offset: %d\n", current_offset);
8960 + printk("done.\n");
8964 -static int raid0_run (int minor, struct md_dev *mddev)
8965 +static int raid0_run (mddev_t *mddev)
8967 - int cur=0, i=0, size, zone0_size, nb_zone;
8968 - struct raid0_data *data;
8970 - MOD_INC_USE_COUNT;
8971 + int cur=0, i=0, size, zone0_size, nb_zone;
8972 + raid0_conf_t *conf;
8974 - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
8975 - data=(struct raid0_data *) mddev->private;
8977 - if (create_strip_zones (minor, mddev))
8983 - nb_zone=data->nr_zones=
8984 - md_size[minor]/data->smallest->size +
8985 - (md_size[minor]%data->smallest->size ? 1 : 0);
8987 - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
8988 - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
8990 - vfree(data->strip_zone);
8994 - size=data->strip_zone[cur].size;
8997 - while (cur<data->nr_strip_zones)
8999 - data->hash_table[i].zone0=data->strip_zone+cur;
9001 - if (size>=data->smallest->size)/* If we completely fill the slot */
9003 - data->hash_table[i++].zone1=NULL;
9004 - size-=data->smallest->size;
9008 - if (++cur==data->nr_strip_zones) continue;
9009 - size=data->strip_zone[cur].size;
9015 - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
9017 - data->hash_table[i].zone1=NULL;
9021 - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
9022 - size=data->strip_zone[cur].size;
9023 - data->hash_table[i++].zone1=data->strip_zone+cur;
9024 - size-=(data->smallest->size - zone0_size);
9026 + MOD_INC_USE_COUNT;
9029 + conf = vmalloc(sizeof (raid0_conf_t));
9032 + mddev->private = (void *)conf;
9034 + if (md_check_ordering(mddev)) {
9035 + printk("raid0: disks are not ordered, aborting!\n");
9036 + goto out_free_conf;
9039 + if (create_strip_zones (mddev))
9040 + goto out_free_conf;
9042 + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
9043 + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
9044 + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
9045 + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
9046 + printk("raid0 : nb_zone is %d.\n", nb_zone);
9047 + conf->nr_zones = nb_zone;
9049 + printk("raid0 : Allocating %d bytes for hash.\n",
9050 + sizeof(struct raid0_hash)*nb_zone);
9052 + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
9053 + if (!conf->hash_table)
9054 + goto out_free_zone_conf;
9055 + size = conf->strip_zone[cur].size;
9058 + while (cur < conf->nr_strip_zones) {
9059 + conf->hash_table[i].zone0 = conf->strip_zone + cur;
9062 + * If we completely fill the slot
9064 + if (size >= conf->smallest->size) {
9065 + conf->hash_table[i++].zone1 = NULL;
9066 + size -= conf->smallest->size;
9069 + if (++cur == conf->nr_strip_zones)
9071 + size = conf->strip_zone[cur].size;
9075 + if (++cur == conf->nr_strip_zones) {
9077 + * Last dev, set unit1 as NULL
9079 + conf->hash_table[i].zone1=NULL;
9084 + * Here we use a 2nd dev to fill the slot
9086 + zone0_size = size;
9087 + size = conf->strip_zone[cur].size;
9088 + conf->hash_table[i++].zone1 = conf->strip_zone + cur;
9089 + size -= (conf->smallest->size - zone0_size);
9093 +out_free_zone_conf:
9094 + vfree(conf->strip_zone);
9095 + conf->strip_zone = NULL;
9099 + mddev->private = NULL;
9101 + MOD_DEC_USE_COUNT;
9106 -static int raid0_stop (int minor, struct md_dev *mddev)
9107 +static int raid0_stop (mddev_t *mddev)
9109 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9110 + raid0_conf_t *conf = mddev_to_conf(mddev);
9112 - vfree (data->hash_table);
9113 - vfree (data->strip_zone);
9115 + vfree (conf->hash_table);
9116 + conf->hash_table = NULL;
9117 + vfree (conf->strip_zone);
9118 + conf->strip_zone = NULL;
9120 + mddev->private = NULL;
9122 - MOD_DEC_USE_COUNT;
9124 + MOD_DEC_USE_COUNT;
9129 @@ -167,135 +221,140 @@
9130 * Of course, those facts may not be valid anymore (and surely won't...)
9131 * Hey guys, there's some work out there ;-)
9133 -static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
9134 +static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9135 unsigned long *rsector, unsigned long size)
9137 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9138 - static struct raid0_hash *hash;
9139 - struct strip_zone *zone;
9140 - struct real_dev *tmp_dev;
9141 - int blk_in_chunk, factor, chunk, chunk_size;
9142 - long block, rblock;
9144 - factor=FACTOR(mddev);
9145 - chunk_size=(1UL << FACTOR_SHIFT(factor));
9146 - block=*rsector >> 1;
9147 - hash=data->hash_table+(block/data->smallest->size);
9149 - if (hash - data->hash_table > data->nr_zones)
9151 - printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
9155 - /* Sanity check */
9156 - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
9158 - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9162 - if (block >= (hash->zone0->size +
9163 - hash->zone0->zone_offset))
9167 - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
9175 + raid0_conf_t *conf = mddev_to_conf(mddev);
9176 + struct raid0_hash *hash;
9177 + struct strip_zone *zone;
9178 + mdk_rdev_t *tmp_dev;
9179 + int blk_in_chunk, chunksize_bits, chunk, chunk_size;
9180 + long block, rblock;
9182 + chunk_size = mddev->param.chunk_size >> 10;
9183 + chunksize_bits = ffz(~chunk_size);
9184 + block = *rsector >> 1;
9185 + hash = conf->hash_table + block / conf->smallest->size;
9187 + if (hash - conf->hash_table > conf->nr_zones) {
9188 + printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
9192 + /* Sanity check */
9193 + if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
9202 + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
9205 + zone = hash->zone1;
9207 + zone = hash->zone0;
9209 - blk_in_chunk=block & (chunk_size -1);
9210 - chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
9211 - tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
9212 - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
9213 + blk_in_chunk = block & (chunk_size -1);
9214 + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
9215 + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
9216 + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
9218 - *rdev=tmp_dev->dev;
9219 - *rsector=rblock<<1;
9220 + *rdev = tmp_dev->dev;
9221 + *rsector = rblock << 1;
9227 + printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9230 + printk("raid0_map bug: hash==NULL for block %ld\n", block);
9233 + printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
9236 + printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
9241 -static int raid0_status (char *page, int minor, struct md_dev *mddev)
9242 +static int raid0_status (char *page, mddev_t *mddev)
9249 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9251 + raid0_conf_t *conf = mddev_to_conf(mddev);
9253 - sz+=sprintf (page+sz, " ");
9254 - for (j=0; j<data->nr_zones; j++)
9256 - sz+=sprintf (page+sz, "[z%d",
9257 - data->hash_table[j].zone0-data->strip_zone);
9258 - if (data->hash_table[j].zone1)
9259 - sz+=sprintf (page+sz, "/z%d] ",
9260 - data->hash_table[j].zone1-data->strip_zone);
9262 - sz+=sprintf (page+sz, "] ");
9264 + sz += sprintf(page + sz, " ");
9265 + for (j = 0; j < conf->nr_zones; j++) {
9266 + sz += sprintf(page + sz, "[z%d",
9267 + conf->hash_table[j].zone0 - conf->strip_zone);
9268 + if (conf->hash_table[j].zone1)
9269 + sz += sprintf(page+sz, "/z%d] ",
9270 + conf->hash_table[j].zone1 - conf->strip_zone);
9272 + sz += sprintf(page+sz, "] ");
9275 - sz+=sprintf (page+sz, "\n");
9276 + sz += sprintf(page + sz, "\n");
9278 - for (j=0; j<data->nr_strip_zones; j++)
9280 - sz+=sprintf (page+sz, " z%d=[", j);
9281 - for (k=0; k<data->strip_zone[j].nb_dev; k++)
9282 - sz+=sprintf (page+sz, "%s/",
9283 - partition_name(data->strip_zone[j].dev[k]->dev));
9285 - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9286 - data->strip_zone[j].zone_offset,
9287 - data->strip_zone[j].dev_offset,
9288 - data->strip_zone[j].size);
9290 + for (j = 0; j < conf->nr_strip_zones; j++) {
9291 + sz += sprintf(page + sz, " z%d=[", j);
9292 + for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
9293 + sz += sprintf (page+sz, "%s/", partition_name(
9294 + conf->strip_zone[j].dev[k]->dev));
9296 + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9297 + conf->strip_zone[j].zone_offset,
9298 + conf->strip_zone[j].dev_offset,
9299 + conf->strip_zone[j].size);
9302 - sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
9304 + sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
9309 -static struct md_personality raid0_personality=
9310 +static mdk_personality_t raid0_personality=
9314 - NULL, /* no special make_request */
9315 - NULL, /* no special end_request */
9319 - NULL, /* no ioctls */
9321 - NULL, /* no error_handler */
9322 - NULL, /* hot_add_disk */
9323 - NULL, /* hot_remove_disk */
9324 - NULL /* mark_spare */
9327 + NULL, /* no special make_request */
9328 + NULL, /* no special end_request */
9332 + NULL, /* no ioctls */
9334 + NULL, /* no error_handler */
9335 + NULL, /* no diskop */
9336 + NULL, /* no stop resync */
9337 + NULL /* no restart resync */
9343 void raid0_init (void)
9345 - register_md_personality (RAID0, &raid0_personality);
9346 + register_md_personality (RAID0, &raid0_personality);
9351 int init_module (void)
9353 - return (register_md_personality (RAID0, &raid0_personality));
9354 + return (register_md_personality (RAID0, &raid0_personality));
9357 void cleanup_module (void)
9359 - unregister_md_personality (RAID0);
9360 + unregister_md_personality (RAID0);
9365 --- linux/drivers/block/raid1.c.orig Mon Dec 11 01:49:41 2000
9366 +++ linux/drivers/block/raid1.c Tue Jan 16 13:42:04 2001
9368 -/************************************************************************
9370 * raid1.c : Multiple Devices driver for Linux
9371 - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9372 + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9374 * RAID-1 management functions.
9379 #include <linux/module.h>
9380 -#include <linux/locks.h>
9381 #include <linux/malloc.h>
9382 -#include <linux/md.h>
9383 -#include <linux/raid1.h>
9384 -#include <asm/bitops.h>
9385 +#include <linux/raid/raid1.h>
9386 #include <asm/atomic.h>
9388 #define MAJOR_NR MD_MAJOR
9390 #define MD_PERSONALITY
9393 - * The following can be used to debug the driver
9395 -/*#define RAID1_DEBUG*/
9397 -#define PRINTK(x) do { printk x; } while (0);
9399 -#define PRINTK(x) do { ; } while (0);
9401 +#define MAX_LINEAR_SECTORS 128
9403 #define MAX(a,b) ((a) > (b) ? (a) : (b))
9404 #define MIN(a,b) ((a) < (b) ? (a) : (b))
9406 -static struct md_personality raid1_personality;
9407 -static struct md_thread *raid1_thread = NULL;
9408 +static mdk_personality_t raid1_personality;
9409 struct buffer_head *raid1_retry_list = NULL;
9411 -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
9412 +static void * raid1_kmalloc (int size)
9416 + * now we are rather fault tolerant than nice, but
9417 + * there are a couple of places in the RAID code where we
9418 + * simply can not afford to fail an allocation because
9419 + * there is no failure return path (eg. make_request())
9421 + while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) {
9422 + printk ("raid1: out of memory, retrying...\n");
9423 + current->state = TASK_UNINTERRUPTIBLE;
9424 + schedule_timeout(HZ/10);
9427 + memset(ptr, 0, size);
9431 +static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
9432 unsigned long *rsector, unsigned long size)
9434 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9435 - int i, n = raid_conf->raid_disks;
9436 + raid1_conf_t *conf = mddev_to_conf(mddev);
9437 + int i, disks = MD_SB_DISKS;
9440 * Later we do read balancing on the read side
9441 * now we use the first available disk.
9444 - PRINTK(("raid1_map().\n"));
9446 - for (i=0; i<n; i++) {
9447 - if (raid_conf->mirrors[i].operational) {
9448 - *rdev = raid_conf->mirrors[i].dev;
9449 + for (i = 0; i < disks; i++) {
9450 + if (conf->mirrors[i].operational) {
9451 + *rdev = conf->mirrors[i].dev;
9459 -static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
9460 +static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9461 unsigned long *rsector, unsigned long size)
9466 -void raid1_reschedule_retry (struct buffer_head *bh)
9467 +static void raid1_reschedule_retry (struct buffer_head *bh)
9469 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9471 - PRINTK(("raid1_reschedule_retry().\n"));
9472 + mddev_t *mddev = r1_bh->mddev;
9473 + raid1_conf_t *conf = mddev_to_conf(mddev);
9475 r1_bh->next_retry = raid1_retry_list;
9476 raid1_retry_list = bh;
9477 - md_wakeup_thread(raid1_thread);
9478 + md_wakeup_thread(conf->thread);
9482 - * raid1_end_buffer_io() is called when we have finished servicing a mirrored
9483 + * raid1_end_bh_io() is called when we have finished servicing a mirrored
9484 * operation and are ready to return a success/failure code to the buffer
9487 -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
9488 +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
9490 struct buffer_head *bh = r1_bh->master_bh;
9496 -int raid1_one_error=0;
9498 void raid1_end_request (struct buffer_head *bh, int uptodate)
9500 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9501 @@ -106,12 +109,7 @@
9505 - PRINTK(("raid1_end_request().\n"));
9507 - if (raid1_one_error) {
9508 - raid1_one_error=0;
9512 * this branch is our 'one mirror IO has finished' event handler:
9514 @@ -136,15 +134,11 @@
9517 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
9519 - PRINTK(("raid1_end_request(), read branch.\n"));
9522 * we have only one buffer_head on the read side
9525 - PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
9526 - raid1_end_buffer_io(r1_bh, uptodate);
9527 + raid1_end_bh_io(r1_bh, uptodate);
9528 restore_flags(flags);
9531 @@ -152,71 +146,56 @@
9534 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
9535 - kdevname(bh->b_dev), bh->b_blocknr);
9536 - raid1_reschedule_retry (bh);
9537 + partition_name(bh->b_dev), bh->b_blocknr);
9538 + raid1_reschedule_retry(bh);
9539 restore_flags(flags);
9544 - * WRITE or WRITEA.
9546 - PRINTK(("raid1_end_request(), write branch.\n"));
9551 * Let's see if all mirrored write operations have finished
9552 - * already [we have irqs off, so we can decrease]:
9556 - if (!--r1_bh->remaining) {
9557 - struct md_dev *mddev = r1_bh->mddev;
9558 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9559 - int i, n = raid_conf->raid_disks;
9561 - PRINTK(("raid1_end_request(), remaining == 0.\n"));
9562 + if (atomic_dec_and_test(&r1_bh->remaining)) {
9563 + int i, disks = MD_SB_DISKS;
9565 - for ( i=0; i<n; i++)
9566 - if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
9567 + for ( i = 0; i < disks; i++)
9568 + if (r1_bh->mirror_bh[i])
9569 + kfree(r1_bh->mirror_bh[i]);
9571 - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9572 + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9574 - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
9575 restore_flags(flags);
9578 -/* This routine checks if the undelying device is an md device and in that
9579 - * case it maps the blocks before putting the request on the queue
9581 + * This routine checks if the undelying device is an md device
9582 + * and in that case it maps the blocks before putting the
9583 + * request on the queue
9586 -map_and_make_request (int rw, struct buffer_head *bh)
9587 +static void map_and_make_request (int rw, struct buffer_head *bh)
9589 if (MAJOR (bh->b_rdev) == MD_MAJOR)
9590 - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
9591 + md_map (bh->b_rdev, &bh->b_rdev,
9592 + &bh->b_rsector, bh->b_size >> 9);
9593 clear_bit(BH_Lock, &bh->b_state);
9594 make_request (MAJOR (bh->b_rdev), rw, bh);
9598 -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9599 +static int raid1_make_request (mddev_t *mddev, int rw,
9600 + struct buffer_head * bh)
9603 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9604 + raid1_conf_t *conf = mddev_to_conf(mddev);
9605 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
9606 struct raid1_bh * r1_bh;
9607 - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
9608 + int disks = MD_SB_DISKS;
9609 + int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
9610 struct mirror_info *mirror;
9612 - PRINTK(("raid1_make_request().\n"));
9614 - while (!( /* FIXME: now we are rather fault tolerant than nice */
9615 - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
9618 - printk ("raid1_make_request(#1): out of memory\n");
9619 - current->policy |= SCHED_YIELD;
9622 - memset (r1_bh, 0, sizeof (struct raid1_bh));
9623 + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
9626 * make_request() can abort the operation when READA or WRITEA are being
9627 @@ -227,43 +206,65 @@
9628 if (rw == READA) rw = READ;
9629 if (rw == WRITEA) rw = WRITE;
9631 - if (rw == WRITE || rw == WRITEA)
9632 - mark_buffer_clean(bh); /* Too early ? */
9633 + if (rw == WRITE) {
9637 + mark_buffer_clean(bh);
9639 + * not too early. we _first_ clean the bh, then we start
9640 + * the IO, then when the IO has finished, we unlock the
9641 + * bh and mark it uptodate. This way we do not miss the
9642 + * case when the bh got dirty again during the IO.
9647 + * special flag for 'lowprio' reconstruction requests ...
9649 + if (buffer_lowprio(bh))
9653 - * i think the read and write branch should be separated completely, since we want
9654 - * to do read balancing on the read side for example. Comments? :) --mingo
9655 + * i think the read and write branch should be separated completely,
9656 + * since we want to do read balancing on the read side for example.
9657 + * Comments? :) --mingo
9660 r1_bh->master_bh=bh;
9664 - if (rw==READ || rw==READA) {
9665 - int last_used = raid_conf->last_used;
9666 - PRINTK(("raid1_make_request(), read branch.\n"));
9667 - mirror = raid_conf->mirrors + last_used;
9669 + int last_used = conf->last_used;
9672 + * read balancing logic:
9674 + mirror = conf->mirrors + last_used;
9675 bh->b_rdev = mirror->dev;
9676 sectors = bh->b_size >> 9;
9677 - if (bh->b_blocknr * sectors == raid_conf->next_sect) {
9678 - raid_conf->sect_count += sectors;
9679 - if (raid_conf->sect_count >= mirror->sect_limit)
9681 + if (bh->b_blocknr * sectors == conf->next_sect) {
9682 + conf->sect_count += sectors;
9683 + if (conf->sect_count >= mirror->sect_limit)
9687 - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
9688 - if (switch_disks) {
9689 - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
9690 - raid_conf->sect_count = 0;
9691 - last_used = raid_conf->last_used = mirror->next;
9692 + conf->next_sect = (bh->b_blocknr + 1) * sectors;
9694 + * Do not switch disks if full resync is in progress ...
9696 + if (switch_disks && !conf->resync_mirrors) {
9697 + conf->sect_count = 0;
9698 + last_used = conf->last_used = mirror->next;
9700 - * Do not switch to write-only disks ... resyncing
9702 + * Do not switch to write-only disks ...
9703 + * reconstruction is in progress
9705 - while (raid_conf->mirrors[last_used].write_only)
9706 - raid_conf->last_used = raid_conf->mirrors[last_used].next;
9707 + while (conf->mirrors[last_used].write_only)
9708 + conf->last_used = conf->mirrors[last_used].next;
9710 - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
9711 bh_req = &r1_bh->bh_req;
9712 memcpy(bh_req, bh, sizeof(*bh));
9713 bh_req->b_end_io = raid1_end_request;
9714 @@ -273,13 +274,12 @@
9718 - * WRITE or WRITEA.
9721 - PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
9723 - for (i = 0; i < n; i++) {
9724 + for (i = 0; i < disks; i++) {
9726 - if (!raid_conf->mirrors [i].operational) {
9727 + if (!conf->mirrors[i].operational) {
9729 * the r1_bh->mirror_bh[i] pointer remains NULL
9731 @@ -287,89 +287,91 @@
9736 + * special case for reconstruction ...
9738 + if (lowprio && (i == conf->last_used)) {
9739 + mirror_bh[i] = NULL;
9744 + * We should use a private pool (size depending on NR_REQUEST),
9745 + * to avoid writes filling up the memory with bhs
9747 + * Such pools are much faster than kmalloc anyways (so we waste
9748 + * almost nothing by not using the master bh when writing and
9749 + * win alot of cleanness) but for now we are cool enough. --mingo
9751 + * It's safe to sleep here, buffer heads cannot be used in a shared
9752 + * manner in the write branch. Look how we lock the buffer at the
9753 + * beginning of this function to grok the difference ;)
9755 + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
9757 + * prepare mirrored bh (fields ordered for max mem throughput):
9759 + mirror_bh[i]->b_blocknr = bh->b_blocknr;
9760 + mirror_bh[i]->b_dev = bh->b_dev;
9761 + mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
9762 + mirror_bh[i]->b_rsector = bh->b_rsector;
9763 + mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9765 + mirror_bh[i]->b_state |= (1<<BH_LowPrio);
9767 + mirror_bh[i]->b_count = 1;
9768 + mirror_bh[i]->b_size = bh->b_size;
9769 + mirror_bh[i]->b_data = bh->b_data;
9770 + mirror_bh[i]->b_list = BUF_LOCKED;
9771 + mirror_bh[i]->b_end_io = raid1_end_request;
9772 + mirror_bh[i]->b_dev_id = r1_bh;
9774 + r1_bh->mirror_bh[i] = mirror_bh[i];
9778 + md_atomic_set(&r1_bh->remaining, sum_bhs);
9781 - * We should use a private pool (size depending on NR_REQUEST),
9782 - * to avoid writes filling up the memory with bhs
9784 - * Such pools are much faster than kmalloc anyways (so we waste almost
9785 - * nothing by not using the master bh when writing and win alot of cleanness)
9787 - * but for now we are cool enough. --mingo
9789 - * It's safe to sleep here, buffer heads cannot be used in a shared
9790 - * manner in the write branch. Look how we lock the buffer at the beginning
9791 - * of this function to grok the difference ;)
9793 - while (!( /* FIXME: now we are rather fault tolerant than nice */
9794 - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
9797 - printk ("raid1_make_request(#2): out of memory\n");
9798 - current->policy |= SCHED_YIELD;
9801 - memset (mirror_bh[i], 0, sizeof (struct buffer_head));
9804 - * prepare mirrored bh (fields ordered for max mem throughput):
9806 - mirror_bh [i]->b_blocknr = bh->b_blocknr;
9807 - mirror_bh [i]->b_dev = bh->b_dev;
9808 - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
9809 - mirror_bh [i]->b_rsector = bh->b_rsector;
9810 - mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9811 - mirror_bh [i]->b_count = 1;
9812 - mirror_bh [i]->b_size = bh->b_size;
9813 - mirror_bh [i]->b_data = bh->b_data;
9814 - mirror_bh [i]->b_list = BUF_LOCKED;
9815 - mirror_bh [i]->b_end_io = raid1_end_request;
9816 - mirror_bh [i]->b_dev_id = r1_bh;
9818 - r1_bh->mirror_bh[i] = mirror_bh[i];
9822 - r1_bh->remaining = sum_bhs;
9824 - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
9827 - * We have to be a bit careful about the semaphore above, thats why we
9828 - * start the requests separately. Since kmalloc() could fail, sleep and
9829 - * make_request() can sleep too, this is the safer solution. Imagine,
9830 - * end_request decreasing the semaphore before we could have set it up ...
9831 - * We could play tricks with the semaphore (presetting it and correcting
9832 - * at the end if sum_bhs is not 'n' but we have to do end_request by hand
9833 - * if all requests finish until we had a chance to set up the semaphore
9834 - * correctly ... lots of races).
9836 - for (i = 0; i < n; i++)
9837 - if (mirror_bh [i] != NULL)
9838 - map_and_make_request (rw, mirror_bh [i]);
9839 + * We have to be a bit careful about the semaphore above, thats
9840 + * why we start the requests separately. Since kmalloc() could
9841 + * fail, sleep and make_request() can sleep too, this is the
9842 + * safer solution. Imagine, end_request decreasing the semaphore
9843 + * before we could have set it up ... We could play tricks with
9844 + * the semaphore (presetting it and correcting at the end if
9845 + * sum_bhs is not 'n' but we have to do end_request by hand if
9846 + * all requests finish until we had a chance to set up the
9847 + * semaphore correctly ... lots of races).
9849 + for (i = 0; i < disks; i++)
9851 + map_and_make_request(rw, mirror_bh[i]);
9856 -static int raid1_status (char *page, int minor, struct md_dev *mddev)
9857 +static int raid1_status (char *page, mddev_t *mddev)
9859 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9860 + raid1_conf_t *conf = mddev_to_conf(mddev);
9863 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
9864 - for (i = 0; i < raid_conf->raid_disks; i++)
9865 - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
9866 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
9867 + conf->working_disks);
9868 + for (i = 0; i < conf->raid_disks; i++)
9869 + sz += sprintf (page+sz, "%s",
9870 + conf->mirrors[i].operational ? "U" : "_");
9871 sz += sprintf (page+sz, "]");
9875 -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
9876 +static void unlink_disk (raid1_conf_t *conf, int target)
9878 - int disks = raid_conf->raid_disks;
9880 + int disks = MD_SB_DISKS;
9883 - for (j = 0; j < disks; j++)
9884 - if (raid_conf->mirrors [j].next == failed_index)
9885 - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
9886 + for (i = 0; i < disks; i++)
9887 + if (conf->mirrors[i].next == target)
9888 + conf->mirrors[i].next = conf->mirrors[target].next;
9891 #define LAST_DISK KERN_ALERT \
9892 @@ -388,48 +390,53 @@
9893 #define ALREADY_SYNCING KERN_INFO \
9894 "raid1: syncing already in progress.\n"
9896 -static int raid1_error (struct md_dev *mddev, kdev_t dev)
9897 +static void mark_disk_bad (mddev_t *mddev, int failed)
9899 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9900 - struct mirror_info *mirror;
9901 - md_superblock_t *sb = mddev->sb;
9902 - int disks = raid_conf->raid_disks;
9904 + raid1_conf_t *conf = mddev_to_conf(mddev);
9905 + struct mirror_info *mirror = conf->mirrors+failed;
9906 + mdp_super_t *sb = mddev->sb;
9908 + mirror->operational = 0;
9909 + unlink_disk(conf, failed);
9910 + mark_disk_faulty(sb->disks+mirror->number);
9911 + mark_disk_nonsync(sb->disks+mirror->number);
9912 + mark_disk_inactive(sb->disks+mirror->number);
9913 + sb->active_disks--;
9914 + sb->working_disks--;
9915 + sb->failed_disks++;
9916 + mddev->sb_dirty = 1;
9917 + md_wakeup_thread(conf->thread);
9918 + conf->working_disks--;
9919 + printk (DISK_FAILED, partition_name (mirror->dev),
9920 + conf->working_disks);
9923 - PRINTK(("raid1_error called\n"));
9924 +static int raid1_error (mddev_t *mddev, kdev_t dev)
9926 + raid1_conf_t *conf = mddev_to_conf(mddev);
9927 + struct mirror_info * mirrors = conf->mirrors;
9928 + int disks = MD_SB_DISKS;
9931 - if (raid_conf->working_disks == 1) {
9932 + if (conf->working_disks == 1) {
9934 * Uh oh, we can do nothing if this is our last disk, but
9935 * first check if this is a queued request for a device
9936 * which has just failed.
9938 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
9940 - if (mirror->dev == dev && !mirror->operational)
9941 + for (i = 0; i < disks; i++) {
9942 + if (mirrors[i].dev==dev && !mirrors[i].operational)
9947 - /* Mark disk as unusable */
9948 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
9950 - if (mirror->dev == dev && mirror->operational){
9951 - mirror->operational = 0;
9952 - raid1_fix_links (raid_conf, i);
9953 - sb->disks[mirror->number].state |=
9954 - (1 << MD_FAULTY_DEVICE);
9955 - sb->disks[mirror->number].state &=
9956 - ~(1 << MD_SYNC_DEVICE);
9957 - sb->disks[mirror->number].state &=
9958 - ~(1 << MD_ACTIVE_DEVICE);
9959 - sb->active_disks--;
9960 - sb->working_disks--;
9961 - sb->failed_disks++;
9962 - mddev->sb_dirty = 1;
9963 - md_wakeup_thread(raid1_thread);
9964 - raid_conf->working_disks--;
9965 - printk (DISK_FAILED, kdevname (dev),
9966 - raid_conf->working_disks);
9968 + * Mark disk as unusable
9970 + for (i = 0; i < disks; i++) {
9971 + if (mirrors[i].dev==dev && mirrors[i].operational) {
9972 + mark_disk_bad (mddev, i);
9977 @@ -442,219 +449,396 @@
9978 #undef START_SYNCING
9981 - * This is the personality-specific hot-addition routine
9982 + * Insert the spare disk into the drive-ring
9984 +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
9987 + int disks = MD_SB_DISKS;
9988 + struct mirror_info *p = conf->mirrors;
9990 -#define NO_SUPERBLOCK KERN_ERR \
9991 -"raid1: cannot hot-add disk to the array with no RAID superblock\n"
9992 + for (j = 0; j < disks; j++, p++)
9993 + if (p->operational && !p->write_only) {
9995 + p->next = mirror->raid_disk;
9996 + mirror->next = next;
10000 -#define WRONG_LEVEL KERN_ERR \
10001 -"raid1: hot-add: level of disk is not RAID-1\n"
10002 + printk("raid1: bug: no read-operational devices\n");
10005 -#define HOT_ADD_SUCCEEDED KERN_INFO \
10006 -"raid1: device %s hot-added\n"
10007 +static void print_raid1_conf (raid1_conf_t *conf)
10010 + struct mirror_info *tmp;
10012 -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
10013 + printk("RAID1 conf printout:\n");
10015 + printk("(conf==NULL)\n");
10018 + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
10019 + conf->raid_disks, conf->nr_disks);
10021 + for (i = 0; i < MD_SB_DISKS; i++) {
10022 + tmp = conf->mirrors + i;
10023 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10024 + i, tmp->spare,tmp->operational,
10025 + tmp->number,tmp->raid_disk,tmp->used_slot,
10026 + partition_name(tmp->dev));
10030 +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10033 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10034 + raid1_conf_t *conf = mddev->private;
10035 + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10036 unsigned long flags;
10037 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10038 - struct mirror_info *mirror;
10039 - md_superblock_t *sb = mddev->sb;
10040 - struct real_dev * realdev;
10042 + mdp_super_t *sb = mddev->sb;
10043 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10045 + save_flags(flags);
10048 + print_raid1_conf(conf);
10050 - * The device has its superblock already read and it was found
10051 - * to be consistent for generic RAID usage. Now we check whether
10052 - * it's usable for RAID-1 hot addition.
10053 + * find the disk ...
10057 - n = mddev->nb_dev++;
10058 - realdev = &mddev->devices[n];
10059 - if (!realdev->sb) {
10060 - printk (NO_SUPERBLOCK);
10063 - if (realdev->sb->level != 1) {
10064 - printk (WRONG_LEVEL);
10066 + case DISKOP_SPARE_ACTIVE:
10069 + * Find the failed disk within the RAID1 configuration ...
10070 + * (this can only be in the first conf->working_disks part)
10072 + for (i = 0; i < conf->raid_disks; i++) {
10073 + tmp = conf->mirrors + i;
10074 + if ((!tmp->operational && !tmp->spare) ||
10075 + !tmp->used_slot) {
10081 + * When we activate a spare disk we _must_ have a disk in
10082 + * the lower (active) part of the array to replace.
10084 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10089 + /* fall through */
10091 + case DISKOP_SPARE_WRITE:
10092 + case DISKOP_SPARE_INACTIVE:
10095 + * Find the spare disk ... (can only be in the 'high'
10096 + * area of the array)
10098 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10099 + tmp = conf->mirrors + i;
10100 + if (tmp->spare && tmp->number == (*d)->number) {
10105 + if (spare_disk == -1) {
10112 + case DISKOP_HOT_REMOVE_DISK:
10114 + for (i = 0; i < MD_SB_DISKS; i++) {
10115 + tmp = conf->mirrors + i;
10116 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
10117 + if (tmp->operational) {
10121 + removed_disk = i;
10125 + if (removed_disk == -1) {
10132 + case DISKOP_HOT_ADD_DISK:
10134 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10135 + tmp = conf->mirrors + i;
10136 + if (!tmp->used_slot) {
10141 + if (added_disk == -1) {
10148 - /* FIXME: are there other things left we could sanity-check? */
10152 - * We have to disable interrupts, as our RAID-1 state is used
10153 - * from irq handlers as well.
10154 + * Switch the spare disk to write-only mode:
10156 - save_flags(flags);
10158 + case DISKOP_SPARE_WRITE:
10159 + sdisk = conf->mirrors + spare_disk;
10160 + sdisk->operational = 1;
10161 + sdisk->write_only = 1;
10164 + * Deactivate a spare disk:
10166 + case DISKOP_SPARE_INACTIVE:
10167 + sdisk = conf->mirrors + spare_disk;
10168 + sdisk->operational = 0;
10169 + sdisk->write_only = 0;
10172 + * Activate (mark read-write) the (now sync) spare disk,
10173 + * which means we switch it's 'raid position' (->raid_disk)
10174 + * with the failed disk. (only the first 'conf->nr_disks'
10175 + * slots are used for 'real' disks and we must preserve this
10178 + case DISKOP_SPARE_ACTIVE:
10180 - raid_conf->raid_disks++;
10181 - mirror = raid_conf->mirrors+n;
10182 + sdisk = conf->mirrors + spare_disk;
10183 + fdisk = conf->mirrors + failed_disk;
10185 - mirror->number=n;
10186 - mirror->raid_disk=n;
10188 - mirror->next=0; /* FIXME */
10189 - mirror->sect_limit=128;
10191 - mirror->operational=0;
10193 - mirror->write_only=0;
10195 - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
10196 - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
10197 - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
10199 - sb->spare_disks++;
10200 + spare_desc = &sb->disks[sdisk->number];
10201 + failed_desc = &sb->disks[fdisk->number];
10203 - restore_flags(flags);
10204 + if (spare_desc != *d) {
10210 - md_update_sb(MINOR(dev));
10211 + if (spare_desc->raid_disk != sdisk->raid_disk) {
10217 + if (sdisk->raid_disk != spare_disk) {
10223 - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
10224 + if (failed_desc->raid_disk != fdisk->raid_disk) {
10232 + if (fdisk->raid_disk != failed_disk) {
10238 -#undef NO_SUPERBLOCK
10239 -#undef WRONG_LEVEL
10240 -#undef HOT_ADD_SUCCEEDED
10242 + * do the switch finally
10244 + xchg_values(*spare_desc, *failed_desc);
10245 + xchg_values(*fdisk, *sdisk);
10248 - * Insert the spare disk into the drive-ring
10250 -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
10253 - struct mirror_info *p = raid_conf->mirrors;
10255 + * (careful, 'failed' and 'spare' are switched from now on)
10257 + * we want to preserve linear numbering and we want to
10258 + * give the proper raid_disk number to the now activated
10259 + * disk. (this means we switch back these values)
10262 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10263 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10264 + xchg_values(spare_desc->number, failed_desc->number);
10265 + xchg_values(sdisk->number, fdisk->number);
10267 - for (j = 0; j < raid_conf->raid_disks; j++, p++)
10268 - if (p->operational && !p->write_only) {
10270 - p->next = mirror->raid_disk;
10271 - mirror->next = next;
10274 - printk("raid1: bug: no read-operational devices\n");
10276 + *d = failed_desc;
10278 -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
10281 - int i = 0, failed_disk = -1;
10282 - struct raid1_data *raid_conf = mddev->private;
10283 - struct mirror_info *mirror = raid_conf->mirrors;
10284 - md_descriptor_t *descriptor;
10285 - unsigned long flags;
10286 + if (sdisk->dev == MKDEV(0,0))
10287 + sdisk->used_slot = 0;
10289 + * this really activates the spare.
10291 + fdisk->spare = 0;
10292 + fdisk->write_only = 0;
10293 + link_disk(conf, fdisk);
10295 - for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
10296 - if (mirror->spare && mirror->number == spare->number)
10301 - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
10303 - if (!mirror->operational)
10306 + * if we activate a spare, we definitely replace a
10307 + * non-operational disk slot in the 'low' area of
10308 + * the disk array.
10311 - save_flags(flags);
10314 - case SPARE_WRITE:
10315 - mirror->operational = 1;
10316 - mirror->write_only = 1;
10317 - raid_conf->raid_disks = MAX(raid_conf->raid_disks,
10318 - mirror->raid_disk + 1);
10320 - case SPARE_INACTIVE:
10321 - mirror->operational = 0;
10322 - mirror->write_only = 0;
10324 - case SPARE_ACTIVE:
10325 - mirror->spare = 0;
10326 - mirror->write_only = 0;
10327 - raid_conf->working_disks++;
10328 - add_ring(raid_conf, mirror);
10330 - if (failed_disk != -1) {
10331 - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
10332 - i = spare->raid_disk;
10333 - spare->raid_disk = descriptor->raid_disk;
10334 - descriptor->raid_disk = i;
10338 - printk("raid1_mark_spare: bug: state == %d\n", state);
10339 - restore_flags(flags);
10341 + conf->working_disks++;
10345 + case DISKOP_HOT_REMOVE_DISK:
10346 + rdisk = conf->mirrors + removed_disk;
10348 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10353 + rdisk->dev = MKDEV(0,0);
10354 + rdisk->used_slot = 0;
10355 + conf->nr_disks--;
10358 + case DISKOP_HOT_ADD_DISK:
10359 + adisk = conf->mirrors + added_disk;
10362 + if (added_disk != added_desc->number) {
10368 + adisk->number = added_desc->number;
10369 + adisk->raid_disk = added_desc->raid_disk;
10370 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10372 + adisk->operational = 0;
10373 + adisk->write_only = 0;
10374 + adisk->spare = 1;
10375 + adisk->used_slot = 1;
10376 + conf->nr_disks++;
10386 restore_flags(flags);
10388 + print_raid1_conf(conf);
10393 +#define IO_ERROR KERN_ALERT \
10394 +"raid1: %s: unrecoverable I/O read error for block %lu\n"
10396 +#define REDIRECT_SECTOR KERN_ERR \
10397 +"raid1: %s: redirecting sector %lu to another mirror\n"
10400 * This is a kernel thread which:
10402 * 1. Retries failed read operations on working mirrors.
10403 * 2. Updates the raid superblock when problems encounter.
10405 -void raid1d (void *data)
10406 +static void raid1d (void *data)
10408 struct buffer_head *bh;
10410 unsigned long flags;
10411 - struct raid1_bh * r1_bh;
10412 - struct md_dev *mddev;
10413 + struct raid1_bh *r1_bh;
10416 - PRINTK(("raid1d() active\n"));
10417 - save_flags(flags);
10419 while (raid1_retry_list) {
10420 + save_flags(flags);
10422 bh = raid1_retry_list;
10423 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
10424 raid1_retry_list = r1_bh->next_retry;
10425 restore_flags(flags);
10427 - mddev = md_dev + MINOR(bh->b_dev);
10428 + mddev = kdev_to_mddev(bh->b_dev);
10429 if (mddev->sb_dirty) {
10430 - printk("dirty sb detected, updating.\n");
10431 + printk(KERN_INFO "dirty sb detected, updating.\n");
10432 mddev->sb_dirty = 0;
10433 - md_update_sb(MINOR(bh->b_dev));
10434 + md_update_sb(mddev);
10437 - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
10438 + __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
10439 + bh->b_size >> 9);
10440 if (bh->b_rdev == dev) {
10441 - printk (KERN_ALERT
10442 - "raid1: %s: unrecoverable I/O read error for block %lu\n",
10443 - kdevname(bh->b_dev), bh->b_blocknr);
10444 - raid1_end_buffer_io(r1_bh, 0);
10445 + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
10446 + raid1_end_bh_io(r1_bh, 0);
10448 - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
10449 - kdevname(bh->b_dev), bh->b_blocknr);
10450 + printk (REDIRECT_SECTOR,
10451 + partition_name(bh->b_dev), bh->b_blocknr);
10452 map_and_make_request (r1_bh->cmd, bh);
10456 - restore_flags(flags);
10459 +#undef REDIRECT_SECTOR
10462 + * Private kernel thread to reconstruct mirrors after an unclean
10465 +static void raid1syncd (void *data)
10467 + raid1_conf_t *conf = data;
10468 + mddev_t *mddev = conf->mddev;
10470 + if (!conf->resync_mirrors)
10472 + if (conf->resync_mirrors == 2)
10474 + down(&mddev->recovery_sem);
10475 + if (md_do_sync(mddev, NULL)) {
10476 + up(&mddev->recovery_sem);
10480 + * Only if everything went Ok.
10482 + conf->resync_mirrors = 0;
10483 + up(&mddev->recovery_sem);
10488 * This will catch the scenario in which one of the mirrors was
10489 * mounted as a normal device rather than as a part of a raid set.
10491 + * check_consistency is very personality-dependent, eg. RAID5 cannot
10492 + * do this check, it uses another method.
10494 -static int __check_consistency (struct md_dev *mddev, int row)
10495 +static int __check_consistency (mddev_t *mddev, int row)
10497 - struct raid1_data *raid_conf = mddev->private;
10498 + raid1_conf_t *conf = mddev_to_conf(mddev);
10499 + int disks = MD_SB_DISKS;
10501 struct buffer_head *bh = NULL;
10503 char *buffer = NULL;
10505 - for (i = 0; i < raid_conf->raid_disks; i++) {
10506 - if (!raid_conf->mirrors[i].operational)
10507 + for (i = 0; i < disks; i++) {
10508 + printk("(checking disk %d)\n",i);
10509 + if (!conf->mirrors[i].operational)
10511 - dev = raid_conf->mirrors[i].dev;
10512 + printk("(really checking disk %d)\n",i);
10513 + dev = conf->mirrors[i].dev;
10514 set_blocksize(dev, 4096);
10515 if ((bh = bread(dev, row / 4, 4096)) == NULL)
10517 @@ -683,167 +867,342 @@
10521 -static int check_consistency (struct md_dev *mddev)
10522 +static int check_consistency (mddev_t *mddev)
10524 - int size = mddev->sb->size;
10526 + if (__check_consistency(mddev, 0))
10528 + * we do not do this currently, as it's perfectly possible to
10529 + * have an inconsistent array when it's freshly created. Only
10530 + * newly written data has to be consistent.
10534 - for (row = 0; row < size; row += size / 8)
10535 - if (__check_consistency(mddev, row))
10540 -static int raid1_run (int minor, struct md_dev *mddev)
10541 +#define INVALID_LEVEL KERN_WARNING \
10542 +"raid1: md%d: raid level not set to mirroring (%d)\n"
10544 +#define NO_SB KERN_ERR \
10545 +"raid1: disabled mirror %s (couldn't access raid superblock)\n"
10547 +#define ERRORS KERN_ERR \
10548 +"raid1: disabled mirror %s (errors detected)\n"
10550 +#define NOT_IN_SYNC KERN_ERR \
10551 +"raid1: disabled mirror %s (not in sync)\n"
10553 +#define INCONSISTENT KERN_ERR \
10554 +"raid1: disabled mirror %s (inconsistent descriptor)\n"
10556 +#define ALREADY_RUNNING KERN_ERR \
10557 +"raid1: disabled mirror %s (mirror %d already operational)\n"
10559 +#define OPERATIONAL KERN_INFO \
10560 +"raid1: device %s operational as mirror %d\n"
10562 +#define MEM_ERROR KERN_ERR \
10563 +"raid1: couldn't allocate memory for md%d\n"
10565 +#define SPARE KERN_INFO \
10566 +"raid1: spare disk %s\n"
10568 +#define NONE_OPERATIONAL KERN_ERR \
10569 +"raid1: no operational mirrors for md%d\n"
10571 +#define RUNNING_CKRAID KERN_ERR \
10572 +"raid1: detected mirror differences -- running resync\n"
10574 +#define ARRAY_IS_ACTIVE KERN_INFO \
10575 +"raid1: raid set md%d active with %d out of %d mirrors\n"
10577 +#define THREAD_ERROR KERN_ERR \
10578 +"raid1: couldn't allocate thread for md%d\n"
10580 +#define START_RESYNC KERN_WARNING \
10581 +"raid1: raid set md%d not clean; reconstructing mirrors\n"
10583 +static int raid1_run (mddev_t *mddev)
10585 - struct raid1_data *raid_conf;
10586 - int i, j, raid_disk;
10587 - md_superblock_t *sb = mddev->sb;
10588 - md_descriptor_t *descriptor;
10589 - struct real_dev *realdev;
10590 + raid1_conf_t *conf;
10591 + int i, j, disk_idx;
10592 + struct mirror_info *disk;
10593 + mdp_super_t *sb = mddev->sb;
10594 + mdp_disk_t *descriptor;
10595 + mdk_rdev_t *rdev;
10596 + struct md_list_head *tmp;
10597 + int start_recovery = 0;
10601 if (sb->level != 1) {
10602 - printk("raid1: %s: raid level not set to mirroring (%d)\n",
10603 - kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
10604 - MOD_DEC_USE_COUNT;
10608 - * copy the now verified devices into our private RAID1 bookkeeping
10609 - * area. [whatever we allocate in raid1_run(), should be freed in
10611 + printk(INVALID_LEVEL, mdidx(mddev), sb->level);
10615 + * copy the already verified devices into our private RAID1
10616 + * bookkeeping area. [whatever we allocate in raid1_run(),
10617 + * should be freed in raid1_stop()]
10620 - while (!( /* FIXME: now we are rather fault tolerant than nice */
10621 - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
10624 - printk ("raid1_run(): out of memory\n");
10625 - current->policy |= SCHED_YIELD;
10628 - raid_conf = mddev->private;
10629 - memset(raid_conf, 0, sizeof(*raid_conf));
10631 - PRINTK(("raid1_run(%d) called.\n", minor));
10633 - for (i = 0; i < mddev->nb_dev; i++) {
10634 - realdev = &mddev->devices[i];
10635 - if (!realdev->sb) {
10636 - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10637 + conf = raid1_kmalloc(sizeof(raid1_conf_t));
10638 + mddev->private = conf;
10640 + printk(MEM_ERROR, mdidx(mddev));
10644 + ITERATE_RDEV(mddev,rdev,tmp) {
10645 + if (rdev->faulty) {
10646 + printk(ERRORS, partition_name(rdev->dev));
10653 + if (rdev->desc_nr == -1) {
10659 - * This is important -- we are using the descriptor on
10660 - * the disk only to get a pointer to the descriptor on
10661 - * the main superblock, which might be more recent.
10663 - descriptor = &sb->disks[realdev->sb->descriptor.number];
10664 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10665 - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
10666 + descriptor = &sb->disks[rdev->desc_nr];
10667 + disk_idx = descriptor->raid_disk;
10668 + disk = conf->mirrors + disk_idx;
10670 + if (disk_faulty(descriptor)) {
10671 + disk->number = descriptor->number;
10672 + disk->raid_disk = disk_idx;
10673 + disk->dev = rdev->dev;
10674 + disk->sect_limit = MAX_LINEAR_SECTORS;
10675 + disk->operational = 0;
10676 + disk->write_only = 0;
10678 + disk->used_slot = 1;
10681 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10682 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10683 - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
10684 + if (disk_active(descriptor)) {
10685 + if (!disk_sync(descriptor)) {
10686 + printk(NOT_IN_SYNC,
10687 + partition_name(rdev->dev));
10690 - raid_disk = descriptor->raid_disk;
10691 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10692 - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10693 + if ((descriptor->number > MD_SB_DISKS) ||
10694 + (disk_idx > sb->raid_disks)) {
10696 + printk(INCONSISTENT,
10697 + partition_name(rdev->dev));
10700 - if (raid_conf->mirrors[raid_disk].operational) {
10701 - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
10702 + if (disk->operational) {
10703 + printk(ALREADY_RUNNING,
10704 + partition_name(rdev->dev),
10708 - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
10709 - raid_conf->mirrors[raid_disk].number = descriptor->number;
10710 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10711 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10712 - raid_conf->mirrors[raid_disk].operational = 1;
10713 - raid_conf->mirrors[raid_disk].sect_limit = 128;
10714 - raid_conf->working_disks++;
10715 + printk(OPERATIONAL, partition_name(rdev->dev),
10717 + disk->number = descriptor->number;
10718 + disk->raid_disk = disk_idx;
10719 + disk->dev = rdev->dev;
10720 + disk->sect_limit = MAX_LINEAR_SECTORS;
10721 + disk->operational = 1;
10722 + disk->write_only = 0;
10724 + disk->used_slot = 1;
10725 + conf->working_disks++;
10728 * Must be a spare disk ..
10730 - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
10731 - raid_disk = descriptor->raid_disk;
10732 - raid_conf->mirrors[raid_disk].number = descriptor->number;
10733 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10734 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10735 - raid_conf->mirrors[raid_disk].sect_limit = 128;
10737 - raid_conf->mirrors[raid_disk].operational = 0;
10738 - raid_conf->mirrors[raid_disk].write_only = 0;
10739 - raid_conf->mirrors[raid_disk].spare = 1;
10742 - if (!raid_conf->working_disks) {
10743 - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10744 - kfree(raid_conf);
10745 - mddev->private = NULL;
10746 - MOD_DEC_USE_COUNT;
10750 - raid_conf->raid_disks = sb->raid_disks;
10751 - raid_conf->mddev = mddev;
10753 - for (j = 0; !raid_conf->mirrors[j].operational; j++);
10754 - raid_conf->last_used = j;
10755 - for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
10756 - if (raid_conf->mirrors[i].operational) {
10757 - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
10758 - raid_conf->mirrors[i].next = j;
10759 + printk(SPARE, partition_name(rdev->dev));
10760 + disk->number = descriptor->number;
10761 + disk->raid_disk = disk_idx;
10762 + disk->dev = rdev->dev;
10763 + disk->sect_limit = MAX_LINEAR_SECTORS;
10764 + disk->operational = 0;
10765 + disk->write_only = 0;
10767 + disk->used_slot = 1;
10770 + if (!conf->working_disks) {
10771 + printk(NONE_OPERATIONAL, mdidx(mddev));
10772 + goto out_free_conf;
10775 + conf->raid_disks = sb->raid_disks;
10776 + conf->nr_disks = sb->nr_disks;
10777 + conf->mddev = mddev;
10779 + for (i = 0; i < MD_SB_DISKS; i++) {
10781 + descriptor = sb->disks+i;
10782 + disk_idx = descriptor->raid_disk;
10783 + disk = conf->mirrors + disk_idx;
10785 + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
10786 + !disk->used_slot) {
10788 + disk->number = descriptor->number;
10789 + disk->raid_disk = disk_idx;
10790 + disk->dev = MKDEV(0,0);
10792 + disk->operational = 0;
10793 + disk->write_only = 0;
10795 + disk->used_slot = 1;
10800 + * find the first working one and use it as a starting point
10801 + * to read balancing.
10803 + for (j = 0; !conf->mirrors[j].operational; j++)
10805 + conf->last_used = j;
10808 + * initialize the 'working disks' list.
10810 + for (i = conf->raid_disks - 1; i >= 0; i--) {
10811 + if (conf->mirrors[i].operational) {
10812 + conf->mirrors[i].next = j;
10817 - if (check_consistency(mddev)) {
10818 - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
10819 - sb->state |= 1 << MD_SB_ERRORS;
10820 - kfree(raid_conf);
10821 - mddev->private = NULL;
10822 - MOD_DEC_USE_COUNT;
10824 + if (conf->working_disks != sb->raid_disks) {
10825 + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10826 + start_recovery = 1;
10829 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
10831 + * we do sanity checks even if the device says
10834 + if (check_consistency(mddev)) {
10835 + printk(RUNNING_CKRAID);
10836 + sb->state &= ~(1 << MD_SB_CLEAN);
10841 + const char * name = "raid1d";
10843 + conf->thread = md_register_thread(raid1d, conf, name);
10844 + if (!conf->thread) {
10845 + printk(THREAD_ERROR, mdidx(mddev));
10846 + goto out_free_conf;
10850 + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
10851 + const char * name = "raid1syncd";
10853 + conf->resync_thread = md_register_thread(raid1syncd, conf,name);
10854 + if (!conf->resync_thread) {
10855 + printk(THREAD_ERROR, mdidx(mddev));
10856 + goto out_free_conf;
10859 + printk(START_RESYNC, mdidx(mddev));
10860 + conf->resync_mirrors = 1;
10861 + md_wakeup_thread(conf->resync_thread);
10865 * Regenerate the "device is in sync with the raid set" bit for
10868 - for (i = 0; i < sb->nr_disks ; i++) {
10869 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10870 + for (i = 0; i < MD_SB_DISKS; i++) {
10871 + mark_disk_nonsync(sb->disks+i);
10872 for (j = 0; j < sb->raid_disks; j++) {
10873 - if (!raid_conf->mirrors[j].operational)
10874 + if (!conf->mirrors[j].operational)
10876 - if (sb->disks[i].number == raid_conf->mirrors[j].number)
10877 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10878 + if (sb->disks[i].number == conf->mirrors[j].number)
10879 + mark_disk_sync(sb->disks+i);
10882 - sb->active_disks = raid_conf->working_disks;
10883 + sb->active_disks = conf->working_disks;
10885 - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
10886 - /* Ok, everything is just fine now */
10888 + if (start_recovery)
10889 + md_recover_arrays();
10892 + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
10894 + * Ok, everything is just fine now
10900 + mddev->private = NULL;
10902 + MOD_DEC_USE_COUNT;
10906 +#undef INVALID_LEVEL
10909 +#undef NOT_IN_SYNC
10910 +#undef INCONSISTENT
10911 +#undef ALREADY_RUNNING
10912 +#undef OPERATIONAL
10914 +#undef NONE_OPERATIONAL
10915 +#undef RUNNING_CKRAID
10916 +#undef ARRAY_IS_ACTIVE
10918 +static int raid1_stop_resync (mddev_t *mddev)
10920 + raid1_conf_t *conf = mddev_to_conf(mddev);
10922 + if (conf->resync_thread) {
10923 + if (conf->resync_mirrors) {
10924 + conf->resync_mirrors = 2;
10925 + md_interrupt_thread(conf->resync_thread);
10926 + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
10934 +static int raid1_restart_resync (mddev_t *mddev)
10936 + raid1_conf_t *conf = mddev_to_conf(mddev);
10938 + if (conf->resync_mirrors) {
10939 + if (!conf->resync_thread) {
10943 + conf->resync_mirrors = 1;
10944 + md_wakeup_thread(conf->resync_thread);
10950 -static int raid1_stop (int minor, struct md_dev *mddev)
10951 +static int raid1_stop (mddev_t *mddev)
10953 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10954 + raid1_conf_t *conf = mddev_to_conf(mddev);
10956 - kfree (raid_conf);
10957 + md_unregister_thread(conf->thread);
10958 + if (conf->resync_thread)
10959 + md_unregister_thread(conf->resync_thread);
10961 mddev->private = NULL;
10966 -static struct md_personality raid1_personality=
10967 +static mdk_personality_t raid1_personality=
10971 @@ -855,15 +1214,13 @@
10972 NULL, /* no ioctls */
10975 - raid1_hot_add_disk,
10976 - /* raid1_hot_remove_drive */ NULL,
10979 + raid1_stop_resync,
10980 + raid1_restart_resync
10983 int raid1_init (void)
10985 - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
10987 return register_md_personality (RAID1, &raid1_personality);
10990 @@ -875,7 +1232,6 @@
10992 void cleanup_module (void)
10994 - md_unregister_thread (raid1_thread);
10995 unregister_md_personality (RAID1);
10998 --- linux/drivers/block/raid5.c.orig Fri May 8 09:17:13 1998
10999 +++ linux/drivers/block/raid5.c Tue Jan 16 13:42:04 2001
11001 -/*****************************************************************************
11003 * raid5.c : Multiple Devices driver for Linux
11004 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
11006 @@ -14,16 +14,15 @@
11007 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
11011 #include <linux/module.h>
11012 #include <linux/locks.h>
11013 #include <linux/malloc.h>
11014 -#include <linux/md.h>
11015 -#include <linux/raid5.h>
11016 +#include <linux/raid/raid5.h>
11017 #include <asm/bitops.h>
11018 #include <asm/atomic.h>
11019 -#include <asm/md.h>
11021 -static struct md_personality raid5_personality;
11022 +static mdk_personality_t raid5_personality;
11027 #define HASH_PAGES_ORDER 0
11028 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
11029 #define HASH_MASK (NR_HASH - 1)
11030 -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11031 +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11034 * The following can be used to debug the driver
11036 #define PRINTK(x) do { ; } while (0)
11039 +static void print_raid5_conf (raid5_conf_t *conf);
11041 static inline int stripe_locked(struct stripe_head *sh)
11043 return test_bit(STRIPE_LOCKED, &sh->state);
11044 @@ -61,32 +62,32 @@
11046 static inline void lock_stripe(struct stripe_head *sh)
11048 - struct raid5_data *raid_conf = sh->raid_conf;
11049 - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11050 + raid5_conf_t *conf = sh->raid_conf;
11051 + if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11052 PRINTK(("locking stripe %lu\n", sh->sector));
11053 - raid_conf->nr_locked_stripes++;
11054 + conf->nr_locked_stripes++;
11058 static inline void unlock_stripe(struct stripe_head *sh)
11060 - struct raid5_data *raid_conf = sh->raid_conf;
11061 - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11062 + raid5_conf_t *conf = sh->raid_conf;
11063 + if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11064 PRINTK(("unlocking stripe %lu\n", sh->sector));
11065 - raid_conf->nr_locked_stripes--;
11066 + conf->nr_locked_stripes--;
11067 wake_up(&sh->wait);
11071 static inline void finish_stripe(struct stripe_head *sh)
11073 - struct raid5_data *raid_conf = sh->raid_conf;
11074 + raid5_conf_t *conf = sh->raid_conf;
11076 sh->cmd = STRIPE_NONE;
11077 sh->phase = PHASE_COMPLETE;
11078 - raid_conf->nr_pending_stripes--;
11079 - raid_conf->nr_cached_stripes++;
11080 - wake_up(&raid_conf->wait_for_stripe);
11081 + conf->nr_pending_stripes--;
11082 + conf->nr_cached_stripes++;
11083 + wake_up(&conf->wait_for_stripe);
11086 void __wait_on_stripe(struct stripe_head *sh)
11087 @@ -114,7 +115,7 @@
11088 __wait_on_stripe(sh);
11091 -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11092 +static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
11094 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
11096 @@ -123,21 +124,22 @@
11097 sh->hash_next->hash_pprev = sh->hash_pprev;
11098 *sh->hash_pprev = sh->hash_next;
11099 sh->hash_pprev = NULL;
11100 - raid_conf->nr_hashed_stripes--;
11101 + conf->nr_hashed_stripes--;
11105 -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11106 +static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
11108 - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
11109 + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
11111 - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
11112 + PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
11113 + sh->sector, conf->nr_hashed_stripes));
11115 if ((sh->hash_next = *shp) != NULL)
11116 (*shp)->hash_pprev = &sh->hash_next;
11118 sh->hash_pprev = shp;
11119 - raid_conf->nr_hashed_stripes++;
11120 + conf->nr_hashed_stripes++;
11123 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
11124 @@ -145,13 +147,15 @@
11125 struct buffer_head *bh;
11126 unsigned long flags;
11128 - save_flags(flags);
11130 - if ((bh = sh->buffer_pool) == NULL)
11132 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11133 + bh = sh->buffer_pool;
11136 sh->buffer_pool = bh->b_next;
11137 bh->b_size = b_size;
11138 - restore_flags(flags);
11140 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11145 @@ -160,12 +164,14 @@
11146 struct buffer_head *bh;
11147 unsigned long flags;
11149 - save_flags(flags);
11151 - if ((bh = sh->bh_pool) == NULL)
11153 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11154 + bh = sh->bh_pool;
11157 sh->bh_pool = bh->b_next;
11158 - restore_flags(flags);
11160 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11165 @@ -173,54 +179,52 @@
11167 unsigned long flags;
11169 - save_flags(flags);
11171 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11172 bh->b_next = sh->buffer_pool;
11173 sh->buffer_pool = bh;
11174 - restore_flags(flags);
11175 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11178 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
11180 unsigned long flags;
11182 - save_flags(flags);
11184 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11185 bh->b_next = sh->bh_pool;
11187 - restore_flags(flags);
11188 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11191 -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
11192 +static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
11194 struct stripe_head *sh;
11195 unsigned long flags;
11199 - if ((sh = raid_conf->free_sh_list) == NULL) {
11200 + if ((sh = conf->free_sh_list) == NULL) {
11201 restore_flags(flags);
11204 - raid_conf->free_sh_list = sh->free_next;
11205 - raid_conf->nr_free_sh--;
11206 - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
11207 + conf->free_sh_list = sh->free_next;
11208 + conf->nr_free_sh--;
11209 + if (!conf->nr_free_sh && conf->free_sh_list)
11210 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
11211 restore_flags(flags);
11212 - if (sh->hash_pprev || sh->nr_pending || sh->count)
11213 + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
11214 printk("get_free_stripe(): bug\n");
11218 -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
11219 +static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
11221 unsigned long flags;
11225 - sh->free_next = raid_conf->free_sh_list;
11226 - raid_conf->free_sh_list = sh;
11227 - raid_conf->nr_free_sh++;
11228 + sh->free_next = conf->free_sh_list;
11229 + conf->free_sh_list = sh;
11230 + conf->nr_free_sh++;
11231 restore_flags(flags);
11234 @@ -324,8 +328,8 @@
11236 static void kfree_stripe(struct stripe_head *sh)
11238 - struct raid5_data *raid_conf = sh->raid_conf;
11239 - int disks = raid_conf->raid_disks, j;
11240 + raid5_conf_t *conf = sh->raid_conf;
11241 + int disks = conf->raid_disks, j;
11243 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
11244 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
11245 @@ -338,19 +342,19 @@
11246 if (sh->bh_new[j] || sh->bh_copy[j])
11247 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
11249 - remove_hash(raid_conf, sh);
11250 - put_free_stripe(raid_conf, sh);
11251 + remove_hash(conf, sh);
11252 + put_free_stripe(conf, sh);
11255 -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
11256 +static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
11258 struct stripe_head *sh;
11261 - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
11262 + PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
11263 for (i = 0; i < NR_HASH; i++) {
11265 - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
11266 + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
11267 for (; sh; sh = sh->hash_next) {
11268 if (sh->phase != PHASE_COMPLETE)
11270 @@ -360,30 +364,30 @@
11273 if (++count == nr) {
11274 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11275 - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
11276 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11277 + conf->clock = (i + conf->clock) & HASH_MASK;
11283 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11284 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11288 -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11289 +static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11291 struct stripe_head *sh;
11293 - if (raid_conf->buffer_size != size) {
11294 - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
11295 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
11296 - raid_conf->buffer_size = size;
11297 + if (conf->buffer_size != size) {
11298 + PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
11299 + shrink_stripe_cache(conf, conf->max_nr_stripes);
11300 + conf->buffer_size = size;
11303 PRINTK(("find_stripe, sector %lu\n", sector));
11304 - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
11305 - if (sh->sector == sector && sh->raid_conf == raid_conf) {
11306 + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
11307 + if (sh->sector == sector && sh->raid_conf == conf) {
11308 if (sh->size == size) {
11309 PRINTK(("found stripe %lu\n", sector));
11311 @@ -397,7 +401,7 @@
11315 -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
11316 +static int grow_stripes(raid5_conf_t *conf, int num, int priority)
11318 struct stripe_head *sh;
11320 @@ -405,62 +409,64 @@
11321 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
11323 memset(sh, 0, sizeof(*sh));
11324 - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
11325 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
11326 + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
11328 + if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
11329 + shrink_buffers(sh, 2 * conf->raid_disks);
11333 - if (grow_bh(sh, raid_conf->raid_disks, priority)) {
11334 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
11335 - shrink_bh(sh, raid_conf->raid_disks);
11336 + if (grow_bh(sh, conf->raid_disks, priority)) {
11337 + shrink_buffers(sh, 2 * conf->raid_disks);
11338 + shrink_bh(sh, conf->raid_disks);
11342 - put_free_stripe(raid_conf, sh);
11343 - raid_conf->nr_stripes++;
11344 + put_free_stripe(conf, sh);
11345 + conf->nr_stripes++;
11350 -static void shrink_stripes(struct raid5_data *raid_conf, int num)
11351 +static void shrink_stripes(raid5_conf_t *conf, int num)
11353 struct stripe_head *sh;
11356 - sh = get_free_stripe(raid_conf);
11357 + sh = get_free_stripe(conf);
11360 - shrink_buffers(sh, raid_conf->raid_disks * 2);
11361 - shrink_bh(sh, raid_conf->raid_disks);
11362 + shrink_buffers(sh, conf->raid_disks * 2);
11363 + shrink_bh(sh, conf->raid_disks);
11365 - raid_conf->nr_stripes--;
11366 + conf->nr_stripes--;
11370 -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11371 +static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11373 struct stripe_head *sh = NULL, *tmp;
11374 struct buffer_head *buffer_pool, *bh_pool;
11376 PRINTK(("kmalloc_stripe called\n"));
11378 - while ((sh = get_free_stripe(raid_conf)) == NULL) {
11379 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
11380 - if ((sh = get_free_stripe(raid_conf)) != NULL)
11381 + while ((sh = get_free_stripe(conf)) == NULL) {
11382 + shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
11383 + if ((sh = get_free_stripe(conf)) != NULL)
11385 - if (!raid_conf->nr_pending_stripes)
11386 + if (!conf->nr_pending_stripes)
11387 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
11388 - md_wakeup_thread(raid_conf->thread);
11389 + md_wakeup_thread(conf->thread);
11390 PRINTK(("waiting for some stripes to complete\n"));
11391 - sleep_on(&raid_conf->wait_for_stripe);
11392 + sleep_on(&conf->wait_for_stripe);
11396 * The above might have slept, so perhaps another process
11397 * already created the stripe for us..
11399 - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
11400 - put_free_stripe(raid_conf, sh);
11401 + if ((tmp = find_stripe(conf, sector, size)) != NULL) {
11402 + put_free_stripe(conf, sh);
11403 wait_on_stripe(tmp);
11406 @@ -472,25 +478,25 @@
11407 sh->bh_pool = bh_pool;
11408 sh->phase = PHASE_COMPLETE;
11409 sh->cmd = STRIPE_NONE;
11410 - sh->raid_conf = raid_conf;
11411 + sh->raid_conf = conf;
11412 sh->sector = sector;
11414 - raid_conf->nr_cached_stripes++;
11415 - insert_hash(raid_conf, sh);
11416 + conf->nr_cached_stripes++;
11417 + insert_hash(conf, sh);
11418 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
11422 -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11423 +static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11425 struct stripe_head *sh;
11427 PRINTK(("get_stripe, sector %lu\n", sector));
11428 - sh = find_stripe(raid_conf, sector, size);
11429 + sh = find_stripe(conf, sector, size);
11431 wait_on_stripe(sh);
11433 - sh = kmalloc_stripe(raid_conf, sector, size);
11434 + sh = kmalloc_stripe(conf, sector, size);
11438 @@ -523,7 +529,7 @@
11439 bh->b_end_io(bh, uptodate);
11441 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
11442 - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
11443 + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
11446 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
11447 @@ -537,36 +543,35 @@
11448 static void raid5_end_request (struct buffer_head * bh, int uptodate)
11450 struct stripe_head *sh = bh->b_dev_id;
11451 - struct raid5_data *raid_conf = sh->raid_conf;
11452 - int disks = raid_conf->raid_disks, i;
11453 + raid5_conf_t *conf = sh->raid_conf;
11454 + int disks = conf->raid_disks, i;
11455 unsigned long flags;
11457 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
11458 - save_flags(flags);
11460 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11461 raid5_mark_buffer_uptodate(bh, uptodate);
11462 - --sh->nr_pending;
11463 - if (!sh->nr_pending) {
11464 - md_wakeup_thread(raid_conf->thread);
11465 - atomic_inc(&raid_conf->nr_handle);
11466 + if (atomic_dec_and_test(&sh->nr_pending)) {
11467 + md_wakeup_thread(conf->thread);
11468 + atomic_inc(&conf->nr_handle);
11472 md_error(bh->b_dev, bh->b_rdev);
11473 - if (raid_conf->failed_disks) {
11475 + if (conf->failed_disks) {
11476 for (i = 0; i < disks; i++) {
11477 - if (raid_conf->disks[i].operational)
11478 + if (conf->disks[i].operational)
11480 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
11482 - if (bh->b_rdev != raid_conf->disks[i].dev)
11483 + if (bh->b_rdev != conf->disks[i].dev)
11485 set_bit(STRIPE_ERROR, &sh->state);
11488 - restore_flags(flags);
11489 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11492 -static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
11493 +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
11494 unsigned long *rsector, unsigned long size)
11496 /* No complex mapping used: the core of the work is done in the
11497 @@ -577,11 +582,10 @@
11499 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
11501 - struct raid5_data *raid_conf = sh->raid_conf;
11502 - struct md_dev *mddev = raid_conf->mddev;
11503 - int minor = (int) (mddev - md_dev);
11504 + raid5_conf_t *conf = sh->raid_conf;
11505 + mddev_t *mddev = conf->mddev;
11507 - kdev_t dev = MKDEV(MD_MAJOR, minor);
11508 + kdev_t dev = mddev_to_kdev(mddev);
11509 int block = sh->sector / (sh->size >> 9);
11511 b_data = ((volatile struct buffer_head *) bh)->b_data;
11512 @@ -589,7 +593,7 @@
11513 init_buffer(bh, dev, block, raid5_end_request, sh);
11514 ((volatile struct buffer_head *) bh)->b_data = b_data;
11516 - bh->b_rdev = raid_conf->disks[i].dev;
11517 + bh->b_rdev = conf->disks[i].dev;
11518 bh->b_rsector = sh->sector;
11520 bh->b_state = (1 << BH_Req);
11521 @@ -597,33 +601,62 @@
11522 bh->b_list = BUF_LOCKED;
11525 -static int raid5_error (struct md_dev *mddev, kdev_t dev)
11526 +static int raid5_error (mddev_t *mddev, kdev_t dev)
11528 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
11529 - md_superblock_t *sb = mddev->sb;
11530 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
11531 + mdp_super_t *sb = mddev->sb;
11532 struct disk_info *disk;
11535 PRINTK(("raid5_error called\n"));
11536 - raid_conf->resync_parity = 0;
11537 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
11538 + conf->resync_parity = 0;
11539 + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
11540 if (disk->dev == dev && disk->operational) {
11541 disk->operational = 0;
11542 - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
11543 - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
11544 - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
11545 + mark_disk_faulty(sb->disks+disk->number);
11546 + mark_disk_nonsync(sb->disks+disk->number);
11547 + mark_disk_inactive(sb->disks+disk->number);
11548 sb->active_disks--;
11549 sb->working_disks--;
11550 sb->failed_disks++;
11551 mddev->sb_dirty = 1;
11552 - raid_conf->working_disks--;
11553 - raid_conf->failed_disks++;
11554 - md_wakeup_thread(raid_conf->thread);
11555 + conf->working_disks--;
11556 + conf->failed_disks++;
11557 + md_wakeup_thread(conf->thread);
11559 - "RAID5: Disk failure on %s, disabling device."
11560 - "Operation continuing on %d devices\n",
11561 - kdevname (dev), raid_conf->working_disks);
11562 + "raid5: Disk failure on %s, disabling device."
11563 + " Operation continuing on %d devices\n",
11564 + partition_name (dev), conf->working_disks);
11569 + * handle errors in spares (during reconstruction)
11571 + if (conf->spare) {
11572 + disk = conf->spare;
11573 + if (disk->dev == dev) {
11574 + printk (KERN_ALERT
11575 + "raid5: Disk failure on spare %s\n",
11576 + partition_name (dev));
11577 + if (!conf->spare->operational) {
11581 + disk->operational = 0;
11582 + disk->write_only = 0;
11583 + conf->spare = NULL;
11584 + mark_disk_faulty(sb->disks+disk->number);
11585 + mark_disk_nonsync(sb->disks+disk->number);
11586 + mark_disk_inactive(sb->disks+disk->number);
11587 + sb->spare_disks--;
11588 + sb->working_disks--;
11589 + sb->failed_disks++;
11598 @@ -634,12 +667,12 @@
11599 static inline unsigned long
11600 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
11601 unsigned int * dd_idx, unsigned int * pd_idx,
11602 - struct raid5_data *raid_conf)
11603 + raid5_conf_t *conf)
11605 unsigned int stripe;
11606 int chunk_number, chunk_offset;
11607 unsigned long new_sector;
11608 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
11609 + int sectors_per_chunk = conf->chunk_size >> 9;
11611 /* First compute the information on this sector */
11613 @@ -662,9 +695,9 @@
11615 * Select the parity disk based on the user selected algorithm.
11617 - if (raid_conf->level == 4)
11618 + if (conf->level == 4)
11619 *pd_idx = data_disks;
11620 - else switch (raid_conf->algorithm) {
11621 + else switch (conf->algorithm) {
11622 case ALGORITHM_LEFT_ASYMMETRIC:
11623 *pd_idx = data_disks - stripe % raid_disks;
11624 if (*dd_idx >= *pd_idx)
11625 @@ -684,7 +717,7 @@
11626 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
11629 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11630 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11634 @@ -705,16 +738,16 @@
11636 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
11638 - struct raid5_data *raid_conf = sh->raid_conf;
11639 - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
11640 + raid5_conf_t *conf = sh->raid_conf;
11641 + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
11642 unsigned long new_sector = sh->sector, check;
11643 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
11644 + int sectors_per_chunk = conf->chunk_size >> 9;
11645 unsigned long stripe = new_sector / sectors_per_chunk;
11646 int chunk_offset = new_sector % sectors_per_chunk;
11647 int chunk_number, dummy1, dummy2, dd_idx = i;
11648 unsigned long r_sector, blocknr;
11650 - switch (raid_conf->algorithm) {
11651 + switch (conf->algorithm) {
11652 case ALGORITHM_LEFT_ASYMMETRIC:
11653 case ALGORITHM_RIGHT_ASYMMETRIC:
11654 if (i > sh->pd_idx)
11655 @@ -727,14 +760,14 @@
11656 i -= (sh->pd_idx + 1);
11659 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11660 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11663 chunk_number = stripe * data_disks + i;
11664 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
11665 blocknr = r_sector / (sh->size >> 9);
11667 - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
11668 + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
11669 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
11670 printk("compute_blocknr: map not correct\n");
11672 @@ -742,36 +775,11 @@
11676 -#ifdef HAVE_ARCH_XORBLOCK
11677 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11679 - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
11682 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11684 - long lines = dest->b_size / (sizeof (long)) / 8, i;
11685 - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
11687 - for (i = lines; i > 0; i--) {
11688 - *(destp + 0) ^= *(sourcep + 0);
11689 - *(destp + 1) ^= *(sourcep + 1);
11690 - *(destp + 2) ^= *(sourcep + 2);
11691 - *(destp + 3) ^= *(sourcep + 3);
11692 - *(destp + 4) ^= *(sourcep + 4);
11693 - *(destp + 5) ^= *(sourcep + 5);
11694 - *(destp + 6) ^= *(sourcep + 6);
11695 - *(destp + 7) ^= *(sourcep + 7);
11702 static void compute_block(struct stripe_head *sh, int dd_idx)
11704 - struct raid5_data *raid_conf = sh->raid_conf;
11705 - int i, disks = raid_conf->raid_disks;
11706 + raid5_conf_t *conf = sh->raid_conf;
11707 + int i, count, disks = conf->raid_disks;
11708 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11710 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
11712 @@ -780,69 +788,100 @@
11713 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
11715 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
11716 + bh_ptr[0] = sh->bh_old[dd_idx];
11718 for (i = 0; i < disks; i++) {
11721 if (sh->bh_old[i]) {
11722 - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
11725 + bh_ptr[count++] = sh->bh_old[i];
11727 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
11729 + if (count == MAX_XOR_BLOCKS) {
11730 + xor_block(count, &bh_ptr[0]);
11735 + xor_block(count, &bh_ptr[0]);
11737 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
11740 static void compute_parity(struct stripe_head *sh, int method)
11742 - struct raid5_data *raid_conf = sh->raid_conf;
11743 - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
11744 + raid5_conf_t *conf = sh->raid_conf;
11745 + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
11746 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11748 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
11750 for (i = 0; i < disks; i++) {
11751 if (i == pd_idx || !sh->bh_new[i])
11753 if (!sh->bh_copy[i])
11754 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
11755 raid5_build_block(sh, sh->bh_copy[i], i);
11756 + if (!buffer_lowprio(sh->bh_new[i]))
11759 + mark_buffer_lowprio(sh->bh_copy[i]);
11760 mark_buffer_clean(sh->bh_new[i]);
11761 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
11763 if (sh->bh_copy[pd_idx] == NULL)
11764 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
11765 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
11767 + mark_buffer_lowprio(sh->bh_copy[pd_idx]);
11769 if (method == RECONSTRUCT_WRITE) {
11770 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
11771 + bh_ptr[0] = sh->bh_copy[pd_idx];
11773 for (i = 0; i < disks; i++) {
11774 if (i == sh->pd_idx)
11776 if (sh->bh_new[i]) {
11777 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11779 + bh_ptr[count++] = sh->bh_copy[i];
11780 + } else if (sh->bh_old[i]) {
11781 + bh_ptr[count++] = sh->bh_old[i];
11783 - if (sh->bh_old[i]) {
11784 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11786 + if (count == MAX_XOR_BLOCKS) {
11787 + xor_block(count, &bh_ptr[0]);
11791 + if (count != 1) {
11792 + xor_block(count, &bh_ptr[0]);
11794 } else if (method == READ_MODIFY_WRITE) {
11795 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
11796 + bh_ptr[0] = sh->bh_copy[pd_idx];
11798 for (i = 0; i < disks; i++) {
11799 if (i == sh->pd_idx)
11801 if (sh->bh_new[i] && sh->bh_old[i]) {
11802 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11803 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11805 + bh_ptr[count++] = sh->bh_copy[i];
11806 + bh_ptr[count++] = sh->bh_old[i];
11808 + if (count >= (MAX_XOR_BLOCKS - 1)) {
11809 + xor_block(count, &bh_ptr[0]);
11813 + if (count != 1) {
11814 + xor_block(count, &bh_ptr[0]);
11817 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
11820 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
11822 - struct raid5_data *raid_conf = sh->raid_conf;
11823 + raid5_conf_t *conf = sh->raid_conf;
11824 struct buffer_head *bh_req;
11826 if (sh->bh_new[dd_idx]) {
11827 @@ -860,19 +899,22 @@
11828 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
11829 sh->phase = PHASE_BEGIN;
11830 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
11831 - raid_conf->nr_pending_stripes++;
11832 - atomic_inc(&raid_conf->nr_handle);
11833 + conf->nr_pending_stripes++;
11834 + atomic_inc(&conf->nr_handle);
11836 sh->bh_new[dd_idx] = bh;
11837 sh->bh_req[dd_idx] = bh_req;
11838 sh->cmd_new[dd_idx] = rw;
11839 sh->new[dd_idx] = 1;
11841 + if (buffer_lowprio(bh))
11842 + mark_buffer_lowprio(bh_req);
11845 static void complete_stripe(struct stripe_head *sh)
11847 - struct raid5_data *raid_conf = sh->raid_conf;
11848 - int disks = raid_conf->raid_disks;
11849 + raid5_conf_t *conf = sh->raid_conf;
11850 + int disks = conf->raid_disks;
11853 PRINTK(("complete_stripe %lu\n", sh->sector));
11854 @@ -909,6 +951,22 @@
11859 +static int is_stripe_lowprio(struct stripe_head *sh, int disks)
11861 + int i, lowprio = 1;
11863 + for (i = 0; i < disks; i++) {
11864 + if (sh->bh_new[i])
11865 + if (!buffer_lowprio(sh->bh_new[i]))
11867 + if (sh->bh_old[i])
11868 + if (!buffer_lowprio(sh->bh_old[i]))
11875 * handle_stripe() is our main logic routine. Note that:
11877 @@ -919,28 +977,27 @@
11878 * 2. We should be careful to set sh->nr_pending whenever we sleep,
11879 * to prevent re-entry of handle_stripe() for the same sh.
11881 - * 3. raid_conf->failed_disks and disk->operational can be changed
11882 + * 3. conf->failed_disks and disk->operational can be changed
11883 * from an interrupt. This complicates things a bit, but it allows
11884 * us to stop issuing requests for a failed drive as soon as possible.
11886 static void handle_stripe(struct stripe_head *sh)
11888 - struct raid5_data *raid_conf = sh->raid_conf;
11889 - struct md_dev *mddev = raid_conf->mddev;
11890 - int minor = (int) (mddev - md_dev);
11891 + raid5_conf_t *conf = sh->raid_conf;
11892 + mddev_t *mddev = conf->mddev;
11893 struct buffer_head *bh;
11894 - int disks = raid_conf->raid_disks;
11895 - int i, nr = 0, nr_read = 0, nr_write = 0;
11896 + int disks = conf->raid_disks;
11897 + int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
11898 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
11899 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
11900 int reading = 0, nr_writing = 0;
11901 int method1 = INT_MAX, method2 = INT_MAX;
11903 unsigned long flags;
11904 - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
11905 + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
11907 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
11908 - if (sh->nr_pending) {
11909 + if (md_atomic_read(&sh->nr_pending)) {
11910 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
11913 @@ -949,9 +1006,9 @@
11917 - atomic_dec(&raid_conf->nr_handle);
11918 + atomic_dec(&conf->nr_handle);
11920 - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11921 + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11922 printk("raid5: restarting stripe %lu\n", sh->sector);
11923 sh->phase = PHASE_BEGIN;
11925 @@ -969,11 +1026,11 @@
11928 for (i = 0; i < disks; i++) {
11929 - operational[i] = raid_conf->disks[i].operational;
11930 - if (i == sh->pd_idx && raid_conf->resync_parity)
11931 + operational[i] = conf->disks[i].operational;
11932 + if (i == sh->pd_idx && conf->resync_parity)
11933 operational[i] = 0;
11935 - failed_disks = raid_conf->failed_disks;
11936 + failed_disks = conf->failed_disks;
11937 restore_flags(flags);
11939 if (failed_disks > 1) {
11940 @@ -1017,7 +1074,7 @@
11943 if (nr_write && nr_read)
11944 - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11945 + printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11949 @@ -1030,7 +1087,7 @@
11952 block = (int) compute_blocknr(sh, i);
11953 - bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
11954 + bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
11955 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
11956 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
11957 add_stripe_bh(sh, bh, i, WRITE);
11958 @@ -1064,21 +1121,22 @@
11960 if (!method1 || !method2) {
11962 - sh->nr_pending++;
11963 + lowprio = is_stripe_lowprio(sh, disks);
11964 + atomic_inc(&sh->nr_pending);
11965 sh->phase = PHASE_WRITE;
11966 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
11967 for (i = 0; i < disks; i++) {
11968 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11969 + if (!operational[i] && !conf->spare && !conf->resync_parity)
11971 if (i == sh->pd_idx || sh->bh_new[i])
11975 - sh->nr_pending = nr_writing;
11976 - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
11977 + md_atomic_set(&sh->nr_pending, nr_writing);
11978 + PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
11980 for (i = 0; i < disks; i++) {
11981 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11982 + if (!operational[i] && !conf->spare && !conf->resync_parity)
11984 bh = sh->bh_copy[i];
11985 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
11986 @@ -1089,18 +1147,30 @@
11987 bh->b_state |= (1<<BH_Dirty);
11988 PRINTK(("making request for buffer %d\n", i));
11989 clear_bit(BH_Lock, &bh->b_state);
11990 - if (!operational[i] && !raid_conf->resync_parity) {
11991 - bh->b_rdev = raid_conf->spare->dev;
11992 - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
11994 - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
11995 + if (!operational[i] && !conf->resync_parity) {
11996 + bh->b_rdev = conf->spare->dev;
11997 + make_request(MAJOR(conf->spare->dev), WRITE, bh);
12000 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
12002 + if (!lowprio || (i==sh->pd_idx))
12003 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
12005 + mark_buffer_clean(bh);
12006 + raid5_end_request(bh,1);
12017 - sh->nr_pending++;
12018 + lowprio = is_stripe_lowprio(sh, disks);
12019 + atomic_inc(&sh->nr_pending);
12020 if (method1 < method2) {
12021 sh->write_method = RECONSTRUCT_WRITE;
12022 for (i = 0; i < disks; i++) {
12023 @@ -1110,6 +1180,8 @@
12025 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12026 raid5_build_block(sh, sh->bh_old[i], i);
12028 + mark_buffer_lowprio(sh->bh_old[i]);
12032 @@ -1121,19 +1193,21 @@
12034 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12035 raid5_build_block(sh, sh->bh_old[i], i);
12037 + mark_buffer_lowprio(sh->bh_old[i]);
12041 sh->phase = PHASE_READ_OLD;
12042 - sh->nr_pending = reading;
12043 - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
12044 + md_atomic_set(&sh->nr_pending, reading);
12045 + PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12046 for (i = 0; i < disks; i++) {
12047 if (!sh->bh_old[i])
12049 if (buffer_uptodate(sh->bh_old[i]))
12051 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12052 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12053 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12057 @@ -1141,7 +1215,8 @@
12059 method1 = nr_read - nr_cache_overwrite;
12061 - sh->nr_pending++;
12062 + lowprio = is_stripe_lowprio(sh,disks);
12063 + atomic_inc(&sh->nr_pending);
12065 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
12066 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
12067 @@ -1149,18 +1224,22 @@
12068 for (i = 0; i < disks; i++) {
12069 if (!sh->bh_new[i])
12071 - if (!sh->bh_old[i])
12072 + if (!sh->bh_old[i]) {
12073 compute_block(sh, i);
12075 + mark_buffer_lowprio
12078 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12080 - sh->nr_pending--;
12081 + atomic_dec(&sh->nr_pending);
12082 complete_stripe(sh);
12085 if (nr_failed_overwrite) {
12086 sh->phase = PHASE_READ_OLD;
12087 - sh->nr_pending = (disks - 1) - nr_cache;
12088 - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
12089 + md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
12090 + PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12091 for (i = 0; i < disks; i++) {
12094 @@ -1168,13 +1247,16 @@
12096 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12097 raid5_build_block(sh, sh->bh_old[i], i);
12099 + mark_buffer_lowprio(sh->bh_old[i]);
12100 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12101 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12102 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12105 sh->phase = PHASE_READ;
12106 - sh->nr_pending = nr_read - nr_cache_overwrite;
12107 - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
12108 + md_atomic_set(&sh->nr_pending,
12109 + nr_read - nr_cache_overwrite);
12110 + PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12111 for (i = 0; i < disks; i++) {
12112 if (!sh->bh_new[i])
12114 @@ -1182,16 +1264,16 @@
12115 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12118 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
12119 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
12125 -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
12126 +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
12128 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12129 - const unsigned int raid_disks = raid_conf->raid_disks;
12130 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12131 + const unsigned int raid_disks = conf->raid_disks;
12132 const unsigned int data_disks = raid_disks - 1;
12133 unsigned int dd_idx, pd_idx;
12134 unsigned long new_sector;
12135 @@ -1202,15 +1284,15 @@
12136 if (rw == WRITEA) rw = WRITE;
12138 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
12139 - &dd_idx, &pd_idx, raid_conf);
12140 + &dd_idx, &pd_idx, conf);
12142 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
12144 - sh = get_stripe(raid_conf, new_sector, bh->b_size);
12145 + sh = get_stripe(conf, new_sector, bh->b_size);
12146 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
12147 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
12149 - if (!sh->nr_pending)
12150 + if (!md_atomic_read(&sh->nr_pending))
12154 @@ -1221,24 +1303,24 @@
12155 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
12156 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
12158 - md_wakeup_thread(raid_conf->thread);
12159 + md_wakeup_thread(conf->thread);
12160 wait_on_stripe(sh);
12163 add_stripe_bh(sh, bh, dd_idx, rw);
12165 - md_wakeup_thread(raid_conf->thread);
12166 + md_wakeup_thread(conf->thread);
12170 static void unplug_devices(struct stripe_head *sh)
12173 - struct raid5_data *raid_conf = sh->raid_conf;
12174 + raid5_conf_t *conf = sh->raid_conf;
12177 - for (i = 0; i < raid_conf->raid_disks; i++)
12178 - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
12179 + for (i = 0; i < conf->raid_disks; i++)
12180 + unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
12184 @@ -1252,8 +1334,8 @@
12185 static void raid5d (void *data)
12187 struct stripe_head *sh;
12188 - struct raid5_data *raid_conf = data;
12189 - struct md_dev *mddev = raid_conf->mddev;
12190 + raid5_conf_t *conf = data;
12191 + mddev_t *mddev = conf->mddev;
12192 int i, handled = 0, unplug = 0;
12193 unsigned long flags;
12195 @@ -1261,47 +1343,47 @@
12197 if (mddev->sb_dirty) {
12198 mddev->sb_dirty = 0;
12199 - md_update_sb((int) (mddev - md_dev));
12200 + md_update_sb(mddev);
12202 for (i = 0; i < NR_HASH; i++) {
12204 - sh = raid_conf->stripe_hashtbl[i];
12205 + sh = conf->stripe_hashtbl[i];
12206 for (; sh; sh = sh->hash_next) {
12207 - if (sh->raid_conf != raid_conf)
12208 + if (sh->raid_conf != conf)
12210 if (sh->phase == PHASE_COMPLETE)
12212 - if (sh->nr_pending)
12213 + if (md_atomic_read(&sh->nr_pending))
12215 - if (sh->sector == raid_conf->next_sector) {
12216 - raid_conf->sector_count += (sh->size >> 9);
12217 - if (raid_conf->sector_count >= 128)
12218 + if (sh->sector == conf->next_sector) {
12219 + conf->sector_count += (sh->size >> 9);
12220 + if (conf->sector_count >= 128)
12225 - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
12226 + PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
12227 unplug_devices(sh);
12229 - raid_conf->sector_count = 0;
12230 + conf->sector_count = 0;
12232 - raid_conf->next_sector = sh->sector + (sh->size >> 9);
12233 + conf->next_sector = sh->sector + (sh->size >> 9);
12240 - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
12242 + PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
12245 - if (!atomic_read(&raid_conf->nr_handle))
12246 - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
12247 + if (!md_atomic_read(&conf->nr_handle))
12248 + clear_bit(THREAD_WAKEUP, &conf->thread->flags);
12249 + restore_flags(flags);
12251 PRINTK(("--- raid5d inactive\n"));
12254 -#if SUPPORT_RECONSTRUCTION
12256 * Private kernel thread for parity reconstruction after an unclean
12257 * shutdown. Reconstruction on spare drives in case of a failed drive
12258 @@ -1309,44 +1391,64 @@
12260 static void raid5syncd (void *data)
12262 - struct raid5_data *raid_conf = data;
12263 - struct md_dev *mddev = raid_conf->mddev;
12264 + raid5_conf_t *conf = data;
12265 + mddev_t *mddev = conf->mddev;
12267 - if (!raid_conf->resync_parity)
12268 + if (!conf->resync_parity)
12270 + if (conf->resync_parity == 2)
12272 + down(&mddev->recovery_sem);
12273 + if (md_do_sync(mddev,NULL)) {
12274 + up(&mddev->recovery_sem);
12275 + printk("raid5: resync aborted!\n");
12277 - md_do_sync(mddev);
12278 - raid_conf->resync_parity = 0;
12280 + conf->resync_parity = 0;
12281 + up(&mddev->recovery_sem);
12282 + printk("raid5: resync finished.\n");
12284 -#endif /* SUPPORT_RECONSTRUCTION */
12286 -static int __check_consistency (struct md_dev *mddev, int row)
12287 +static int __check_consistency (mddev_t *mddev, int row)
12289 - struct raid5_data *raid_conf = mddev->private;
12290 + raid5_conf_t *conf = mddev->private;
12292 struct buffer_head *bh[MD_SB_DISKS], tmp;
12293 - int i, rc = 0, nr = 0;
12294 + int i, rc = 0, nr = 0, count;
12295 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
12297 - if (raid_conf->working_disks != raid_conf->raid_disks)
12298 + if (conf->working_disks != conf->raid_disks)
12301 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
12303 + md_clear_page((unsigned long)tmp.b_data);
12304 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
12305 - for (i = 0; i < raid_conf->raid_disks; i++) {
12306 - dev = raid_conf->disks[i].dev;
12307 + for (i = 0; i < conf->raid_disks; i++) {
12308 + dev = conf->disks[i].dev;
12309 set_blocksize(dev, 4096);
12310 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
12314 - if (nr == raid_conf->raid_disks) {
12315 - for (i = 1; i < nr; i++)
12316 - xor_block(&tmp, bh[i]);
12317 + if (nr == conf->raid_disks) {
12318 + bh_ptr[0] = &tmp;
12320 + for (i = 1; i < nr; i++) {
12321 + bh_ptr[count++] = bh[i];
12322 + if (count == MAX_XOR_BLOCKS) {
12323 + xor_block(count, &bh_ptr[0]);
12327 + if (count != 1) {
12328 + xor_block(count, &bh_ptr[0]);
12330 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
12333 - for (i = 0; i < raid_conf->raid_disks; i++) {
12334 - dev = raid_conf->disks[i].dev;
12335 + for (i = 0; i < conf->raid_disks; i++) {
12336 + dev = conf->disks[i].dev;
12340 @@ -1358,285 +1460,607 @@
12344 -static int check_consistency (struct md_dev *mddev)
12345 +static int check_consistency (mddev_t *mddev)
12347 - int size = mddev->sb->size;
12349 + if (__check_consistency(mddev, 0))
12351 + * We are not checking this currently, as it's legitimate to have
12352 + * an inconsistent array, at creation time.
12356 - for (row = 0; row < size; row += size / 8)
12357 - if (__check_consistency(mddev, row))
12362 -static int raid5_run (int minor, struct md_dev *mddev)
12363 +static int raid5_run (mddev_t *mddev)
12365 - struct raid5_data *raid_conf;
12366 + raid5_conf_t *conf;
12367 int i, j, raid_disk, memory;
12368 - md_superblock_t *sb = mddev->sb;
12369 - md_descriptor_t *descriptor;
12370 - struct real_dev *realdev;
12371 + mdp_super_t *sb = mddev->sb;
12372 + mdp_disk_t *desc;
12373 + mdk_rdev_t *rdev;
12374 + struct disk_info *disk;
12375 + struct md_list_head *tmp;
12376 + int start_recovery = 0;
12380 if (sb->level != 5 && sb->level != 4) {
12381 - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
12382 + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
12387 - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
12388 - if ((raid_conf = mddev->private) == NULL)
12389 + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
12390 + if ((conf = mddev->private) == NULL)
12392 - memset (raid_conf, 0, sizeof (*raid_conf));
12393 - raid_conf->mddev = mddev;
12394 + memset (conf, 0, sizeof (*conf));
12395 + conf->mddev = mddev;
12397 - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12398 + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12400 - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12401 + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12403 - init_waitqueue(&raid_conf->wait_for_stripe);
12404 - PRINTK(("raid5_run(%d) called.\n", minor));
12406 - for (i = 0; i < mddev->nb_dev; i++) {
12407 - realdev = &mddev->devices[i];
12408 - if (!realdev->sb) {
12409 - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
12412 + init_waitqueue(&conf->wait_for_stripe);
12413 + PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
12415 + ITERATE_RDEV(mddev,rdev,tmp) {
12417 * This is important -- we are using the descriptor on
12418 * the disk only to get a pointer to the descriptor on
12419 * the main superblock, which might be more recent.
12421 - descriptor = &sb->disks[realdev->sb->descriptor.number];
12422 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
12423 - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
12424 + desc = sb->disks + rdev->desc_nr;
12425 + raid_disk = desc->raid_disk;
12426 + disk = conf->disks + raid_disk;
12428 + if (disk_faulty(desc)) {
12429 + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
12430 + if (!rdev->faulty) {
12434 + disk->number = desc->number;
12435 + disk->raid_disk = raid_disk;
12436 + disk->dev = rdev->dev;
12438 + disk->operational = 0;
12439 + disk->write_only = 0;
12441 + disk->used_slot = 1;
12444 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
12445 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
12446 - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
12448 + if (disk_active(desc)) {
12449 + if (!disk_sync(desc)) {
12450 + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
12454 - raid_disk = descriptor->raid_disk;
12455 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
12456 - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
12457 + if (raid_disk > sb->raid_disks) {
12458 + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
12461 - if (raid_conf->disks[raid_disk].operational) {
12462 - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
12463 + if (disk->operational) {
12464 + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
12467 - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
12468 + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
12470 - raid_conf->disks[raid_disk].number = descriptor->number;
12471 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
12472 - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
12473 - raid_conf->disks[raid_disk].operational = 1;
12474 + disk->number = desc->number;
12475 + disk->raid_disk = raid_disk;
12476 + disk->dev = rdev->dev;
12477 + disk->operational = 1;
12478 + disk->used_slot = 1;
12480 - raid_conf->working_disks++;
12481 + conf->working_disks++;
12484 * Must be a spare disk ..
12486 - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
12487 - raid_disk = descriptor->raid_disk;
12488 - raid_conf->disks[raid_disk].number = descriptor->number;
12489 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
12490 - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
12492 - raid_conf->disks[raid_disk].operational = 0;
12493 - raid_conf->disks[raid_disk].write_only = 0;
12494 - raid_conf->disks[raid_disk].spare = 1;
12497 - raid_conf->raid_disks = sb->raid_disks;
12498 - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
12499 - raid_conf->mddev = mddev;
12500 - raid_conf->chunk_size = sb->chunk_size;
12501 - raid_conf->level = sb->level;
12502 - raid_conf->algorithm = sb->parity_algorithm;
12503 - raid_conf->max_nr_stripes = NR_STRIPES;
12504 + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
12505 + disk->number = desc->number;
12506 + disk->raid_disk = raid_disk;
12507 + disk->dev = rdev->dev;
12509 - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
12510 - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
12512 + disk->operational = 0;
12513 + disk->write_only = 0;
12515 + disk->used_slot = 1;
12518 - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
12519 - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
12521 + for (i = 0; i < MD_SB_DISKS; i++) {
12522 + desc = sb->disks + i;
12523 + raid_disk = desc->raid_disk;
12524 + disk = conf->disks + raid_disk;
12526 + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
12527 + !conf->disks[raid_disk].used_slot) {
12529 + disk->number = desc->number;
12530 + disk->raid_disk = raid_disk;
12531 + disk->dev = MKDEV(0,0);
12533 + disk->operational = 0;
12534 + disk->write_only = 0;
12536 + disk->used_slot = 1;
12540 + conf->raid_disks = sb->raid_disks;
12542 + * 0 for a fully functional array, 1 for a degraded array.
12544 + conf->failed_disks = conf->raid_disks - conf->working_disks;
12545 + conf->mddev = mddev;
12546 + conf->chunk_size = sb->chunk_size;
12547 + conf->level = sb->level;
12548 + conf->algorithm = sb->layout;
12549 + conf->max_nr_stripes = NR_STRIPES;
12552 + for (i = 0; i < conf->raid_disks; i++) {
12553 + if (!conf->disks[i].used_slot) {
12559 + if (!conf->chunk_size || conf->chunk_size % 4) {
12560 + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
12563 - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12564 - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
12565 + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12566 + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
12569 - if (raid_conf->failed_disks > 1) {
12570 - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
12571 + if (conf->failed_disks > 1) {
12572 + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
12576 - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
12577 - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
12578 - sb->state |= 1 << MD_SB_ERRORS;
12580 + if (conf->working_disks != sb->raid_disks) {
12581 + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
12582 + start_recovery = 1;
12585 - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
12586 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12588 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
12589 + check_consistency(mddev)) {
12590 + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
12591 + sb->state &= ~(1 << MD_SB_CLEAN);
12594 -#if SUPPORT_RECONSTRUCTION
12595 - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
12596 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12599 + const char * name = "raid5d";
12601 + conf->thread = md_register_thread(raid5d, conf, name);
12602 + if (!conf->thread) {
12603 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12607 -#endif /* SUPPORT_RECONSTRUCTION */
12609 - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
12610 - raid_conf->raid_disks * (sizeof(struct buffer_head) +
12611 + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
12612 + conf->raid_disks * (sizeof(struct buffer_head) +
12613 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
12614 - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
12615 + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
12616 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
12617 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12618 + shrink_stripes(conf, conf->max_nr_stripes);
12621 - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
12622 + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
12625 * Regenerate the "device is in sync with the raid set" bit for
12628 - for (i = 0; i < sb->nr_disks ; i++) {
12629 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
12630 + for (i = 0; i < MD_SB_DISKS ; i++) {
12631 + mark_disk_nonsync(sb->disks + i);
12632 for (j = 0; j < sb->raid_disks; j++) {
12633 - if (!raid_conf->disks[j].operational)
12634 + if (!conf->disks[j].operational)
12636 - if (sb->disks[i].number == raid_conf->disks[j].number)
12637 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
12638 + if (sb->disks[i].number == conf->disks[j].number)
12639 + mark_disk_sync(sb->disks + i);
12642 - sb->active_disks = raid_conf->working_disks;
12643 + sb->active_disks = conf->working_disks;
12645 if (sb->active_disks == sb->raid_disks)
12646 - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12647 + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12649 - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12650 + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12652 + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
12653 + const char * name = "raid5syncd";
12655 + conf->resync_thread = md_register_thread(raid5syncd, conf,name);
12656 + if (!conf->resync_thread) {
12657 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12661 - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
12662 - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
12663 - raid_conf->resync_parity = 1;
12664 -#if SUPPORT_RECONSTRUCTION
12665 - md_wakeup_thread(raid_conf->resync_thread);
12666 -#endif /* SUPPORT_RECONSTRUCTION */
12667 + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
12668 + conf->resync_parity = 1;
12669 + md_wakeup_thread(conf->resync_thread);
12672 + print_raid5_conf(conf);
12673 + if (start_recovery)
12674 + md_recover_arrays();
12675 + print_raid5_conf(conf);
12677 /* Ok, everything is just fine now */
12681 - if (raid_conf->stripe_hashtbl)
12682 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12683 - kfree(raid_conf);
12685 + print_raid5_conf(conf);
12686 + if (conf->stripe_hashtbl)
12687 + free_pages((unsigned long) conf->stripe_hashtbl,
12688 + HASH_PAGES_ORDER);
12691 mddev->private = NULL;
12692 - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12693 + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
12698 -static int raid5_stop (int minor, struct md_dev *mddev)
12699 +static int raid5_stop_resync (mddev_t *mddev)
12701 + raid5_conf_t *conf = mddev_to_conf(mddev);
12702 + mdk_thread_t *thread = conf->resync_thread;
12705 + if (conf->resync_parity) {
12706 + conf->resync_parity = 2;
12707 + md_interrupt_thread(thread);
12708 + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
12716 +static int raid5_restart_resync (mddev_t *mddev)
12718 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12719 + raid5_conf_t *conf = mddev_to_conf(mddev);
12721 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
12722 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12723 - md_unregister_thread(raid_conf->thread);
12724 -#if SUPPORT_RECONSTRUCTION
12725 - md_unregister_thread(raid_conf->resync_thread);
12726 -#endif /* SUPPORT_RECONSTRUCTION */
12727 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12728 - kfree(raid_conf);
12729 + if (conf->resync_parity) {
12730 + if (!conf->resync_thread) {
12734 + printk("raid5: waking up raid5resync.\n");
12735 + conf->resync_parity = 1;
12736 + md_wakeup_thread(conf->resync_thread);
12739 + printk("raid5: no restart-resync needed.\n");
12744 +static int raid5_stop (mddev_t *mddev)
12746 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12748 + shrink_stripe_cache(conf, conf->max_nr_stripes);
12749 + shrink_stripes(conf, conf->max_nr_stripes);
12750 + md_unregister_thread(conf->thread);
12751 + if (conf->resync_thread)
12752 + md_unregister_thread(conf->resync_thread);
12753 + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
12755 mddev->private = NULL;
12760 -static int raid5_status (char *page, int minor, struct md_dev *mddev)
12761 +static int raid5_status (char *page, mddev_t *mddev)
12763 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12764 - md_superblock_t *sb = mddev->sb;
12765 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12766 + mdp_super_t *sb = mddev->sb;
12769 - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
12770 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
12771 - for (i = 0; i < raid_conf->raid_disks; i++)
12772 - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
12773 + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
12774 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
12775 + for (i = 0; i < conf->raid_disks; i++)
12776 + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
12777 sz += sprintf (page+sz, "]");
12781 -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
12782 +static void print_raid5_conf (raid5_conf_t *conf)
12785 + struct disk_info *tmp;
12787 + printk("RAID5 conf printout:\n");
12789 + printk("(conf==NULL)\n");
12792 + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
12793 + conf->working_disks, conf->failed_disks);
12795 + for (i = 0; i < MD_SB_DISKS; i++) {
12796 + tmp = conf->disks + i;
12797 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
12798 + i, tmp->spare,tmp->operational,
12799 + tmp->number,tmp->raid_disk,tmp->used_slot,
12800 + partition_name(tmp->dev));
12804 +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
12806 - int i = 0, failed_disk = -1;
12807 - struct raid5_data *raid_conf = mddev->private;
12808 - struct disk_info *disk = raid_conf->disks;
12810 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
12811 + raid5_conf_t *conf = mddev->private;
12812 + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
12813 unsigned long flags;
12814 - md_superblock_t *sb = mddev->sb;
12815 - md_descriptor_t *descriptor;
12816 + mdp_super_t *sb = mddev->sb;
12817 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
12819 - for (i = 0; i < MD_SB_DISKS; i++, disk++) {
12820 - if (disk->spare && disk->number == spare->number)
12825 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
12826 - if (!disk->operational)
12828 - if (failed_disk == -1)
12833 + print_raid5_conf(conf);
12835 + * find the disk ...
12838 - case SPARE_WRITE:
12839 - disk->operational = 1;
12840 - disk->write_only = 1;
12841 - raid_conf->spare = disk;
12843 - case SPARE_INACTIVE:
12844 - disk->operational = 0;
12845 - disk->write_only = 0;
12846 - raid_conf->spare = NULL;
12848 - case SPARE_ACTIVE:
12850 - disk->write_only = 0;
12852 - descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
12853 - i = spare->raid_disk;
12854 - disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
12855 - if (disk->raid_disk != failed_disk)
12856 - printk("raid5: disk->raid_disk != failed_disk");
12857 - descriptor->raid_disk = i;
12859 - raid_conf->spare = NULL;
12860 - raid_conf->working_disks++;
12861 - raid_conf->failed_disks--;
12862 - raid_conf->disks[failed_disk] = *disk;
12865 - printk("raid5_mark_spare: bug: state == %d\n", state);
12866 - restore_flags(flags);
12868 + case DISKOP_SPARE_ACTIVE:
12871 + * Find the failed disk within the RAID5 configuration ...
12872 + * (this can only be in the first conf->raid_disks part)
12874 + for (i = 0; i < conf->raid_disks; i++) {
12875 + tmp = conf->disks + i;
12876 + if ((!tmp->operational && !tmp->spare) ||
12877 + !tmp->used_slot) {
12883 + * When we activate a spare disk we _must_ have a disk in
12884 + * the lower (active) part of the array to replace.
12886 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
12891 + /* fall through */
12893 + case DISKOP_SPARE_WRITE:
12894 + case DISKOP_SPARE_INACTIVE:
12897 + * Find the spare disk ... (can only be in the 'high'
12898 + * area of the array)
12900 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12901 + tmp = conf->disks + i;
12902 + if (tmp->spare && tmp->number == (*d)->number) {
12907 + if (spare_disk == -1) {
12914 + case DISKOP_HOT_REMOVE_DISK:
12916 + for (i = 0; i < MD_SB_DISKS; i++) {
12917 + tmp = conf->disks + i;
12918 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
12919 + if (tmp->operational) {
12923 + removed_disk = i;
12927 + if (removed_disk == -1) {
12934 + case DISKOP_HOT_ADD_DISK:
12936 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12937 + tmp = conf->disks + i;
12938 + if (!tmp->used_slot) {
12943 + if (added_disk == -1) {
12953 + * Switch the spare disk to write-only mode:
12955 + case DISKOP_SPARE_WRITE:
12956 + if (conf->spare) {
12961 + sdisk = conf->disks + spare_disk;
12962 + sdisk->operational = 1;
12963 + sdisk->write_only = 1;
12964 + conf->spare = sdisk;
12967 + * Deactivate a spare disk:
12969 + case DISKOP_SPARE_INACTIVE:
12970 + sdisk = conf->disks + spare_disk;
12971 + sdisk->operational = 0;
12972 + sdisk->write_only = 0;
12974 + * Was the spare being resynced?
12976 + if (conf->spare == sdisk)
12977 + conf->spare = NULL;
12980 + * Activate (mark read-write) the (now sync) spare disk,
12981 + * which means we switch it's 'raid position' (->raid_disk)
12982 + * with the failed disk. (only the first 'conf->raid_disks'
12983 + * slots are used for 'real' disks and we must preserve this
12986 + case DISKOP_SPARE_ACTIVE:
12987 + if (!conf->spare) {
12992 + sdisk = conf->disks + spare_disk;
12993 + fdisk = conf->disks + failed_disk;
12995 + spare_desc = &sb->disks[sdisk->number];
12996 + failed_desc = &sb->disks[fdisk->number];
12998 + if (spare_desc != *d) {
13004 + if (spare_desc->raid_disk != sdisk->raid_disk) {
13010 + if (sdisk->raid_disk != spare_disk) {
13016 + if (failed_desc->raid_disk != fdisk->raid_disk) {
13022 + if (fdisk->raid_disk != failed_disk) {
13029 + * do the switch finally
13031 + xchg_values(*spare_desc, *failed_desc);
13032 + xchg_values(*fdisk, *sdisk);
13035 + * (careful, 'failed' and 'spare' are switched from now on)
13037 + * we want to preserve linear numbering and we want to
13038 + * give the proper raid_disk number to the now activated
13039 + * disk. (this means we switch back these values)
13042 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
13043 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
13044 + xchg_values(spare_desc->number, failed_desc->number);
13045 + xchg_values(sdisk->number, fdisk->number);
13047 + *d = failed_desc;
13049 + if (sdisk->dev == MKDEV(0,0))
13050 + sdisk->used_slot = 0;
13053 + * this really activates the spare.
13055 + fdisk->spare = 0;
13056 + fdisk->write_only = 0;
13059 + * if we activate a spare, we definitely replace a
13060 + * non-operational disk slot in the 'low' area of
13061 + * the disk array.
13063 + conf->failed_disks--;
13064 + conf->working_disks++;
13065 + conf->spare = NULL;
13069 + case DISKOP_HOT_REMOVE_DISK:
13070 + rdisk = conf->disks + removed_disk;
13072 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
13077 + rdisk->dev = MKDEV(0,0);
13078 + rdisk->used_slot = 0;
13082 + case DISKOP_HOT_ADD_DISK:
13083 + adisk = conf->disks + added_disk;
13086 + if (added_disk != added_desc->number) {
13092 + adisk->number = added_desc->number;
13093 + adisk->raid_disk = added_desc->raid_disk;
13094 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
13096 + adisk->operational = 0;
13097 + adisk->write_only = 0;
13098 + adisk->spare = 1;
13099 + adisk->used_slot = 1;
13110 restore_flags(flags);
13112 + print_raid5_conf(conf);
13116 -static struct md_personality raid5_personality=
13117 +static mdk_personality_t raid5_personality=
13121 @@ -1648,14 +2072,19 @@
13122 NULL, /* no ioctls */
13125 - /* raid5_hot_add_disk, */ NULL,
13126 - /* raid1_hot_remove_drive */ NULL,
13129 + raid5_stop_resync,
13130 + raid5_restart_resync
13133 int raid5_init (void)
13135 - return register_md_personality (RAID5, &raid5_personality);
13138 + err = register_md_personality (RAID5, &raid5_personality);
13145 --- linux/drivers/block/translucent.c.orig Tue Jan 16 13:42:04 2001
13146 +++ linux/drivers/block/translucent.c Tue Jan 16 13:42:04 2001
13149 + translucent.c : Translucent RAID driver for Linux
13150 + Copyright (C) 1998 Ingo Molnar
13152 + Translucent mode management functions.
13154 + This program is free software; you can redistribute it and/or modify
13155 + it under the terms of the GNU General Public License as published by
13156 + the Free Software Foundation; either version 2, or (at your option)
13157 + any later version.
13159 + You should have received a copy of the GNU General Public License
13160 + (for example /usr/src/linux/COPYING); if not, write to the Free
13161 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13164 +#include <linux/module.h>
13166 +#include <linux/raid/md.h>
13167 +#include <linux/malloc.h>
13169 +#include <linux/raid/translucent.h>
13171 +#define MAJOR_NR MD_MAJOR
13173 +#define MD_PERSONALITY
13175 +static int translucent_run (mddev_t *mddev)
13177 + translucent_conf_t *conf;
13178 + mdk_rdev_t *rdev;
13181 + MOD_INC_USE_COUNT;
13183 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
13186 + mddev->private = conf;
13188 + if (mddev->nb_dev != 2) {
13189 + printk("translucent: this mode needs 2 disks, aborting!\n");
13193 + if (md_check_ordering(mddev)) {
13194 + printk("translucent: disks are not ordered, aborting!\n");
13198 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
13199 + dev_info_t *disk = conf->disks + i;
13201 + disk->dev = rdev->dev;
13202 + disk->size = rdev->size;
13211 + MOD_DEC_USE_COUNT;
13215 +static int translucent_stop (mddev_t *mddev)
13217 + translucent_conf_t *conf = mddev_to_conf(mddev);
13221 + MOD_DEC_USE_COUNT;
13227 +static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
13228 + unsigned long *rsector, unsigned long size)
13230 + translucent_conf_t *conf = mddev_to_conf(mddev);
13232 + *rdev = conf->disks[0].dev;
13237 +static int translucent_status (char *page, mddev_t *mddev)
13241 + sz += sprintf(page+sz, " %d%% full", 10);
13246 +static mdk_personality_t translucent_personality=
13253 + translucent_stop,
13254 + translucent_status,
13265 +md__initfunc(void translucent_init (void))
13267 + register_md_personality (TRANSLUCENT, &translucent_personality);
13272 +int init_module (void)
13274 + return (register_md_personality (TRANSLUCENT, &translucent_personality));
13277 +void cleanup_module (void)
13279 + unregister_md_personality (TRANSLUCENT);
13284 --- linux/drivers/block/xor.c.orig Tue Jan 16 13:42:04 2001
13285 +++ linux/drivers/block/xor.c Tue Jan 16 13:42:04 2001
13288 + * xor.c : Multiple Devices driver for Linux
13290 + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
13293 + * optimized RAID-5 checksumming functions.
13295 + * This program is free software; you can redistribute it and/or modify
13296 + * it under the terms of the GNU General Public License as published by
13297 + * the Free Software Foundation; either version 2, or (at your option)
13298 + * any later version.
13300 + * You should have received a copy of the GNU General Public License
13301 + * (for example /usr/src/linux/COPYING); if not, write to the Free
13302 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13304 +#include <linux/module.h>
13305 +#include <linux/raid/md.h>
13306 +#ifdef __sparc_v9__
13307 +#include <asm/head.h>
13308 +#include <asm/asi.h>
13309 +#include <asm/visasm.h>
13313 + * we use the 'XOR function template' to register multiple xor
13314 + * functions runtime. The kernel measures their speed upon bootup
13315 + * and decides which one to use. (compile-time registration is
13316 + * not enough as certain CPU features like MMX can only be detected
13319 + * this architecture makes it pretty easy to add new routines
13320 + * that are faster on certain CPUs, without killing other CPU's
13321 + * 'native' routine. Although the current routines are belived
13322 + * to be the physically fastest ones on all CPUs tested, but
13323 + * feel free to prove me wrong and add yet another routine =B-)
13327 +#define MAX_XOR_BLOCKS 5
13329 +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
13331 +typedef void (*xor_block_t) XOR_ARGS;
13332 +xor_block_t xor_block = NULL;
13334 +#ifndef __sparc_v9__
13336 +struct xor_block_template;
13338 +struct xor_block_template {
13340 + xor_block_t xor_block;
13342 + struct xor_block_template * next;
13345 +struct xor_block_template * xor_functions = NULL;
13347 +#define XORBLOCK_TEMPLATE(x) \
13348 +static void xor_block_##x XOR_ARGS; \
13349 +static struct xor_block_template t_xor_block_##x = \
13350 + { #x, xor_block_##x, 0, NULL }; \
13351 +static void xor_block_##x XOR_ARGS
13355 +#ifdef CONFIG_X86_XMM
13357 + * Cache avoiding checksumming functions utilizing KNI instructions
13358 + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
13361 +XORBLOCK_TEMPLATE(pIII_kni)
13363 + char xmm_save[16*4];
13365 + int lines = (bh_ptr[0]->b_size>>8);
13367 + __asm__ __volatile__ (
13368 + "movl %%cr0,%0 ;\n\t"
13370 + "movups %%xmm0,(%1) ;\n\t"
13371 + "movups %%xmm1,0x10(%1) ;\n\t"
13372 + "movups %%xmm2,0x20(%1) ;\n\t"
13373 + "movups %%xmm3,0x30(%1) ;\n\t"
13378 +#define OFFS(x) "8*("#x"*2)"
13380 + " prefetcht0 "OFFS(x)"(%1) ;\n"
13382 + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
13384 + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
13386 + " prefetchnta "OFFS(x)"(%2) ;\n"
13388 + " prefetchnta "OFFS(x)"(%3) ;\n"
13390 + " prefetchnta "OFFS(x)"(%4) ;\n"
13392 + " prefetchnta "OFFS(x)"(%5) ;\n"
13394 + " prefetchnta "OFFS(x)"(%6) ;\n"
13395 +#define XO1(x,y) \
13396 + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
13397 +#define XO2(x,y) \
13398 + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
13399 +#define XO3(x,y) \
13400 + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
13401 +#define XO4(x,y) \
13402 + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
13403 +#define XO5(x,y) \
13404 + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
13408 + __asm__ __volatile__ (
13410 +#define BLOCK(i) \
13432 + " .align 32,0x90 ;\n"
13440 + " addl $256, %1 ;\n"
13441 + " addl $256, %2 ;\n"
13447 + "r" (bh_ptr[0]->b_data),
13448 + "r" (bh_ptr[1]->b_data)
13452 + __asm__ __volatile__ (
13454 +#define BLOCK(i) \
13482 + " .align 32,0x90 ;\n"
13490 + " addl $256, %1 ;\n"
13491 + " addl $256, %2 ;\n"
13492 + " addl $256, %3 ;\n"
13497 + "r" (bh_ptr[0]->b_data),
13498 + "r" (bh_ptr[1]->b_data),
13499 + "r" (bh_ptr[2]->b_data)
13503 + __asm__ __volatile__ (
13505 +#define BLOCK(i) \
13539 + " .align 32,0x90 ;\n"
13547 + " addl $256, %1 ;\n"
13548 + " addl $256, %2 ;\n"
13549 + " addl $256, %3 ;\n"
13550 + " addl $256, %4 ;\n"
13556 + "r" (bh_ptr[0]->b_data),
13557 + "r" (bh_ptr[1]->b_data),
13558 + "r" (bh_ptr[2]->b_data),
13559 + "r" (bh_ptr[3]->b_data)
13563 + __asm__ __volatile__ (
13565 +#define BLOCK(i) \
13605 + " .align 32,0x90 ;\n"
13613 + " addl $256, %1 ;\n"
13614 + " addl $256, %2 ;\n"
13615 + " addl $256, %3 ;\n"
13616 + " addl $256, %4 ;\n"
13617 + " addl $256, %5 ;\n"
13623 + "r" (bh_ptr[0]->b_data),
13624 + "r" (bh_ptr[1]->b_data),
13625 + "r" (bh_ptr[2]->b_data),
13626 + "r" (bh_ptr[3]->b_data),
13627 + "r" (bh_ptr[4]->b_data)
13632 + __asm__ __volatile__ (
13634 + "movups (%1),%%xmm0 ;\n\t"
13635 + "movups 0x10(%1),%%xmm1 ;\n\t"
13636 + "movups 0x20(%1),%%xmm2 ;\n\t"
13637 + "movups 0x30(%1),%%xmm3 ;\n\t"
13638 + "movl %0,%%cr0 ;\n\t"
13640 + : "r" (cr0), "r" (xmm_save)
13660 +#endif /* CONFIG_X86_XMM */
13663 + * high-speed RAID5 checksumming functions utilizing MMX instructions
13664 + * Copyright (C) 1998 Ingo Molnar
13666 +XORBLOCK_TEMPLATE(pII_mmx)
13668 + char fpu_save[108];
13669 + int lines = (bh_ptr[0]->b_size>>7);
13671 + if (!(current->flags & PF_USEDFPU))
13672 + __asm__ __volatile__ ( " clts;\n");
13674 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13677 + " movq 8*("#x")(%1), %%mm"#y" ;\n"
13679 + " movq %%mm"#y", 8*("#x")(%1) ;\n"
13680 +#define XO1(x,y) \
13681 + " pxor 8*("#x")(%2), %%mm"#y" ;\n"
13682 +#define XO2(x,y) \
13683 + " pxor 8*("#x")(%3), %%mm"#y" ;\n"
13684 +#define XO3(x,y) \
13685 + " pxor 8*("#x")(%4), %%mm"#y" ;\n"
13686 +#define XO4(x,y) \
13687 + " pxor 8*("#x")(%5), %%mm"#y" ;\n"
13691 + __asm__ __volatile__ (
13693 +#define BLOCK(i) \
13707 + " .align 32,0x90 ;\n"
13715 + " addl $128, %1 ;\n"
13716 + " addl $128, %2 ;\n"
13721 + "r" (bh_ptr[0]->b_data),
13722 + "r" (bh_ptr[1]->b_data)
13726 + __asm__ __volatile__ (
13728 +#define BLOCK(i) \
13746 + " .align 32,0x90 ;\n"
13754 + " addl $128, %1 ;\n"
13755 + " addl $128, %2 ;\n"
13756 + " addl $128, %3 ;\n"
13761 + "r" (bh_ptr[0]->b_data),
13762 + "r" (bh_ptr[1]->b_data),
13763 + "r" (bh_ptr[2]->b_data)
13767 + __asm__ __volatile__ (
13769 +#define BLOCK(i) \
13791 + " .align 32,0x90 ;\n"
13799 + " addl $128, %1 ;\n"
13800 + " addl $128, %2 ;\n"
13801 + " addl $128, %3 ;\n"
13802 + " addl $128, %4 ;\n"
13807 + "r" (bh_ptr[0]->b_data),
13808 + "r" (bh_ptr[1]->b_data),
13809 + "r" (bh_ptr[2]->b_data),
13810 + "r" (bh_ptr[3]->b_data)
13814 + __asm__ __volatile__ (
13816 +#define BLOCK(i) \
13842 + " .align 32,0x90 ;\n"
13850 + " addl $128, %1 ;\n"
13851 + " addl $128, %2 ;\n"
13852 + " addl $128, %3 ;\n"
13853 + " addl $128, %4 ;\n"
13854 + " addl $128, %5 ;\n"
13859 + "r" (bh_ptr[0]->b_data),
13860 + "r" (bh_ptr[1]->b_data),
13861 + "r" (bh_ptr[2]->b_data),
13862 + "r" (bh_ptr[3]->b_data),
13863 + "r" (bh_ptr[4]->b_data)
13868 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
13870 + if (!(current->flags & PF_USEDFPU))
13882 +XORBLOCK_TEMPLATE(p5_mmx)
13884 + char fpu_save[108];
13885 + int lines = (bh_ptr[0]->b_size>>6);
13887 + if (!(current->flags & PF_USEDFPU))
13888 + __asm__ __volatile__ ( " clts;\n");
13890 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13894 + __asm__ __volatile__ (
13896 + " .align 32,0x90 ;\n"
13898 + " movq (%1), %%mm0 ;\n"
13899 + " movq 8(%1), %%mm1 ;\n"
13900 + " pxor (%2), %%mm0 ;\n"
13901 + " movq 16(%1), %%mm2 ;\n"
13902 + " movq %%mm0, (%1) ;\n"
13903 + " pxor 8(%2), %%mm1 ;\n"
13904 + " movq 24(%1), %%mm3 ;\n"
13905 + " movq %%mm1, 8(%1) ;\n"
13906 + " pxor 16(%2), %%mm2 ;\n"
13907 + " movq 32(%1), %%mm4 ;\n"
13908 + " movq %%mm2, 16(%1) ;\n"
13909 + " pxor 24(%2), %%mm3 ;\n"
13910 + " movq 40(%1), %%mm5 ;\n"
13911 + " movq %%mm3, 24(%1) ;\n"
13912 + " pxor 32(%2), %%mm4 ;\n"
13913 + " movq 48(%1), %%mm6 ;\n"
13914 + " movq %%mm4, 32(%1) ;\n"
13915 + " pxor 40(%2), %%mm5 ;\n"
13916 + " movq 56(%1), %%mm7 ;\n"
13917 + " movq %%mm5, 40(%1) ;\n"
13918 + " pxor 48(%2), %%mm6 ;\n"
13919 + " pxor 56(%2), %%mm7 ;\n"
13920 + " movq %%mm6, 48(%1) ;\n"
13921 + " movq %%mm7, 56(%1) ;\n"
13923 + " addl $64, %1 ;\n"
13924 + " addl $64, %2 ;\n"
13930 + "r" (bh_ptr[0]->b_data),
13931 + "r" (bh_ptr[1]->b_data)
13935 + __asm__ __volatile__ (
13937 + " .align 32,0x90 ;\n"
13939 + " movq (%1), %%mm0 ;\n"
13940 + " movq 8(%1), %%mm1 ;\n"
13941 + " pxor (%2), %%mm0 ;\n"
13942 + " movq 16(%1), %%mm2 ;\n"
13943 + " pxor 8(%2), %%mm1 ;\n"
13944 + " pxor (%3), %%mm0 ;\n"
13945 + " pxor 16(%2), %%mm2 ;\n"
13946 + " movq %%mm0, (%1) ;\n"
13947 + " pxor 8(%3), %%mm1 ;\n"
13948 + " pxor 16(%3), %%mm2 ;\n"
13949 + " movq 24(%1), %%mm3 ;\n"
13950 + " movq %%mm1, 8(%1) ;\n"
13951 + " movq 32(%1), %%mm4 ;\n"
13952 + " movq 40(%1), %%mm5 ;\n"
13953 + " pxor 24(%2), %%mm3 ;\n"
13954 + " movq %%mm2, 16(%1) ;\n"
13955 + " pxor 32(%2), %%mm4 ;\n"
13956 + " pxor 24(%3), %%mm3 ;\n"
13957 + " pxor 40(%2), %%mm5 ;\n"
13958 + " movq %%mm3, 24(%1) ;\n"
13959 + " pxor 32(%3), %%mm4 ;\n"
13960 + " pxor 40(%3), %%mm5 ;\n"
13961 + " movq 48(%1), %%mm6 ;\n"
13962 + " movq %%mm4, 32(%1) ;\n"
13963 + " movq 56(%1), %%mm7 ;\n"
13964 + " pxor 48(%2), %%mm6 ;\n"
13965 + " movq %%mm5, 40(%1) ;\n"
13966 + " pxor 56(%2), %%mm7 ;\n"
13967 + " pxor 48(%3), %%mm6 ;\n"
13968 + " pxor 56(%3), %%mm7 ;\n"
13969 + " movq %%mm6, 48(%1) ;\n"
13970 + " movq %%mm7, 56(%1) ;\n"
13972 + " addl $64, %1 ;\n"
13973 + " addl $64, %2 ;\n"
13974 + " addl $64, %3 ;\n"
13980 + "r" (bh_ptr[0]->b_data),
13981 + "r" (bh_ptr[1]->b_data),
13982 + "r" (bh_ptr[2]->b_data)
13986 + __asm__ __volatile__ (
13988 + " .align 32,0x90 ;\n"
13990 + " movq (%1), %%mm0 ;\n"
13991 + " movq 8(%1), %%mm1 ;\n"
13992 + " pxor (%2), %%mm0 ;\n"
13993 + " movq 16(%1), %%mm2 ;\n"
13994 + " pxor 8(%2), %%mm1 ;\n"
13995 + " pxor (%3), %%mm0 ;\n"
13996 + " pxor 16(%2), %%mm2 ;\n"
13997 + " pxor 8(%3), %%mm1 ;\n"
13998 + " pxor (%4), %%mm0 ;\n"
13999 + " movq 24(%1), %%mm3 ;\n"
14000 + " pxor 16(%3), %%mm2 ;\n"
14001 + " pxor 8(%4), %%mm1 ;\n"
14002 + " movq %%mm0, (%1) ;\n"
14003 + " movq 32(%1), %%mm4 ;\n"
14004 + " pxor 24(%2), %%mm3 ;\n"
14005 + " pxor 16(%4), %%mm2 ;\n"
14006 + " movq %%mm1, 8(%1) ;\n"
14007 + " movq 40(%1), %%mm5 ;\n"
14008 + " pxor 32(%2), %%mm4 ;\n"
14009 + " pxor 24(%3), %%mm3 ;\n"
14010 + " movq %%mm2, 16(%1) ;\n"
14011 + " pxor 40(%2), %%mm5 ;\n"
14012 + " pxor 32(%3), %%mm4 ;\n"
14013 + " pxor 24(%4), %%mm3 ;\n"
14014 + " movq %%mm3, 24(%1) ;\n"
14015 + " movq 56(%1), %%mm7 ;\n"
14016 + " movq 48(%1), %%mm6 ;\n"
14017 + " pxor 40(%3), %%mm5 ;\n"
14018 + " pxor 32(%4), %%mm4 ;\n"
14019 + " pxor 48(%2), %%mm6 ;\n"
14020 + " movq %%mm4, 32(%1) ;\n"
14021 + " pxor 56(%2), %%mm7 ;\n"
14022 + " pxor 40(%4), %%mm5 ;\n"
14023 + " pxor 48(%3), %%mm6 ;\n"
14024 + " pxor 56(%3), %%mm7 ;\n"
14025 + " movq %%mm5, 40(%1) ;\n"
14026 + " pxor 48(%4), %%mm6 ;\n"
14027 + " pxor 56(%4), %%mm7 ;\n"
14028 + " movq %%mm6, 48(%1) ;\n"
14029 + " movq %%mm7, 56(%1) ;\n"
14031 + " addl $64, %1 ;\n"
14032 + " addl $64, %2 ;\n"
14033 + " addl $64, %3 ;\n"
14034 + " addl $64, %4 ;\n"
14040 + "r" (bh_ptr[0]->b_data),
14041 + "r" (bh_ptr[1]->b_data),
14042 + "r" (bh_ptr[2]->b_data),
14043 + "r" (bh_ptr[3]->b_data)
14047 + __asm__ __volatile__ (
14049 + " .align 32,0x90 ;\n"
14051 + " movq (%1), %%mm0 ;\n"
14052 + " movq 8(%1), %%mm1 ;\n"
14053 + " pxor (%2), %%mm0 ;\n"
14054 + " pxor 8(%2), %%mm1 ;\n"
14055 + " movq 16(%1), %%mm2 ;\n"
14056 + " pxor (%3), %%mm0 ;\n"
14057 + " pxor 8(%3), %%mm1 ;\n"
14058 + " pxor 16(%2), %%mm2 ;\n"
14059 + " pxor (%4), %%mm0 ;\n"
14060 + " pxor 8(%4), %%mm1 ;\n"
14061 + " pxor 16(%3), %%mm2 ;\n"
14062 + " movq 24(%1), %%mm3 ;\n"
14063 + " pxor (%5), %%mm0 ;\n"
14064 + " pxor 8(%5), %%mm1 ;\n"
14065 + " movq %%mm0, (%1) ;\n"
14066 + " pxor 16(%4), %%mm2 ;\n"
14067 + " pxor 24(%2), %%mm3 ;\n"
14068 + " movq %%mm1, 8(%1) ;\n"
14069 + " pxor 16(%5), %%mm2 ;\n"
14070 + " pxor 24(%3), %%mm3 ;\n"
14071 + " movq 32(%1), %%mm4 ;\n"
14072 + " movq %%mm2, 16(%1) ;\n"
14073 + " pxor 24(%4), %%mm3 ;\n"
14074 + " pxor 32(%2), %%mm4 ;\n"
14075 + " movq 40(%1), %%mm5 ;\n"
14076 + " pxor 24(%5), %%mm3 ;\n"
14077 + " pxor 32(%3), %%mm4 ;\n"
14078 + " pxor 40(%2), %%mm5 ;\n"
14079 + " movq %%mm3, 24(%1) ;\n"
14080 + " pxor 32(%4), %%mm4 ;\n"
14081 + " pxor 40(%3), %%mm5 ;\n"
14082 + " movq 48(%1), %%mm6 ;\n"
14083 + " movq 56(%1), %%mm7 ;\n"
14084 + " pxor 32(%5), %%mm4 ;\n"
14085 + " pxor 40(%4), %%mm5 ;\n"
14086 + " pxor 48(%2), %%mm6 ;\n"
14087 + " pxor 56(%2), %%mm7 ;\n"
14088 + " movq %%mm4, 32(%1) ;\n"
14089 + " pxor 48(%3), %%mm6 ;\n"
14090 + " pxor 56(%3), %%mm7 ;\n"
14091 + " pxor 40(%5), %%mm5 ;\n"
14092 + " pxor 48(%4), %%mm6 ;\n"
14093 + " pxor 56(%4), %%mm7 ;\n"
14094 + " movq %%mm5, 40(%1) ;\n"
14095 + " pxor 48(%5), %%mm6 ;\n"
14096 + " pxor 56(%5), %%mm7 ;\n"
14097 + " movq %%mm6, 48(%1) ;\n"
14098 + " movq %%mm7, 56(%1) ;\n"
14100 + " addl $64, %1 ;\n"
14101 + " addl $64, %2 ;\n"
14102 + " addl $64, %3 ;\n"
14103 + " addl $64, %4 ;\n"
14104 + " addl $64, %5 ;\n"
14110 + "r" (bh_ptr[0]->b_data),
14111 + "r" (bh_ptr[1]->b_data),
14112 + "r" (bh_ptr[2]->b_data),
14113 + "r" (bh_ptr[3]->b_data),
14114 + "r" (bh_ptr[4]->b_data)
14119 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
14121 + if (!(current->flags & PF_USEDFPU))
14124 +#endif /* __i386__ */
14125 +#endif /* !__sparc_v9__ */
14127 +#ifdef __sparc_v9__
14129 + * High speed xor_block operation for RAID4/5 utilizing the
14130 + * UltraSparc Visual Instruction Set.
14132 + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
14135 + * !(((long)dest | (long)sourceN) & (64 - 1)) &&
14136 + * !(len & 127) && len >= 256
14138 + * It is done in pure assembly, as otherwise gcc makes it
14139 + * a non-leaf function, which is not what we want.
14140 + * Also, we don't measure the speeds as on other architectures,
14141 + * as the measuring routine does not take into account cold caches
14142 + * and the fact that xor_block_VIS bypasses the caches.
14143 + * xor_block_32regs might be 5% faster for count 2 if caches are hot
14144 + * and things just right (for count 3 VIS is about as fast as 32regs for
14145 + * hot caches and for count 4 and 5 VIS is faster by good margin always),
14146 + * but I think it is better not to pollute the caches.
14147 + * Actually, if I'd just fight for speed for hot caches, I could
14148 + * write a hybrid VIS/integer routine, which would do always two
14149 + * 64B blocks in VIS and two in IEUs, but I really care more about
14152 +extern void *VISenter(void);
14153 +extern void xor_block_VIS XOR_ARGS;
14155 +void __xor_block_VIS(void)
14158 + .globl xor_block_VIS
14160 + ldx [%%o1 + 0], %%o4
14161 + ldx [%%o1 + 8], %%o3
14162 + ldx [%%o4 + %1], %%g5
14163 + ldx [%%o4 + %0], %%o4
14164 + ldx [%%o3 + %0], %%o3
14166 + andcc %%o5, %2, %%g0
14167 + be,pt %%icc, 297f
14168 + sethi %%hi(%5), %%g1
14169 + jmpl %%g1 + %%lo(%5), %%g7
14170 + add %%g7, 8, %%g7
14171 +297: wr %%g0, %4, %%fprs
14172 + membar #LoadStore|#StoreLoad|#StoreStore
14173 + sub %%g5, 64, %%g5
14174 + ldda [%%o4] %3, %%f0
14175 + ldda [%%o3] %3, %%f16
14177 + bgeu,pt %%xcc, 10f
14181 + sub %%g5, 64, %%g5
14183 + wr %%g0, %3, %%asi
14185 +2: ldda [%%o4 + 64] %%asi, %%f32
14186 + fxor %%f0, %%f16, %%f16
14187 + fxor %%f2, %%f18, %%f18
14188 + fxor %%f4, %%f20, %%f20
14189 + fxor %%f6, %%f22, %%f22
14190 + fxor %%f8, %%f24, %%f24
14191 + fxor %%f10, %%f26, %%f26
14192 + fxor %%f12, %%f28, %%f28
14193 + fxor %%f14, %%f30, %%f30
14194 + stda %%f16, [%%o4] %3
14195 + ldda [%%o3 + 64] %%asi, %%f48
14196 + ldda [%%o4 + 128] %%asi, %%f0
14197 + fxor %%f32, %%f48, %%f48
14198 + fxor %%f34, %%f50, %%f50
14199 + add %%o4, 128, %%o4
14200 + fxor %%f36, %%f52, %%f52
14201 + add %%o3, 128, %%o3
14202 + fxor %%f38, %%f54, %%f54
14203 + subcc %%g5, 128, %%g5
14204 + fxor %%f40, %%f56, %%f56
14205 + fxor %%f42, %%f58, %%f58
14206 + fxor %%f44, %%f60, %%f60
14207 + fxor %%f46, %%f62, %%f62
14208 + stda %%f48, [%%o4 - 64] %%asi
14210 + ldda [%%o3] %3, %%f16
14212 + ldda [%%o4 + 64] %%asi, %%f32
14213 + fxor %%f0, %%f16, %%f16
14214 + fxor %%f2, %%f18, %%f18
14215 + fxor %%f4, %%f20, %%f20
14216 + fxor %%f6, %%f22, %%f22
14217 + fxor %%f8, %%f24, %%f24
14218 + fxor %%f10, %%f26, %%f26
14219 + fxor %%f12, %%f28, %%f28
14220 + fxor %%f14, %%f30, %%f30
14221 + stda %%f16, [%%o4] %3
14222 + ldda [%%o3 + 64] %%asi, %%f48
14224 + fxor %%f32, %%f48, %%f48
14225 + fxor %%f34, %%f50, %%f50
14226 + fxor %%f36, %%f52, %%f52
14227 + fxor %%f38, %%f54, %%f54
14228 + fxor %%f40, %%f56, %%f56
14229 + fxor %%f42, %%f58, %%f58
14230 + fxor %%f44, %%f60, %%f60
14231 + fxor %%f46, %%f62, %%f62
14232 + stda %%f48, [%%o4 + 64] %%asi
14233 + membar #Sync|#StoreStore|#StoreLoad
14234 + wr %%g0, 0, %%fprs
14236 + wr %%g1, %%g0, %%asi
14238 +13: ldx [%%o1 + 16], %%o2
14239 + ldx [%%o2 + %0], %%o2
14241 +3: ldda [%%o2] %3, %%f32
14242 + fxor %%f0, %%f16, %%f48
14243 + fxor %%f2, %%f18, %%f50
14244 + add %%o4, 64, %%o4
14245 + fxor %%f4, %%f20, %%f52
14246 + fxor %%f6, %%f22, %%f54
14247 + add %%o3, 64, %%o3
14248 + fxor %%f8, %%f24, %%f56
14249 + fxor %%f10, %%f26, %%f58
14250 + fxor %%f12, %%f28, %%f60
14251 + fxor %%f14, %%f30, %%f62
14252 + ldda [%%o4] %3, %%f0
14253 + fxor %%f48, %%f32, %%f48
14254 + fxor %%f50, %%f34, %%f50
14255 + fxor %%f52, %%f36, %%f52
14256 + fxor %%f54, %%f38, %%f54
14257 + add %%o2, 64, %%o2
14258 + fxor %%f56, %%f40, %%f56
14259 + fxor %%f58, %%f42, %%f58
14260 + subcc %%g5, 64, %%g5
14261 + fxor %%f60, %%f44, %%f60
14262 + fxor %%f62, %%f46, %%f62
14263 + stda %%f48, [%%o4 + %%g1] %3
14265 + ldda [%%o3] %3, %%f16
14267 + ldda [%%o2] %3, %%f32
14268 + fxor %%f0, %%f16, %%f48
14269 + fxor %%f2, %%f18, %%f50
14270 + fxor %%f4, %%f20, %%f52
14271 + fxor %%f6, %%f22, %%f54
14272 + fxor %%f8, %%f24, %%f56
14273 + fxor %%f10, %%f26, %%f58
14274 + fxor %%f12, %%f28, %%f60
14275 + fxor %%f14, %%f30, %%f62
14277 + fxor %%f48, %%f32, %%f48
14278 + fxor %%f50, %%f34, %%f50
14279 + fxor %%f52, %%f36, %%f52
14280 + fxor %%f54, %%f38, %%f54
14281 + fxor %%f56, %%f40, %%f56
14282 + fxor %%f58, %%f42, %%f58
14283 + fxor %%f60, %%f44, %%f60
14284 + fxor %%f62, %%f46, %%f62
14285 + stda %%f48, [%%o4] %3
14286 + membar #Sync|#StoreStore|#StoreLoad
14288 + wr %%g0, 0, %%fprs
14294 +14: ldx [%%o1 + 16], %%o2
14295 + ldx [%%o1 + 24], %%o0
14296 + ldx [%%o2 + %0], %%o2
14297 + ldx [%%o0 + %0], %%o0
14299 +4: ldda [%%o2] %3, %%f32
14300 + fxor %%f0, %%f16, %%f16
14301 + fxor %%f2, %%f18, %%f18
14302 + add %%o4, 64, %%o4
14303 + fxor %%f4, %%f20, %%f20
14304 + fxor %%f6, %%f22, %%f22
14305 + add %%o3, 64, %%o3
14306 + fxor %%f8, %%f24, %%f24
14307 + fxor %%f10, %%f26, %%f26
14308 + fxor %%f12, %%f28, %%f28
14309 + fxor %%f14, %%f30, %%f30
14310 + ldda [%%o0] %3, %%f48
14311 + fxor %%f16, %%f32, %%f32
14312 + fxor %%f18, %%f34, %%f34
14313 + fxor %%f20, %%f36, %%f36
14314 + fxor %%f22, %%f38, %%f38
14315 + add %%o2, 64, %%o2
14316 + fxor %%f24, %%f40, %%f40
14317 + fxor %%f26, %%f42, %%f42
14318 + fxor %%f28, %%f44, %%f44
14319 + fxor %%f30, %%f46, %%f46
14320 + ldda [%%o4] %3, %%f0
14321 + fxor %%f32, %%f48, %%f48
14322 + fxor %%f34, %%f50, %%f50
14323 + fxor %%f36, %%f52, %%f52
14324 + add %%o0, 64, %%o0
14325 + fxor %%f38, %%f54, %%f54
14326 + fxor %%f40, %%f56, %%f56
14327 + fxor %%f42, %%f58, %%f58
14328 + subcc %%g5, 64, %%g5
14329 + fxor %%f44, %%f60, %%f60
14330 + fxor %%f46, %%f62, %%f62
14331 + stda %%f48, [%%o4 + %%g1] %3
14333 + ldda [%%o3] %3, %%f16
14335 + ldda [%%o2] %3, %%f32
14336 + fxor %%f0, %%f16, %%f16
14337 + fxor %%f2, %%f18, %%f18
14338 + fxor %%f4, %%f20, %%f20
14339 + fxor %%f6, %%f22, %%f22
14340 + fxor %%f8, %%f24, %%f24
14341 + fxor %%f10, %%f26, %%f26
14342 + fxor %%f12, %%f28, %%f28
14343 + fxor %%f14, %%f30, %%f30
14344 + ldda [%%o0] %3, %%f48
14345 + fxor %%f16, %%f32, %%f32
14346 + fxor %%f18, %%f34, %%f34
14347 + fxor %%f20, %%f36, %%f36
14348 + fxor %%f22, %%f38, %%f38
14349 + fxor %%f24, %%f40, %%f40
14350 + fxor %%f26, %%f42, %%f42
14351 + fxor %%f28, %%f44, %%f44
14352 + fxor %%f30, %%f46, %%f46
14354 + fxor %%f32, %%f48, %%f48
14355 + fxor %%f34, %%f50, %%f50
14356 + fxor %%f36, %%f52, %%f52
14357 + fxor %%f38, %%f54, %%f54
14358 + fxor %%f40, %%f56, %%f56
14359 + fxor %%f42, %%f58, %%f58
14360 + fxor %%f44, %%f60, %%f60
14361 + fxor %%f46, %%f62, %%f62
14362 + stda %%f48, [%%o4] %3
14363 + membar #Sync|#StoreStore|#StoreLoad
14365 + wr %%g0, 0, %%fprs
14367 +15: ldx [%%o1 + 16], %%o2
14368 + ldx [%%o1 + 24], %%o0
14369 + ldx [%%o1 + 32], %%o1
14370 + ldx [%%o2 + %0], %%o2
14371 + ldx [%%o0 + %0], %%o0
14372 + ldx [%%o1 + %0], %%o1
14374 +5: ldda [%%o2] %3, %%f32
14375 + fxor %%f0, %%f16, %%f48
14376 + fxor %%f2, %%f18, %%f50
14377 + add %%o4, 64, %%o4
14378 + fxor %%f4, %%f20, %%f52
14379 + fxor %%f6, %%f22, %%f54
14380 + add %%o3, 64, %%o3
14381 + fxor %%f8, %%f24, %%f56
14382 + fxor %%f10, %%f26, %%f58
14383 + fxor %%f12, %%f28, %%f60
14384 + fxor %%f14, %%f30, %%f62
14385 + ldda [%%o0] %3, %%f16
14386 + fxor %%f48, %%f32, %%f48
14387 + fxor %%f50, %%f34, %%f50
14388 + fxor %%f52, %%f36, %%f52
14389 + fxor %%f54, %%f38, %%f54
14390 + add %%o2, 64, %%o2
14391 + fxor %%f56, %%f40, %%f56
14392 + fxor %%f58, %%f42, %%f58
14393 + fxor %%f60, %%f44, %%f60
14394 + fxor %%f62, %%f46, %%f62
14395 + ldda [%%o1] %3, %%f32
14396 + fxor %%f48, %%f16, %%f48
14397 + fxor %%f50, %%f18, %%f50
14398 + add %%o0, 64, %%o0
14399 + fxor %%f52, %%f20, %%f52
14400 + fxor %%f54, %%f22, %%f54
14401 + add %%o1, 64, %%o1
14402 + fxor %%f56, %%f24, %%f56
14403 + fxor %%f58, %%f26, %%f58
14404 + fxor %%f60, %%f28, %%f60
14405 + fxor %%f62, %%f30, %%f62
14406 + ldda [%%o4] %3, %%f0
14407 + fxor %%f48, %%f32, %%f48
14408 + fxor %%f50, %%f34, %%f50
14409 + fxor %%f52, %%f36, %%f52
14410 + fxor %%f54, %%f38, %%f54
14411 + fxor %%f56, %%f40, %%f56
14412 + fxor %%f58, %%f42, %%f58
14413 + subcc %%g5, 64, %%g5
14414 + fxor %%f60, %%f44, %%f60
14415 + fxor %%f62, %%f46, %%f62
14416 + stda %%f48, [%%o4 + %%g1] %3
14418 + ldda [%%o3] %3, %%f16
14420 + ldda [%%o2] %3, %%f32
14421 + fxor %%f0, %%f16, %%f48
14422 + fxor %%f2, %%f18, %%f50
14423 + fxor %%f4, %%f20, %%f52
14424 + fxor %%f6, %%f22, %%f54
14425 + fxor %%f8, %%f24, %%f56
14426 + fxor %%f10, %%f26, %%f58
14427 + fxor %%f12, %%f28, %%f60
14428 + fxor %%f14, %%f30, %%f62
14429 + ldda [%%o0] %3, %%f16
14430 + fxor %%f48, %%f32, %%f48
14431 + fxor %%f50, %%f34, %%f50
14432 + fxor %%f52, %%f36, %%f52
14433 + fxor %%f54, %%f38, %%f54
14434 + fxor %%f56, %%f40, %%f56
14435 + fxor %%f58, %%f42, %%f58
14436 + fxor %%f60, %%f44, %%f60
14437 + fxor %%f62, %%f46, %%f62
14438 + ldda [%%o1] %3, %%f32
14439 + fxor %%f48, %%f16, %%f48
14440 + fxor %%f50, %%f18, %%f50
14441 + fxor %%f52, %%f20, %%f52
14442 + fxor %%f54, %%f22, %%f54
14443 + fxor %%f56, %%f24, %%f56
14444 + fxor %%f58, %%f26, %%f58
14445 + fxor %%f60, %%f28, %%f60
14446 + fxor %%f62, %%f30, %%f62
14448 + fxor %%f48, %%f32, %%f48
14449 + fxor %%f50, %%f34, %%f50
14450 + fxor %%f52, %%f36, %%f52
14451 + fxor %%f54, %%f38, %%f54
14452 + fxor %%f56, %%f40, %%f56
14453 + fxor %%f58, %%f42, %%f58
14454 + fxor %%f60, %%f44, %%f60
14455 + fxor %%f62, %%f46, %%f62
14456 + stda %%f48, [%%o4] %3
14457 + membar #Sync|#StoreStore|#StoreLoad
14459 + wr %%g0, 0, %%fprs
14461 + "i" (&((struct buffer_head *)0)->b_data),
14462 + "i" (&((struct buffer_head *)0)->b_size),
14463 + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
14464 + "i" (FPRS_FEF), "i" (VISenter));
14466 +#endif /* __sparc_v9__ */
14468 +#if defined(__sparc__) && !defined(__sparc_v9__)
14470 + * High speed xor_block operation for RAID4/5 utilizing the
14471 + * ldd/std SPARC instructions.
14473 + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
14477 +XORBLOCK_TEMPLATE(SPARC)
14479 + int size = bh_ptr[0]->b_size;
14480 + int lines = size / (sizeof (long)) / 8, i;
14481 + long *destp = (long *) bh_ptr[0]->b_data;
14482 + long *source1 = (long *) bh_ptr[1]->b_data;
14483 + long *source2, *source3, *source4;
14487 + for (i = lines; i > 0; i--) {
14488 + __asm__ __volatile__("
14489 + ldd [%0 + 0x00], %%g2
14490 + ldd [%0 + 0x08], %%g4
14491 + ldd [%0 + 0x10], %%o0
14492 + ldd [%0 + 0x18], %%o2
14493 + ldd [%1 + 0x00], %%o4
14494 + ldd [%1 + 0x08], %%l0
14495 + ldd [%1 + 0x10], %%l2
14496 + ldd [%1 + 0x18], %%l4
14497 + xor %%g2, %%o4, %%g2
14498 + xor %%g3, %%o5, %%g3
14499 + xor %%g4, %%l0, %%g4
14500 + xor %%g5, %%l1, %%g5
14501 + xor %%o0, %%l2, %%o0
14502 + xor %%o1, %%l3, %%o1
14503 + xor %%o2, %%l4, %%o2
14504 + xor %%o3, %%l5, %%o3
14505 + std %%g2, [%0 + 0x00]
14506 + std %%g4, [%0 + 0x08]
14507 + std %%o0, [%0 + 0x10]
14508 + std %%o2, [%0 + 0x18]
14509 + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
14510 + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
14516 + source2 = (long *) bh_ptr[2]->b_data;
14517 + for (i = lines; i > 0; i--) {
14518 + __asm__ __volatile__("
14519 + ldd [%0 + 0x00], %%g2
14520 + ldd [%0 + 0x08], %%g4
14521 + ldd [%0 + 0x10], %%o0
14522 + ldd [%0 + 0x18], %%o2
14523 + ldd [%1 + 0x00], %%o4
14524 + ldd [%1 + 0x08], %%l0
14525 + ldd [%1 + 0x10], %%l2
14526 + ldd [%1 + 0x18], %%l4
14527 + xor %%g2, %%o4, %%g2
14528 + xor %%g3, %%o5, %%g3
14529 + ldd [%2 + 0x00], %%o4
14530 + xor %%g4, %%l0, %%g4
14531 + xor %%g5, %%l1, %%g5
14532 + ldd [%2 + 0x08], %%l0
14533 + xor %%o0, %%l2, %%o0
14534 + xor %%o1, %%l3, %%o1
14535 + ldd [%2 + 0x10], %%l2
14536 + xor %%o2, %%l4, %%o2
14537 + xor %%o3, %%l5, %%o3
14538 + ldd [%2 + 0x18], %%l4
14539 + xor %%g2, %%o4, %%g2
14540 + xor %%g3, %%o5, %%g3
14541 + xor %%g4, %%l0, %%g4
14542 + xor %%g5, %%l1, %%g5
14543 + xor %%o0, %%l2, %%o0
14544 + xor %%o1, %%l3, %%o1
14545 + xor %%o2, %%l4, %%o2
14546 + xor %%o3, %%l5, %%o3
14547 + std %%g2, [%0 + 0x00]
14548 + std %%g4, [%0 + 0x08]
14549 + std %%o0, [%0 + 0x10]
14550 + std %%o2, [%0 + 0x18]
14551 + " : : "r" (destp), "r" (source1), "r" (source2)
14552 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14553 + "l0", "l1", "l2", "l3", "l4", "l5");
14560 + source2 = (long *) bh_ptr[2]->b_data;
14561 + source3 = (long *) bh_ptr[3]->b_data;
14562 + for (i = lines; i > 0; i--) {
14563 + __asm__ __volatile__("
14564 + ldd [%0 + 0x00], %%g2
14565 + ldd [%0 + 0x08], %%g4
14566 + ldd [%0 + 0x10], %%o0
14567 + ldd [%0 + 0x18], %%o2
14568 + ldd [%1 + 0x00], %%o4
14569 + ldd [%1 + 0x08], %%l0
14570 + ldd [%1 + 0x10], %%l2
14571 + ldd [%1 + 0x18], %%l4
14572 + xor %%g2, %%o4, %%g2
14573 + xor %%g3, %%o5, %%g3
14574 + ldd [%2 + 0x00], %%o4
14575 + xor %%g4, %%l0, %%g4
14576 + xor %%g5, %%l1, %%g5
14577 + ldd [%2 + 0x08], %%l0
14578 + xor %%o0, %%l2, %%o0
14579 + xor %%o1, %%l3, %%o1
14580 + ldd [%2 + 0x10], %%l2
14581 + xor %%o2, %%l4, %%o2
14582 + xor %%o3, %%l5, %%o3
14583 + ldd [%2 + 0x18], %%l4
14584 + xor %%g2, %%o4, %%g2
14585 + xor %%g3, %%o5, %%g3
14586 + ldd [%3 + 0x00], %%o4
14587 + xor %%g4, %%l0, %%g4
14588 + xor %%g5, %%l1, %%g5
14589 + ldd [%3 + 0x08], %%l0
14590 + xor %%o0, %%l2, %%o0
14591 + xor %%o1, %%l3, %%o1
14592 + ldd [%3 + 0x10], %%l2
14593 + xor %%o2, %%l4, %%o2
14594 + xor %%o3, %%l5, %%o3
14595 + ldd [%3 + 0x18], %%l4
14596 + xor %%g2, %%o4, %%g2
14597 + xor %%g3, %%o5, %%g3
14598 + xor %%g4, %%l0, %%g4
14599 + xor %%g5, %%l1, %%g5
14600 + xor %%o0, %%l2, %%o0
14601 + xor %%o1, %%l3, %%o1
14602 + xor %%o2, %%l4, %%o2
14603 + xor %%o3, %%l5, %%o3
14604 + std %%g2, [%0 + 0x00]
14605 + std %%g4, [%0 + 0x08]
14606 + std %%o0, [%0 + 0x10]
14607 + std %%o2, [%0 + 0x18]
14608 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
14609 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14610 + "l0", "l1", "l2", "l3", "l4", "l5");
14618 + source2 = (long *) bh_ptr[2]->b_data;
14619 + source3 = (long *) bh_ptr[3]->b_data;
14620 + source4 = (long *) bh_ptr[4]->b_data;
14621 + for (i = lines; i > 0; i--) {
14622 + __asm__ __volatile__("
14623 + ldd [%0 + 0x00], %%g2
14624 + ldd [%0 + 0x08], %%g4
14625 + ldd [%0 + 0x10], %%o0
14626 + ldd [%0 + 0x18], %%o2
14627 + ldd [%1 + 0x00], %%o4
14628 + ldd [%1 + 0x08], %%l0
14629 + ldd [%1 + 0x10], %%l2
14630 + ldd [%1 + 0x18], %%l4
14631 + xor %%g2, %%o4, %%g2
14632 + xor %%g3, %%o5, %%g3
14633 + ldd [%2 + 0x00], %%o4
14634 + xor %%g4, %%l0, %%g4
14635 + xor %%g5, %%l1, %%g5
14636 + ldd [%2 + 0x08], %%l0
14637 + xor %%o0, %%l2, %%o0
14638 + xor %%o1, %%l3, %%o1
14639 + ldd [%2 + 0x10], %%l2
14640 + xor %%o2, %%l4, %%o2
14641 + xor %%o3, %%l5, %%o3
14642 + ldd [%2 + 0x18], %%l4
14643 + xor %%g2, %%o4, %%g2
14644 + xor %%g3, %%o5, %%g3
14645 + ldd [%3 + 0x00], %%o4
14646 + xor %%g4, %%l0, %%g4
14647 + xor %%g5, %%l1, %%g5
14648 + ldd [%3 + 0x08], %%l0
14649 + xor %%o0, %%l2, %%o0
14650 + xor %%o1, %%l3, %%o1
14651 + ldd [%3 + 0x10], %%l2
14652 + xor %%o2, %%l4, %%o2
14653 + xor %%o3, %%l5, %%o3
14654 + ldd [%3 + 0x18], %%l4
14655 + xor %%g2, %%o4, %%g2
14656 + xor %%g3, %%o5, %%g3
14657 + ldd [%4 + 0x00], %%o4
14658 + xor %%g4, %%l0, %%g4
14659 + xor %%g5, %%l1, %%g5
14660 + ldd [%4 + 0x08], %%l0
14661 + xor %%o0, %%l2, %%o0
14662 + xor %%o1, %%l3, %%o1
14663 + ldd [%4 + 0x10], %%l2
14664 + xor %%o2, %%l4, %%o2
14665 + xor %%o3, %%l5, %%o3
14666 + ldd [%4 + 0x18], %%l4
14667 + xor %%g2, %%o4, %%g2
14668 + xor %%g3, %%o5, %%g3
14669 + xor %%g4, %%l0, %%g4
14670 + xor %%g5, %%l1, %%g5
14671 + xor %%o0, %%l2, %%o0
14672 + xor %%o1, %%l3, %%o1
14673 + xor %%o2, %%l4, %%o2
14674 + xor %%o3, %%l5, %%o3
14675 + std %%g2, [%0 + 0x00]
14676 + std %%g4, [%0 + 0x08]
14677 + std %%o0, [%0 + 0x10]
14678 + std %%o2, [%0 + 0x18]
14679 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
14680 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14681 + "l0", "l1", "l2", "l3", "l4", "l5");
14691 +#endif /* __sparc_v[78]__ */
14693 +#ifndef __sparc_v9__
14696 + * this one works reasonably on any x86 CPU
14697 + * (send me an assembly version for inclusion if you can make it faster)
14699 + * this one is just as fast as written in pure assembly on x86.
14700 + * the reason for this separate version is that the
14701 + * fast open-coded xor routine "32reg" produces suboptimal code
14702 + * on x86, due to lack of registers.
14704 +XORBLOCK_TEMPLATE(8regs)
14706 + int len = bh_ptr[0]->b_size;
14707 + long *destp = (long *) bh_ptr[0]->b_data;
14708 + long *source1, *source2, *source3, *source4;
14709 + long lines = len / (sizeof (long)) / 8, i;
14713 + source1 = (long *) bh_ptr[1]->b_data;
14714 + for (i = lines; i > 0; i--) {
14715 + *(destp + 0) ^= *(source1 + 0);
14716 + *(destp + 1) ^= *(source1 + 1);
14717 + *(destp + 2) ^= *(source1 + 2);
14718 + *(destp + 3) ^= *(source1 + 3);
14719 + *(destp + 4) ^= *(source1 + 4);
14720 + *(destp + 5) ^= *(source1 + 5);
14721 + *(destp + 6) ^= *(source1 + 6);
14722 + *(destp + 7) ^= *(source1 + 7);
14728 + source2 = (long *) bh_ptr[2]->b_data;
14729 + source1 = (long *) bh_ptr[1]->b_data;
14730 + for (i = lines; i > 0; i--) {
14731 + *(destp + 0) ^= *(source1 + 0);
14732 + *(destp + 0) ^= *(source2 + 0);
14733 + *(destp + 1) ^= *(source1 + 1);
14734 + *(destp + 1) ^= *(source2 + 1);
14735 + *(destp + 2) ^= *(source1 + 2);
14736 + *(destp + 2) ^= *(source2 + 2);
14737 + *(destp + 3) ^= *(source1 + 3);
14738 + *(destp + 3) ^= *(source2 + 3);
14739 + *(destp + 4) ^= *(source1 + 4);
14740 + *(destp + 4) ^= *(source2 + 4);
14741 + *(destp + 5) ^= *(source1 + 5);
14742 + *(destp + 5) ^= *(source2 + 5);
14743 + *(destp + 6) ^= *(source1 + 6);
14744 + *(destp + 6) ^= *(source2 + 6);
14745 + *(destp + 7) ^= *(source1 + 7);
14746 + *(destp + 7) ^= *(source2 + 7);
14753 + source3 = (long *) bh_ptr[3]->b_data;
14754 + source2 = (long *) bh_ptr[2]->b_data;
14755 + source1 = (long *) bh_ptr[1]->b_data;
14756 + for (i = lines; i > 0; i--) {
14757 + *(destp + 0) ^= *(source1 + 0);
14758 + *(destp + 0) ^= *(source2 + 0);
14759 + *(destp + 0) ^= *(source3 + 0);
14760 + *(destp + 1) ^= *(source1 + 1);
14761 + *(destp + 1) ^= *(source2 + 1);
14762 + *(destp + 1) ^= *(source3 + 1);
14763 + *(destp + 2) ^= *(source1 + 2);
14764 + *(destp + 2) ^= *(source2 + 2);
14765 + *(destp + 2) ^= *(source3 + 2);
14766 + *(destp + 3) ^= *(source1 + 3);
14767 + *(destp + 3) ^= *(source2 + 3);
14768 + *(destp + 3) ^= *(source3 + 3);
14769 + *(destp + 4) ^= *(source1 + 4);
14770 + *(destp + 4) ^= *(source2 + 4);
14771 + *(destp + 4) ^= *(source3 + 4);
14772 + *(destp + 5) ^= *(source1 + 5);
14773 + *(destp + 5) ^= *(source2 + 5);
14774 + *(destp + 5) ^= *(source3 + 5);
14775 + *(destp + 6) ^= *(source1 + 6);
14776 + *(destp + 6) ^= *(source2 + 6);
14777 + *(destp + 6) ^= *(source3 + 6);
14778 + *(destp + 7) ^= *(source1 + 7);
14779 + *(destp + 7) ^= *(source2 + 7);
14780 + *(destp + 7) ^= *(source3 + 7);
14788 + source4 = (long *) bh_ptr[4]->b_data;
14789 + source3 = (long *) bh_ptr[3]->b_data;
14790 + source2 = (long *) bh_ptr[2]->b_data;
14791 + source1 = (long *) bh_ptr[1]->b_data;
14792 + for (i = lines; i > 0; i--) {
14793 + *(destp + 0) ^= *(source1 + 0);
14794 + *(destp + 0) ^= *(source2 + 0);
14795 + *(destp + 0) ^= *(source3 + 0);
14796 + *(destp + 0) ^= *(source4 + 0);
14797 + *(destp + 1) ^= *(source1 + 1);
14798 + *(destp + 1) ^= *(source2 + 1);
14799 + *(destp + 1) ^= *(source3 + 1);
14800 + *(destp + 1) ^= *(source4 + 1);
14801 + *(destp + 2) ^= *(source1 + 2);
14802 + *(destp + 2) ^= *(source2 + 2);
14803 + *(destp + 2) ^= *(source3 + 2);
14804 + *(destp + 2) ^= *(source4 + 2);
14805 + *(destp + 3) ^= *(source1 + 3);
14806 + *(destp + 3) ^= *(source2 + 3);
14807 + *(destp + 3) ^= *(source3 + 3);
14808 + *(destp + 3) ^= *(source4 + 3);
14809 + *(destp + 4) ^= *(source1 + 4);
14810 + *(destp + 4) ^= *(source2 + 4);
14811 + *(destp + 4) ^= *(source3 + 4);
14812 + *(destp + 4) ^= *(source4 + 4);
14813 + *(destp + 5) ^= *(source1 + 5);
14814 + *(destp + 5) ^= *(source2 + 5);
14815 + *(destp + 5) ^= *(source3 + 5);
14816 + *(destp + 5) ^= *(source4 + 5);
14817 + *(destp + 6) ^= *(source1 + 6);
14818 + *(destp + 6) ^= *(source2 + 6);
14819 + *(destp + 6) ^= *(source3 + 6);
14820 + *(destp + 6) ^= *(source4 + 6);
14821 + *(destp + 7) ^= *(source1 + 7);
14822 + *(destp + 7) ^= *(source2 + 7);
14823 + *(destp + 7) ^= *(source3 + 7);
14824 + *(destp + 7) ^= *(source4 + 7);
14836 + * platform independent RAID5 checksum calculation, this should
14837 + * be very fast on any platform that has a decent amount of
14838 + * registers. (32 or more)
14840 +XORBLOCK_TEMPLATE(32regs)
14842 + int size = bh_ptr[0]->b_size;
14843 + int lines = size / (sizeof (long)) / 8, i;
14844 + long *destp = (long *) bh_ptr[0]->b_data;
14845 + long *source1, *source2, *source3, *source4;
14847 + /* LOTS of registers available...
14848 + We do explicite loop-unrolling here for code which
14849 + favours RISC machines. In fact this is almoast direct
14850 + RISC assembly on Alpha and SPARC :-) */
14855 + source1 = (long *) bh_ptr[1]->b_data;
14856 + for (i = lines; i > 0; i--) {
14857 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14858 + d0 = destp[0]; /* Pull the stuff into registers */
14859 + d1 = destp[1]; /* ... in bursts, if possible. */
14866 + d0 ^= source1[0];
14867 + d1 ^= source1[1];
14868 + d2 ^= source1[2];
14869 + d3 ^= source1[3];
14870 + d4 ^= source1[4];
14871 + d5 ^= source1[5];
14872 + d6 ^= source1[6];
14873 + d7 ^= source1[7];
14874 + destp[0] = d0; /* Store the result (in burts) */
14878 + destp[4] = d4; /* Store the result (in burts) */
14887 + source2 = (long *) bh_ptr[2]->b_data;
14888 + source1 = (long *) bh_ptr[1]->b_data;
14889 + for (i = lines; i > 0; i--) {
14890 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14891 + d0 = destp[0]; /* Pull the stuff into registers */
14892 + d1 = destp[1]; /* ... in bursts, if possible. */
14899 + d0 ^= source1[0];
14900 + d1 ^= source1[1];
14901 + d2 ^= source1[2];
14902 + d3 ^= source1[3];
14903 + d4 ^= source1[4];
14904 + d5 ^= source1[5];
14905 + d6 ^= source1[6];
14906 + d7 ^= source1[7];
14907 + d0 ^= source2[0];
14908 + d1 ^= source2[1];
14909 + d2 ^= source2[2];
14910 + d3 ^= source2[3];
14911 + d4 ^= source2[4];
14912 + d5 ^= source2[5];
14913 + d6 ^= source2[6];
14914 + d7 ^= source2[7];
14915 + destp[0] = d0; /* Store the result (in burts) */
14919 + destp[4] = d4; /* Store the result (in burts) */
14929 + source3 = (long *) bh_ptr[3]->b_data;
14930 + source2 = (long *) bh_ptr[2]->b_data;
14931 + source1 = (long *) bh_ptr[1]->b_data;
14932 + for (i = lines; i > 0; i--) {
14933 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14934 + d0 = destp[0]; /* Pull the stuff into registers */
14935 + d1 = destp[1]; /* ... in bursts, if possible. */
14942 + d0 ^= source1[0];
14943 + d1 ^= source1[1];
14944 + d2 ^= source1[2];
14945 + d3 ^= source1[3];
14946 + d4 ^= source1[4];
14947 + d5 ^= source1[5];
14948 + d6 ^= source1[6];
14949 + d7 ^= source1[7];
14950 + d0 ^= source2[0];
14951 + d1 ^= source2[1];
14952 + d2 ^= source2[2];
14953 + d3 ^= source2[3];
14954 + d4 ^= source2[4];
14955 + d5 ^= source2[5];
14956 + d6 ^= source2[6];
14957 + d7 ^= source2[7];
14958 + d0 ^= source3[0];
14959 + d1 ^= source3[1];
14960 + d2 ^= source3[2];
14961 + d3 ^= source3[3];
14962 + d4 ^= source3[4];
14963 + d5 ^= source3[5];
14964 + d6 ^= source3[6];
14965 + d7 ^= source3[7];
14966 + destp[0] = d0; /* Store the result (in burts) */
14970 + destp[4] = d4; /* Store the result (in burts) */
14981 + source4 = (long *) bh_ptr[4]->b_data;
14982 + source3 = (long *) bh_ptr[3]->b_data;
14983 + source2 = (long *) bh_ptr[2]->b_data;
14984 + source1 = (long *) bh_ptr[1]->b_data;
14985 + for (i = lines; i > 0; i--) {
14986 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14987 + d0 = destp[0]; /* Pull the stuff into registers */
14988 + d1 = destp[1]; /* ... in bursts, if possible. */
14995 + d0 ^= source1[0];
14996 + d1 ^= source1[1];
14997 + d2 ^= source1[2];
14998 + d3 ^= source1[3];
14999 + d4 ^= source1[4];
15000 + d5 ^= source1[5];
15001 + d6 ^= source1[6];
15002 + d7 ^= source1[7];
15003 + d0 ^= source2[0];
15004 + d1 ^= source2[1];
15005 + d2 ^= source2[2];
15006 + d3 ^= source2[3];
15007 + d4 ^= source2[4];
15008 + d5 ^= source2[5];
15009 + d6 ^= source2[6];
15010 + d7 ^= source2[7];
15011 + d0 ^= source3[0];
15012 + d1 ^= source3[1];
15013 + d2 ^= source3[2];
15014 + d3 ^= source3[3];
15015 + d4 ^= source3[4];
15016 + d5 ^= source3[5];
15017 + d6 ^= source3[6];
15018 + d7 ^= source3[7];
15019 + d0 ^= source4[0];
15020 + d1 ^= source4[1];
15021 + d2 ^= source4[2];
15022 + d3 ^= source4[3];
15023 + d4 ^= source4[4];
15024 + d5 ^= source4[5];
15025 + d6 ^= source4[6];
15026 + d7 ^= source4[7];
15027 + destp[0] = d0; /* Store the result (in burts) */
15031 + destp[4] = d4; /* Store the result (in burts) */
15046 + * (the -6*32 shift factor colors the cache)
15048 +#define SIZE (PAGE_SIZE-6*32)
15050 +static void xor_speed ( struct xor_block_template * func,
15051 + struct buffer_head *b1, struct buffer_head *b2)
15054 + unsigned long now;
15055 + int i, count, max;
15056 + struct buffer_head *bh_ptr[6];
15058 + func->next = xor_functions;
15059 + xor_functions = func;
15064 + * count the number of XORs done during a whole jiffy.
15065 + * calculate the speed of checksumming from this.
15066 + * (we use a 2-page allocation to have guaranteed
15067 + * color L1-cache layout)
15070 + for (i = 0; i < 5; i++) {
15073 + while (jiffies == now) {
15075 + func->xor_block(2,bh_ptr);
15084 + speed = max * (HZ*SIZE/1024);
15085 + func->speed = speed;
15087 + printk( " %-10s: %5d.%03d MB/sec\n", func->name,
15088 + speed / 1000, speed % 1000);
15091 +static inline void pick_fastest_function(void)
15093 + struct xor_block_template *f, *fastest;
15095 + fastest = xor_functions;
15096 + for (f = fastest; f; f = f->next) {
15097 + if (f->speed > fastest->speed)
15100 +#ifdef CONFIG_X86_XMM
15101 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15102 + fastest = &t_xor_block_pIII_kni;
15105 + xor_block = fastest->xor_block;
15106 + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
15107 + fastest->speed / 1000, fastest->speed % 1000);
15111 +void calibrate_xor_block(void)
15113 + struct buffer_head b1, b2;
15115 + memset(&b1,0,sizeof(b1));
15118 + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
15119 + if (!b1.b_data) {
15120 + pick_fastest_function();
15123 + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
15125 + b1.b_size = SIZE;
15127 + printk(KERN_INFO "raid5: measuring checksumming speed\n");
15129 + sti(); /* should be safe */
15131 +#if defined(__sparc__) && !defined(__sparc_v9__)
15132 + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
15133 + xor_speed(&t_xor_block_SPARC,&b1,&b2);
15136 +#ifdef CONFIG_X86_XMM
15137 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15139 + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
15140 + /* we force the use of the KNI xor block because it
15141 + can write around l2. we may also be able
15142 + to load into the l1 only depending on how
15143 + the cpu deals with a load to a line that is
15144 + being prefetched.
15146 + xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
15148 +#endif /* CONFIG_X86_XMM */
15152 + if (md_cpu_has_mmx()) {
15154 + "raid5: MMX detected, trying high-speed MMX checksum routines\n");
15155 + xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
15156 + xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
15159 +#endif /* __i386__ */
15162 + xor_speed(&t_xor_block_8regs,&b1,&b2);
15163 + xor_speed(&t_xor_block_32regs,&b1,&b2);
15165 + free_pages((unsigned long)b1.b_data,2);
15166 + pick_fastest_function();
15169 +#else /* __sparc_v9__ */
15171 +void calibrate_xor_block(void)
15173 + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
15174 + xor_block = xor_block_VIS;
15177 +#endif /* __sparc_v9__ */
15179 +MD_EXPORT_SYMBOL(xor_block);
15181 --- linux/arch/i386/defconfig.orig Mon Dec 11 01:49:41 2000
15182 +++ linux/arch/i386/defconfig Tue Jan 16 13:42:04 2001
15185 # CONFIG_BLK_DEV_LOOP is not set
15186 # CONFIG_BLK_DEV_NBD is not set
15187 -# CONFIG_BLK_DEV_MD is not set
15188 +CONFIG_BLK_DEV_MD=y
15189 +CONFIG_AUTODETECT_RAID=y
15190 +CONFIG_MD_TRANSLUCENT=y
15191 +CONFIG_MD_LINEAR=y
15192 +CONFIG_MD_STRIPED=y
15193 +CONFIG_MD_MIRRORING=y
15196 +CONFIG_BLK_DEV_HSM=y
15197 # CONFIG_BLK_DEV_RAM is not set
15198 # CONFIG_BLK_DEV_XD is not set
15199 # CONFIG_BLK_DEV_DAC960 is not set
15200 --- linux/arch/sparc/config.in.orig Mon Dec 11 01:49:41 2000
15201 +++ linux/arch/sparc/config.in Tue Jan 16 13:42:04 2001
15202 @@ -88,10 +88,16 @@
15204 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15205 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15206 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15207 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15208 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15209 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15210 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15211 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15212 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15214 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15215 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15218 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15219 --- linux/arch/sparc/defconfig.orig Tue Jan 16 13:30:06 2001
15220 +++ linux/arch/sparc/defconfig Tue Jan 16 13:42:04 2001
15221 @@ -89,10 +89,13 @@
15223 CONFIG_BLK_DEV_FD=y
15224 CONFIG_BLK_DEV_MD=y
15225 +# CONFIG_AUTODETECT_RAID is not set
15227 CONFIG_MD_STRIPED=m
15228 CONFIG_MD_MIRRORING=m
15230 +# CONFIG_MD_TRANSLUCENT is not set
15231 +# CONFIG_MD_HSM is not set
15232 CONFIG_BLK_DEV_RAM=y
15233 CONFIG_BLK_DEV_RAM_SIZE=4096
15234 CONFIG_BLK_DEV_INITRD=y
15235 --- linux/arch/sparc64/config.in.orig Mon Dec 11 01:49:41 2000
15236 +++ linux/arch/sparc64/config.in Tue Jan 16 13:42:04 2001
15237 @@ -102,10 +102,16 @@
15239 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15240 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15241 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15242 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15243 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15244 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15245 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15246 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15247 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15249 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15250 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15253 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15254 --- linux/arch/sparc64/defconfig.orig Tue Jan 16 13:30:06 2001
15255 +++ linux/arch/sparc64/defconfig Tue Jan 16 13:42:04 2001
15256 @@ -106,10 +106,13 @@
15258 CONFIG_BLK_DEV_FD=y
15259 CONFIG_BLK_DEV_MD=y
15260 +# CONFIG_AUTODETECT_RAID is not set
15262 CONFIG_MD_STRIPED=m
15263 CONFIG_MD_MIRRORING=m
15265 +# CONFIG_MD_TRANSLUCENT is not set
15266 +# CONFIG_MD_HSM is not set
15267 CONFIG_BLK_DEV_RAM=y
15268 CONFIG_BLK_DEV_RAM_SIZE=4096
15269 CONFIG_BLK_DEV_INITRD=y
15270 --- linux/Documentation/Configure.help.orig Tue Jan 16 13:30:06 2001
15271 +++ linux/Documentation/Configure.help Tue Jan 16 13:42:04 2001
15272 @@ -1015,6 +1015,13 @@
15276 +Autodetect RAID partitions
15277 +CONFIG_AUTODETECT_RAID
15278 + This feature lets the kernel detect RAID partitions on bootup.
15279 + An autodetect RAID partition is a normal partition with partition
15280 + type 0xfd. Use this if you want to boot RAID devices, or want to
15281 + run them automatically.
15283 Linear (append) mode
15285 If you say Y here, then your multiple devices driver will be able to
15286 @@ -1093,6 +1100,21 @@
15287 Documentation/modules.txt.
15291 +Translucent Block Device Support (EXPERIMENTAL)
15292 +CONFIG_MD_TRANSLUCENT
15293 + DO NOT USE THIS STUFF YET!
15295 + currently there is only a placeholder there as the implementation
15296 + is not yet usable.
15298 +Hierarchical Storage Management support (EXPERIMENTAL)
15300 + DO NOT USE THIS STUFF YET!
15302 + i have released this so people can comment on the architecture,
15303 + but user-space tools are still unusable so there is nothing much
15304 + you can do with this.
15306 Boot support (linear, striped)