1 --- linux/init/main.c.orig Fri Nov 23 11:17:45 2001
2 +++ linux/init/main.c Fri Nov 23 11:18:53 2001
4 #include <linux/utsname.h>
5 #include <linux/ioport.h>
6 #include <linux/init.h>
7 +#include <linux/raid/md.h>
8 #include <linux/smp_lock.h>
10 #include <linux/hdreg.h>
11 @@ -1012,9 +1013,12 @@
13 { "ftape=", ftape_setup},
15 -#ifdef CONFIG_MD_BOOT
16 +#ifdef CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID
19 +#if CONFIG_BLK_DEV_MD
20 + { "raid=", raid_setup},
22 #ifdef CONFIG_ADBMOUSE
23 { "adb_buttons=", adb_mouse_setup },
26 while (pid != wait(&i));
27 if (MAJOR(real_root_dev) != RAMDISK_MAJOR
28 || MINOR(real_root_dev) != 0) {
29 +#ifdef CONFIG_BLK_DEV_MD
32 error = change_root(real_root_dev,"/initrd");
34 printk(KERN_ERR "Change root to /initrd: "
35 --- linux/include/linux/raid/hsm.h.orig Fri Nov 23 11:18:15 2001
36 +++ linux/include/linux/raid/hsm.h Fri Nov 23 11:18:15 2001
41 +#include <linux/raid/md.h>
44 +#error fix cpu_addr on Alpha first
47 +#include <linux/raid/hsm_p.h>
49 +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr)
50 +#define index_dev(lv,index) index_pv((lv),(index))->dev
51 +#define index_block(lv,index) (index)->data.phys_block
52 +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr))
54 +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr))
57 +typedef struct pv_bg_desc_s {
58 + unsigned int free_blocks;
59 + pv_block_group_t *bg;
62 +typedef struct pv_s pv_t;
63 +typedef struct vg_s vg_t;
64 +typedef struct lv_s lv_t;
71 + pv_bg_desc_t *bg_array;
79 + unsigned int max_indices;
80 + unsigned int free_indices;
81 + lv_lptr_t root_index;
89 + pv_t pv_array [MD_SB_DISKS];
92 + lv_t lv_array [HSM_MAX_LVS_PER_VG];
98 +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data)
99 +#define mddev_to_vg(mddev) ((vg_t *) mddev->private)
103 --- linux/include/linux/raid/hsm_p.h.orig Fri Nov 23 11:18:15 2001
104 +++ linux/include/linux/raid/hsm_p.h Fri Nov 23 11:18:15 2001
109 +#define HSM_BLOCKSIZE 4096
110 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4)
111 +#define PACKED __attribute__ ((packed))
114 + * Identifies a block in physical space
116 +typedef struct phys_idx_s {
120 +} PACKED phys_idx_t;
123 + * Identifies a block in logical space
125 +typedef struct log_idx_s {
134 +#define HSM_PV_SB_MAGIC 0xf091ae9fU
136 +#define HSM_PV_SB_GENERIC_WORDS 32
137 +#define HSM_PV_SB_RESERVED_WORDS \
138 + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS)
141 + * On-disk PV identification data, on block 0 in any PV.
143 +typedef struct pv_sb_s
145 + __u32 pv_magic; /* 0 */
147 + __u32 pv_uuid0; /* 1 */
148 + __u32 pv_uuid1; /* 2 */
149 + __u32 pv_uuid2; /* 3 */
150 + __u32 pv_uuid3; /* 4 */
152 + __u32 pv_major; /* 5 */
153 + __u32 pv_minor; /* 6 */
154 + __u32 pv_patch; /* 7 */
156 + __u32 pv_ctime; /* 8 Creation time */
158 + __u32 pv_total_size; /* 9 size of this PV, in blocks */
159 + __u32 pv_first_free; /* 10 first free block */
160 + __u32 pv_first_used; /* 11 first used block */
161 + __u32 pv_blocks_left; /* 12 unallocated blocks */
162 + __u32 pv_bg_size; /* 13 size of a block group, in blocks */
163 + __u32 pv_block_size; /* 14 size of blocks, in bytes */
164 + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */
165 + __u32 pv_block_groups; /* 16 number of block groups */
167 + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17];
172 + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS];
177 + * this is pretty much arbitrary, but has to be less than ~64
179 +#define HSM_MAX_LVS_PER_VG 32
181 +#define HSM_VG_SB_GENERIC_WORDS 32
183 +#define LV_DESCRIPTOR_WORDS 8
184 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \
185 + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS)
187 +#if (HSM_PV_SB_RESERVED_WORDS < 0)
188 +#error you messed this one up dude ...
191 +typedef struct lv_descriptor_s
193 + __u32 lv_id; /* 0 */
194 + phys_idx_t lv_root_idx; /* 1 */
195 + __u16 __reserved; /* 2 */
196 + __u32 lv_max_indices; /* 3 */
197 + __u32 lv_free_indices; /* 4 */
198 + __u32 md_id; /* 5 */
200 + __u32 reserved[LV_DESCRIPTOR_WORDS - 6];
202 +} PACKED lv_descriptor_t;
204 +#define HSM_VG_SB_MAGIC 0x98320d7aU
206 + * On-disk VG identification data, in block 1 on all PVs
208 +typedef struct vg_sb_s
210 + __u32 vg_magic; /* 0 */
211 + __u32 nr_lvs; /* 1 */
213 + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2];
215 + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG];
219 + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS];
227 +#define HSM_LV_SB_MAGIC 0xe182bd8aU
229 +/* do we need lv_sb_t? */
231 +typedef struct lv_sb_s
234 + * On-disk LV identifier
236 + __u32 lv_magic; /* 0 LV identifier */
237 + __u32 lv_uuid0; /* 1 */
238 + __u32 lv_uuid1; /* 2 */
239 + __u32 lv_uuid2; /* 3 */
240 + __u32 lv_uuid3; /* 4 */
242 + __u32 lv_major; /* 5 PV identifier */
243 + __u32 lv_minor; /* 6 PV identifier */
244 + __u32 lv_patch; /* 7 PV identifier */
246 + __u32 ctime; /* 8 Creation time */
247 + __u32 size; /* 9 size of this LV, in blocks */
248 + phys_idx_t start; /* 10 position of root index block */
249 + log_idx_t first_free; /* 11-12 first free index */
254 + __u32 reserved[HSM_BLOCKSIZE_WORDS-13];
259 + * Pointer pointing from the physical space, points to
260 + * the LV owning this block. It also contains various
261 + * statistics about the physical block.
263 +typedef struct pv_pptr_s
269 + log_idx_t predicted;
270 + __u32 last_referenced;
283 +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr)
285 + return !pptr->u.free.log_id;
289 +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1))
291 +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1)
293 + * A table of pointers filling up a single block, managing
294 + * the next DATA_BLOCKS_PER_BG physical blocks. Such block
295 + * groups form the physical space of blocks.
297 +typedef struct pv_block_group_s
299 + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8];
301 + pv_pptr_t blocks[DATA_BLOCKS_PER_BG];
303 +} PACKED pv_block_group_t;
306 + * Pointer from the logical space, points to
307 + * the (PV,block) containing this logical block
309 +typedef struct lv_lptr_s
318 +static __inline__ int index_free (const lv_lptr_t * index)
320 + return !index->data.phys_block;
323 +static __inline__ int index_present (const lv_lptr_t * index)
325 + return index->cpu_addr;
329 +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t))
331 + * A table of pointers filling up a single block, managing
332 + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form
333 + * the logical space of blocks.
335 +typedef struct lv_index_block_s
337 + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK];
339 +} PACKED lv_index_block_t;
343 --- linux/include/linux/raid/linear.h.orig Fri Nov 23 11:18:15 2001
344 +++ linux/include/linux/raid/linear.h Fri Nov 23 11:18:15 2001
349 +#include <linux/raid/md.h>
354 + unsigned int offset;
357 +typedef struct dev_info dev_info_t;
361 + dev_info_t *dev0, *dev1;
364 +struct linear_private_data
366 + struct linear_hash *hash_table;
367 + dev_info_t disks[MD_SB_DISKS];
368 + dev_info_t *smallest;
373 +typedef struct linear_private_data linear_conf_t;
375 +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
378 --- linux/include/linux/raid/md.h.orig Fri Nov 23 11:18:15 2001
379 +++ linux/include/linux/raid/md.h Fri Nov 23 11:18:15 2001
382 + md.h : Multiple Devices driver for Linux
383 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
384 + Copyright (C) 1994-96 Marc ZYNGIER
385 + <zyngier@ufr-info-p7.ibp.fr> or
386 + <maz@gloups.fdn.fr>
388 + This program is free software; you can redistribute it and/or modify
389 + it under the terms of the GNU General Public License as published by
390 + the Free Software Foundation; either version 2, or (at your option)
393 + You should have received a copy of the GNU General Public License
394 + (for example /usr/src/linux/COPYING); if not, write to the Free
395 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
401 +#include <linux/mm.h>
402 +#include <linux/fs.h>
403 +#include <linux/blkdev.h>
404 +#include <asm/semaphore.h>
405 +#include <linux/major.h>
406 +#include <linux/ioctl.h>
407 +#include <linux/types.h>
408 +#include <asm/bitops.h>
409 +#include <linux/module.h>
410 +#include <linux/mm.h>
411 +#include <linux/hdreg.h>
412 +#include <linux/sysctl.h>
413 +#include <linux/fs.h>
414 +#include <linux/proc_fs.h>
415 +#include <linux/smp_lock.h>
416 +#include <linux/delay.h>
417 +#include <net/checksum.h>
418 +#include <linux/random.h>
419 +#include <linux/locks.h>
422 +#include <linux/raid/md_compatible.h>
424 + * 'md_p.h' holds the 'physical' layout of RAID devices
425 + * 'md_u.h' holds the user <=> kernel API
427 + * 'md_k.h' holds kernel internal definitions
430 +#include <linux/raid/md_p.h>
431 +#include <linux/raid/md_u.h>
432 +#include <linux/raid/md_k.h>
435 + * Different major versions are not compatible.
436 + * Different minor versions are only downward compatible.
437 + * Different patchlevel versions are downward and upward compatible.
439 +#define MD_MAJOR_VERSION 0
440 +#define MD_MINOR_VERSION 90
441 +#define MD_PATCHLEVEL_VERSION 0
443 +extern int md_size[MAX_MD_DEVS];
444 +extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
446 +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
447 +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
448 +extern char * partition_name (kdev_t dev);
449 +extern int register_md_personality (int p_num, mdk_personality_t *p);
450 +extern int unregister_md_personality (int p_num);
451 +extern mdk_thread_t * md_register_thread (void (*run) (void *data),
452 + void *data, const char *name);
453 +extern void md_unregister_thread (mdk_thread_t *thread);
454 +extern void md_wakeup_thread(mdk_thread_t *thread);
455 +extern void md_interrupt_thread (mdk_thread_t *thread);
456 +extern int md_update_sb (mddev_t *mddev);
457 +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
458 +extern void md_recover_arrays (void);
459 +extern int md_check_ordering (mddev_t *mddev);
460 +extern void autodetect_raid(void);
461 +extern struct gendisk * find_gendisk (kdev_t dev);
462 +extern int md_notify_reboot(struct notifier_block *this,
463 + unsigned long code, void *x);
464 +#if CONFIG_BLK_DEV_MD
465 +extern void raid_setup(char *str,int *ints) md__init;
467 +#ifdef CONFIG_MD_BOOT
468 +extern void md_setup(char *str,int *ints) md__init;
471 +extern void md_print_devices (void);
473 +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
477 --- linux/include/linux/raid/md_compatible.h.orig Fri Nov 23 11:18:16 2001
478 +++ linux/include/linux/raid/md_compatible.h Fri Nov 23 11:18:16 2001
482 + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2
483 + Copyright (C) 1998 Ingo Molnar
485 + This program is free software; you can redistribute it and/or modify
486 + it under the terms of the GNU General Public License as published by
487 + the Free Software Foundation; either version 2, or (at your option)
490 + You should have received a copy of the GNU General Public License
491 + (for example /usr/src/linux/COPYING); if not, write to the Free
492 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
495 +#include <linux/version.h>
497 +#ifndef _MD_COMPATIBLE_H
498 +#define _MD_COMPATIBLE_H
500 +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s))
502 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
505 +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL)
509 +extern __inline__ int md_cpu_has_mmx(void)
511 + return x86_capability & 0x00800000;
516 +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
520 + * someone please suggest a sane compatibility layer for modules
522 +#define MD_EXPORT_SYMBOL(x)
525 +static inline unsigned long
526 +md_copy_from_user(void *to, const void *from, unsigned long n)
530 + err = verify_area(VERIFY_READ,from,n);
532 + memcpy_fromfs(to, from, n);
537 +extern inline unsigned long
538 +md_copy_to_user(void *to, const void *from, unsigned long n)
542 + err = verify_area(VERIFY_WRITE,to,n);
544 + memcpy_tofs(to, from, n);
549 +#define md_put_user(x,ptr) \
553 + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \
560 +extern inline int md_capable_admin(void)
566 +#define MD_FILE_TO_INODE(file) ((file)->f_inode)
569 +extern inline void md_flush_signals (void)
571 + current->signal = 0;
575 +#define __S(nr) (1<<((nr)-1))
576 +extern inline void md_init_signals (void)
578 + current->exit_signal = SIGCHLD;
579 + current->blocked = ~(__S(SIGKILL));
584 +extern inline unsigned long md_signal_pending (struct task_struct * tsk)
586 + return (tsk->signal & ~tsk->blocked);
590 +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD
593 +#define md_mdelay(n) (\
594 + {unsigned long msec=(n); while (msec--) udelay(1000);})
597 +#define MD_SYS_DOWN 0
598 +#define MD_SYS_HALT 0
599 +#define MD_SYS_POWER_OFF 0
602 +#define md_register_reboot_notifier(x)
605 +extern __inline__ unsigned long
606 +md_test_and_set_bit(int nr, void * addr)
608 + unsigned long flags;
609 + unsigned long oldbit;
613 + oldbit = test_bit(nr,addr);
615 + restore_flags(flags);
620 +extern __inline__ unsigned long
621 +md_test_and_clear_bit(int nr, void * addr)
623 + unsigned long flags;
624 + unsigned long oldbit;
628 + oldbit = test_bit(nr,addr);
629 + clear_bit(nr,addr);
630 + restore_flags(flags);
635 +#define md_atomic_read(x) (*(volatile int *)(x))
636 +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y))
639 +extern __inline__ void md_lock_kernel (void)
647 +extern __inline__ void md_unlock_kernel (void)
657 +#define md__initdata
658 +#define md__initfunc(__arginit) __arginit
664 +struct md_list_head {
665 + struct md_list_head *next, *prev;
668 +#define MD_LIST_HEAD(name) \
669 + struct md_list_head name = { &name, &name }
671 +#define MD_INIT_LIST_HEAD(ptr) do { \
672 + (ptr)->next = (ptr); (ptr)->prev = (ptr); \
675 +static __inline__ void md__list_add(struct md_list_head * new,
676 + struct md_list_head * prev,
677 + struct md_list_head * next)
685 +static __inline__ void md_list_add(struct md_list_head *new,
686 + struct md_list_head *head)
688 + md__list_add(new, head, head->next);
691 +static __inline__ void md__list_del(struct md_list_head * prev,
692 + struct md_list_head * next)
698 +static __inline__ void md_list_del(struct md_list_head *entry)
700 + md__list_del(entry->prev, entry->next);
703 +static __inline__ int md_list_empty(struct md_list_head *head)
705 + return head->next == head;
708 +#define md_list_entry(ptr, type, member) \
709 + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
713 +static __inline__ signed long md_schedule_timeout(signed long timeout)
715 + current->timeout = jiffies + timeout;
721 +#define md_need_resched(tsk) (need_resched)
724 +typedef struct { int gcc_is_buggy; } md_spinlock_t;
725 +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 }
727 +#define md_spin_lock_irq cli
728 +#define md_spin_unlock_irq sti
729 +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags)
730 +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0)
736 +#include <linux/reboot.h>
737 +#include <linux/vmalloc.h>
740 +#define md__get_free_pages(x,y) __get_free_pages(x,y)
744 +extern __inline__ int md_cpu_has_mmx(void)
746 + return boot_cpu_data.x86_capability & X86_FEATURE_MMX;
751 +#define md_clear_page(page) clear_page(page)
754 +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x)
757 +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z)
760 +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z)
763 +#define md_put_user put_user
766 +extern inline int md_capable_admin(void)
768 + return capable(CAP_SYS_ADMIN);
772 +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode)
775 +extern inline void md_flush_signals (void)
777 + spin_lock(¤t->sigmask_lock);
778 + flush_signals(current);
779 + spin_unlock(¤t->sigmask_lock);
783 +extern inline void md_init_signals (void)
785 + current->exit_signal = SIGCHLD;
786 + siginitsetinv(¤t->blocked, sigmask(SIGKILL));
790 +#define md_signal_pending signal_pending
793 +extern inline void md_set_global_readahead(int * table)
795 + max_readahead[MD_MAJOR] = table;
799 +#define md_mdelay(x) mdelay(x)
802 +#define MD_SYS_DOWN SYS_DOWN
803 +#define MD_SYS_HALT SYS_HALT
804 +#define MD_SYS_POWER_OFF SYS_POWER_OFF
807 +#define md_register_reboot_notifier register_reboot_notifier
810 +#define md_test_and_set_bit test_and_set_bit
813 +#define md_test_and_clear_bit test_and_clear_bit
816 +#define md_atomic_read atomic_read
817 +#define md_atomic_set atomic_set
820 +#define md_lock_kernel lock_kernel
821 +#define md_unlock_kernel unlock_kernel
825 +#include <linux/init.h>
827 +#define md__init __init
828 +#define md__initdata __initdata
829 +#define md__initfunc(__arginit) __initfunc(__arginit)
836 +#define md_list_head list_head
837 +#define MD_LIST_HEAD(name) LIST_HEAD(name)
838 +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr)
839 +#define md_list_add list_add
840 +#define md_list_del list_del
841 +#define md_list_empty list_empty
843 +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member)
847 +#define md_schedule_timeout schedule_timeout
850 +#define md_need_resched(tsk) ((tsk)->need_resched)
853 +#define md_spinlock_t spinlock_t
854 +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
856 +#define md_spin_lock_irq spin_lock_irq
857 +#define md_spin_unlock_irq spin_unlock_irq
858 +#define md_spin_unlock_irqrestore spin_unlock_irqrestore
859 +#define md_spin_lock_irqsave spin_lock_irqsave
865 +#endif _MD_COMPATIBLE_H
867 --- linux/include/linux/raid/md_k.h.orig Fri Nov 23 11:18:16 2001
868 +++ linux/include/linux/raid/md_k.h Fri Nov 23 11:18:16 2001
871 + md_k.h : kernel internal structure of the Linux MD driver
872 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
874 + This program is free software; you can redistribute it and/or modify
875 + it under the terms of the GNU General Public License as published by
876 + the Free Software Foundation; either version 2, or (at your option)
879 + You should have received a copy of the GNU General Public License
880 + (for example /usr/src/linux/COPYING); if not, write to the Free
881 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
887 +#define MD_RESERVED 0UL
890 +#define RAID0 STRIPED
893 +#define TRANSLUCENT 5UL
895 +#define MAX_PERSONALITY 7UL
897 +extern inline int pers_to_level (int pers)
900 + case HSM: return -3;
901 + case TRANSLUCENT: return -2;
902 + case LINEAR: return -1;
903 + case RAID0: return 0;
904 + case RAID1: return 1;
905 + case RAID5: return 5;
907 + panic("pers_to_level()");
910 +extern inline int level_to_pers (int level)
913 + case -3: return HSM;
914 + case -2: return TRANSLUCENT;
915 + case -1: return LINEAR;
916 + case 0: return RAID0;
917 + case 1: return RAID1;
919 + case 5: return RAID5;
921 + return MD_RESERVED;
924 +typedef struct mddev_s mddev_t;
925 +typedef struct mdk_rdev_s mdk_rdev_t;
927 +#if (MINORBITS != 8)
928 +#error MD doesnt handle bigger kdev yet
931 +#define MAX_REAL 12 /* Max number of disks per md dev */
932 +#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
935 + * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
936 + * the personality. (eg. HSM uses this to identify individual LVs)
938 +typedef struct dev_mapping_s {
943 +extern dev_mapping_t mddev_map [MAX_MD_DEVS];
945 +extern inline mddev_t * kdev_to_mddev (kdev_t dev)
947 + return mddev_map[MINOR(dev)].mddev;
951 + * options passed in raidrun:
954 +#define MAX_CHUNK_SIZE (4096*1024)
957 + * default readahead
959 +#define MD_READAHEAD (256 * 512)
961 +extern inline int disk_faulty(mdp_disk_t * d)
963 + return d->state & (1 << MD_DISK_FAULTY);
966 +extern inline int disk_active(mdp_disk_t * d)
968 + return d->state & (1 << MD_DISK_ACTIVE);
971 +extern inline int disk_sync(mdp_disk_t * d)
973 + return d->state & (1 << MD_DISK_SYNC);
976 +extern inline int disk_spare(mdp_disk_t * d)
978 + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
981 +extern inline int disk_removed(mdp_disk_t * d)
983 + return d->state & (1 << MD_DISK_REMOVED);
986 +extern inline void mark_disk_faulty(mdp_disk_t * d)
988 + d->state |= (1 << MD_DISK_FAULTY);
991 +extern inline void mark_disk_active(mdp_disk_t * d)
993 + d->state |= (1 << MD_DISK_ACTIVE);
996 +extern inline void mark_disk_sync(mdp_disk_t * d)
998 + d->state |= (1 << MD_DISK_SYNC);
1001 +extern inline void mark_disk_spare(mdp_disk_t * d)
1006 +extern inline void mark_disk_removed(mdp_disk_t * d)
1008 + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
1011 +extern inline void mark_disk_inactive(mdp_disk_t * d)
1013 + d->state &= ~(1 << MD_DISK_ACTIVE);
1016 +extern inline void mark_disk_nonsync(mdp_disk_t * d)
1018 + d->state &= ~(1 << MD_DISK_SYNC);
1022 + * MD's 'extended' device
1026 + struct md_list_head same_set; /* RAID devices within the same set */
1027 + struct md_list_head all; /* all RAID devices */
1028 + struct md_list_head pending; /* undetected RAID devices */
1030 + kdev_t dev; /* Device number */
1031 + kdev_t old_dev; /* "" when it was last imported */
1032 + int size; /* Device size (in blocks) */
1033 + mddev_t *mddev; /* RAID array if running */
1034 + unsigned long last_events; /* IO event timestamp */
1036 + struct inode *inode; /* Lock inode */
1037 + struct file filp; /* Lock file */
1042 + int faulty; /* if faulty do not issue IO requests */
1043 + int desc_nr; /* descriptor index in the superblock */
1048 + * disk operations in a working array:
1050 +#define DISKOP_SPARE_INACTIVE 0
1051 +#define DISKOP_SPARE_WRITE 1
1052 +#define DISKOP_SPARE_ACTIVE 2
1053 +#define DISKOP_HOT_REMOVE_DISK 3
1054 +#define DISKOP_HOT_ADD_DISK 4
1056 +typedef struct mdk_personality_s mdk_personality_t;
1061 + mdk_personality_t *pers;
1065 + struct md_list_head disks;
1067 + mdu_param_t param;
1069 + unsigned int curr_resync;
1070 + unsigned long resync_start;
1072 + int recovery_running;
1073 + struct semaphore reconfig_sem;
1074 + struct semaphore recovery_sem;
1075 + struct semaphore resync_sem;
1076 + struct md_list_head all_mddevs;
1079 +struct mdk_personality_s
1082 + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev,
1083 + unsigned long *rsector, unsigned long size);
1084 + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh);
1085 + void (*end_request)(struct buffer_head * bh, int uptodate);
1086 + int (*run)(mddev_t *mddev);
1087 + int (*stop)(mddev_t *mddev);
1088 + int (*status)(char *page, mddev_t *mddev);
1089 + int (*ioctl)(struct inode *inode, struct file *file,
1090 + unsigned int cmd, unsigned long arg);
1091 + int max_invalid_dev;
1092 + int (*error_handler)(mddev_t *mddev, kdev_t dev);
1095 + * Some personalities (RAID-1, RAID-5) can have disks hot-added and
1096 + * hot-removed. Hot removal is different from failure. (failure marks
1097 + * a disk inactive, but the disk is still part of the array) The interface
1098 + * to such operations is the 'pers->diskop()' function, can be NULL.
1100 + * the diskop function can change the pointer pointing to the incoming
1101 + * descriptor, but must do so very carefully. (currently only
1102 + * SPARE_ACTIVE expects such a change)
1104 + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
1106 + int (*stop_resync)(mddev_t *mddev);
1107 + int (*restart_resync)(mddev_t *mddev);
1112 + * Currently we index md_array directly, based on the minor
1113 + * number. This will have to change to dynamic allocation
1114 + * once we start supporting partitioning of md devices.
1116 +extern inline int mdidx (mddev_t * mddev)
1118 + return mddev->__minor;
1121 +extern inline kdev_t mddev_to_kdev(mddev_t * mddev)
1123 + return MKDEV(MD_MAJOR, mdidx(mddev));
1126 +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
1127 +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
1130 + * iterates through some rdev ringlist. It's safe to remove the
1131 + * current 'rdev'. Dont touch 'tmp' though.
1133 +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
1135 + for (tmp = head.next; \
1136 + rdev = md_list_entry(tmp, mdk_rdev_t, field), \
1137 + tmp = tmp->next, tmp->prev != &head \
1140 + * iterates through the 'same array disks' ringlist
1142 +#define ITERATE_RDEV(mddev,rdev,tmp) \
1143 + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
1146 + * Same as above, but assumes that the device has rdev->desc_nr numbered
1147 + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
1149 +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
1150 + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
1154 + * Iterates through all 'RAID managed disks'
1156 +#define ITERATE_RDEV_ALL(rdev,tmp) \
1157 + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
1160 + * Iterates through 'pending RAID disks'
1162 +#define ITERATE_RDEV_PENDING(rdev,tmp) \
1163 + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
1166 + * iterates through all used mddevs in the system.
1168 +#define ITERATE_MDDEV(mddev,tmp) \
1170 + for (tmp = all_mddevs.next; \
1171 + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
1172 + tmp = tmp->next, tmp->prev != &all_mddevs \
1175 +extern inline int lock_mddev (mddev_t * mddev)
1177 + return down_interruptible(&mddev->reconfig_sem);
1180 +extern inline void unlock_mddev (mddev_t * mddev)
1182 + up(&mddev->reconfig_sem);
1185 +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
1186 + x = y; y = __tmp; } while (0)
1188 +typedef struct mdk_thread_s {
1189 + void (*run) (void *data);
1191 + struct wait_queue *wqueue;
1192 + unsigned long flags;
1193 + struct semaphore *sem;
1194 + struct task_struct *tsk;
1198 +#define THREAD_WAKEUP 0
1200 +typedef struct dev_name_s {
1201 + struct md_list_head list;
1203 + char name [MAX_DISKNAME_LEN];
1208 --- linux/include/linux/raid/md_p.h.orig Fri Nov 23 11:18:16 2001
1209 +++ linux/include/linux/raid/md_p.h Fri Nov 23 11:18:16 2001
1212 + md_p.h : physical layout of Linux RAID devices
1213 + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
1215 + This program is free software; you can redistribute it and/or modify
1216 + it under the terms of the GNU General Public License as published by
1217 + the Free Software Foundation; either version 2, or (at your option)
1218 + any later version.
1220 + You should have received a copy of the GNU General Public License
1221 + (for example /usr/src/linux/COPYING); if not, write to the Free
1222 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1229 + * RAID superblock.
1231 + * The RAID superblock maintains some statistics on each RAID configuration.
1232 + * Each real device in the RAID set contains it near the end of the device.
1233 + * Some of the ideas are copied from the ext2fs implementation.
1235 + * We currently use 4096 bytes as follows:
1237 + * word offset function
1239 + * 0 - 31 Constant generic RAID device information.
1240 + * 32 - 63 Generic state information.
1241 + * 64 - 127 Personality specific information.
1242 + * 128 - 511 12 32-words descriptors of the disks in the raid set.
1243 + * 512 - 911 Reserved.
1244 + * 912 - 1023 Disk specific descriptor.
1248 + * If x is the real device size in bytes, we return an apparent size of:
1250 + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1252 + * and place the 4kB superblock at offset y.
1254 +#define MD_RESERVED_BYTES (64 * 1024)
1255 +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1256 +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1258 +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1259 +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1261 +#define MD_SB_BYTES 4096
1262 +#define MD_SB_WORDS (MD_SB_BYTES / 4)
1263 +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1264 +#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1267 + * The following are counted in 32-bit words
1269 +#define MD_SB_GENERIC_OFFSET 0
1270 +#define MD_SB_PERSONALITY_OFFSET 64
1271 +#define MD_SB_DISKS_OFFSET 128
1272 +#define MD_SB_DESCRIPTOR_OFFSET 992
1274 +#define MD_SB_GENERIC_CONSTANT_WORDS 32
1275 +#define MD_SB_GENERIC_STATE_WORDS 32
1276 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1277 +#define MD_SB_PERSONALITY_WORDS 64
1278 +#define MD_SB_DISKS_WORDS 384
1279 +#define MD_SB_DESCRIPTOR_WORDS 32
1280 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1281 +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1282 +#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1285 + * Device "operational" state bits
1287 +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
1288 +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
1289 +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
1290 +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
1292 +typedef struct mdp_device_descriptor_s {
1293 + __u32 number; /* 0 Device number in the entire set */
1294 + __u32 major; /* 1 Device major number */
1295 + __u32 minor; /* 2 Device minor number */
1296 + __u32 raid_disk; /* 3 The role of the device in the raid set */
1297 + __u32 state; /* 4 Operational state */
1298 + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1301 +#define MD_SB_MAGIC 0xa92b4efc
1304 + * Superblock state bits
1306 +#define MD_SB_CLEAN 0
1307 +#define MD_SB_ERRORS 1
1309 +typedef struct mdp_superblock_s {
1311 + * Constant generic information
1313 + __u32 md_magic; /* 0 MD identifier */
1314 + __u32 major_version; /* 1 major version to which the set conforms */
1315 + __u32 minor_version; /* 2 minor version ... */
1316 + __u32 patch_version; /* 3 patchlevel version ... */
1317 + __u32 gvalid_words; /* 4 Number of used words in this section */
1318 + __u32 set_uuid0; /* 5 Raid set identifier */
1319 + __u32 ctime; /* 6 Creation time */
1320 + __u32 level; /* 7 Raid personality */
1321 + __u32 size; /* 8 Apparent size of each individual disk */
1322 + __u32 nr_disks; /* 9 total disks in the raid set */
1323 + __u32 raid_disks; /* 10 disks in a fully functional raid set */
1324 + __u32 md_minor; /* 11 preferred MD minor device number */
1325 + __u32 not_persistent; /* 12 does it have a persistent superblock */
1326 + __u32 set_uuid1; /* 13 Raid set identifier #2 */
1327 + __u32 set_uuid2; /* 14 Raid set identifier #3 */
1328 + __u32 set_uuid3; /* 14 Raid set identifier #4 */
1329 + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
1332 + * Generic state information
1334 + __u32 utime; /* 0 Superblock update time */
1335 + __u32 state; /* 1 State bits (clean, ...) */
1336 + __u32 active_disks; /* 2 Number of currently active disks */
1337 + __u32 working_disks; /* 3 Number of working disks */
1338 + __u32 failed_disks; /* 4 Number of failed disks */
1339 + __u32 spare_disks; /* 5 Number of spare disks */
1340 + __u32 sb_csum; /* 6 checksum of the whole superblock */
1341 + __u64 events; /* 7 number of superblock updates (64-bit!) */
1342 + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
1345 + * Personality information
1347 + __u32 layout; /* 0 the array's physical layout */
1348 + __u32 chunk_size; /* 1 chunk size in bytes */
1349 + __u32 root_pv; /* 2 LV root PV */
1350 + __u32 root_block; /* 3 LV root block */
1351 + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
1354 + * Disks information
1356 + mdp_disk_t disks[MD_SB_DISKS];
1361 + __u32 reserved[MD_SB_RESERVED_WORDS];
1364 + * Active descriptor
1366 + mdp_disk_t this_disk;
1372 --- linux/include/linux/raid/md_u.h.orig Fri Nov 23 11:18:16 2001
1373 +++ linux/include/linux/raid/md_u.h Fri Nov 23 11:18:16 2001
1376 + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
1377 + Copyright (C) 1998 Ingo Molnar
1379 + This program is free software; you can redistribute it and/or modify
1380 + it under the terms of the GNU General Public License as published by
1381 + the Free Software Foundation; either version 2, or (at your option)
1382 + any later version.
1384 + You should have received a copy of the GNU General Public License
1385 + (for example /usr/src/linux/COPYING); if not, write to the Free
1386 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1395 +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
1396 +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
1397 +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
1398 +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13)
1400 +/* configuration */
1401 +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20)
1402 +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
1403 +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
1404 +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
1405 +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24)
1406 +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25)
1407 +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26)
1408 +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27)
1409 +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28)
1410 +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
1413 +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
1414 +#define START_ARRAY _IO (MD_MAJOR, 0x31)
1415 +#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
1416 +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
1417 +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
1419 +typedef struct mdu_version_s {
1425 +typedef struct mdu_array_info_s {
1427 + * Generic constant information
1429 + int major_version;
1430 + int minor_version;
1431 + int patch_version;
1438 + int not_persistent;
1441 + * Generic state information
1443 + int utime; /* 0 Superblock update time */
1444 + int state; /* 1 State bits (clean, ...) */
1445 + int active_disks; /* 2 Number of currently active disks */
1446 + int working_disks; /* 3 Number of working disks */
1447 + int failed_disks; /* 4 Number of failed disks */
1448 + int spare_disks; /* 5 Number of spare disks */
1451 + * Personality information
1453 + int layout; /* 0 the array's physical layout */
1454 + int chunk_size; /* 1 chunk size in bytes */
1456 +} mdu_array_info_t;
1458 +typedef struct mdu_disk_info_s {
1460 + * configuration/status of one particular disk
1470 +typedef struct mdu_start_info_s {
1472 + * configuration/status of one particular disk
1479 +} mdu_start_info_t;
1481 +typedef struct mdu_param_s
1483 + int personality; /* 1,2,3,4 */
1484 + int chunk_size; /* in bytes */
1485 + int max_fault; /* unused for now */
1490 --- linux/include/linux/raid/raid0.h.orig Fri Nov 23 11:18:16 2001
1491 +++ linux/include/linux/raid/raid0.h Fri Nov 23 11:18:16 2001
1496 +#include <linux/raid/md.h>
1500 + int zone_offset; /* Zone offset in md_dev */
1501 + int dev_offset; /* Zone offset in real dev */
1502 + int size; /* Zone size */
1503 + int nb_dev; /* # of devices attached to the zone */
1504 + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */
1509 + struct strip_zone *zone0, *zone1;
1512 +struct raid0_private_data
1514 + struct raid0_hash *hash_table; /* Dynamically allocated */
1515 + struct strip_zone *strip_zone; /* This one too */
1516 + int nr_strip_zones;
1517 + struct strip_zone *smallest;
1521 +typedef struct raid0_private_data raid0_conf_t;
1523 +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
1526 --- linux/include/linux/raid/raid1.h.orig Fri Nov 23 11:18:16 2001
1527 +++ linux/include/linux/raid/raid1.h Fri Nov 23 11:18:16 2001
1532 +#include <linux/raid/md.h>
1534 +struct mirror_info {
1551 +struct raid1_private_data {
1553 + struct mirror_info mirrors[MD_SB_DISKS];
1556 + int working_disks;
1558 + unsigned long next_sect;
1560 + mdk_thread_t *thread, *resync_thread;
1561 + int resync_mirrors;
1562 + struct mirror_info *spare;
1565 +typedef struct raid1_private_data raid1_conf_t;
1568 + * this is the only point in the RAID code where we violate
1569 + * C type safety. mddev->private is an 'opaque' pointer.
1571 +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
1574 + * this is our 'private' 'collective' RAID1 buffer head.
1575 + * it contains information about what kind of IO operations were started
1576 + * for this RAID1 operation, and about their status:
1580 + atomic_t remaining; /* 'have we finished' count,
1581 + * used from IRQ handlers
1584 + unsigned long state;
1586 + struct buffer_head *master_bh;
1587 + struct buffer_head *mirror_bh [MD_SB_DISKS];
1588 + struct buffer_head bh_req;
1589 + struct buffer_head *next_retry;
1593 --- linux/include/linux/raid/raid5.h.orig Fri Nov 23 11:18:16 2001
1594 +++ linux/include/linux/raid/raid5.h Fri Nov 23 11:18:16 2001
1599 +#include <linux/raid/md.h>
1600 +#include <linux/raid/xor.h>
1612 +struct stripe_head {
1613 + md_spinlock_t stripe_lock;
1614 + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
1615 + struct stripe_head *free_next; /* pool of free sh's */
1616 + struct buffer_head *buffer_pool; /* pool of free buffers */
1617 + struct buffer_head *bh_pool; /* pool of free bh's */
1618 + struct raid5_private_data *raid_conf;
1619 + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
1620 + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
1621 + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
1622 + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
1623 + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
1624 + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
1625 + unsigned long sector; /* sector of this row */
1626 + int size; /* buffers size */
1627 + int pd_idx; /* parity disk index */
1628 + atomic_t nr_pending; /* nr of pending cmds */
1629 + unsigned long state; /* state flags */
1630 + int cmd; /* stripe cmd */
1631 + int count; /* nr of waiters */
1632 + int write_method; /* reconstruct-write / read-modify-write */
1633 + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
1634 + struct wait_queue *wait; /* processes waiting for this stripe */
1640 +#define PHASE_BEGIN 0
1641 +#define PHASE_READ_OLD 1
1642 +#define PHASE_WRITE 2
1643 +#define PHASE_READ 3
1644 +#define PHASE_COMPLETE 4
1649 +#define METHOD_NONE 0
1650 +#define RECONSTRUCT_WRITE 1
1651 +#define READ_MODIFY_WRITE 2
1656 +#define STRIPE_LOCKED 0
1657 +#define STRIPE_ERROR 1
1662 +#define STRIPE_NONE 0
1663 +#define STRIPE_WRITE 1
1664 +#define STRIPE_READ 2
1666 +struct raid5_private_data {
1667 + struct stripe_head **stripe_hashtbl;
1669 + mdk_thread_t *thread, *resync_thread;
1670 + struct disk_info disks[MD_SB_DISKS];
1671 + struct disk_info *spare;
1673 + int chunk_size, level, algorithm;
1674 + int raid_disks, working_disks, failed_disks;
1676 + unsigned long next_sector;
1677 + atomic_t nr_handle;
1678 + struct stripe_head *next_free_stripe;
1680 + int resync_parity;
1681 + int max_nr_stripes;
1683 + int nr_hashed_stripes;
1684 + int nr_locked_stripes;
1685 + int nr_pending_stripes;
1686 + int nr_cached_stripes;
1689 + * Free stripes pool
1692 + struct stripe_head *free_sh_list;
1693 + struct wait_queue *wait_for_stripe;
1696 +typedef struct raid5_private_data raid5_conf_t;
1698 +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
1701 + * Our supported algorithms
1703 +#define ALGORITHM_LEFT_ASYMMETRIC 0
1704 +#define ALGORITHM_RIGHT_ASYMMETRIC 1
1705 +#define ALGORITHM_LEFT_SYMMETRIC 2
1706 +#define ALGORITHM_RIGHT_SYMMETRIC 3
1709 --- linux/include/linux/raid/translucent.h.orig Fri Nov 23 11:18:16 2001
1710 +++ linux/include/linux/raid/translucent.h Fri Nov 23 11:18:16 2001
1712 +#ifndef _TRANSLUCENT_H
1713 +#define _TRANSLUCENT_H
1715 +#include <linux/raid/md.h>
1717 +typedef struct dev_info dev_info_t;
1724 +struct translucent_private_data
1726 + dev_info_t disks[MD_SB_DISKS];
1730 +typedef struct translucent_private_data translucent_conf_t;
1732 +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private)
1735 --- linux/include/linux/raid/xor.h.orig Fri Nov 23 11:18:16 2001
1736 +++ linux/include/linux/raid/xor.h Fri Nov 23 11:18:16 2001
1741 +#include <linux/raid/md.h>
1743 +#define MAX_XOR_BLOCKS 5
1745 +extern void calibrate_xor_block(void);
1746 +extern void (*xor_block)(unsigned int count,
1747 + struct buffer_head **bh_ptr);
1750 --- linux/include/linux/fs.h.orig Fri Nov 23 11:17:45 2001
1751 +++ linux/include/linux/fs.h Fri Nov 23 11:18:16 2001
1753 #define BH_Req 3 /* 0 if the buffer has been invalidated */
1754 #define BH_Protected 6 /* 1 if the buffer is protected */
1755 #define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */
1756 +#define BH_LowPrio 8 /* 1 if the buffer is lowprio */
1759 * Try to keep the most commonly used fields in single cache lines (16
1761 extern void refile_buffer(struct buffer_head * buf);
1762 extern void set_writetime(struct buffer_head * buf, int flag);
1763 extern int try_to_free_buffers(struct page *, int);
1764 +extern void cache_drop_behind(struct buffer_head *bh);
1766 extern int nr_buffers;
1767 extern long buffermem;
1768 @@ -814,6 +816,25 @@
1772 +extern inline void mark_buffer_highprio(struct buffer_head * bh)
1774 + clear_bit(BH_LowPrio, &bh->b_state);
1777 +extern inline void mark_buffer_lowprio(struct buffer_head * bh)
1780 + * dirty buffers cannot be marked lowprio.
1782 + if (!buffer_dirty(bh))
1783 + set_bit(BH_LowPrio, &bh->b_state);
1786 +static inline int buffer_lowprio(struct buffer_head * bh)
1788 + return test_bit(BH_LowPrio, &bh->b_state);
1791 extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag)
1793 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1794 @@ -821,6 +842,23 @@
1795 if (bh->b_list != BUF_DIRTY)
1799 + * if a buffer gets marked dirty then it has to lose
1800 + * it's lowprio state.
1802 + mark_buffer_highprio(bh);
1805 +extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh)
1807 + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1808 + if (bh->b_list != BUF_DIRTY)
1809 + refile_buffer(bh);
1811 + * Mark it lowprio only if it was not dirty before!
1813 + set_bit(BH_LowPrio, &bh->b_state);
1817 extern int check_disk_change(kdev_t dev);
1819 extern struct buffer_head * find_buffer(kdev_t dev, int block, int size);
1820 extern void ll_rw_block(int, int, struct buffer_head * bh[]);
1821 extern int is_read_only(kdev_t);
1822 +extern int is_device_idle(kdev_t);
1823 extern void __brelse(struct buffer_head *);
1824 extern inline void brelse(struct buffer_head *buf)
1826 @@ -914,8 +953,12 @@
1827 extern void set_blocksize(kdev_t dev, int size);
1828 extern unsigned int get_hardblocksize(kdev_t dev);
1829 extern struct buffer_head * bread(kdev_t dev, int block, int size);
1830 +extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size);
1831 +extern void bread_ahead (kdev_t dev, int block, int size);
1832 extern struct buffer_head * breada(kdev_t dev,int block, int size,
1833 unsigned int pos, unsigned int filesize);
1834 +extern struct buffer_head * breada_blocks(kdev_t dev,int block,
1835 + int size, int blocks);
1837 extern int brw_page(int, struct page *, kdev_t, int [], int, int);
1839 --- linux/include/linux/blkdev.h.orig Mon Dec 11 01:49:44 2000
1840 +++ linux/include/linux/blkdev.h Fri Nov 23 11:18:16 2001
1842 extern void make_request(int major,int rw, struct buffer_head * bh);
1844 /* md needs this function to remap requests */
1845 -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size);
1846 -extern int md_make_request (int minor, int rw, struct buffer_head * bh);
1847 +extern int md_map (kdev_t dev, kdev_t *rdev,
1848 + unsigned long *rsector, unsigned long size);
1849 +extern int md_make_request (struct buffer_head * bh, int rw);
1850 extern int md_error (kdev_t mddev, kdev_t rdev);
1852 extern int * blk_size[MAX_BLKDEV];
1853 --- linux/include/linux/md.h.orig Fri May 8 09:17:13 1998
1854 +++ linux/include/linux/md.h Fri Nov 23 11:18:18 2001
1857 - md.h : Multiple Devices driver for Linux
1858 - Copyright (C) 1994-96 Marc ZYNGIER
1859 - <zyngier@ufr-info-p7.ibp.fr> or
1860 - <maz@gloups.fdn.fr>
1862 - This program is free software; you can redistribute it and/or modify
1863 - it under the terms of the GNU General Public License as published by
1864 - the Free Software Foundation; either version 2, or (at your option)
1865 - any later version.
1867 - You should have received a copy of the GNU General Public License
1868 - (for example /usr/src/linux/COPYING); if not, write to the Free
1869 - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1875 -#include <linux/major.h>
1876 -#include <linux/ioctl.h>
1877 -#include <linux/types.h>
1880 - * Different major versions are not compatible.
1881 - * Different minor versions are only downward compatible.
1882 - * Different patchlevel versions are downward and upward compatible.
1884 -#define MD_MAJOR_VERSION 0
1885 -#define MD_MINOR_VERSION 36
1886 -#define MD_PATCHLEVEL_VERSION 6
1888 -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024)
1891 -#define REGISTER_DEV _IO (MD_MAJOR, 1)
1892 -#define START_MD _IO (MD_MAJOR, 2)
1893 -#define STOP_MD _IO (MD_MAJOR, 3)
1894 -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4)
1898 - Byte 0 : Chunk size factor
1899 - Byte 1 : Fault tolerance count for each physical device
1900 - ( 0 means no fault tolerance,
1901 - 0xFF means always tolerate faults), not used by now.
1902 - Byte 2 : Personality
1903 - Byte 3 : Reserved.
1906 -#define FAULT_SHIFT 8
1907 -#define PERSONALITY_SHIFT 16
1909 -#define FACTOR_MASK 0x000000FFUL
1910 -#define FAULT_MASK 0x0000FF00UL
1911 -#define PERSONALITY_MASK 0x00FF0000UL
1913 -#define MD_RESERVED 0 /* Not used by now */
1914 -#define LINEAR (1UL << PERSONALITY_SHIFT)
1915 -#define STRIPED (2UL << PERSONALITY_SHIFT)
1916 -#define RAID0 STRIPED
1917 -#define RAID1 (3UL << PERSONALITY_SHIFT)
1918 -#define RAID5 (4UL << PERSONALITY_SHIFT)
1919 -#define MAX_PERSONALITY 5
1924 - * The MD superblock maintains some statistics on each MD configuration.
1925 - * Each real device in the MD set contains it near the end of the device.
1926 - * Some of the ideas are copied from the ext2fs implementation.
1928 - * We currently use 4096 bytes as follows:
1930 - * word offset function
1932 - * 0 - 31 Constant generic MD device information.
1933 - * 32 - 63 Generic state information.
1934 - * 64 - 127 Personality specific information.
1935 - * 128 - 511 12 32-words descriptors of the disks in the raid set.
1936 - * 512 - 911 Reserved.
1937 - * 912 - 1023 Disk specific descriptor.
1941 - * If x is the real device size in bytes, we return an apparent size of:
1943 - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
1945 - * and place the 4kB superblock at offset y.
1947 -#define MD_RESERVED_BYTES (64 * 1024)
1948 -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
1949 -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
1951 -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
1952 -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
1954 -#define MD_SB_BYTES 4096
1955 -#define MD_SB_WORDS (MD_SB_BYTES / 4)
1956 -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
1957 -#define MD_SB_SECTORS (MD_SB_BYTES / 512)
1960 - * The following are counted in 32-bit words
1962 -#define MD_SB_GENERIC_OFFSET 0
1963 -#define MD_SB_PERSONALITY_OFFSET 64
1964 -#define MD_SB_DISKS_OFFSET 128
1965 -#define MD_SB_DESCRIPTOR_OFFSET 992
1967 -#define MD_SB_GENERIC_CONSTANT_WORDS 32
1968 -#define MD_SB_GENERIC_STATE_WORDS 32
1969 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
1970 -#define MD_SB_PERSONALITY_WORDS 64
1971 -#define MD_SB_DISKS_WORDS 384
1972 -#define MD_SB_DESCRIPTOR_WORDS 32
1973 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
1974 -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
1975 -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS)
1978 - * Device "operational" state bits
1980 -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */
1981 -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */
1982 -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */
1984 -typedef struct md_device_descriptor_s {
1985 - __u32 number; /* 0 Device number in the entire set */
1986 - __u32 major; /* 1 Device major number */
1987 - __u32 minor; /* 2 Device minor number */
1988 - __u32 raid_disk; /* 3 The role of the device in the raid set */
1989 - __u32 state; /* 4 Operational state */
1990 - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
1993 -#define MD_SB_MAGIC 0xa92b4efc
1996 - * Superblock state bits
1998 -#define MD_SB_CLEAN 0
1999 -#define MD_SB_ERRORS 1
2001 -typedef struct md_superblock_s {
2004 - * Constant generic information
2006 - __u32 md_magic; /* 0 MD identifier */
2007 - __u32 major_version; /* 1 major version to which the set conforms */
2008 - __u32 minor_version; /* 2 minor version to which the set conforms */
2009 - __u32 patch_version; /* 3 patchlevel version to which the set conforms */
2010 - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */
2011 - __u32 set_magic; /* 5 Raid set identifier */
2012 - __u32 ctime; /* 6 Creation time */
2013 - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */
2014 - __u32 size; /* 8 Apparent size of each individual disk, in kB */
2015 - __u32 nr_disks; /* 9 Number of total disks in the raid set */
2016 - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */
2017 - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11];
2020 - * Generic state information
2022 - __u32 utime; /* 0 Superblock update time */
2023 - __u32 state; /* 1 State bits (clean, ...) */
2024 - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */
2025 - __u32 working_disks; /* 3 Number of working disks */
2026 - __u32 failed_disks; /* 4 Number of failed disks */
2027 - __u32 spare_disks; /* 5 Number of spare disks */
2028 - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6];
2031 - * Personality information
2033 - __u32 parity_algorithm;
2035 - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2];
2038 - * Disks information
2040 - md_descriptor_t disks[MD_SB_DISKS];
2045 - __u32 reserved[MD_SB_RESERVED_WORDS];
2048 - * Active descriptor
2050 - md_descriptor_t descriptor;
2055 -#include <linux/mm.h>
2056 -#include <linux/fs.h>
2057 -#include <linux/blkdev.h>
2058 -#include <asm/semaphore.h>
2061 - * Kernel-based reconstruction is mostly working, but still requires
2062 - * some additional work.
2064 -#define SUPPORT_RECONSTRUCTION 0
2066 -#define MAX_REAL 8 /* Max number of physical dev per md dev */
2067 -#define MAX_MD_DEV 4 /* Max number of md dev */
2069 -#define FACTOR(a) ((a)->repartition & FACTOR_MASK)
2070 -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8)
2071 -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK)
2073 -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10)
2077 - kdev_t dev; /* Device number */
2078 - int size; /* Device size (in blocks) */
2079 - int offset; /* Real device offset (in blocks) in md dev
2080 - (only used in linear mode) */
2081 - struct inode *inode; /* Lock inode */
2082 - md_superblock_t *sb;
2088 -#define SPARE_INACTIVE 0
2089 -#define SPARE_WRITE 1
2090 -#define SPARE_ACTIVE 2
2092 -struct md_personality
2095 - int (*map)(struct md_dev *mddev, kdev_t *rdev,
2096 - unsigned long *rsector, unsigned long size);
2097 - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh);
2098 - void (*end_request)(struct buffer_head * bh, int uptodate);
2099 - int (*run)(int minor, struct md_dev *mddev);
2100 - int (*stop)(int minor, struct md_dev *mddev);
2101 - int (*status)(char *page, int minor, struct md_dev *mddev);
2102 - int (*ioctl)(struct inode *inode, struct file *file,
2103 - unsigned int cmd, unsigned long arg);
2104 - int max_invalid_dev;
2105 - int (*error_handler)(struct md_dev *mddev, kdev_t dev);
2108 - * Some personalities (RAID-1, RAID-5) can get disks hot-added and
2109 - * hot-removed. Hot removal is different from failure. (failure marks
2110 - * a disk inactive, but the disk is still part of the array)
2112 - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev);
2113 - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev);
2114 - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state);
2119 - struct real_dev devices[MAX_REAL];
2120 - struct md_personality *pers;
2121 - md_superblock_t *sb;
2130 - void (*run) (void *data);
2132 - struct wait_queue *wqueue;
2133 - unsigned long flags;
2134 - struct semaphore *sem;
2135 - struct task_struct *tsk;
2138 -#define THREAD_WAKEUP 0
2140 -extern struct md_dev md_dev[MAX_MD_DEV];
2141 -extern int md_size[MAX_MD_DEV];
2142 -extern int md_maxreadahead[MAX_MD_DEV];
2144 -extern char *partition_name (kdev_t dev);
2146 -extern int register_md_personality (int p_num, struct md_personality *p);
2147 -extern int unregister_md_personality (int p_num);
2148 -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data);
2149 -extern void md_unregister_thread (struct md_thread *thread);
2150 -extern void md_wakeup_thread(struct md_thread *thread);
2151 -extern int md_update_sb (int minor);
2152 -extern int md_do_sync(struct md_dev *mddev);
2156 --- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996
2157 +++ linux/include/linux/raid0.h Fri Nov 23 11:18:18 2001
2164 - int zone_offset; /* Zone offset in md_dev */
2165 - int dev_offset; /* Zone offset in real dev */
2166 - int size; /* Zone size */
2167 - int nb_dev; /* Number of devices attached to the zone */
2168 - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */
2173 - struct strip_zone *zone0, *zone1;
2178 - struct raid0_hash *hash_table; /* Dynamically allocated */
2179 - struct strip_zone *strip_zone; /* This one too */
2180 - int nr_strip_zones;
2181 - struct strip_zone *smallest;
2186 --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998
2187 +++ linux/include/linux/raid1.h Fri Nov 23 11:18:18 2001
2192 -#include <linux/md.h>
2194 -struct mirror_info {
2209 -struct raid1_data {
2210 - struct md_dev *mddev;
2211 - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */
2213 - int working_disks; /* Number of working disks */
2215 - unsigned long next_sect;
2217 - int resync_running;
2221 - * this is our 'private' 'collective' RAID1 buffer head.
2222 - * it contains information about what kind of IO operations were started
2223 - * for this RAID5 operation, and about their status:
2227 - unsigned int remaining;
2229 - unsigned long state;
2230 - struct md_dev *mddev;
2231 - struct buffer_head *master_bh;
2232 - struct buffer_head *mirror_bh [MD_SB_DISKS];
2233 - struct buffer_head bh_req;
2234 - struct buffer_head *next_retry;
2238 --- linux/include/linux/raid5.h.orig Fri May 8 09:17:13 1998
2239 +++ linux/include/linux/raid5.h Fri Nov 23 11:18:18 2001
2245 -#include <linux/md.h>
2246 -#include <asm/atomic.h>
2257 -struct stripe_head {
2258 - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
2259 - struct stripe_head *free_next; /* pool of free sh's */
2260 - struct buffer_head *buffer_pool; /* pool of free buffers */
2261 - struct buffer_head *bh_pool; /* pool of free bh's */
2262 - struct raid5_data *raid_conf;
2263 - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */
2264 - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */
2265 - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */
2266 - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */
2267 - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */
2268 - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */
2269 - unsigned long sector; /* sector of this row */
2270 - int size; /* buffers size */
2271 - int pd_idx; /* parity disk index */
2272 - int nr_pending; /* nr of pending cmds */
2273 - unsigned long state; /* state flags */
2274 - int cmd; /* stripe cmd */
2275 - int count; /* nr of waiters */
2276 - int write_method; /* reconstruct-write / read-modify-write */
2277 - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */
2278 - struct wait_queue *wait; /* processes waiting for this stripe */
2284 -#define PHASE_BEGIN 0
2285 -#define PHASE_READ_OLD 1
2286 -#define PHASE_WRITE 2
2287 -#define PHASE_READ 3
2288 -#define PHASE_COMPLETE 4
2293 -#define METHOD_NONE 0
2294 -#define RECONSTRUCT_WRITE 1
2295 -#define READ_MODIFY_WRITE 2
2300 -#define STRIPE_LOCKED 0
2301 -#define STRIPE_ERROR 1
2306 -#define STRIPE_NONE 0
2307 -#define STRIPE_WRITE 1
2308 -#define STRIPE_READ 2
2310 -struct raid5_data {
2311 - struct stripe_head **stripe_hashtbl;
2312 - struct md_dev *mddev;
2313 - struct md_thread *thread, *resync_thread;
2314 - struct disk_info disks[MD_SB_DISKS];
2315 - struct disk_info *spare;
2317 - int chunk_size, level, algorithm;
2318 - int raid_disks, working_disks, failed_disks;
2320 - unsigned long next_sector;
2321 - atomic_t nr_handle;
2322 - struct stripe_head *next_free_stripe;
2324 - int resync_parity;
2325 - int max_nr_stripes;
2327 - int nr_hashed_stripes;
2328 - int nr_locked_stripes;
2329 - int nr_pending_stripes;
2330 - int nr_cached_stripes;
2333 - * Free stripes pool
2336 - struct stripe_head *free_sh_list;
2337 - struct wait_queue *wait_for_stripe;
2343 - * Our supported algorithms
2345 -#define ALGORITHM_LEFT_ASYMMETRIC 0
2346 -#define ALGORITHM_RIGHT_ASYMMETRIC 1
2347 -#define ALGORITHM_LEFT_SYMMETRIC 2
2348 -#define ALGORITHM_RIGHT_SYMMETRIC 3
2351 --- linux/include/linux/sysctl.h.orig Fri Nov 23 11:17:44 2001
2352 +++ linux/include/linux/sysctl.h Fri Nov 23 11:18:18 2001
2361 @@ -446,6 +447,11 @@
2364 DEV_CDROM_CHECK_MEDIA=6
2367 +/* /proc/sys/dev/md */
2369 + DEV_MD_SPEED_LIMIT=1
2372 /* /proc/sys/dev/mac_hid */
2373 --- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998
2374 +++ linux/include/asm-i386/md.h Fri Nov 23 11:18:18 2001
2377 - * md.h: High speed xor_block operation for RAID4/5
2384 -/* #define HAVE_ARCH_XORBLOCK */
2386 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2388 -#endif /* __ASM_MD_H */
2389 --- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998
2390 +++ linux/include/asm-alpha/md.h Fri Nov 23 11:18:18 2001
2393 - * md.h: High speed xor_block operation for RAID4/5
2400 -/* #define HAVE_ARCH_XORBLOCK */
2402 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2404 -#endif /* __ASM_MD_H */
2405 --- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998
2406 +++ linux/include/asm-m68k/md.h Fri Nov 23 11:18:18 2001
2409 - * md.h: High speed xor_block operation for RAID4/5
2416 -/* #define HAVE_ARCH_XORBLOCK */
2418 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2420 -#endif /* __ASM_MD_H */
2421 --- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998
2422 +++ linux/include/asm-sparc/md.h Fri Nov 23 11:18:18 2001
2425 - * md.h: High speed xor_block operation for RAID4/5
2432 -/* #define HAVE_ARCH_XORBLOCK */
2434 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2436 -#endif /* __ASM_MD_H */
2437 --- linux/include/asm-ppc/md.h.orig Wed Oct 27 02:53:42 1999
2438 +++ linux/include/asm-ppc/md.h Fri Nov 23 11:18:18 2001
2441 - * md.h: High speed xor_block operation for RAID4/5
2448 -/* #define HAVE_ARCH_XORBLOCK */
2450 -#define MD_XORBLOCK_ALIGNMENT sizeof(long)
2452 -#endif /* __ASM_MD_H */
2453 --- linux/include/asm-sparc64/md.h.orig Tue Jan 13 00:15:58 1998
2454 +++ linux/include/asm-sparc64/md.h Fri Nov 23 11:18:18 2001
2457 - * md.h: High speed xor_block operation for RAID4/5
2458 - * utilizing the UltraSparc Visual Instruction Set.
2460 - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
2466 -#include <asm/head.h>
2467 -#include <asm/asi.h>
2469 -#define HAVE_ARCH_XORBLOCK
2471 -#define MD_XORBLOCK_ALIGNMENT 64
2473 -/* void __xor_block (char *dest, char *src, long len)
2475 - * while (len--) *dest++ ^= *src++;
2479 - * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) &&
2480 - * !(len & 127) && len >= 256
2483 -static inline void __xor_block (char *dest, char *src, long len)
2485 - __asm__ __volatile__ ("
2486 - wr %%g0, %3, %%fprs
2487 - wr %%g0, %4, %%asi
2488 - membar #LoadStore|#StoreLoad|#StoreStore
2490 - ldda [%0] %4, %%f0
2491 - ldda [%1] %4, %%f16
2492 -1: ldda [%0 + 64] %%asi, %%f32
2493 - fxor %%f0, %%f16, %%f16
2494 - fxor %%f2, %%f18, %%f18
2495 - fxor %%f4, %%f20, %%f20
2496 - fxor %%f6, %%f22, %%f22
2497 - fxor %%f8, %%f24, %%f24
2498 - fxor %%f10, %%f26, %%f26
2499 - fxor %%f12, %%f28, %%f28
2500 - fxor %%f14, %%f30, %%f30
2501 - stda %%f16, [%0] %4
2502 - ldda [%1 + 64] %%asi, %%f48
2503 - ldda [%0 + 128] %%asi, %%f0
2504 - fxor %%f32, %%f48, %%f48
2505 - fxor %%f34, %%f50, %%f50
2507 - fxor %%f36, %%f52, %%f52
2509 - fxor %%f38, %%f54, %%f54
2511 - fxor %%f40, %%f56, %%f56
2512 - fxor %%f42, %%f58, %%f58
2513 - fxor %%f44, %%f60, %%f60
2514 - fxor %%f46, %%f62, %%f62
2515 - stda %%f48, [%0 - 64] %%asi
2517 - ldda [%1] %4, %%f16
2518 - ldda [%0 + 64] %%asi, %%f32
2519 - fxor %%f0, %%f16, %%f16
2520 - fxor %%f2, %%f18, %%f18
2521 - fxor %%f4, %%f20, %%f20
2522 - fxor %%f6, %%f22, %%f22
2523 - fxor %%f8, %%f24, %%f24
2524 - fxor %%f10, %%f26, %%f26
2525 - fxor %%f12, %%f28, %%f28
2526 - fxor %%f14, %%f30, %%f30
2527 - stda %%f16, [%0] %4
2528 - ldda [%1 + 64] %%asi, %%f48
2530 - fxor %%f32, %%f48, %%f48
2531 - fxor %%f34, %%f50, %%f50
2532 - fxor %%f36, %%f52, %%f52
2533 - fxor %%f38, %%f54, %%f54
2534 - fxor %%f40, %%f56, %%f56
2535 - fxor %%f42, %%f58, %%f58
2536 - fxor %%f44, %%f60, %%f60
2537 - fxor %%f46, %%f62, %%f62
2538 - stda %%f48, [%0 + 64] %%asi
2539 - membar #Sync|#StoreStore|#StoreLoad
2540 - wr %%g0, 0, %%fprs
2542 - "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) :
2546 -#endif /* __ASM_MD_H */
2547 --- linux/drivers/block/Config.in.orig Fri Nov 23 11:17:44 2001
2548 +++ linux/drivers/block/Config.in Fri Nov 23 11:18:18 2001
2549 @@ -103,10 +103,13 @@
2551 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
2552 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
2553 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
2554 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
2555 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
2556 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
2557 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
2558 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
2559 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
2561 if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
2562 bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
2563 --- linux/drivers/block/Makefile.orig Mon Dec 11 01:49:41 2000
2564 +++ linux/drivers/block/Makefile Fri Nov 23 11:18:18 2001
2565 @@ -294,10 +294,28 @@
2568 ifeq ($(CONFIG_MD_RAID5),y)
2572 ifeq ($(CONFIG_MD_RAID5),m)
2578 +ifeq ($(CONFIG_MD_TRANSLUCENT),y)
2579 +L_OBJS += translucent.o
2581 + ifeq ($(CONFIG_MD_TRANSLUCENT),m)
2582 + M_OBJS += translucent.o
2586 +ifeq ($(CONFIG_MD_HSM),y)
2589 + ifeq ($(CONFIG_MD_HSM),m)
2594 --- linux/drivers/block/genhd.c.orig Fri Nov 23 11:17:44 2001
2595 +++ linux/drivers/block/genhd.c Fri Nov 23 11:18:18 2001
2597 #include <linux/string.h>
2598 #include <linux/blk.h>
2599 #include <linux/init.h>
2600 +#include <linux/raid/md.h>
2602 #ifdef CONFIG_ARCH_S390
2603 #include <asm/dasd.h>
2604 @@ -1786,6 +1787,9 @@
2609 +#ifdef CONFIG_BLK_DEV_MD
2610 + autodetect_raid();
2612 #ifdef CONFIG_MD_BOOT
2614 --- linux/drivers/block/hsm.c.orig Fri Nov 23 11:18:18 2001
2615 +++ linux/drivers/block/hsm.c Fri Nov 23 11:18:18 2001
2618 + hsm.c : HSM RAID driver for Linux
2619 + Copyright (C) 1998 Ingo Molnar
2621 + HSM mode management functions.
2623 + This program is free software; you can redistribute it and/or modify
2624 + it under the terms of the GNU General Public License as published by
2625 + the Free Software Foundation; either version 2, or (at your option)
2626 + any later version.
2628 + You should have received a copy of the GNU General Public License
2629 + (for example /usr/src/linux/COPYING); if not, write to the Free
2630 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2633 +#include <linux/module.h>
2635 +#include <linux/raid/md.h>
2636 +#include <linux/malloc.h>
2638 +#include <linux/raid/hsm.h>
2639 +#include <linux/blk.h>
2641 +#define MAJOR_NR MD_MAJOR
2643 +#define MD_PERSONALITY
2646 +#define DEBUG_HSM 1
2649 +#define dprintk(x,y...) printk(x,##y)
2651 +#define dprintk(x,y...) do { } while (0)
2654 +void print_bh(struct buffer_head *bh)
2656 + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh,
2657 + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev,
2658 + bh->b_rsector, bh->b_this_page, bh->b_state,
2659 + bh->b_next_free, bh->b_count, bh->b_data,
2660 + bh->b_list, bh->b_flushtime
2664 +static int check_bg (pv_t *pv, pv_block_group_t * bg)
2668 + dprintk("checking bg ...\n");
2670 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2671 + if (pv_pptr_free(bg->blocks + i)) {
2673 + if (test_bit(i, bg->used_bitmap)) {
2674 + printk("hm, bit %d set?\n", i);
2677 + if (!test_bit(i, bg->used_bitmap)) {
2678 + printk("hm, bit %d not set?\n", i);
2682 + dprintk("%d free blocks in bg ...\n", free);
2686 +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr)
2688 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2689 + struct buffer_head *bh;
2691 + dprintk("... getting BG at %u ...\n", bg_pos);
2693 + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE);
2698 + desc->bg = (pv_block_group_t *) bh->b_data;
2699 + desc->free_blocks = check_bg(pv, desc->bg);
2702 +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr,
2703 + unsigned int lblock, lv_lptr_t * index)
2707 + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) {
2708 + pv_pptr_t * bptr = desc->bg->blocks + i;
2709 + if (pv_pptr_free(bptr)) {
2710 + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2;
2712 + if (test_bit(i, desc->bg->used_bitmap)) {
2716 + bptr->u.used.owner.log_id = lv->log_id;
2717 + bptr->u.used.owner.log_index = lblock;
2718 + index->data.phys_nr = pv->phys_nr;
2719 + index->data.phys_block = bg_pos + i + 1;
2720 + set_bit(i, desc->bg->used_bitmap);
2721 + desc->free_blocks--;
2722 + dprintk(".....free blocks left in bg %p: %d\n",
2723 + desc->bg, desc->free_blocks);
2730 +static int __get_free_block (lv_t *lv, pv_t *pv,
2731 + unsigned int lblock, lv_lptr_t * index)
2735 + dprintk("trying to get free block for lblock %d ...\n", lblock);
2737 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2738 + pv_bg_desc_t *desc = pv->bg_array + i;
2740 + dprintk("looking at desc #%d (%p)...\n", i, desc->bg);
2742 + get_bg(pv, desc, i);
2744 + if (desc->bg && desc->free_blocks)
2745 + return find_free_block(lv, pv, desc, i,
2748 + dprintk("hsm: pv %s full!\n", partition_name(pv->dev));
2752 +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2756 + if (!lv->free_indices)
2760 + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index);
2762 + if (err || !index->data.phys_block) {
2767 + lv->free_indices--;
2773 + * fix me: wordsize assumptions ...
2775 +#define INDEX_BITS 8
2776 +#define INDEX_DEPTH (32/INDEX_BITS)
2777 +#define INDEX_MASK ((1<<INDEX_BITS) - 1)
2779 +static void print_index_list (lv_t *lv, lv_lptr_t *index)
2784 + dprintk("... block <%u,%u,%x> [.", index->data.phys_nr,
2785 + index->data.phys_block, index->cpu_addr);
2787 + tmp = index_child(index);
2788 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2789 + if (index_block(lv, tmp))
2790 + dprintk("(%d->%d)", i, index_block(lv, tmp));
2796 +static int read_index_group (lv_t *lv, lv_lptr_t *index)
2798 + lv_lptr_t *index_group, *tmp;
2799 + struct buffer_head *bh;
2802 + dprintk("reading index group <%s:%d>\n",
2803 + partition_name(index_dev(lv, index)), index_block(lv, index));
2805 + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE);
2810 + if (!buffer_uptodate(bh))
2813 + index_group = (lv_lptr_t *) bh->b_data;
2814 + tmp = index_group;
2815 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
2816 + if (index_block(lv, tmp)) {
2817 + dprintk("index group has BLOCK %d, non-present.\n", i);
2818 + tmp->cpu_addr = 0;
2822 + index->cpu_addr = ptr_to_cpuaddr(index_group);
2824 + dprintk("have read index group %p at block %d.\n",
2825 + index_group, index_block(lv, index));
2826 + print_index_list(lv, index);
2831 +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index)
2833 + struct buffer_head *bh;
2834 + lv_lptr_t * index_group;
2836 + if (get_free_block(lv, lblock, index))
2839 + dprintk("creating block for index group <%s:%d>\n",
2840 + partition_name(index_dev(lv, index)), index_block(lv, index));
2842 + bh = getblk(index_dev(lv, index),
2843 + index_block(lv, index), HSM_BLOCKSIZE);
2845 + index_group = (lv_lptr_t *) bh->b_data;
2846 + md_clear_page(index_group);
2847 + mark_buffer_uptodate(bh, 1);
2849 + index->cpu_addr = ptr_to_cpuaddr(index_group);
2851 + dprintk("allocated index group %p at block %d.\n",
2852 + index_group, index_block(lv, index));
2856 +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock)
2858 + lv_lptr_t * index = index_child(&lv->root_index);
2861 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
2862 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2866 + if (!index_present(index)) {
2867 + dprintk("no group, level %u, pos %u\n", l, idx);
2868 + if (alloc_index_group(lv, lblock, index))
2871 + index = index_child(index);
2873 + if (!index_block(lv,index)) {
2874 + dprintk("no data, pos %u\n", idx);
2875 + if (get_free_block(lv, lblock, index))
2883 +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock)
2885 + lv_lptr_t * index = index_child(&lv->root_index);
2888 + for (l = INDEX_DEPTH-1; l >= 0; l--) {
2889 + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK;
2893 + if (index_free(index))
2895 + if (!index_present(index))
2896 + read_index_group(lv, index);
2897 + if (!index_present(index)) {
2901 + index = index_child(index);
2903 + if (!index_block(lv,index))
2908 +static int read_root_index(lv_t *lv)
2911 + lv_lptr_t *index = &lv->root_index;
2913 + if (!index_block(lv, index)) {
2914 + printk("LV has no root index yet, creating.\n");
2916 + err = alloc_index_group (lv, 0, index);
2918 + printk("could not create index group, err:%d\n", err);
2921 + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx =
2922 + lv->root_index.data;
2924 + printk("LV already has a root index.\n");
2925 + printk("... at <%s:%d>.\n",
2926 + partition_name(index_dev(lv, index)),
2927 + index_block(lv, index));
2929 + read_index_group(lv, index);
2934 +static int init_pv(pv_t *pv)
2936 + struct buffer_head *bh;
2939 + bh = bread (pv->dev, 0, HSM_BLOCKSIZE);
2945 + pv_sb = (pv_sb_t *) bh->b_data;
2946 + pv->pv_sb = pv_sb;
2948 + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) {
2949 + printk("%s is not a PV, has magic %x instead of %x!\n",
2950 + partition_name(pv->dev), pv_sb->pv_magic,
2954 + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev),
2956 + printk("... created under HSM version %d.%d.%d, at %x.\n",
2957 + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime);
2958 + printk("... total # of blocks: %d (%d left unallocated).\n",
2959 + pv_sb->pv_total_size, pv_sb->pv_blocks_left);
2961 + printk("... block size: %d bytes.\n", pv_sb->pv_block_size);
2962 + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size);
2963 + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size);
2964 + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups);
2966 + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) {
2970 + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL);
2971 + if (!pv->bg_array) {
2975 + memset(pv->bg_array, 0, PAGE_SIZE);
2980 +static int free_pv(pv_t *pv)
2982 + struct buffer_head *bh;
2984 + dprintk("freeing PV %d ...\n", pv->phys_nr);
2986 + if (pv->bg_array) {
2989 + dprintk(".... freeing BGs ...\n");
2990 + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) {
2991 + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2;
2992 + pv_bg_desc_t *desc = pv->bg_array + i;
2995 + dprintk(".... freeing BG %d ...\n", i);
2996 + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE);
2997 + mark_buffer_dirty(bh, 1);
3002 + free_page((unsigned long)pv->bg_array);
3006 + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE);
3011 + mark_buffer_dirty(bh, 1);
3018 +struct semaphore hsm_sem = MUTEX;
3020 +#define HSM_SECTORS (HSM_BLOCKSIZE/512)
3022 +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3023 + unsigned long *rsector, unsigned long bsectors)
3025 + lv_t *lv = kdev_to_lv(dev);
3027 + unsigned int lblock = *rsector / HSM_SECTORS;
3028 + unsigned int offset = *rsector % HSM_SECTORS;
3032 + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev));
3035 + if (offset + bsectors > HSM_SECTORS) {
3040 + index = find_index(lv, lblock);
3042 + printk("no block %u yet ... allocating\n", lblock);
3043 + index = alloc_fixed_index(lv, lblock);
3048 + printk(" %u <%s : %ld(%ld)> -> ", lblock,
3049 + partition_name(*rdev), *rsector, bsectors);
3051 + *rdev = index_dev(lv, index);
3052 + *rsector = index_block(lv, index) * HSM_SECTORS + offset;
3054 + printk(" <%s : %ld> %u\n",
3055 + partition_name(*rdev), *rsector, index_block(lv, index));
3062 +static void free_index (lv_t *lv, lv_lptr_t * index)
3064 + struct buffer_head *bh;
3066 + printk("tryin to get cached block for index group <%s:%d>\n",
3067 + partition_name(index_dev(lv, index)), index_block(lv, index));
3069 + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE);
3071 + printk("....FREEING ");
3072 + print_index_list(lv, index);
3075 + if (!buffer_uptodate(bh))
3077 + if ((lv_lptr_t *)bh->b_data != index_child(index)) {
3078 + printk("huh? b_data is %p, index content is %p.\n",
3079 + bh->b_data, index_child(index));
3081 + printk("good, b_data == index content == %p.\n",
3082 + index_child(index));
3083 + printk("b_count == %d, writing.\n", bh->b_count);
3084 + mark_buffer_dirty(bh, 1);
3087 + printk("done.\n");
3089 + printk("FAILED!\n");
3091 + print_index_list(lv, index);
3092 + index_child(index) = NULL;
3095 +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3098 + lv_lptr_t * index;
3101 + nr_dots = (INDEX_DEPTH-level)*3;
3102 + memcpy(dots,"...............",nr_dots);
3103 + dots[nr_dots] = 0;
3105 + dprintk("%s level %d index group block:\n", dots, level);
3109 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3110 + if (index->data.phys_block) {
3111 + dprintk("%s block <%u,%u,%x>\n", dots,
3112 + index->data.phys_nr,
3113 + index->data.phys_block,
3115 + if (level && index_present(index)) {
3116 + dprintk("%s==> deeper one level\n", dots);
3117 + free_index_group(lv, level-1,
3118 + index_child(index));
3119 + dprintk("%s freeing index group block %p ...",
3120 + dots, index_child(index));
3121 + free_index(lv, index);
3126 + dprintk("%s DONE: level %d index group block.\n", dots, level);
3129 +static void free_lv_indextree (lv_t *lv)
3131 + dprintk("freeing LV %d ...\n", lv->log_id);
3132 + dprintk("..root index: %p\n", index_child(&lv->root_index));
3133 + dprintk("..INDEX TREE:\n");
3134 + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3135 + dprintk("..freeing root index %p ...", index_child(&lv->root_index));
3136 + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr,
3137 + lv->root_index.data.phys_block, lv->root_index.cpu_addr);
3138 + free_index(lv, &lv->root_index);
3139 + dprintk("..INDEX TREE done.\n");
3140 + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */
3141 + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices;
3144 +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0)
3147 + lv_lptr_t * index;
3150 + nr_dots = (INDEX_DEPTH-level)*3;
3151 + memcpy(dots,"...............",nr_dots);
3152 + dots[nr_dots] = 0;
3154 + dprintk("%s level %d index group block:\n", dots, level);
3157 + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) {
3158 + index = index_0 + i;
3159 + if (index->data.phys_block) {
3160 + dprintk("%s block <%u,%u,%x>\n", dots,
3161 + index->data.phys_nr,
3162 + index->data.phys_block,
3164 + if (level && index_present(index)) {
3165 + dprintk("%s==> deeper one level\n", dots);
3166 + print_index_group(lv, level-1,
3167 + index_child(index));
3171 + dprintk("%s DONE: level %d index group block.\n", dots, level);
3174 +static void print_lv (lv_t *lv)
3176 + dprintk("printing LV %d ...\n", lv->log_id);
3177 + dprintk("..root index: %p\n", index_child(&lv->root_index));
3178 + dprintk("..INDEX TREE:\n");
3179 + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index));
3180 + dprintk("..INDEX TREE done.\n");
3183 +static int map_lv (lv_t *lv)
3185 + kdev_t dev = lv->dev;
3186 + unsigned int nr = MINOR(dev);
3187 + mddev_t *mddev = lv->vg->mddev;
3189 + if (MAJOR(dev) != MD_MAJOR) {
3193 + if (kdev_to_mddev(dev)) {
3197 + md_hd_struct[nr].start_sect = 0;
3198 + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1;
3199 + md_size[nr] = md_size[mdidx(mddev)];
3200 + add_mddev_mapping(mddev, dev, lv);
3205 +static int unmap_lv (lv_t *lv)
3207 + kdev_t dev = lv->dev;
3208 + unsigned int nr = MINOR(dev);
3210 + if (MAJOR(dev) != MD_MAJOR) {
3214 + md_hd_struct[nr].start_sect = 0;
3215 + md_hd_struct[nr].nr_sects = 0;
3217 + del_mddev_mapping(lv->vg->mddev, dev);
3222 +static int init_vg (vg_t *vg)
3228 + struct buffer_head *bh;
3229 + lv_descriptor_t *lv_desc;
3232 + * fix me: read all PVs and compare the SB
3234 + dev = vg->pv_array[0].dev;
3235 + bh = bread (dev, 1, HSM_BLOCKSIZE);
3241 + vg_sb = (vg_sb_t *) bh->b_data;
3242 + vg->vg_sb = vg_sb;
3244 + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) {
3245 + printk("%s is not a valid VG, has magic %x instead of %x!\n",
3246 + partition_name(dev), vg_sb->vg_magic,
3252 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3254 + lv_desc = vg->vg_sb->lv_array + i;
3256 + id = lv_desc->lv_id;
3258 + printk("... LV desc %d empty\n", i);
3261 + if (id >= HSM_MAX_LVS_PER_VG) {
3266 + lv = vg->lv_array + id;
3273 + lv->max_indices = lv_desc->lv_max_indices;
3274 + lv->free_indices = lv_desc->lv_free_indices;
3275 + lv->root_index.data = lv_desc->lv_root_idx;
3276 + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id);
3281 + if (read_root_index(lv)) {
3284 + memset(lv, 0, sizeof(*lv));
3287 + if (vg->nr_lv != vg_sb->nr_lvs)
3293 +static int hsm_run (mddev_t *mddev)
3299 + MOD_INC_USE_COUNT;
3301 + vg = kmalloc (sizeof (*vg), GFP_KERNEL);
3304 + memset(vg, 0, sizeof(*vg));
3305 + mddev->private = vg;
3306 + vg->mddev = mddev;
3308 + if (md_check_ordering(mddev)) {
3309 + printk("hsm: disks are not ordered, aborting!\n");
3313 + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE);
3315 + vg->nr_pv = mddev->nb_dev;
3316 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
3317 + pv_t *pv = vg->pv_array + i;
3319 + pv->dev = rdev->dev;
3320 + fsync_dev (pv->dev);
3321 + set_blocksize (pv->dev, HSM_BLOCKSIZE);
3334 + mddev->private = NULL;
3336 + MOD_DEC_USE_COUNT;
3341 +static int hsm_stop (mddev_t *mddev)
3347 + vg = mddev_to_vg(mddev);
3349 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3350 + lv = vg->lv_array + i;
3354 + free_lv_indextree(lv);
3357 + for (i = 0; i < vg->nr_pv; i++)
3358 + free_pv(vg->pv_array + i);
3362 + MOD_DEC_USE_COUNT;
3368 +static int hsm_status (char *page, mddev_t *mddev)
3374 + vg = mddev_to_vg(mddev);
3376 + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) {
3377 + lv = vg->lv_array + i;
3380 + sz += sprintf(page+sz, "<LV%d %d/%d blocks used> ", lv->log_id,
3381 + lv->max_indices - lv->free_indices, lv->max_indices);
3387 +static mdk_personality_t hsm_personality=
3406 +md__initfunc(void hsm_init (void))
3408 + register_md_personality (HSM, &hsm_personality);
3413 +int init_module (void)
3415 + return (register_md_personality (HSM, &hsm_personality));
3418 +void cleanup_module (void)
3420 + unregister_md_personality (HSM);
3426 + * This Linus-trick catches bugs via the linker.
3429 +extern void __BUG__in__hsm_dot_c_1(void);
3430 +extern void __BUG__in__hsm_dot_c_2(void);
3431 +extern void __BUG__in__hsm_dot_c_3(void);
3432 +extern void __BUG__in__hsm_dot_c_4(void);
3433 +extern void __BUG__in__hsm_dot_c_5(void);
3434 +extern void __BUG__in__hsm_dot_c_6(void);
3435 +extern void __BUG__in__hsm_dot_c_7(void);
3437 +void bugcatcher (void)
3439 + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE)
3440 + __BUG__in__hsm_dot_c_1();
3441 + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE)
3442 + __BUG__in__hsm_dot_c_2();
3444 + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE)
3445 + __BUG__in__hsm_dot_c_4();
3446 + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE)
3447 + __BUG__in__hsm_dot_c_3();
3448 + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE)
3449 + __BUG__in__hsm_dot_c_6();
3451 + if (sizeof(lv_lptr_t) != 16)
3452 + __BUG__in__hsm_dot_c_5();
3453 + if (sizeof(pv_pptr_t) != 16)
3454 + __BUG__in__hsm_dot_c_6();
3457 --- linux/drivers/block/linear.c.orig Sat Nov 8 20:39:12 1997
3458 +++ linux/drivers/block/linear.c Fri Nov 23 11:18:18 2001
3462 linear.c : Multiple Devices driver for Linux
3463 Copyright (C) 1994-96 Marc ZYNGIER
3464 @@ -19,186 +18,207 @@
3466 #include <linux/module.h>
3468 -#include <linux/md.h>
3469 +#include <linux/raid/md.h>
3470 #include <linux/malloc.h>
3471 -#include <linux/init.h>
3473 -#include "linear.h"
3474 +#include <linux/raid/linear.h>
3476 #define MAJOR_NR MD_MAJOR
3478 #define MD_PERSONALITY
3480 -static int linear_run (int minor, struct md_dev *mddev)
3481 +static int linear_run (mddev_t *mddev)
3483 - int cur=0, i, size, dev0_size, nb_zone;
3484 - struct linear_data *data;
3486 - MOD_INC_USE_COUNT;
3488 - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL);
3489 - data=(struct linear_data *) mddev->private;
3492 - Find out the smallest device. This was previously done
3493 - at registry time, but since it violates modularity,
3494 - I moved it here... Any comment ? ;-)
3497 - data->smallest=mddev->devices;
3498 - for (i=1; i<mddev->nb_dev; i++)
3499 - if (data->smallest->size > mddev->devices[i].size)
3500 - data->smallest=mddev->devices+i;
3502 - nb_zone=data->nr_zones=
3503 - md_size[minor]/data->smallest->size +
3504 - (md_size[minor]%data->smallest->size ? 1 : 0);
3506 - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL);
3508 - size=mddev->devices[cur].size;
3509 + linear_conf_t *conf;
3510 + struct linear_hash *table;
3512 + int size, i, j, nb_zone;
3513 + unsigned int curr_offset;
3515 + MOD_INC_USE_COUNT;
3517 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
3520 + mddev->private = conf;
3522 + if (md_check_ordering(mddev)) {
3523 + printk("linear: disks are not ordered, aborting!\n");
3527 + * Find the smallest device.
3530 + conf->smallest = NULL;
3532 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
3533 + dev_info_t *disk = conf->disks + j;
3535 + disk->dev = rdev->dev;
3536 + disk->size = rdev->size;
3537 + disk->offset = curr_offset;
3539 + curr_offset += disk->size;
3541 + if (!conf->smallest || (disk->size < conf->smallest->size))
3542 + conf->smallest = disk;
3545 + nb_zone = conf->nr_zones =
3546 + md_size[mdidx(mddev)] / conf->smallest->size +
3547 + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
3549 + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
3551 + if (!conf->hash_table)
3555 + * Here we generate the linear hash table
3557 + table = conf->hash_table;
3560 + for (j = 0; j < mddev->nb_dev; j++) {
3561 + dev_info_t *disk = conf->disks + j;
3564 + table->dev1 = disk;
3567 + size += disk->size;
3570 + table->dev0 = disk;
3571 + size -= conf->smallest->size;
3574 + table->dev1 = NULL;
3578 + table->dev1 = NULL;
3585 + MOD_DEC_USE_COUNT;
3589 +static int linear_stop (mddev_t *mddev)
3591 + linear_conf_t *conf = mddev_to_conf(mddev);
3593 + kfree(conf->hash_table);
3597 - while (cur<mddev->nb_dev)
3599 - data->hash_table[i].dev0=mddev->devices+cur;
3600 + MOD_DEC_USE_COUNT;
3602 - if (size>=data->smallest->size) /* If we completely fill the slot */
3604 - data->hash_table[i++].dev1=NULL;
3605 - size-=data->smallest->size;
3609 - if (++cur==mddev->nb_dev) continue;
3610 - size=mddev->devices[cur].size;
3616 - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */
3618 - data->hash_table[i].dev1=NULL;
3622 - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */
3623 - size=mddev->devices[cur].size;
3624 - data->hash_table[i++].dev1=mddev->devices+cur;
3625 - size-=(data->smallest->size - dev0_size);
3631 -static int linear_stop (int minor, struct md_dev *mddev)
3633 - struct linear_data *data=(struct linear_data *) mddev->private;
3635 - kfree (data->hash_table);
3638 - MOD_DEC_USE_COUNT;
3645 -static int linear_map (struct md_dev *mddev, kdev_t *rdev,
3646 +static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
3647 unsigned long *rsector, unsigned long size)
3649 - struct linear_data *data=(struct linear_data *) mddev->private;
3650 - struct linear_hash *hash;
3651 - struct real_dev *tmp_dev;
3654 - block=*rsector >> 1;
3655 - hash=data->hash_table+(block/data->smallest->size);
3657 - if (block >= (hash->dev0->size + hash->dev0->offset))
3661 - printk ("linear_map : hash->dev1==NULL for block %ld\n", block);
3665 - tmp_dev=hash->dev1;
3668 - tmp_dev=hash->dev0;
3669 + linear_conf_t *conf = mddev_to_conf(mddev);
3670 + struct linear_hash *hash;
3671 + dev_info_t *tmp_dev;
3674 + block = *rsector >> 1;
3675 + hash = conf->hash_table + (block / conf->smallest->size);
3677 + if (block >= (hash->dev0->size + hash->dev0->offset))
3681 + printk ("linear_map : hash->dev1==NULL for block %ld\n",
3685 + tmp_dev = hash->dev1;
3687 + tmp_dev = hash->dev0;
3689 - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset)
3690 - printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3691 - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3692 + if (block >= (tmp_dev->size + tmp_dev->offset)
3693 + || block < tmp_dev->offset)
3694 + printk ("Block %ld out of bounds on dev %s size %d offset %d\n",
3695 + block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset);
3697 - *rdev=tmp_dev->dev;
3698 - *rsector=(block-(tmp_dev->offset)) << 1;
3699 + *rdev = tmp_dev->dev;
3700 + *rsector = (block - tmp_dev->offset) << 1;
3706 -static int linear_status (char *page, int minor, struct md_dev *mddev)
3707 +static int linear_status (char *page, mddev_t *mddev)
3715 - struct linear_data *data=(struct linear_data *) mddev->private;
3717 + linear_conf_t *conf = mddev_to_conf(mddev);
3719 - sz+=sprintf (page+sz, " ");
3720 - for (j=0; j<data->nr_zones; j++)
3722 - sz+=sprintf (page+sz, "[%s",
3723 - partition_name (data->hash_table[j].dev0->dev));
3725 - if (data->hash_table[j].dev1)
3726 - sz+=sprintf (page+sz, "/%s] ",
3727 - partition_name(data->hash_table[j].dev1->dev));
3729 - sz+=sprintf (page+sz, "] ");
3732 - sz+=sprintf (page+sz, "\n");
3733 + sz += sprintf(page+sz, " ");
3734 + for (j = 0; j < conf->nr_zones; j++)
3736 + sz += sprintf(page+sz, "[%s",
3737 + partition_name(conf->hash_table[j].dev0->dev));
3739 + if (conf->hash_table[j].dev1)
3740 + sz += sprintf(page+sz, "/%s] ",
3741 + partition_name(conf->hash_table[j].dev1->dev));
3743 + sz += sprintf(page+sz, "] ");
3745 + sz += sprintf(page+sz, "\n");
3747 - sz+=sprintf (page+sz, " %dk rounding", 1<<FACTOR_SHIFT(FACTOR(mddev)));
3749 + sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
3754 -static struct md_personality linear_personality=
3755 +static mdk_personality_t linear_personality=
3764 - NULL, /* no ioctls */
3784 -__initfunc(void linear_init (void))
3785 +md__initfunc(void linear_init (void))
3787 - register_md_personality (LINEAR, &linear_personality);
3788 + register_md_personality (LINEAR, &linear_personality);
3793 int init_module (void)
3795 - return (register_md_personality (LINEAR, &linear_personality));
3796 + return (register_md_personality (LINEAR, &linear_personality));
3799 void cleanup_module (void)
3801 - unregister_md_personality (LINEAR);
3802 + unregister_md_personality (LINEAR);
3807 --- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996
3808 +++ linux/drivers/block/linear.h Fri Nov 23 11:18:18 2001
3815 - struct real_dev *dev0, *dev1;
3820 - struct linear_hash *hash_table; /* Dynamically allocated */
3821 - struct real_dev *smallest;
3826 --- linux/drivers/block/ll_rw_blk.c.orig Fri Nov 23 11:17:44 2001
3827 +++ linux/drivers/block/ll_rw_blk.c Fri Nov 23 11:18:18 2001
3830 #include <asm/uaccess.h>
3831 #include <linux/blk.h>
3832 +#include <linux/raid/md.h>
3834 #ifdef CONFIG_POWERMAC
3835 #include <asm/ide.h>
3837 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
3840 + * per-major idle-IO detection
3842 +unsigned long io_events[MAX_BLKDEV] = {0, };
3845 * used to wait on when there are no free requests
3847 struct wait_queue * wait_for_request;
3850 /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
3852 + if (!buffer_lowprio(bh))
3853 + io_events[major]++;
3855 if (blk_size[major]) {
3856 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
3858 bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9);
3859 #ifdef CONFIG_BLK_DEV_MD
3860 if (major==MD_MAJOR &&
3861 - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev,
3862 + md_map (bh[i]->b_dev, &bh[i]->b_rdev,
3863 &bh[i]->b_rsector, bh[i]->b_size >> 9)) {
3865 "Bad md_map in ll_rw_block\n");
3867 set_bit(BH_Req, &bh[i]->b_state);
3868 #ifdef CONFIG_BLK_DEV_MD
3869 if (MAJOR(bh[i]->b_dev) == MD_MAJOR) {
3870 - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]);
3871 + md_make_request(bh[i], rw);
3875 --- linux/drivers/block/md.c.orig Mon Sep 4 19:39:16 2000
3876 +++ linux/drivers/block/md.c Fri Nov 23 11:18:18 2001
3880 md.c : Multiple Devices driver for Linux
3881 - Copyright (C) 1994-96 Marc ZYNGIER
3882 - <zyngier@ufr-info-p7.ibp.fr> or
3883 - <maz@gloups.fdn.fr>
3884 + Copyright (C) 1998, 1999 Ingo Molnar
3886 - A lot of inspiration came from hd.c ...
3887 + completely rewritten, based on the MD driver code from Marc Zyngier
3889 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3890 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3893 - RAID-1/RAID-5 extensions by:
3894 - Ingo Molnar, Miguel de Icaza, Gadi Oxman
3895 + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
3896 + - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
3897 + - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
3898 + - kmod support by: Cyrus Durgin
3899 + - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
3901 - Changes for kmod by:
3904 This program is free software; you can redistribute it and/or modify
3905 it under the terms of the GNU General Public License as published by
3906 the Free Software Foundation; either version 2, or (at your option)
3907 @@ -26,807 +22,3007 @@
3908 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3912 - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
3913 - * the extra system load does not show up that much. Increase it if your
3914 - * system can take more.
3916 -#define SPEED_LIMIT 1024
3917 +#include <linux/raid/md.h>
3918 +#include <linux/raid/xor.h>
3920 -#include <linux/config.h>
3921 -#include <linux/module.h>
3922 -#include <linux/version.h>
3923 -#include <linux/malloc.h>
3924 -#include <linux/mm.h>
3925 -#include <linux/md.h>
3926 -#include <linux/hdreg.h>
3927 -#include <linux/stat.h>
3928 -#include <linux/fs.h>
3929 -#include <linux/proc_fs.h>
3930 -#include <linux/blkdev.h>
3931 -#include <linux/genhd.h>
3932 -#include <linux/smp_lock.h>
3934 #include <linux/kmod.h>
3936 -#include <linux/errno.h>
3937 -#include <linux/init.h>
3939 #define __KERNEL_SYSCALLS__
3940 #include <linux/unistd.h>
3942 +#include <asm/unaligned.h>
3944 +extern asmlinkage int sys_sched_yield(void);
3945 +extern asmlinkage int sys_setsid(void);
3947 +extern unsigned long io_events[MAX_BLKDEV];
3949 #define MAJOR_NR MD_MAJOR
3952 #include <linux/blk.h>
3953 -#include <asm/uaccess.h>
3954 -#include <asm/bitops.h>
3955 -#include <asm/atomic.h>
3957 #ifdef CONFIG_MD_BOOT
3958 -extern kdev_t name_to_kdev_t(char *line) __init;
3959 +extern kdev_t name_to_kdev_t(char *line) md__init;
3962 -static struct hd_struct md_hd_struct[MAX_MD_DEV];
3963 -static int md_blocksizes[MAX_MD_DEV];
3964 -int md_maxreadahead[MAX_MD_DEV];
3965 -#if SUPPORT_RECONSTRUCTION
3966 -static struct md_thread *md_sync_thread = NULL;
3967 -#endif /* SUPPORT_RECONSTRUCTION */
3968 +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
3971 + * these have to be allocated separately because external
3972 + * subsystems want to have a pre-defined structure
3974 +struct hd_struct md_hd_struct[MAX_MD_DEVS];
3975 +static int md_blocksizes[MAX_MD_DEVS];
3976 +static int md_maxreadahead[MAX_MD_DEVS];
3977 +static mdk_thread_t *md_recovery_thread = NULL;
3979 -int md_size[MAX_MD_DEV]={0, };
3980 +int md_size[MAX_MD_DEVS] = {0, };
3982 static void md_geninit (struct gendisk *);
3984 static struct gendisk md_gendisk=
4010 -static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
4011 -struct md_dev md_dev[MAX_MD_DEV];
4013 -int md_thread(void * arg);
4015 + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
4016 + * is 100 KB/sec, so the extra system load does not show up that much.
4017 + * Increase it if you want to have more _guaranteed_ speed. Note that
4018 + * the RAID driver will use the maximum available bandwith if the IO
4019 + * subsystem is idle.
4021 + * you can change it via /proc/sys/dev/speed-limit
4024 -static struct gendisk *find_gendisk (kdev_t dev)
4026 - struct gendisk *tmp=gendisk_head;
4027 +static int sysctl_speed_limit = 100;
4029 - while (tmp != NULL)
4031 - if (tmp->major==MAJOR(dev))
4036 +static struct ctl_table_header *md_table_header;
4040 +static ctl_table md_table[] = {
4041 + {DEV_MD_SPEED_LIMIT, "speed-limit",
4042 + &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec},
4046 -char *partition_name (kdev_t dev)
4048 - static char name[40]; /* This should be long
4049 - enough for a device name ! */
4050 - struct gendisk *hd = find_gendisk (dev);
4051 +static ctl_table md_dir_table[] = {
4052 + {DEV_MD, "md", NULL, 0, 0555, md_table},
4058 - sprintf (name, "[dev %s]", kdevname(dev));
4061 +static ctl_table md_root_table[] = {
4062 + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table},
4066 - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
4067 +static void md_register_sysctl(void)
4069 + md_table_header = register_sysctl_table(md_root_table, 1);
4072 -static int legacy_raid_sb (int minor, int pnum)
4073 +void md_unregister_sysctl(void)
4076 + unregister_sysctl_table(md_table_header);
4080 + * The mapping between kdev and mddev is not necessary a simple
4081 + * one! Eg. HSM uses several sub-devices to implement Logical
4082 + * Volumes. All these sub-devices map to the same mddev.
4084 +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
4086 - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4087 +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
4089 + unsigned int minor = MINOR(dev);
4092 - * do size and offset calculations.
4094 - for (i=0; i<md_dev[minor].nb_dev; i++) {
4095 - md_dev[minor].devices[i].size &= ~(factor - 1);
4096 - md_size[minor] += md_dev[minor].devices[i].size;
4097 - md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
4098 - md_dev[minor].devices[i-1].size) : 0;
4099 + if (MAJOR(dev) != MD_MAJOR) {
4103 - if (pnum == RAID0 >> PERSONALITY_SHIFT)
4104 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
4106 + if (mddev_map[minor].mddev != NULL) {
4110 + mddev_map[minor].mddev = mddev;
4111 + mddev_map[minor].data = data;
4114 -static void free_sb (struct md_dev *mddev)
4115 +void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
4118 - struct real_dev *realdev;
4119 + unsigned int minor = MINOR(dev);
4122 - free_page((unsigned long) mddev->sb);
4124 + if (MAJOR(dev) != MD_MAJOR) {
4128 - for (i = 0; i <mddev->nb_dev; i++) {
4129 - realdev = mddev->devices + i;
4130 - if (realdev->sb) {
4131 - free_page((unsigned long) realdev->sb);
4132 - realdev->sb = NULL;
4134 + if (mddev_map[minor].mddev != mddev) {
4138 + mddev_map[minor].mddev = NULL;
4139 + mddev_map[minor].data = NULL;
4143 - * Check one RAID superblock for generic plausibility
4144 + * Enables to iterate over all existing md arrays
4146 +static MD_LIST_HEAD(all_mddevs);
4148 -#define BAD_MAGIC KERN_ERR \
4149 -"md: %s: invalid raid superblock magic (%x) on block %u\n"
4150 +static mddev_t * alloc_mddev (kdev_t dev)
4154 -#define OUT_OF_MEM KERN_ALERT \
4155 -"md: out of memory.\n"
4156 + if (MAJOR(dev) != MD_MAJOR) {
4160 + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
4164 + memset(mddev, 0, sizeof(*mddev));
4166 -#define NO_DEVICE KERN_ERR \
4167 -"md: disabled device %s\n"
4168 + mddev->__minor = MINOR(dev);
4169 + mddev->reconfig_sem = MUTEX;
4170 + mddev->recovery_sem = MUTEX;
4171 + mddev->resync_sem = MUTEX;
4172 + MD_INIT_LIST_HEAD(&mddev->disks);
4174 + * The 'base' mddev is the one with data NULL.
4175 + * personalities can create additional mddevs
4178 + add_mddev_mapping(mddev, dev, 0);
4179 + md_list_add(&mddev->all_mddevs, &all_mddevs);
4186 -static int analyze_one_sb (struct real_dev * rdev)
4187 +static void free_mddev (mddev_t *mddev)
4189 - int ret = FAILURE;
4190 - struct buffer_head *bh;
4191 - kdev_t dev = rdev->dev;
4192 - md_superblock_t *sb;
4199 - * Read the superblock, it's at the end of the disk
4200 + * Make sure nobody else is using this mddev
4201 + * (careful, we rely on the global kernel lock here)
4203 - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
4204 - set_blocksize (dev, MD_SB_BYTES);
4205 - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4208 - sb = (md_superblock_t *) bh->b_data;
4209 - if (sb->md_magic != MD_SB_MAGIC) {
4210 - printk (BAD_MAGIC, kdevname(dev),
4211 - sb->md_magic, rdev->sb_offset);
4214 - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
4216 - printk (OUT_OF_MEM);
4219 - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
4220 + while (md_atomic_read(&mddev->resync_sem.count) != 1)
4222 + while (md_atomic_read(&mddev->recovery_sem.count) != 1)
4225 - rdev->size = sb->size;
4227 - printk (NO_DEVICE,kdevname(rdev->dev));
4233 + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
4234 + md_list_del(&mddev->all_mddevs);
4235 + MD_INIT_LIST_HEAD(&mddev->all_mddevs);
4247 - * Check a full RAID array for plausibility
4249 +struct gendisk * find_gendisk (kdev_t dev)
4251 + struct gendisk *tmp = gendisk_head;
4253 -#define INCONSISTENT KERN_ERR \
4254 -"md: superblock inconsistency -- run ckraid\n"
4255 + while (tmp != NULL) {
4256 + if (tmp->major == MAJOR(dev))
4263 -#define OUT_OF_DATE KERN_ERR \
4264 -"md: superblock update time inconsistenty -- using the most recent one\n"
4265 +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
4267 + mdk_rdev_t * rdev;
4268 + struct md_list_head *tmp;
4270 -#define OLD_VERSION KERN_ALERT \
4271 -"md: %s: unsupported raid array version %d.%d.%d\n"
4272 + ITERATE_RDEV(mddev,rdev,tmp) {
4273 + if (rdev->desc_nr == nr)
4279 -#define NOT_CLEAN KERN_ERR \
4280 -"md: %s: raid array is not clean -- run ckraid\n"
4281 +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
4283 + struct md_list_head *tmp;
4286 -#define NOT_CLEAN_IGNORE KERN_ERR \
4287 -"md: %s: raid array is not clean -- reconstructing parity\n"
4288 + ITERATE_RDEV(mddev,rdev,tmp) {
4289 + if (rdev->dev == dev)
4295 -#define UNKNOWN_LEVEL KERN_ERR \
4296 -"md: %s: unsupported raid level %d\n"
4297 +static MD_LIST_HEAD(device_names);
4299 -static int analyze_sbs (int minor, int pnum)
4300 +char * partition_name (kdev_t dev)
4302 - struct md_dev *mddev = md_dev + minor;
4303 - int i, N = mddev->nb_dev, out_of_date = 0;
4304 - struct real_dev * disks = mddev->devices;
4305 - md_superblock_t *sb, *freshest = NULL;
4306 + struct gendisk *hd;
4307 + static char nomem [] = "<nomem>";
4308 + dev_name_t *dname;
4309 + struct md_list_head *tmp = device_names.next;
4312 - * RAID-0 and linear don't use a RAID superblock
4314 - if (pnum == RAID0 >> PERSONALITY_SHIFT ||
4315 - pnum == LINEAR >> PERSONALITY_SHIFT)
4316 - return legacy_raid_sb (minor, pnum);
4317 + while (tmp != &device_names) {
4318 + dname = md_list_entry(tmp, dev_name_t, list);
4319 + if (dname->dev == dev)
4320 + return dname->name;
4324 + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
4329 - * Verify the RAID superblock on each real device
4330 + * ok, add this new device name to the list
4332 - for (i = 0; i < N; i++)
4333 - if (analyze_one_sb(disks+i))
4335 + hd = find_gendisk (dev);
4338 + sprintf (dname->name, "[dev %s]", kdevname(dev));
4340 + disk_name (hd, MINOR(dev), dname->name);
4343 + md_list_add(&dname->list, &device_names);
4345 + return dname->name;
4348 +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
4351 + unsigned int size = 0;
4353 + if (blk_size[MAJOR(dev)])
4354 + size = blk_size[MAJOR(dev)][MINOR(dev)];
4356 + size = MD_NEW_SIZE_BLOCKS(size);
4360 +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
4362 + unsigned int size;
4364 + size = calc_dev_sboffset(dev, mddev, persistent);
4369 + if (mddev->sb->chunk_size)
4370 + size &= ~(mddev->sb->chunk_size/1024 - 1);
4375 + * We check wether all devices are numbered from 0 to nb_dev-1. The
4376 + * order is guaranteed even after device name changes.
4378 + * Some personalities (raid0, linear) use this. Personalities that
4379 + * provide data have to be able to deal with loss of individual
4380 + * disks, so they do their checking themselves.
4382 +int md_check_ordering (mddev_t *mddev)
4386 + struct md_list_head *tmp;
4389 - * The superblock constant part has to be the same
4390 - * for all disks in the array.
4391 + * First, all devices must be fully functional
4394 - for (i = 0; i < N; i++) {
4402 - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
4403 - printk (INCONSISTENT);
4404 + ITERATE_RDEV(mddev,rdev,tmp) {
4405 + if (rdev->faulty) {
4406 + printk("md: md%d's device %s faulty, aborting.\n",
4407 + mdidx(mddev), partition_name(rdev->dev));
4413 - * OK, we have all disks and the array is ready to run. Let's
4414 - * find the freshest superblock, that one will be the superblock
4415 - * that represents the whole array.
4417 - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
4419 + ITERATE_RDEV(mddev,rdev,tmp) {
4422 + if (c != mddev->nb_dev) {
4426 - for (i = 0; i < N; i++) {
4430 - freshest = disks[i].sb;
4434 - * Find the newest superblock version
4436 - if (disks[i].sb->utime != freshest->utime) {
4438 - if (disks[i].sb->utime > freshest->utime)
4439 - freshest = disks[i].sb;
4443 - printk(OUT_OF_DATE);
4444 - memcpy (sb, freshest, sizeof(*freshest));
4447 - * Check if we can support this RAID array
4449 - if (sb->major_version != MD_MAJOR_VERSION ||
4450 - sb->minor_version > MD_MINOR_VERSION) {
4452 - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
4453 - sb->major_version, sb->minor_version,
4454 - sb->patch_version);
4455 + if (mddev->nb_dev != mddev->sb->raid_disks) {
4456 + printk("md: md%d, array needs %d disks, has %d, aborting.\n",
4457 + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
4462 - * We need to add this as a superblock option.
4463 + * Now the numbering check
4465 -#if SUPPORT_RECONSTRUCTION
4466 - if (sb->state != (1 << MD_SB_CLEAN)) {
4467 - if (sb->level == 1) {
4468 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4469 + for (i = 0; i < mddev->nb_dev; i++) {
4471 + ITERATE_RDEV(mddev,rdev,tmp) {
4472 + if (rdev->desc_nr == i)
4476 + printk("md: md%d, missing disk #%d, aborting.\n",
4480 - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
4483 - if (sb->state != (1 << MD_SB_CLEAN)) {
4484 - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
4487 -#endif /* SUPPORT_RECONSTRUCTION */
4489 - switch (sb->level) {
4491 - md_size[minor] = sb->size;
4492 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
4496 - md_size[minor] = sb->size * (sb->raid_disks - 1);
4497 - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
4500 - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
4504 + printk("md: md%d, too many disks #%d, aborting.\n",
4515 -#undef INCONSISTENT
4521 -int md_update_sb(int minor)
4522 +static unsigned int zoned_raid_size (mddev_t *mddev)
4524 - struct md_dev *mddev = md_dev + minor;
4525 - struct buffer_head *bh;
4526 - md_superblock_t *sb = mddev->sb;
4527 - struct real_dev *realdev;
4531 + unsigned int mask;
4532 + mdk_rdev_t * rdev;
4533 + struct md_list_head *tmp;
4535 - sb->utime = CURRENT_TIME;
4536 - for (i = 0; i < mddev->nb_dev; i++) {
4537 - realdev = mddev->devices + i;
4540 - dev = realdev->dev;
4541 - sb_offset = realdev->sb_offset;
4542 - set_blocksize(dev, MD_SB_BYTES);
4543 - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
4544 - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4546 - sb = (md_superblock_t *) bh->b_data;
4547 - memcpy(sb, mddev->sb, MD_SB_BYTES);
4548 - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
4549 - mark_buffer_uptodate(bh, 1);
4550 - mark_buffer_dirty(bh, 1);
4551 - ll_rw_block(WRITE, 1, &bh);
4552 - wait_on_buffer(bh);
4555 - invalidate_buffers(dev);
4557 - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
4563 + * do size and offset calculations.
4565 + mask = ~(mddev->sb->chunk_size/1024 - 1);
4566 +printk("mask %08x\n", mask);
4568 + ITERATE_RDEV(mddev,rdev,tmp) {
4569 +printk(" rdev->size: %d\n", rdev->size);
4570 + rdev->size &= mask;
4571 +printk(" masked rdev->size: %d\n", rdev->size);
4572 + md_size[mdidx(mddev)] += rdev->size;
4573 +printk(" new md_size: %d\n", md_size[mdidx(mddev)]);
4578 -static int do_md_run (int minor, int repart)
4579 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
4581 - int pnum, i, min, factor, err;
4582 + if (disk_active(disk)) {
4583 + sb->working_disks--;
4585 + if (disk_spare(disk)) {
4586 + sb->spare_disks--;
4587 + sb->working_disks--;
4589 + sb->failed_disks--;
4595 + mark_disk_removed(disk);
4598 - if (!md_dev[minor].nb_dev)
4601 - if (md_dev[minor].pers)
4603 +#define BAD_MAGIC KERN_ERR \
4604 +"md: invalid raid superblock magic on %s\n"
4606 - md_dev[minor].repartition=repart;
4608 - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
4609 - >= MAX_PERSONALITY)
4612 - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
4613 - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
4614 - for (i = 0; i < md_dev [minor].nb_dev; i++)
4615 - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
4621 - char module_name[80];
4622 - sprintf (module_name, "md-personality-%d", pnum);
4623 - request_module (module_name);
4629 - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
4631 - for (i=0; i<md_dev[minor].nb_dev; i++)
4632 - if (md_dev[minor].devices[i].size<min)
4634 - printk ("Dev %s smaller than %dk, cannot shrink\n",
4635 - partition_name (md_dev[minor].devices[i].dev), min);
4639 - for (i=0; i<md_dev[minor].nb_dev; i++) {
4640 - fsync_dev(md_dev[minor].devices[i].dev);
4641 - invalidate_buffers(md_dev[minor].devices[i].dev);
4644 - /* Resize devices according to the factor. It is used to align
4645 - partitions size on a given chunk size. */
4649 - * Analyze the raid superblock
4651 - if (analyze_sbs(minor, pnum))
4653 +#define BAD_MINOR KERN_ERR \
4654 +"md: %s: invalid raid minor (%x)\n"
4656 - md_dev[minor].pers=pers[pnum];
4658 - if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
4660 - md_dev[minor].pers=NULL;
4661 - free_sb(md_dev + minor);
4665 - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
4667 - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
4668 - md_update_sb(minor);
4671 - /* FIXME : We assume here we have blocks
4672 - that are twice as large as sectors.
4673 - THIS MAY NOT BE TRUE !!! */
4674 - md_hd_struct[minor].start_sect=0;
4675 - md_hd_struct[minor].nr_sects=md_size[minor]<<1;
4677 - read_ahead[MD_MAJOR] = 128;
4680 +#define OUT_OF_MEM KERN_ALERT \
4681 +"md: out of memory.\n"
4683 +#define NO_SB KERN_ERR \
4684 +"md: disabled device %s, could not read superblock.\n"
4686 -static int do_md_stop (int minor, struct inode *inode)
4687 +#define BAD_CSUM KERN_WARNING \
4688 +"md: invalid superblock checksum on %s\n"
4690 +static int alloc_array_sb (mddev_t * mddev)
4694 - if (inode->i_count>1 || md_dev[minor].busy>1) {
4696 - * ioctl : one open channel
4698 - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
4699 - minor, inode->i_count, md_dev[minor].busy);
4703 - if (md_dev[minor].pers) {
4705 - * It is safe to call stop here, it only frees private
4706 - * data. Also, it tells us if a device is unstoppable
4707 - * (eg. resyncing is in progress)
4709 - if (md_dev[minor].pers->stop (minor, md_dev+minor))
4712 - * The device won't exist anymore -> flush it now
4714 - fsync_dev (inode->i_rdev);
4715 - invalidate_buffers (inode->i_rdev);
4716 - if (md_dev[minor].sb) {
4717 - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
4718 - md_update_sb(minor);
4725 - /* Remove locks. */
4726 - if (md_dev[minor].sb)
4727 - free_sb(md_dev + minor);
4728 - for (i=0; i<md_dev[minor].nb_dev; i++)
4729 - clear_inode (md_dev[minor].devices[i].inode);
4731 - md_dev[minor].nb_dev=md_size[minor]=0;
4732 - md_hd_struct[minor].nr_sects=0;
4733 - md_dev[minor].pers=NULL;
4735 - read_ahead[MD_MAJOR] = 128;
4739 + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
4742 + md_clear_page((unsigned long)mddev->sb);
4746 -static int do_md_add (int minor, kdev_t dev)
4747 +static int alloc_disk_sb (mdk_rdev_t * rdev)
4751 - struct real_dev *realdev;
4755 - if (md_dev[minor].nb_dev==MAX_REAL)
4756 + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
4758 + printk (OUT_OF_MEM);
4761 + md_clear_page((unsigned long)rdev->sb);
4763 - if (!fs_may_mount (dev))
4768 - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
4769 - printk("md_add(): zero device size, huh, bailing out.\n");
4771 +static void free_disk_sb (mdk_rdev_t * rdev)
4774 + free_page((unsigned long) rdev->sb);
4776 + rdev->sb_offset = 0;
4779 + if (!rdev->faulty)
4784 - if (md_dev[minor].pers) {
4786 - * The array is already running, hot-add the drive, or
4789 - if (!md_dev[minor].pers->hot_add_disk)
4793 +static void mark_rdev_faulty (mdk_rdev_t * rdev)
4795 + unsigned long flags;
4801 + save_flags(flags);
4803 + free_disk_sb(rdev);
4805 + restore_flags(flags);
4808 +static int read_disk_sb (mdk_rdev_t * rdev)
4810 + int ret = -EINVAL;
4811 + struct buffer_head *bh = NULL;
4812 + kdev_t dev = rdev->dev;
4822 - * Careful. We cannot increase nb_dev for a running array.
4823 + * Calculate the position of the superblock,
4824 + * it's at the end of the disk
4826 - i=md_dev[minor].nb_dev;
4827 - realdev = &md_dev[minor].devices[i];
4830 - /* Lock the device by inserting a dummy inode. This doesn't
4831 - smell very good, but I need to be consistent with the
4832 - mount stuff, specially with fs_may_mount. If someone have
4833 - a better idea, please help ! */
4835 - realdev->inode=get_empty_inode ();
4836 - realdev->inode->i_dev=dev; /* don't care about other fields */
4837 - insert_inode_hash (realdev->inode);
4839 - /* Sizes are now rounded at run time */
4841 -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
4843 - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
4844 + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
4845 + rdev->sb_offset = sb_offset;
4846 + printk("(read) %s's sb offset: %d", partition_name(dev),
4849 + set_blocksize (dev, MD_SB_BYTES);
4850 + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
4854 + sb = (mdp_super_t *) bh->b_data;
4855 + memcpy (rdev->sb, sb, MD_SB_BYTES);
4857 + printk (NO_SB,partition_name(rdev->dev));
4860 + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
4868 +static unsigned int calc_sb_csum (mdp_super_t * sb)
4870 + unsigned int disk_csum, csum;
4872 + disk_csum = sb->sb_csum;
4874 + csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
4875 + sb->sb_csum = disk_csum;
4880 + * Check one RAID superblock for generic plausibility
4883 +static int check_disk_sb (mdk_rdev_t * rdev)
4886 + int ret = -EINVAL;
4894 + if (sb->md_magic != MD_SB_MAGIC) {
4895 + printk (BAD_MAGIC, partition_name(rdev->dev));
4899 + if (sb->md_minor >= MAX_MD_DEVS) {
4900 + printk (BAD_MINOR, partition_name(rdev->dev),
4905 + if (calc_sb_csum(sb) != sb->sb_csum)
4906 + printk(BAD_CSUM, partition_name(rdev->dev));
4912 +static kdev_t dev_unit(kdev_t dev)
4914 + unsigned int mask;
4915 + struct gendisk *hd = find_gendisk(dev);
4919 + mask = ~((1 << hd->minor_shift) - 1);
4921 + return MKDEV(MAJOR(dev), MINOR(dev) & mask);
4924 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
4926 + struct md_list_head *tmp;
4929 + ITERATE_RDEV(mddev,rdev,tmp)
4930 + if (dev_unit(rdev->dev) == dev_unit(dev))
4936 +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
4938 + struct md_list_head *tmp;
4941 + ITERATE_RDEV(mddev1,rdev,tmp)
4942 + if (match_dev_unit(mddev2, rdev->dev))
4948 +static MD_LIST_HEAD(all_raid_disks);
4949 +static MD_LIST_HEAD(pending_raid_disks);
4951 +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
4953 + mdk_rdev_t *same_pdev;
4955 + if (rdev->mddev) {
4959 + same_pdev = match_dev_unit(mddev, rdev->dev);
4961 + printk( KERN_WARNING
4962 +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
4963 +" protection against single-disk failure might be compromised.\n",
4964 + mdidx(mddev), partition_name(rdev->dev),
4965 + partition_name(same_pdev->dev));
4967 + md_list_add(&rdev->same_set, &mddev->disks);
4968 + rdev->mddev = mddev;
4970 + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
4973 +static void unbind_rdev_from_array (mdk_rdev_t * rdev)
4975 + if (!rdev->mddev) {
4979 + md_list_del(&rdev->same_set);
4980 + MD_INIT_LIST_HEAD(&rdev->same_set);
4981 + rdev->mddev->nb_dev--;
4982 + printk("unbind<%s,%d>\n", partition_name(rdev->dev),
4983 + rdev->mddev->nb_dev);
4984 + rdev->mddev = NULL;
4988 + * prevent the device from being mounted, repartitioned or
4989 + * otherwise reused by a RAID array (or any other kernel
4990 + * subsystem), by opening the device. [simply getting an
4991 + * inode is not enough, the SCSI module usage code needs
4992 + * an explicit open() on the device]
4994 +static int lock_rdev (mdk_rdev_t *rdev)
4999 + * First insert a dummy inode.
5003 + rdev->inode = get_empty_inode();
5005 + * we dont care about any other fields
5007 + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
5008 + insert_inode_hash(rdev->inode);
5010 + memset(&rdev->filp, 0, sizeof(rdev->filp));
5011 + rdev->filp.f_mode = 3; /* read write */
5012 + err = blkdev_open(rdev->inode, &rdev->filp);
5014 + printk("blkdev_open() failed: %d\n", err);
5015 + clear_inode(rdev->inode);
5016 + rdev->inode = NULL;
5021 +static void unlock_rdev (mdk_rdev_t *rdev)
5023 + blkdev_release(rdev->inode);
5026 + clear_inode(rdev->inode);
5027 + rdev->inode = NULL;
5030 +static void export_rdev (mdk_rdev_t * rdev)
5032 + printk("export_rdev(%s)\n",partition_name(rdev->dev));
5035 + unlock_rdev(rdev);
5036 + free_disk_sb(rdev);
5037 + md_list_del(&rdev->all);
5038 + MD_INIT_LIST_HEAD(&rdev->all);
5039 + if (rdev->pending.next != &rdev->pending) {
5040 + printk("(%s was pending)\n",partition_name(rdev->dev));
5041 + md_list_del(&rdev->pending);
5042 + MD_INIT_LIST_HEAD(&rdev->pending);
5049 +static void kick_rdev_from_array (mdk_rdev_t * rdev)
5051 + unbind_rdev_from_array(rdev);
5052 + export_rdev(rdev);
5055 +static void export_array (mddev_t *mddev)
5057 + struct md_list_head *tmp;
5059 + mdp_super_t *sb = mddev->sb;
5063 + free_page((unsigned long) sb);
5066 + ITERATE_RDEV(mddev,rdev,tmp) {
5067 + if (!rdev->mddev) {
5071 + kick_rdev_from_array(rdev);
5073 + if (mddev->nb_dev)
5082 +static void print_desc(mdp_disk_t *desc)
5084 + printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
5085 + partition_name(MKDEV(desc->major,desc->minor)),
5086 + desc->major,desc->minor,desc->raid_disk,desc->state);
5089 +static void print_sb(mdp_super_t *sb)
5093 + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
5094 + sb->major_version, sb->minor_version, sb->patch_version,
5095 + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
5097 + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
5098 + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
5099 + sb->layout, sb->chunk_size);
5100 + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
5101 + sb->utime, sb->state, sb->active_disks, sb->working_disks,
5102 + sb->failed_disks, sb->spare_disks,
5103 + sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
5105 + for (i = 0; i < MD_SB_DISKS; i++) {
5108 + desc = sb->disks + i;
5109 + printk(" D %2d: ", i);
5112 + printk(" THIS: ");
5113 + print_desc(&sb->this_disk);
5117 +static void print_rdev(mdk_rdev_t *rdev)
5119 + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ",
5120 + partition_name(rdev->dev), partition_name(rdev->old_dev),
5121 + rdev->size, rdev->faulty, rdev->desc_nr);
5123 + printk("rdev superblock:\n");
5124 + print_sb(rdev->sb);
5126 + printk("no rdev superblock!\n");
5129 +void md_print_devices (void)
5131 + struct md_list_head *tmp, *tmp2;
5136 + printk(" **********************************\n");
5137 + printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
5138 + printk(" **********************************\n");
5139 + ITERATE_MDDEV(mddev,tmp) {
5140 + printk("md%d: ", mdidx(mddev));
5142 + ITERATE_RDEV(mddev,rdev,tmp2)
5143 + printk("<%s>", partition_name(rdev->dev));
5146 + printk(" array superblock:\n");
5147 + print_sb(mddev->sb);
5149 + printk(" no array superblock.\n");
5151 + ITERATE_RDEV(mddev,rdev,tmp2)
5154 + printk(" **********************************\n");
5158 +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
5161 + mdp_super_t *tmp1, *tmp2;
5163 + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
5164 + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
5166 + if (!tmp1 || !tmp2) {
5175 + * nr_disks is not constant
5177 + tmp1->nr_disks = 0;
5178 + tmp2->nr_disks = 0;
5180 + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
5194 +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
5196 + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
5197 + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
5198 + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
5199 + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
5206 +static mdk_rdev_t * find_rdev_all (kdev_t dev)
5208 + struct md_list_head *tmp;
5211 + tmp = all_raid_disks.next;
5212 + while (tmp != &all_raid_disks) {
5213 + rdev = md_list_entry(tmp, mdk_rdev_t, all);
5214 + if (rdev->dev == dev)
5221 +#define GETBLK_FAILED KERN_ERR \
5222 +"md: getblk failed for device %s\n"
5224 +static int write_disk_sb(mdk_rdev_t * rdev)
5226 + struct buffer_head *bh;
5228 + u32 sb_offset, size;
5235 + if (rdev->faulty) {
5239 + if (rdev->sb->md_magic != MD_SB_MAGIC) {
5245 + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
5246 + if (rdev->sb_offset != sb_offset) {
5247 + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
5251 + * If the disk went offline meanwhile and it's just a spare, then
5252 + * it's size has changed to zero silently, and the MD code does
5253 + * not yet know that it's faulty.
5255 + size = calc_dev_size(dev, rdev->mddev, 1);
5256 + if (size != rdev->size) {
5257 + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size);
5261 + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset);
5263 + set_blocksize(dev, MD_SB_BYTES);
5264 + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
5266 + printk(GETBLK_FAILED, partition_name(dev));
5269 + memset(bh->b_data,0,bh->b_size);
5270 + sb = (mdp_super_t *) bh->b_data;
5271 + memcpy(sb, rdev->sb, MD_SB_BYTES);
5273 + mark_buffer_uptodate(bh, 1);
5274 + mark_buffer_dirty(bh, 1);
5275 + ll_rw_block(WRITE, 1, &bh);
5276 + wait_on_buffer(bh);
5282 +#undef GETBLK_FAILED KERN_ERR
5284 +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5289 + for (i = 0; i < MD_SB_DISKS; i++) {
5290 + desc = mddev->sb->disks + i;
5292 + if (disk_faulty(desc)) {
5293 + if (MKDEV(desc->major,desc->minor) == rdev->dev)
5298 + if (MKDEV(desc->major,desc->minor) == rdev->dev) {
5299 + rdev->sb->this_disk = *desc;
5300 + rdev->desc_nr = desc->number;
5311 +static int sync_sbs(mddev_t * mddev)
5315 + struct md_list_head *tmp;
5317 + ITERATE_RDEV(mddev,rdev,tmp) {
5322 + set_this_disk(mddev, rdev);
5323 + sb->sb_csum = calc_sb_csum(sb);
5328 +int md_update_sb(mddev_t * mddev)
5330 + int first, err, count = 100;
5331 + struct md_list_head *tmp;
5336 + mddev->sb->utime = CURRENT_TIME;
5337 + ev = get_unaligned(&mddev->sb->events);
5339 + put_unaligned(ev,&mddev->sb->events);
5340 + if (ev == (__u64)0) {
5342 + * oops, this 64-bit counter should never wrap.
5343 + * Either we are in around ~1 trillion A.C., assuming
5344 + * 1 reboot per second, or we have a bug:
5348 + put_unaligned(ev,&mddev->sb->events);
5353 + * do not write anything to disk if using
5354 + * nonpersistent superblocks
5356 + if (mddev->sb->not_persistent)
5359 + printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
5364 + ITERATE_RDEV(mddev,rdev,tmp) {
5370 + printk("(skipping faulty ");
5371 + printk("%s ", partition_name(rdev->dev));
5372 + if (!rdev->faulty) {
5373 + printk("[events: %08lx]",
5374 + (unsigned long)get_unaligned(&rdev->sb->events));
5375 + err += write_disk_sb(rdev);
5381 + printk("errors occured during superblock update, repeating\n");
5384 + printk("excessive errors occured during superblock update, exiting\n");
5390 + * Import a device. If 'on_disk', then sanity check the superblock
5392 + * mark the device faulty if:
5394 + * - the device is nonexistent (zero size)
5395 + * - the device has no valid superblock
5397 + * a faulty rdev _never_ has rdev->sb set.
5399 +static int md_import_device (kdev_t newdev, int on_disk)
5403 + unsigned int size;
5405 + if (find_rdev_all(newdev))
5408 + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
5410 + printk("could not alloc mem for %s!\n", partition_name(newdev));
5413 + memset(rdev, 0, sizeof(*rdev));
5415 + if (!fs_may_mount(newdev)) {
5416 + printk("md: can not import %s, has active inodes!\n",
5417 + partition_name(newdev));
5422 + if ((err = alloc_disk_sb(rdev)))
5425 + rdev->dev = newdev;
5426 + if (lock_rdev(rdev)) {
5427 + printk("md: could not lock %s, zero-size? Marking faulty.\n",
5428 + partition_name(newdev));
5432 + rdev->desc_nr = -1;
5436 + if (blk_size[MAJOR(newdev)])
5437 + size = blk_size[MAJOR(newdev)][MINOR(newdev)];
5439 + printk("md: %s has zero size, marking faulty!\n",
5440 + partition_name(newdev));
5446 + if ((err = read_disk_sb(rdev))) {
5447 + printk("md: could not read %s's sb, not importing!\n",
5448 + partition_name(newdev));
5451 + if ((err = check_disk_sb(rdev))) {
5452 + printk("md: %s has invalid sb, not importing!\n",
5453 + partition_name(newdev));
5457 + rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
5458 + rdev->sb->this_disk.minor);
5459 + rdev->desc_nr = rdev->sb->this_disk.number;
5461 + md_list_add(&rdev->all, &all_raid_disks);
5462 + MD_INIT_LIST_HEAD(&rdev->pending);
5464 + if (rdev->faulty && rdev->sb)
5465 + free_disk_sb(rdev);
5471 + unlock_rdev(rdev);
5472 + free_disk_sb(rdev);
5479 + * Check a full RAID array for plausibility
5482 +#define INCONSISTENT KERN_ERR \
5483 +"md: fatal superblock inconsistency in %s -- removing from array\n"
5485 +#define OUT_OF_DATE KERN_ERR \
5486 +"md: superblock update time inconsistency -- using the most recent one\n"
5488 +#define OLD_VERSION KERN_ALERT \
5489 +"md: md%d: unsupported raid array version %d.%d.%d\n"
5491 +#define NOT_CLEAN_IGNORE KERN_ERR \
5492 +"md: md%d: raid array is not clean -- starting background reconstruction\n"
5494 +#define UNKNOWN_LEVEL KERN_ERR \
5495 +"md: md%d: unsupported raid level %d\n"
5497 +static int analyze_sbs (mddev_t * mddev)
5499 + int out_of_date = 0, i;
5500 + struct md_list_head *tmp, *tmp2;
5501 + mdk_rdev_t *rdev, *rdev2, *freshest;
5505 + * Verify the RAID superblock on each real device
5507 + ITERATE_RDEV(mddev,rdev,tmp) {
5508 + if (rdev->faulty) {
5516 + if (check_disk_sb(rdev))
5521 + * The superblock constant part has to be the same
5522 + * for all disks in the array.
5526 + ITERATE_RDEV(mddev,rdev,tmp) {
5531 + if (!sb_equal(sb, rdev->sb)) {
5532 + printk (INCONSISTENT, partition_name(rdev->dev));
5533 + kick_rdev_from_array(rdev);
5539 + * OK, we have all disks and the array is ready to run. Let's
5540 + * find the freshest superblock, that one will be the superblock
5541 + * that represents the whole array.
5544 + if (alloc_array_sb(mddev))
5549 + ITERATE_RDEV(mddev,rdev,tmp) {
5552 + * if the checksum is invalid, use the superblock
5553 + * only as a last resort. (decrease it's age by
5556 + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
5557 + __u64 ev = get_unaligned(&rdev->sb->events);
5558 + if (ev != (__u64)0) {
5560 + put_unaligned(ev,&rdev->sb->events);
5564 + printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
5565 + (unsigned long)get_unaligned(&rdev->sb->events));
5571 + * Find the newest superblock version
5573 + ev1 = get_unaligned(&rdev->sb->events);
5574 + ev2 = get_unaligned(&freshest->sb->events);
5581 + if (out_of_date) {
5582 + printk(OUT_OF_DATE);
5583 + printk("freshest: %s\n", partition_name(freshest->dev));
5585 + memcpy (sb, freshest->sb, sizeof(*sb));
5588 + * at this point we have picked the 'best' superblock
5589 + * from all available superblocks.
5590 + * now we validate this superblock and kick out possibly
5593 + ITERATE_RDEV(mddev,rdev,tmp) {
5595 + * Kick all non-fresh devices faulty
5598 + ev1 = get_unaligned(&rdev->sb->events);
5599 + ev2 = get_unaligned(&sb->events);
5602 + printk("md: kicking non-fresh %s from array!\n",
5603 + partition_name(rdev->dev));
5604 + kick_rdev_from_array(rdev);
5610 + * Fix up changed device names ... but only if this disk has a
5611 + * recent update time. Use faulty checksum ones too.
5613 + ITERATE_RDEV(mddev,rdev,tmp) {
5614 + __u64 ev1, ev2, ev3;
5615 + if (rdev->faulty) { /* REMOVEME */
5619 + ev1 = get_unaligned(&rdev->sb->events);
5620 + ev2 = get_unaligned(&sb->events);
5623 + if ((rdev->dev != rdev->old_dev) &&
5624 + ((ev1 == ev2) || (ev1 == ev3))) {
5627 + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
5628 + if (rdev->desc_nr == -1) {
5632 + desc = &sb->disks[rdev->desc_nr];
5633 + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
5637 + desc->major = MAJOR(rdev->dev);
5638 + desc->minor = MINOR(rdev->dev);
5639 + desc = &rdev->sb->this_disk;
5640 + desc->major = MAJOR(rdev->dev);
5641 + desc->minor = MINOR(rdev->dev);
5646 + * Remove unavailable and faulty devices ...
5648 + * note that if an array becomes completely unrunnable due to
5649 + * missing devices, we do not write the superblock back, so the
5650 + * administrator has a chance to fix things up. The removal thus
5651 + * only happens if it's nonfatal to the contents of the array.
5653 + for (i = 0; i < MD_SB_DISKS; i++) {
5658 + desc = sb->disks + i;
5659 + dev = MKDEV(desc->major, desc->minor);
5662 + * We kick faulty devices/descriptors immediately.
5664 + if (disk_faulty(desc)) {
5666 + ITERATE_RDEV(mddev,rdev,tmp) {
5667 + if (rdev->desc_nr != desc->number)
5669 + printk("md%d: kicking faulty %s!\n",
5670 + mdidx(mddev),partition_name(rdev->dev));
5671 + kick_rdev_from_array(rdev);
5676 + if (dev == MKDEV(0,0))
5678 + printk("md%d: removing former faulty %s!\n",
5679 + mdidx(mddev), partition_name(dev));
5681 + remove_descriptor(desc, sb);
5685 + if (dev == MKDEV(0,0))
5688 + * Is this device present in the rdev ring?
5691 + ITERATE_RDEV(mddev,rdev,tmp) {
5692 + if (rdev->desc_nr == desc->number) {
5700 + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
5701 + remove_descriptor(desc, sb);
5705 + * Double check wether all devices mentioned in the
5706 + * superblock are in the rdev ring.
5708 + for (i = 0; i < MD_SB_DISKS; i++) {
5712 + desc = sb->disks + i;
5713 + dev = MKDEV(desc->major, desc->minor);
5715 + if (dev == MKDEV(0,0))
5718 + if (disk_faulty(desc)) {
5723 + rdev = find_rdev(mddev, dev);
5731 + * Do a final reality check.
5733 + ITERATE_RDEV(mddev,rdev,tmp) {
5734 + if (rdev->desc_nr == -1) {
5739 + * is the desc_nr unique?
5741 + ITERATE_RDEV(mddev,rdev2,tmp2) {
5742 + if ((rdev2 != rdev) &&
5743 + (rdev2->desc_nr == rdev->desc_nr)) {
5749 + * is the device unique?
5751 + ITERATE_RDEV(mddev,rdev2,tmp2) {
5752 + if ((rdev2 != rdev) &&
5753 + (rdev2->dev == rdev->dev)) {
5761 + * Check if we can support this RAID array
5763 + if (sb->major_version != MD_MAJOR_VERSION ||
5764 + sb->minor_version > MD_MINOR_VERSION) {
5766 + printk (OLD_VERSION, mdidx(mddev), sb->major_version,
5767 + sb->minor_version, sb->patch_version);
5771 + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
5772 + (sb->level == 4) || (sb->level == 5)))
5773 + printk (NOT_CLEAN_IGNORE, mdidx(mddev));
5780 +#undef INCONSISTENT
5785 +static int device_size_calculation (mddev_t * mddev)
5787 + int data_disks = 0, persistent;
5788 + unsigned int readahead;
5789 + mdp_super_t *sb = mddev->sb;
5790 + struct md_list_head *tmp;
5794 + * Do device size calculation. Bail out if too small.
5795 + * (we have to do this after having validated chunk_size,
5796 + * because device size has to be modulo chunk_size)
5798 + persistent = !mddev->sb->not_persistent;
5799 + ITERATE_RDEV(mddev,rdev,tmp) {
5806 + rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
5807 + if (rdev->size < sb->chunk_size / 1024) {
5808 + printk (KERN_WARNING
5809 + "Dev %s smaller than chunk_size: %dk < %dk\n",
5810 + partition_name(rdev->dev),
5811 + rdev->size, sb->chunk_size / 1024);
5816 + switch (sb->level) {
5824 + zoned_raid_size(mddev);
5828 + zoned_raid_size(mddev);
5829 + data_disks = sb->raid_disks;
5836 + data_disks = sb->raid_disks-1;
5839 + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
5842 + if (!md_size[mdidx(mddev)])
5843 + md_size[mdidx(mddev)] = sb->size * data_disks;
5845 + readahead = MD_READAHEAD;
5846 + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
5847 + readahead = mddev->sb->chunk_size * 4 * data_disks;
5848 + if (readahead < data_disks * MAX_SECTORS*512*2)
5849 + readahead = data_disks * MAX_SECTORS*512*2;
5851 + if (sb->level == -3)
5854 + md_maxreadahead[mdidx(mddev)] = readahead;
5856 + printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
5857 + mdidx(mddev), readahead/1024);
5860 + "md%d: %d data-disks, max readahead per data-disk: %dk\n",
5861 + mdidx(mddev), data_disks, readahead/data_disks/1024);
5868 +#define TOO_BIG_CHUNKSIZE KERN_ERR \
5869 +"too big chunk_size: %d > %d\n"
5871 +#define TOO_SMALL_CHUNKSIZE KERN_ERR \
5872 +"too small chunk_size: %d < %ld\n"
5874 +#define BAD_CHUNKSIZE KERN_ERR \
5875 +"no chunksize specified, see 'man raidtab'\n"
5877 +static int do_md_run (mddev_t * mddev)
5881 + struct md_list_head *tmp;
5885 + if (!mddev->nb_dev) {
5894 + * Resize disks to align partitions size on a given
5897 + md_size[mdidx(mddev)] = 0;
5900 + * Analyze all RAID superblock(s)
5902 + if (analyze_sbs(mddev)) {
5907 + chunk_size = mddev->sb->chunk_size;
5908 + pnum = level_to_pers(mddev->sb->level);
5910 + mddev->param.chunk_size = chunk_size;
5911 + mddev->param.personality = pnum;
5913 + if (chunk_size > MAX_CHUNK_SIZE) {
5914 + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
5918 + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
5920 + if ( (1 << ffz(~chunk_size)) != chunk_size) {
5924 + if (chunk_size < PAGE_SIZE) {
5925 + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
5929 + if (pnum >= MAX_PERSONALITY) {
5934 + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
5936 + * 'default chunksize' in the old md code used to
5937 + * be PAGE_SIZE, baaad.
5938 + * we abort here to be on the safe side. We dont
5939 + * want to continue the bad practice.
5941 + printk(BAD_CHUNKSIZE);
5948 + char module_name[80];
5949 + sprintf (module_name, "md-personality-%d", pnum);
5950 + request_module (module_name);
5956 + if (device_size_calculation(mddev))
5960 + * Drop all container device buffers, from now on
5961 + * the only valid external interface is through the md
5964 + ITERATE_RDEV(mddev,rdev,tmp) {
5967 + fsync_dev(rdev->dev);
5968 + invalidate_buffers(rdev->dev);
5971 + mddev->pers = pers[pnum];
5973 + err = mddev->pers->run(mddev);
5975 + printk("pers->run() failed ...\n");
5976 + mddev->pers = NULL;
5980 + mddev->sb->state &= ~(1 << MD_SB_CLEAN);
5981 + md_update_sb(mddev);
5984 + * md_size has units of 1K blocks, which are
5985 + * twice as large as sectors.
5987 + md_hd_struct[mdidx(mddev)].start_sect = 0;
5988 + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
5990 + read_ahead[MD_MAJOR] = 1024;
5994 +#undef TOO_BIG_CHUNKSIZE
5995 +#undef BAD_CHUNKSIZE
5997 +#define OUT(x) do { err = (x); goto out; } while (0)
5999 +static int restart_array (mddev_t *mddev)
6004 + * Complain if it has no devices
6006 + if (!mddev->nb_dev)
6009 + if (mddev->pers) {
6014 + set_device_ro(mddev_to_kdev(mddev), 0);
6017 + "md%d switched to read-write mode.\n", mdidx(mddev));
6019 + * Kick recovery or resync if necessary
6021 + md_recover_arrays();
6022 + if (mddev->pers->restart_resync)
6023 + mddev->pers->restart_resync(mddev);
6031 +#define STILL_MOUNTED KERN_WARNING \
6032 +"md: md%d still mounted.\n"
6034 +static int do_md_stop (mddev_t * mddev, int ro)
6036 + int err = 0, resync_interrupted = 0;
6037 + kdev_t dev = mddev_to_kdev(mddev);
6039 + if (!ro && !fs_may_mount (dev)) {
6040 + printk (STILL_MOUNTED, mdidx(mddev));
6045 + * complain if it's already stopped
6047 + if (!mddev->nb_dev)
6050 + if (mddev->pers) {
6052 + * It is safe to call stop here, it only frees private
6053 + * data. Also, it tells us if a device is unstoppable
6054 + * (eg. resyncing is in progress)
6056 + if (mddev->pers->stop_resync)
6057 + if (mddev->pers->stop_resync(mddev))
6058 + resync_interrupted = 1;
6060 + if (mddev->recovery_running)
6061 + md_interrupt_thread(md_recovery_thread);
6064 + * This synchronizes with signal delivery to the
6065 + * resync or reconstruction thread. It also nicely
6066 + * hangs the process if some reconstruction has not
6069 + down(&mddev->recovery_sem);
6070 + up(&mddev->recovery_sem);
6073 + * sync and invalidate buffers because we cannot kill the
6074 + * main thread with valid IO transfers still around.
6075 + * the kernel lock protects us from new requests being
6076 + * added after invalidate_buffers().
6078 + fsync_dev (mddev_to_kdev(mddev));
6080 + invalidate_buffers (dev);
6088 + set_device_ro(dev, 0);
6089 + if (mddev->pers->stop(mddev)) {
6091 + set_device_ro(dev, 1);
6099 + * mark it clean only if there was no resync
6102 + if (!mddev->recovery_running && !resync_interrupted) {
6103 + printk("marking sb clean...\n");
6104 + mddev->sb->state |= 1 << MD_SB_CLEAN;
6106 + md_update_sb(mddev);
6109 + set_device_ro(dev, 1);
6113 + * Free resources if final stop
6116 + export_array(mddev);
6117 + md_size[mdidx(mddev)] = 0;
6118 + md_hd_struct[mdidx(mddev)].nr_sects = 0;
6119 + free_mddev(mddev);
6121 + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
6124 + "md%d switched to read-only mode.\n", mdidx(mddev));
6132 + * We have to safely support old arrays too.
6134 +int detect_old_array (mdp_super_t *sb)
6136 + if (sb->major_version > 0)
6138 + if (sb->minor_version >= 90)
6145 +static void autorun_array (mddev_t *mddev)
6148 + struct md_list_head *tmp;
6151 + if (mddev->disks.prev == &mddev->disks) {
6156 + printk("running: ");
6158 + ITERATE_RDEV(mddev,rdev,tmp) {
6159 + printk("<%s>", partition_name(rdev->dev));
6161 + printk("\nnow!\n");
6163 + err = do_md_run (mddev);
6165 + printk("do_md_run() returned %d\n", err);
6167 + * prevent the writeback of an unrunnable array
6169 + mddev->sb_dirty = 0;
6170 + do_md_stop (mddev, 0);
6175 + * lets try to run arrays based on all disks that have arrived
6176 + * until now. (those are in the ->pending list)
6178 + * the method: pick the first pending disk, collect all disks with
6179 + * the same UUID, remove all from the pending list and put them into
6180 + * the 'same_array' list. Then order this list based on superblock
6181 + * update time (freshest comes first), kick out 'old' disks and
6182 + * compare superblocks. If everything's fine then run it.
6184 +static void autorun_devices (void)
6186 + struct md_list_head candidates;
6187 + struct md_list_head *tmp;
6188 + mdk_rdev_t *rdev0, *rdev;
6193 + printk("autorun ...\n");
6194 + while (pending_raid_disks.next != &pending_raid_disks) {
6195 + rdev0 = md_list_entry(pending_raid_disks.next,
6196 + mdk_rdev_t, pending);
6198 + printk("considering %s ...\n", partition_name(rdev0->dev));
6199 + MD_INIT_LIST_HEAD(&candidates);
6200 + ITERATE_RDEV_PENDING(rdev,tmp) {
6201 + if (uuid_equal(rdev0, rdev)) {
6202 + if (!sb_equal(rdev0->sb, rdev->sb)) {
6203 + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
6206 + printk(" adding %s ...\n", partition_name(rdev->dev));
6207 + md_list_del(&rdev->pending);
6208 + md_list_add(&rdev->pending, &candidates);
6212 - * Check the superblock for consistency.
6213 - * The personality itself has to check whether it's getting
6214 - * added with the proper flags. The personality has to be
6216 + * now we have a set of devices, with all of them having
6217 + * mostly sane superblocks. It's time to allocate the
6220 - if (analyze_one_sb (realdev))
6221 + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
6222 + mddev = kdev_to_mddev(md_kdev);
6224 + printk("md%d already running, cannot run %s\n",
6225 + mdidx(mddev), partition_name(rdev0->dev));
6226 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
6227 + export_rdev(rdev);
6230 + mddev = alloc_mddev(md_kdev);
6231 + printk("created md%d\n", mdidx(mddev));
6232 + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
6233 + bind_rdev_to_array(rdev, mddev);
6234 + md_list_del(&rdev->pending);
6235 + MD_INIT_LIST_HEAD(&rdev->pending);
6237 + autorun_array(mddev);
6239 + printk("... autorun DONE.\n");
6243 + * import RAID devices based on one partition
6244 + * if possible, the array gets run as well.
6247 +#define BAD_VERSION KERN_ERR \
6248 +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
6250 +#define OUT_OF_MEM KERN_ALERT \
6251 +"md: out of memory.\n"
6253 +#define NO_DEVICE KERN_ERR \
6254 +"md: disabled device %s\n"
6256 +#define AUTOADD_FAILED KERN_ERR \
6257 +"md: auto-adding devices to md%d FAILED (error %d).\n"
6259 +#define AUTOADD_FAILED_USED KERN_ERR \
6260 +"md: cannot auto-add device %s to md%d, already used.\n"
6262 +#define AUTORUN_FAILED KERN_ERR \
6263 +"md: auto-running md%d FAILED (error %d).\n"
6265 +#define MDDEV_BUSY KERN_ERR \
6266 +"md: cannot auto-add to md%d, already running.\n"
6268 +#define AUTOADDING KERN_INFO \
6269 +"md: auto-adding devices to md%d, based on %s's superblock.\n"
6271 +#define AUTORUNNING KERN_INFO \
6272 +"md: auto-running md%d.\n"
6274 +static int autostart_array (kdev_t startdev)
6276 + int err = -EINVAL, i;
6277 + mdp_super_t *sb = NULL;
6278 + mdk_rdev_t *start_rdev = NULL, *rdev;
6280 + if (md_import_device(startdev, 1)) {
6281 + printk("could not import %s!\n", partition_name(startdev));
6285 + start_rdev = find_rdev_all(startdev);
6286 + if (!start_rdev) {
6290 + if (start_rdev->faulty) {
6291 + printk("can not autostart based on faulty %s!\n",
6292 + partition_name(startdev));
6295 + md_list_add(&start_rdev->pending, &pending_raid_disks);
6297 + sb = start_rdev->sb;
6299 + err = detect_old_array(sb);
6301 + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
6305 + for (i = 0; i < MD_SB_DISKS; i++) {
6309 + desc = sb->disks + i;
6310 + dev = MKDEV(desc->major, desc->minor);
6312 + if (dev == MKDEV(0,0))
6314 + if (dev == startdev)
6316 + if (md_import_device(dev, 1)) {
6317 + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
6320 + rdev = find_rdev_all(dev);
6325 + md_list_add(&rdev->pending, &pending_raid_disks);
6329 + * possibly return codes
6331 + autorun_devices();
6336 + export_rdev(start_rdev);
6343 +#undef AUTOADD_FAILED_USED
6344 +#undef AUTOADD_FAILED
6345 +#undef AUTORUN_FAILED
6353 +} raid_setup_args md__initdata = { 0, 0 };
6356 + * Searches all registered partitions for autorun RAID arrays
6359 +md__initfunc(void autodetect_raid(void))
6361 +#ifdef CONFIG_AUTODETECT_RAID
6362 + struct gendisk *disk;
6366 + if (raid_setup_args.noautodetect) {
6367 + printk(KERN_INFO "skipping autodetection of RAID arrays\n");
6370 + printk(KERN_INFO "autodetecting RAID arrays\n");
6372 + for (disk = gendisk_head ; disk ; disk = disk->next) {
6373 + for (i = 0; i < disk->max_p*disk->max_nr; i++) {
6374 + kdev_t dev = MKDEV(disk->major,i);
6376 + if (disk->part[i].type == LINUX_OLD_RAID_PARTITION) {
6378 +"md: %s's partition type has to be changed from type 0x86 to type 0xfd\n"
6379 +" to maintain interoperability with other OSs! Autodetection support for\n"
6380 +" type 0x86 will be deleted after some migration timeout. Sorry.\n",
6381 + partition_name(dev));
6382 + disk->part[i].type = LINUX_RAID_PARTITION;
6384 + if (disk->part[i].type != LINUX_RAID_PARTITION)
6387 + if (md_import_device(dev,1)) {
6388 + printk(KERN_ALERT "could not import %s!\n",
6389 + partition_name(dev));
6395 + rdev = find_rdev_all(dev);
6400 + if (rdev->faulty) {
6404 + md_list_add(&rdev->pending, &pending_raid_disks);
6408 + autorun_devices();
6412 +static int get_version (void * arg)
6414 + mdu_version_t ver;
6416 + ver.major = MD_MAJOR_VERSION;
6417 + ver.minor = MD_MINOR_VERSION;
6418 + ver.patchlevel = MD_PATCHLEVEL_VERSION;
6420 + if (md_copy_to_user(arg, &ver, sizeof(ver)))
6426 +#define SET_FROM_SB(x) info.x = mddev->sb->x
6427 +static int get_array_info (mddev_t * mddev, void * arg)
6429 + mdu_array_info_t info;
6434 + SET_FROM_SB(major_version);
6435 + SET_FROM_SB(minor_version);
6436 + SET_FROM_SB(patch_version);
6437 + SET_FROM_SB(ctime);
6438 + SET_FROM_SB(level);
6439 + SET_FROM_SB(size);
6440 + SET_FROM_SB(nr_disks);
6441 + SET_FROM_SB(raid_disks);
6442 + SET_FROM_SB(md_minor);
6443 + SET_FROM_SB(not_persistent);
6445 + SET_FROM_SB(utime);
6446 + SET_FROM_SB(state);
6447 + SET_FROM_SB(active_disks);
6448 + SET_FROM_SB(working_disks);
6449 + SET_FROM_SB(failed_disks);
6450 + SET_FROM_SB(spare_disks);
6452 + SET_FROM_SB(layout);
6453 + SET_FROM_SB(chunk_size);
6455 + if (md_copy_to_user(arg, &info, sizeof(info)))
6462 +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
6463 +static int get_disk_info (mddev_t * mddev, void * arg)
6465 + mdu_disk_info_t info;
6471 + if (md_copy_from_user(&info, arg, sizeof(info)))
6475 + if (nr >= mddev->sb->nr_disks)
6478 + SET_FROM_SB(major);
6479 + SET_FROM_SB(minor);
6480 + SET_FROM_SB(raid_disk);
6481 + SET_FROM_SB(state);
6483 + if (md_copy_to_user(arg, &info, sizeof(info)))
6490 +#define SET_SB(x) mddev->sb->disks[nr].x = info.x
6492 +static int add_new_disk (mddev_t * mddev, void * arg)
6494 + int err, size, persistent;
6495 + mdu_disk_info_t info;
6503 + if (md_copy_from_user(&info, arg, sizeof(info)))
6507 + if (nr >= mddev->sb->nr_disks)
6510 + dev = MKDEV(info.major,info.minor);
6512 + if (find_rdev_all(dev)) {
6513 + printk("device %s already used in a RAID array!\n",
6514 + partition_name(dev));
6521 + SET_SB(raid_disk);
6524 + if ((info.state & (1<<MD_DISK_FAULTY))==0) {
6525 + err = md_import_device (dev, 0);
6527 + printk("md: error, md_import_device() returned %d\n", err);
6530 + rdev = find_rdev_all(dev);
6536 + rdev->old_dev = dev;
6537 + rdev->desc_nr = info.number;
6539 + bind_rdev_to_array(rdev, mddev);
6541 + persistent = !mddev->sb->not_persistent;
6543 + printk("nonpersistent superblock ...\n");
6544 + if (!mddev->sb->chunk_size)
6545 + printk("no chunksize?\n");
6547 + size = calc_dev_size(dev, mddev, persistent);
6548 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6550 + if (!mddev->sb->size || (mddev->sb->size > size))
6551 + mddev->sb->size = size;
6555 + * sync all other superblocks with the main superblock
6563 +static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
6572 + printk("trying to remove %s from md%d ... \n",
6573 + partition_name(dev), mdidx(mddev));
6575 + if (!mddev->pers->diskop) {
6576 + printk("md%d: personality does not support diskops!\n",
6581 + rdev = find_rdev(mddev, dev);
6585 + if (rdev->desc_nr == -1) {
6589 + disk = &mddev->sb->disks[rdev->desc_nr];
6590 + if (disk_active(disk))
6592 + if (disk_removed(disk)) {
6597 + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
6598 + if (err == -EBUSY)
6605 + remove_descriptor(disk, mddev->sb);
6606 + kick_rdev_from_array(rdev);
6607 + mddev->sb_dirty = 1;
6608 + md_update_sb(mddev);
6612 + printk("cannot remove active disk %s from md%d ... \n",
6613 + partition_name(dev), mdidx(mddev));
6617 +static int hot_add_disk (mddev_t * mddev, kdev_t dev)
6619 + int i, err, persistent;
6620 + unsigned int size;
6627 + printk("trying to hot-add %s to md%d ... \n",
6628 + partition_name(dev), mdidx(mddev));
6630 + if (!mddev->pers->diskop) {
6631 + printk("md%d: personality does not support diskops!\n",
6636 + persistent = !mddev->sb->not_persistent;
6637 + size = calc_dev_size(dev, mddev, persistent);
6639 + if (size < mddev->sb->size) {
6640 + printk("md%d: disk size %d blocks < array size %d\n",
6641 + mdidx(mddev), size, mddev->sb->size);
6645 + rdev = find_rdev(mddev, dev);
6649 + err = md_import_device (dev, 0);
6651 + printk("md: error, md_import_device() returned %d\n", err);
6654 + rdev = find_rdev_all(dev);
6659 + if (rdev->faulty) {
6660 + printk("md: can not hot-add faulty %s disk to md%d!\n",
6661 + partition_name(dev), mdidx(mddev));
6663 + goto abort_export;
6665 + bind_rdev_to_array(rdev, mddev);
6668 + * The rest should better be atomic, we can have disk failures
6669 + * noticed in interrupt contexts ...
6672 + rdev->old_dev = dev;
6673 + rdev->size = size;
6674 + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
6676 + disk = mddev->sb->disks + mddev->sb->raid_disks;
6677 + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
6678 + disk = mddev->sb->disks + i;
6680 + if (!disk->major && !disk->minor)
6682 + if (disk_removed(disk))
6685 + if (i == MD_SB_DISKS) {
6687 + printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
6689 + goto abort_unbind_export;
6692 + if (disk_removed(disk)) {
6694 - * hot_add has to bump up nb_dev itself
6697 - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
6699 - * FIXME: here we should free up the inode and stuff
6701 - printk ("FIXME\n");
6703 + if (disk->number != i) {
6707 + goto abort_unbind_export;
6710 - md_dev[minor].nb_dev++;
6715 - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
6717 + disk->raid_disk = disk->number;
6718 + disk->major = MAJOR(dev);
6719 + disk->minor = MINOR(dev);
6721 + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
6725 + goto abort_unbind_export;
6728 + mark_disk_spare(disk);
6729 + mddev->sb->nr_disks++;
6730 + mddev->sb->spare_disks++;
6731 + mddev->sb->working_disks++;
6733 + mddev->sb_dirty = 1;
6736 + md_update_sb(mddev);
6739 + * Kick recovery, maybe this spare has to be added to the
6740 + * array immediately.
6742 + md_recover_arrays();
6746 +abort_unbind_export:
6747 + unbind_rdev_from_array(rdev);
6750 + export_rdev(rdev);
6754 +#define SET_SB(x) mddev->sb->x = info.x
6755 +static int set_array_info (mddev_t * mddev, void * arg)
6757 + mdu_array_info_t info;
6760 + printk("array md%d already has a superblock!\n",
6765 + if (md_copy_from_user(&info, arg, sizeof(info)))
6768 + if (alloc_array_sb(mddev))
6771 + mddev->sb->major_version = MD_MAJOR_VERSION;
6772 + mddev->sb->minor_version = MD_MINOR_VERSION;
6773 + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
6774 + mddev->sb->ctime = CURRENT_TIME;
6779 + SET_SB(raid_disks);
6781 + SET_SB(not_persistent);
6784 + SET_SB(active_disks);
6785 + SET_SB(working_disks);
6786 + SET_SB(failed_disks);
6787 + SET_SB(spare_disks);
6790 + SET_SB(chunk_size);
6792 + mddev->sb->md_magic = MD_SB_MAGIC;
6795 + * Generate a 128 bit UUID
6797 + get_random_bytes(&mddev->sb->set_uuid0, 4);
6798 + get_random_bytes(&mddev->sb->set_uuid1, 4);
6799 + get_random_bytes(&mddev->sb->set_uuid2, 4);
6800 + get_random_bytes(&mddev->sb->set_uuid3, 4);
6806 +static int set_disk_info (mddev_t * mddev, void * arg)
6808 + printk("not yet");
6812 +static int clear_array (mddev_t * mddev)
6814 + printk("not yet");
6818 +static int write_raid_info (mddev_t * mddev)
6820 + printk("not yet");
6824 +static int protect_array (mddev_t * mddev)
6826 + printk("not yet");
6830 +static int unprotect_array (mddev_t * mddev)
6832 + printk("not yet");
6836 +static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
6840 + fsync_dev(mddev_to_kdev(mddev));
6841 + ret = md_error(mddev_to_kdev(mddev), dev);
6845 static int md_ioctl (struct inode *inode, struct file *file,
6846 unsigned int cmd, unsigned long arg)
6849 - struct hd_geometry *loc = (struct hd_geometry *) arg;
6850 + unsigned int minor;
6852 + struct hd_geometry *loc = (struct hd_geometry *) arg;
6853 + mddev_t *mddev = NULL;
6856 - if (!capable(CAP_SYS_ADMIN))
6858 + if (!md_capable_admin())
6861 - if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
6862 - (minor & 0x7f) < MAX_PERSONALITY &&
6863 - pers[minor & 0x7f] &&
6864 - pers[minor & 0x7f]->ioctl)
6865 - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
6867 - if (minor >= MAX_MD_DEV)
6869 + dev = inode->i_rdev;
6870 + minor = MINOR(dev);
6871 + if (minor >= MAX_MD_DEVS)
6876 - case REGISTER_DEV:
6877 - return do_md_add (minor, to_kdev_t ((dev_t) arg));
6879 + * Commands dealing with the RAID driver but not any
6880 + * particular array:
6884 + case RAID_VERSION:
6885 + err = get_version((void *)arg);
6888 + case PRINT_RAID_DEBUG:
6890 + md_print_devices();
6893 + case BLKGETSIZE: /* Return device size */
6898 + err = md_put_user(md_hd_struct[minor].nr_sects,
6903 - return do_md_run (minor, (int) arg);
6906 + invalidate_buffers(dev);
6910 - return do_md_stop (minor, inode);
6912 - case BLKGETSIZE: /* Return device size */
6913 - if (!arg) return -EINVAL;
6914 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
6920 - fsync_dev (inode->i_rdev);
6921 - invalidate_buffers (inode->i_rdev);
6927 - read_ahead[MAJOR(inode->i_rdev)] = arg;
6931 - if (!arg) return -EINVAL;
6932 - err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
6937 - /* We have a problem here : there is no easy way to give a CHS
6938 - virtual geometry. We currently pretend that we have a 2 heads
6939 - 4 sectors (with a BIG number of cylinders...). This drives dosfs
6940 - just mad... ;-) */
6943 - if (!loc) return -EINVAL;
6944 - err = put_user (2, (char *) &loc->heads);
6947 - err = put_user (4, (char *) &loc->sectors);
6950 - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
6953 - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
6954 - (long *) &loc->start);
6959 - RO_IOCTLS(inode->i_rdev,arg);
6965 + read_ahead[MAJOR(dev)] = arg;
6976 + err = md_put_user (read_ahead[
6977 + MAJOR(dev)], (long *) arg);
6983 + * Commands creating/starting a new array:
6986 + mddev = kdev_to_mddev(dev);
6990 + case SET_ARRAY_INFO:
6993 + printk("array md%d already exists!\n",
7003 + case SET_ARRAY_INFO:
7004 + mddev = alloc_mddev(dev);
7010 + * alloc_mddev() should possibly self-lock.
7012 + err = lock_mddev(mddev);
7014 + printk("ioctl, reason %d, cmd %d\n", err, cmd);
7017 + err = set_array_info(mddev, (void *)arg);
7019 + printk("couldnt set array info. %d\n", err);
7026 + * possibly make it lock the array ...
7028 + err = autostart_array((kdev_t)arg);
7030 + printk("autostart %s failed!\n",
7031 + partition_name((kdev_t)arg));
7040 + * Commands querying/configuring an existing array:
7047 + err = lock_mddev(mddev);
7049 + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
7054 + * Commands even a read-only array can execute:
7058 + case GET_ARRAY_INFO:
7059 + err = get_array_info(mddev, (void *)arg);
7062 + case GET_DISK_INFO:
7063 + err = get_disk_info(mddev, (void *)arg);
7066 + case RESTART_ARRAY_RW:
7067 + err = restart_array(mddev);
7071 + err = do_md_stop (mddev, 0);
7074 + case STOP_ARRAY_RO:
7075 + err = do_md_stop (mddev, 1);
7079 + * We have a problem here : there is no easy way to give a CHS
7080 + * virtual geometry. We currently pretend that we have a 2 heads
7081 + * 4 sectors (with a BIG number of cylinders...). This drives
7082 + * dosfs just mad... ;-)
7087 + goto abort_unlock;
7089 + err = md_put_user (2, (char *) &loc->heads);
7091 + goto abort_unlock;
7092 + err = md_put_user (4, (char *) &loc->sectors);
7094 + goto abort_unlock;
7095 + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
7096 + (short *) &loc->cylinders);
7098 + goto abort_unlock;
7099 + err = md_put_user (md_hd_struct[minor].start_sect,
7100 + (long *) &loc->start);
7106 + * The remaining ioctls are changing the state of the
7107 + * superblock, so we do not allow read-only arrays
7112 + goto abort_unlock;
7118 + err = clear_array(mddev);
7121 + case ADD_NEW_DISK:
7122 + err = add_new_disk(mddev, (void *)arg);
7125 + case HOT_REMOVE_DISK:
7126 + err = hot_remove_disk(mddev, (kdev_t)arg);
7129 + case HOT_ADD_DISK:
7130 + err = hot_add_disk(mddev, (kdev_t)arg);
7133 + case SET_DISK_INFO:
7134 + err = set_disk_info(mddev, (void *)arg);
7137 + case WRITE_RAID_INFO:
7138 + err = write_raid_info(mddev);
7141 + case UNPROTECT_ARRAY:
7142 + err = unprotect_array(mddev);
7145 + case PROTECT_ARRAY:
7146 + err = protect_array(mddev);
7149 + case SET_DISK_FAULTY:
7150 + err = set_disk_faulty(mddev, (kdev_t)arg);
7155 + mdu_param_t param;
7157 + err = md_copy_from_user(¶m, (mdu_param_t *)arg,
7160 + goto abort_unlock;
7162 + err = do_md_run (mddev);
7164 + * we have to clean up the mess if
7165 + * the array cannot be run for some
7169 + mddev->sb_dirty = 0;
7170 + do_md_stop (mddev, 0);
7176 + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
7178 + goto abort_unlock;
7184 + unlock_mddev(mddev);
7186 + printk("huh11?\n");
7191 + printk("huh12?\n");
7197 +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
7199 static int md_open (struct inode *inode, struct file *file)
7201 - int minor=MINOR(inode->i_rdev);
7208 +static void md_release (struct inode *inode, struct file *file)
7210 + sync_dev(inode->i_rdev);
7214 +static int md_read (struct inode *inode, struct file *file,
7215 + char *buf, int count)
7217 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7219 - md_dev[minor].busy++;
7220 - return (0); /* Always succeed */
7221 + if (!mddev || !mddev->pers)
7224 + return block_read (inode, file, buf, count);
7227 +static int md_write (struct inode *inode, struct file *file,
7228 + const char *buf, int count)
7230 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7232 + if (!mddev || !mddev->pers)
7235 -static int md_release (struct inode *inode, struct file *file)
7236 + return block_write (inode, file, buf, count);
7239 +static struct file_operations md_fops=
7241 - int minor=MINOR(inode->i_rdev);
7256 - sync_dev (inode->i_rdev);
7257 - md_dev[minor].busy--;
7259 +static int md_open (struct inode *inode, struct file *file)
7267 +static int md_release (struct inode *inode, struct file *file)
7269 + sync_dev(inode->i_rdev);
7273 static ssize_t md_read (struct file *file, char *buf, size_t count,
7276 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7277 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7279 - if (!md_dev[minor].pers) /* Check if device is being run */
7281 + if (!mddev || !mddev->pers)
7284 - return block_read(file, buf, count, ppos);
7285 + return block_read(file, buf, count, ppos);
7288 static ssize_t md_write (struct file *file, const char *buf,
7289 size_t count, loff_t *ppos)
7291 - int minor=MINOR(file->f_dentry->d_inode->i_rdev);
7292 + mddev_t *mddev = kdev_to_mddev(MD_FILE_TO_INODE(file)->i_rdev);
7294 - if (!md_dev[minor].pers) /* Check if device is being run */
7296 + if (!mddev || !mddev->pers)
7299 - return block_write(file, buf, count, ppos);
7300 + return block_write(file, buf, count, ppos);
7303 static struct file_operations md_fops=
7329 -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
7332 +int md_map (kdev_t dev, kdev_t *rdev,
7333 + unsigned long *rsector, unsigned long size)
7335 - if ((unsigned int) minor >= MAX_MD_DEV)
7337 - printk ("Bad md device %d\n", minor);
7341 - if (!md_dev[minor].pers)
7343 - printk ("Oops ! md%d not running, giving up !\n", minor);
7347 + mddev_t *mddev = kdev_to_mddev(dev);
7349 - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
7350 + if (!mddev || !mddev->pers) {
7355 + err = mddev->pers->map(mddev, dev, rdev, rsector, size);
7360 -int md_make_request (int minor, int rw, struct buffer_head * bh)
7361 +int md_make_request (struct buffer_head * bh, int rw)
7363 - if (md_dev [minor].pers->make_request) {
7364 - if (buffer_locked(bh))
7367 + mddev_t *mddev = kdev_to_mddev(bh->b_dev);
7369 + if (!mddev || !mddev->pers) {
7374 + if (mddev->pers->make_request) {
7375 + if (buffer_locked(bh)) {
7379 set_bit(BH_Lock, &bh->b_state);
7380 if (rw == WRITE || rw == WRITEA) {
7381 if (!buffer_dirty(bh)) {
7382 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7384 + bh->b_end_io(bh, buffer_uptodate(bh));
7389 if (rw == READ || rw == READA) {
7390 if (buffer_uptodate(bh)) {
7391 - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
7393 + bh->b_end_io(bh, buffer_uptodate(bh));
7398 - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
7399 + err = mddev->pers->make_request(mddev, rw, bh);
7401 make_request (MAJOR(bh->b_rdev), rw, bh);
7409 static void do_md_request (void)
7411 - printk ("Got md request, not good...");
7413 + printk(KERN_ALERT "Got md request, not good...");
7417 +int md_thread(void * arg)
7419 + mdk_thread_t *thread = arg;
7423 + exit_files(current);
7430 + sprintf(current->comm, thread->name);
7431 + md_init_signals();
7432 + md_flush_signals();
7433 + thread->tsk = current;
7436 + * md_thread is a 'system-thread', it's priority should be very
7437 + * high. We avoid resource deadlocks individually in each
7438 + * raid personality. (RAID5 does preallocation) We also use RR and
7439 + * the very same RT priority as kswapd, thus we will never get
7440 + * into a priority inversion deadlock.
7442 + * we definitely have to have equal or higher priority than
7443 + * bdflush, otherwise bdflush will deadlock if there are too
7444 + * many dirty RAID5 blocks.
7446 + current->policy = SCHED_OTHER;
7447 + current->priority = 40;
7453 + if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7456 + interruptible_sleep_on(&thread->wqueue);
7459 + clear_bit(THREAD_WAKEUP, &thread->flags);
7460 + if (thread->run) {
7461 + thread->run(thread->data);
7462 + run_task_queue(&tq_disk);
7464 + if (md_signal_pending(current)) {
7465 + printk("%8s(%d) flushing signals.\n", current->comm,
7467 + md_flush_signals();
7475 -void md_wakeup_thread(struct md_thread *thread)
7476 +void md_wakeup_thread(mdk_thread_t *thread)
7478 set_bit(THREAD_WAKEUP, &thread->flags);
7479 wake_up(&thread->wqueue);
7482 -struct md_thread *md_register_thread (void (*run) (void *), void *data)
7483 +mdk_thread_t *md_register_thread (void (*run) (void *),
7484 + void *data, const char *name)
7486 - struct md_thread *thread = (struct md_thread *)
7487 - kmalloc(sizeof(struct md_thread), GFP_KERNEL);
7488 + mdk_thread_t *thread;
7490 struct semaphore sem = MUTEX_LOCKED;
7492 - if (!thread) return NULL;
7493 + thread = (mdk_thread_t *) kmalloc
7494 + (sizeof(mdk_thread_t), GFP_KERNEL);
7498 - memset(thread, 0, sizeof(struct md_thread));
7499 + memset(thread, 0, sizeof(mdk_thread_t));
7500 init_waitqueue(&thread->wqueue);
7504 thread->data = data;
7505 + thread->name = name;
7506 ret = kernel_thread(md_thread, thread, 0);
7509 @@ -836,270 +3032,407 @@
7513 -void md_unregister_thread (struct md_thread *thread)
7514 +void md_interrupt_thread (mdk_thread_t *thread)
7516 + if (!thread->tsk) {
7520 + printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
7521 + send_sig(SIGKILL, thread->tsk, 1);
7524 +void md_unregister_thread (mdk_thread_t *thread)
7526 struct semaphore sem = MUTEX_LOCKED;
7531 - printk("Killing md_thread %d %p %s\n",
7532 - thread->tsk->pid, thread->tsk, thread->tsk->comm);
7534 - printk("Aiee. md_thread has 0 tsk\n");
7535 - send_sig(SIGKILL, thread->tsk, 1);
7536 - printk("downing on %p\n", &sem);
7537 + thread->name = NULL;
7538 + if (!thread->tsk) {
7542 + md_interrupt_thread(thread);
7546 -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
7548 -int md_thread(void * arg)
7549 +void md_recover_arrays (void)
7551 - struct md_thread *thread = arg;
7555 - exit_files(current);
7558 - current->session = 1;
7559 - current->pgrp = 1;
7560 - sprintf(current->comm, "md_thread");
7561 - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
7562 - thread->tsk = current;
7567 - if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
7569 - spin_lock(¤t->sigmask_lock);
7570 - flush_signals(current);
7571 - spin_unlock(¤t->sigmask_lock);
7572 - interruptible_sleep_on(&thread->wqueue);
7574 - if (test_bit(THREAD_WAKEUP, &thread->flags))
7576 - if (!thread->run) {
7581 - } while (signal_pending(current));
7584 - clear_bit(THREAD_WAKEUP, &thread->flags);
7585 - if (thread->run) {
7586 - thread->run(thread->data);
7587 - run_task_queue(&tq_disk);
7589 + if (!md_recovery_thread) {
7593 + md_wakeup_thread(md_recovery_thread);
7596 -EXPORT_SYMBOL(md_size);
7597 -EXPORT_SYMBOL(md_maxreadahead);
7598 -EXPORT_SYMBOL(register_md_personality);
7599 -EXPORT_SYMBOL(unregister_md_personality);
7600 -EXPORT_SYMBOL(partition_name);
7601 -EXPORT_SYMBOL(md_dev);
7602 -EXPORT_SYMBOL(md_error);
7603 -EXPORT_SYMBOL(md_register_thread);
7604 -EXPORT_SYMBOL(md_unregister_thread);
7605 -EXPORT_SYMBOL(md_update_sb);
7606 -EXPORT_SYMBOL(md_map);
7607 -EXPORT_SYMBOL(md_wakeup_thread);
7608 -EXPORT_SYMBOL(md_do_sync);
7610 -#ifdef CONFIG_PROC_FS
7611 -static struct proc_dir_entry proc_md = {
7612 - PROC_MD, 6, "mdstat",
7613 - S_IFREG | S_IRUGO, 1, 0, 0,
7614 - 0, &proc_array_inode_operations,
7616 +int md_error (kdev_t dev, kdev_t rdev)
7618 + mddev_t *mddev = kdev_to_mddev(dev);
7619 + mdk_rdev_t * rrdev;
7626 + rrdev = find_rdev(mddev, rdev);
7627 + mark_rdev_faulty(rrdev);
7629 + * if recovery was running, stop it now.
7631 + if (mddev->pers->stop_resync)
7632 + mddev->pers->stop_resync(mddev);
7633 + if (mddev->recovery_running)
7634 + md_interrupt_thread(md_recovery_thread);
7635 + if (mddev->pers->error_handler) {
7636 + rc = mddev->pers->error_handler(mddev, rdev);
7637 + md_recover_arrays();
7642 + * Drop all buffers in the failed array.
7643 + * _not_. This is called from IRQ handlers ...
7645 + invalidate_buffers(rdev);
7650 -static void md_geninit (struct gendisk *gdisk)
7651 +static int status_unused (char * page)
7655 - for(i=0;i<MAX_MD_DEV;i++)
7657 - md_blocksizes[i] = 1024;
7658 - md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
7659 - md_gendisk.part[i].start_sect=-1; /* avoid partition check */
7660 - md_gendisk.part[i].nr_sects=0;
7661 - md_dev[i].pers=NULL;
7663 + int sz = 0, i = 0;
7665 + struct md_list_head *tmp;
7667 - blksize_size[MD_MAJOR] = md_blocksizes;
7668 - max_readahead[MD_MAJOR] = md_maxreadahead;
7669 + sz += sprintf(page + sz, "unused devices: ");
7671 -#ifdef CONFIG_PROC_FS
7672 - proc_register(&proc_root, &proc_md);
7674 + ITERATE_RDEV_ALL(rdev,tmp) {
7675 + if (!rdev->same_set.next && !rdev->same_set.prev) {
7677 + * The device is not yet used by any array.
7680 + sz += sprintf(page + sz, "%s ",
7681 + partition_name(rdev->dev));
7685 + sz += sprintf(page + sz, "<none>");
7687 + sz += sprintf(page + sz, "\n");
7691 -int md_error (kdev_t mddev, kdev_t rdev)
7693 +static int status_resync (char * page, mddev_t * mddev)
7695 - unsigned int minor = MINOR (mddev);
7698 + unsigned int blocksize, max_blocks, resync, res, dt, tt, et;
7700 - if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
7701 - panic ("md_error gets unknown device\n");
7702 - if (!md_dev [minor].pers)
7703 - panic ("md_error gets an error for an unknown device\n");
7704 - if (md_dev [minor].pers->error_handler) {
7705 - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
7706 -#if SUPPORT_RECONSTRUCTION
7707 - md_wakeup_thread(md_sync_thread);
7708 -#endif /* SUPPORT_RECONSTRUCTION */
7712 + resync = mddev->curr_resync;
7713 + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)];
7714 + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10);
7717 + * Should not happen.
7719 + if (!max_blocks) {
7723 + res = resync*100/max_blocks;
7724 + if (!mddev->recovery_running)
7728 + sz += sprintf(page + sz, " resync=%u%%", res);
7733 + sz += sprintf(page + sz, " recovery=%u%%", res);
7736 + * We do not want to overflow, so the order of operands and
7737 + * the * 100 / 100 trick are important. We do a +1 to be
7738 + * safe against division by zero. We only estimate anyway.
7740 + * dt: time until now
7742 + * et: estimated finish time
7744 + dt = ((jiffies - mddev->resync_start) / HZ);
7745 + tt = (dt * (max_blocks / (resync/100+1)))/100;
7750 + * ignore rounding effects near finish time
7754 + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6);
7759 int get_md_status (char *page)
7761 - int sz=0, i, j, size;
7763 - sz+=sprintf( page+sz, "Personalities : ");
7764 - for (i=0; i<MAX_PERSONALITY; i++)
7766 - sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
7770 - sz+=sprintf (page+sz, "read_ahead ");
7771 - if (read_ahead[MD_MAJOR]==INT_MAX)
7772 - sz+=sprintf (page+sz, "not set\n");
7774 - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7775 + int sz = 0, j, size;
7776 + struct md_list_head *tmp, *tmp2;
7780 + sz += sprintf(page + sz, "Personalities : ");
7781 + for (j = 0; j < MAX_PERSONALITY; j++)
7783 + sz += sprintf(page+sz, "[%s] ", pers[j]->name);
7785 + sz += sprintf(page+sz, "\n");
7788 + sz += sprintf(page+sz, "read_ahead ");
7789 + if (read_ahead[MD_MAJOR] == INT_MAX)
7790 + sz += sprintf(page+sz, "not set\n");
7792 + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
7794 - for (i=0; i<MAX_MD_DEV; i++)
7796 - sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
7798 - if (md_dev[i].pers)
7799 - sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
7800 + ITERATE_MDDEV(mddev,tmp) {
7801 + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
7802 + mddev->pers ? "" : "in");
7803 + if (mddev->pers) {
7805 + sz += sprintf(page + sz, " (read-only)");
7806 + sz += sprintf(page + sz, " %s", mddev->pers->name);
7810 - for (j=0; j<md_dev[i].nb_dev; j++)
7812 - sz+=sprintf (page+sz, " %s",
7813 - partition_name(md_dev[i].devices[j].dev));
7814 - size+=md_dev[i].devices[j].size;
7817 + ITERATE_RDEV(mddev,rdev,tmp2) {
7818 + sz += sprintf(page + sz, " %s[%d]",
7819 + partition_name(rdev->dev), rdev->desc_nr);
7820 + if (rdev->faulty) {
7821 + sz += sprintf(page + sz, "(F)");
7824 + size += rdev->size;
7827 - if (md_dev[i].nb_dev) {
7828 - if (md_dev[i].pers)
7829 - sz+=sprintf (page+sz, " %d blocks", md_size[i]);
7831 - sz+=sprintf (page+sz, " %d blocks", size);
7833 + if (mddev->nb_dev) {
7835 + sz += sprintf(page + sz, " %d blocks",
7836 + md_size[mdidx(mddev)]);
7838 + sz += sprintf(page + sz, " %d blocks", size);
7841 - if (!md_dev[i].pers)
7843 - sz+=sprintf (page+sz, "\n");
7846 + if (!mddev->pers) {
7847 + sz += sprintf(page+sz, "\n");
7851 - if (md_dev[i].pers->max_invalid_dev)
7852 - sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
7853 + sz += mddev->pers->status (page+sz, mddev);
7855 - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
7856 - sz+=sprintf (page+sz, "\n");
7858 + if (mddev->curr_resync)
7859 + sz += status_resync (page+sz, mddev);
7861 + if (md_atomic_read(&mddev->resync_sem.count) != 1)
7862 + sz += sprintf(page + sz, " resync=DELAYED");
7864 + sz += sprintf(page + sz, "\n");
7866 + sz += status_unused (page + sz);
7872 -int register_md_personality (int p_num, struct md_personality *p)
7873 +int register_md_personality (int pnum, mdk_personality_t *p)
7875 - int i=(p_num >> PERSONALITY_SHIFT);
7877 - if (i >= MAX_PERSONALITY)
7879 + if (pnum >= MAX_PERSONALITY)
7888 - printk ("%s personality registered\n", p->name);
7891 + printk(KERN_INFO "%s personality registered\n", p->name);
7895 -int unregister_md_personality (int p_num)
7896 +int unregister_md_personality (int pnum)
7898 - int i=(p_num >> PERSONALITY_SHIFT);
7900 - if (i >= MAX_PERSONALITY)
7902 + if (pnum >= MAX_PERSONALITY)
7905 - printk ("%s personality unregistered\n", pers[i]->name);
7908 + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
7909 + pers[pnum] = NULL;
7913 -static md_descriptor_t *get_spare(struct md_dev *mddev)
7914 +static mdp_disk_t *get_spare(mddev_t *mddev)
7917 - md_superblock_t *sb = mddev->sb;
7918 - md_descriptor_t *descriptor;
7919 - struct real_dev *realdev;
7921 - for (i = 0; i < mddev->nb_dev; i++) {
7922 - realdev = &mddev->devices[i];
7924 + mdp_super_t *sb = mddev->sb;
7927 + struct md_list_head *tmp;
7929 + ITERATE_RDEV(mddev,rdev,tmp) {
7935 - descriptor = &sb->disks[realdev->sb->descriptor.number];
7936 - if (descriptor->state & (1 << MD_FAULTY_DEVICE))
7938 + disk = &sb->disks[rdev->desc_nr];
7939 + if (disk_faulty(disk)) {
7942 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
7944 + if (disk_active(disk))
7946 - return descriptor;
7952 +static int is_mddev_idle (mddev_t *mddev)
7954 + mdk_rdev_t * rdev;
7955 + struct md_list_head *tmp;
7957 + unsigned long curr_events;
7960 + ITERATE_RDEV(mddev,rdev,tmp) {
7961 + curr_events = io_events[MAJOR(rdev->dev)];
7963 + if (curr_events != rdev->last_events) {
7964 +// printk("!I(%d)", curr_events-rdev->last_events);
7965 + rdev->last_events = curr_events;
7973 * parallel resyncing thread.
7975 - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
7976 - * - fix read error handing
7979 -int md_do_sync(struct md_dev *mddev)
7981 + * Determine correct block size for this device.
7983 +unsigned int device_bsize (kdev_t dev)
7985 + unsigned int i, correct_size;
7987 + correct_size = BLOCK_SIZE;
7988 + if (blksize_size[MAJOR(dev)]) {
7989 + i = blksize_size[MAJOR(dev)][MINOR(dev)];
7994 + return correct_size;
7997 +static struct wait_queue *resync_wait = (struct wait_queue *)NULL;
7999 +#define RA_ORDER (1)
8000 +#define RA_PAGE_SIZE (PAGE_SIZE*(1<<RA_ORDER))
8001 +#define MAX_NR_BLOCKS (RA_PAGE_SIZE/sizeof(struct buffer_head *))
8003 +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
8005 - struct buffer_head *bh;
8006 - int max_blocks, blocksize, curr_bsize, percent=1, j;
8007 - kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
8009 + struct buffer_head **bh;
8010 + unsigned int max_blocks, blocksize, curr_bsize,
8011 + i, ii, j, k, chunk, window, nr_blocks, err, serialize;
8012 + kdev_t read_disk = mddev_to_kdev(mddev);
8013 int major = MAJOR(read_disk), minor = MINOR(read_disk);
8014 unsigned long starttime;
8015 + int max_read_errors = 2*MAX_NR_BLOCKS,
8016 + max_write_errors = 2*MAX_NR_BLOCKS;
8017 + struct md_list_head *tmp;
8020 + bh = (struct buffer_head **) md__get_free_pages(GFP_KERNEL, RA_ORDER);
8023 + "could not alloc bh array for reconstruction ... retrying!\n");
8027 + err = down_interruptible(&mddev->resync_sem);
8033 + ITERATE_MDDEV(mddev2,tmp) {
8034 + if (mddev2 == mddev)
8036 + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
8037 + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
8043 + interruptible_sleep_on(&resync_wait);
8044 + if (md_signal_pending(current)) {
8045 + md_flush_signals();
8052 + mddev->curr_resync = 1;
8054 - blocksize = blksize_size[major][minor];
8055 + blocksize = device_bsize(read_disk);
8056 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8058 - printk("... resync log\n");
8059 - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
8060 - printk(" .... raid array: %s\n", kdevname(read_disk));
8061 - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
8062 - printk("md: syncing RAID array %s\n", kdevname(read_disk));
8063 + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
8064 + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n",
8065 + sysctl_speed_limit);
8066 + printk(KERN_INFO "md: using maximum available idle IO bandwith for reconstruction.\n");
8069 + * Resync has low priority.
8071 + current->priority = 1;
8073 + is_mddev_idle(mddev); /* this also initializes IO event counters */
8074 + starttime = jiffies;
8075 + mddev->resync_start = starttime;
8079 + * Tune reconstruction:
8081 + window = md_maxreadahead[mdidx(mddev)]/1024;
8082 + nr_blocks = window / (blocksize >> 10);
8083 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8084 + nr_blocks = MAX_NR_BLOCKS;
8085 + printk(KERN_INFO "md: using %dk window.\n",window);
8087 - starttime=jiffies;
8088 - for (j = 0; j < max_blocks; j++) {
8089 + for (j = 0; j < max_blocks; j += nr_blocks) {
8092 + mddev->curr_resync = j;
8094 * B careful. When some1 mounts a non-'blocksize' filesystem
8095 * then we get the blocksize changed right under us. Go deal
8096 * with it transparently, recalculate 'blocksize', 'j' and
8099 - curr_bsize = blksize_size[major][minor];
8100 + curr_bsize = device_bsize(read_disk);
8101 if (curr_bsize != blocksize) {
8103 + printk(KERN_INFO "md%d: blocksize changed\n",
8106 if (curr_bsize > blocksize)
8108 * this is safe, rounds downwards.
8109 @@ -1109,114 +3442,384 @@
8110 j *= blocksize/curr_bsize;
8112 blocksize = curr_bsize;
8113 + nr_blocks = window / (blocksize >> 10);
8114 + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS))
8115 + nr_blocks = MAX_NR_BLOCKS;
8116 max_blocks = blk_size[major][minor] / (blocksize >> 10);
8118 - if ((bh = breada (read_disk, j, blocksize, j * blocksize,
8119 - max_blocks * blocksize)) != NULL) {
8120 - mark_buffer_dirty(bh, 1);
8123 + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n",
8124 + nr_blocks, blocksize, j, max_blocks);
8126 - * FIXME: Ugly, but set_blocksize() isnt safe ...
8127 + * We will retry the current block-group
8129 - curr_bsize = blksize_size[major][minor];
8130 - if (curr_bsize != blocksize)
8131 - goto diff_blocksize;
8135 - * It's a real read problem. FIXME, handle this
8138 - printk ( KERN_ALERT
8139 - "read error, stopping reconstruction.\n");
8143 + * Cleanup routines expect this
8145 + for (k = 0; k < nr_blocks; k++)
8148 + chunk = nr_blocks;
8149 + if (chunk > max_blocks-j)
8150 + chunk = max_blocks-j;
8153 + * request buffer heads ...
8155 + for (i = 0; i < chunk; i++) {
8156 + bh[i] = getblk (read_disk, j+i, blocksize);
8159 + if (!buffer_dirty(bh[i]))
8160 + mark_buffer_lowprio(bh[i]);
8164 - * Let's sleep some if we are faster than our speed limit:
8165 + * read buffer heads ...
8167 - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
8169 - current->state = TASK_INTERRUPTIBLE;
8170 - schedule_timeout(1);
8171 + ll_rw_block (READ, chunk, bh);
8172 + run_task_queue(&tq_disk);
8175 + * verify that all of them are OK ...
8177 + for (i = 0; i < chunk; i++) {
8179 + wait_on_buffer(bh[ii]);
8180 + if (!buffer_uptodate(bh[ii]))
8185 + for (i = 0; i < chunk; i++)
8186 + mark_buffer_dirty_lowprio(bh[i]);
8188 + ll_rw_block(WRITE, chunk, bh);
8189 + run_task_queue(&tq_disk);
8191 + for (i = 0; i < chunk; i++) {
8193 + wait_on_buffer(bh[ii]);
8195 + if (spare && disk_faulty(spare)) {
8196 + for (k = 0; k < chunk; k++)
8198 + printk(" <SPARE FAILED!>\n ");
8203 + if (!buffer_uptodate(bh[ii])) {
8204 + curr_bsize = device_bsize(read_disk);
8205 + if (curr_bsize != blocksize) {
8207 + "md%d: blocksize changed during write\n",
8209 + for (k = 0; k < chunk; k++)
8211 + if (buffer_lowprio(bh[k]))
8212 + mark_buffer_clean(bh[k]);
8217 + printk(" BAD WRITE %8d>\n", j);
8219 + * Ouch, write error, retry or bail out.
8221 + if (max_write_errors) {
8222 + max_write_errors--;
8223 + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8226 + printk ( KERN_ALERT
8227 + "too many write errors, stopping reconstruction.\n");
8228 + for (k = 0; k < chunk; k++)
8230 + if (buffer_lowprio(bh[k]))
8231 + mark_buffer_clean(bh[k]);
8240 - * FIXME: put this status bar thing into /proc
8241 + * This is the normal 'everything went OK' case
8242 + * do a 'free-behind' logic, we sure dont need
8243 + * this buffer if it was the only user.
8245 - if (!(j%(max_blocks/100))) {
8246 - if (!(percent%10))
8247 - printk (" %03d%% done.\n",percent);
8248 + for (i = 0; i < chunk; i++)
8249 + if (buffer_dirty(bh[i]))
8257 + if (md_signal_pending(current)) {
8259 + * got a signal, exit.
8261 + mddev->curr_resync = 0;
8262 + printk("md_do_sync() got signal ... exiting\n");
8263 + md_flush_signals();
8269 + * this loop exits only if either when we are slower than
8270 + * the 'hard' speed limit, or the system was IO-idle for
8272 + * the system might be non-idle CPU-wise, but we only care
8273 + * about not overloading the IO subsystem. (things like an
8274 + * e2fsck being done on the RAID array should execute fast)
8277 + if (md_need_resched(current))
8280 + if ((blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1
8281 + > sysctl_speed_limit) {
8282 + current->priority = 1;
8284 + if (!is_mddev_idle(mddev)) {
8285 + current->state = TASK_INTERRUPTIBLE;
8286 + md_schedule_timeout(HZ/2);
8287 + if (!md_signal_pending(current))
8291 + current->priority = 40;
8293 fsync_dev(read_disk);
8294 - printk("md: %s: sync done.\n", kdevname(read_disk));
8297 + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
8300 + * this also signals 'finished resyncing' to md_stop
8303 + up(&mddev->resync_sem);
8305 + free_pages((unsigned long)bh, RA_ORDER);
8306 + mddev->curr_resync = 0;
8307 + wake_up(&resync_wait);
8312 + * set_blocksize() might change the blocksize. This
8313 + * should not happen often, but it happens when eg.
8314 + * someone mounts a filesystem that has non-1k
8315 + * blocksize. set_blocksize() doesnt touch our
8316 + * buffer, but to avoid aliasing problems we change
8317 + * our internal blocksize too and retry the read.
8319 + curr_bsize = device_bsize(read_disk);
8320 + if (curr_bsize != blocksize) {
8321 + printk(KERN_INFO "md%d: blocksize changed during read\n",
8323 + for (k = 0; k < chunk; k++)
8325 + if (buffer_lowprio(bh[k]))
8326 + mark_buffer_clean(bh[k]);
8333 + * It's a real read problem. We retry and bail out
8334 + * only if it's excessive.
8336 + if (max_read_errors) {
8337 + max_read_errors--;
8338 + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize);
8339 + for (k = 0; k < chunk; k++)
8341 + if (buffer_lowprio(bh[k]))
8342 + mark_buffer_clean(bh[k]);
8347 + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n");
8348 + for (k = 0; k < chunk; k++)
8350 + if (buffer_lowprio(bh[k]))
8351 + mark_buffer_clean(bh[k]);
8358 +#undef MAX_NR_BLOCKS
8361 - * This is a kernel thread which: syncs a spare disk with the active array
8362 + * This is a kernel thread which syncs a spare disk with the active array
8364 * the amount of foolproofing might seem to be a tad excessive, but an
8365 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
8366 * of my root partition with the first 0.5 gigs of my /home partition ... so
8367 * i'm a bit nervous ;)
8369 -void mdsyncd (void *data)
8370 +void md_do_recovery (void *data)
8373 - struct md_dev *mddev;
8374 - md_superblock_t *sb;
8375 - md_descriptor_t *spare;
8379 + mdp_disk_t *spare;
8380 unsigned long flags;
8381 + struct md_list_head *tmp;
8383 - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
8384 - if ((sb = mddev->sb) == NULL)
8385 + printk(KERN_INFO "md: recovery thread got woken up ...\n");
8387 + ITERATE_MDDEV(mddev,tmp) {
8391 + if (mddev->recovery_running)
8393 if (sb->active_disks == sb->raid_disks)
8395 - if (!sb->spare_disks)
8396 + if (!sb->spare_disks) {
8397 + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
8401 + * now here we get the spare and resync it.
8403 if ((spare = get_spare(mddev)) == NULL)
8405 - if (!mddev->pers->mark_spare)
8406 + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8407 + if (!mddev->pers->diskop)
8409 - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
8410 + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
8412 - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
8413 - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
8414 + down(&mddev->recovery_sem);
8415 + mddev->recovery_running = 1;
8416 + err = md_do_sync(mddev, spare);
8417 + if (err == -EIO) {
8418 + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
8419 + if (!disk_faulty(spare)) {
8420 + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
8421 + mark_disk_faulty(spare);
8422 + mark_disk_nonsync(spare);
8423 + mark_disk_inactive(spare);
8424 + sb->spare_disks--;
8425 + sb->working_disks--;
8426 + sb->failed_disks++;
8429 + if (disk_faulty(spare))
8430 + mddev->pers->diskop(mddev, &spare,
8431 + DISKOP_SPARE_INACTIVE);
8432 + if (err == -EINTR) {
8434 + * Recovery got interrupted ...
8435 + * signal back that we have finished using the array.
8437 + mddev->pers->diskop(mddev, &spare,
8438 + DISKOP_SPARE_INACTIVE);
8439 + up(&mddev->recovery_sem);
8440 + mddev->recovery_running = 0;
8443 + mddev->recovery_running = 0;
8444 + up(&mddev->recovery_sem);
8448 - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
8449 - spare->state |= (1 << MD_SYNC_DEVICE);
8450 - spare->state |= (1 << MD_ACTIVE_DEVICE);
8451 - sb->spare_disks--;
8452 - sb->active_disks++;
8453 - mddev->sb_dirty = 1;
8454 - md_update_sb(mddev - md_dev);
8455 + if (!disk_faulty(spare)) {
8457 + * the SPARE_ACTIVE diskop possibly changes the
8460 + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
8461 + mark_disk_sync(spare);
8462 + mark_disk_active(spare);
8463 + sb->active_disks++;
8464 + sb->spare_disks--;
8466 restore_flags(flags);
8467 + mddev->sb_dirty = 1;
8468 + md_update_sb(mddev);
8471 + printk(KERN_INFO "md: recovery thread finished ...\n");
8475 +int md_notify_reboot(struct notifier_block *this,
8476 + unsigned long code, void *x)
8478 + struct md_list_head *tmp;
8481 + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
8482 + || (code == MD_SYS_POWER_OFF)) {
8484 + printk(KERN_INFO "stopping all md devices.\n");
8486 + ITERATE_MDDEV(mddev,tmp)
8487 + do_md_stop (mddev, 1);
8489 + * certain more exotic SCSI devices are known to be
8490 + * volatile wrt too early system reboots. While the
8491 + * right place to handle this issue is the given
8492 + * driver, we do want to have a safe RAID driver ...
8494 + md_mdelay(1000*1);
8496 + return NOTIFY_DONE;
8499 +struct notifier_block md_notifier = {
8505 +md__initfunc(void raid_setup(char *str, int *ints))
8507 + char tmpline[100];
8508 + int len, pos, nr, i;
8510 + len = strlen(str) + 1;
8514 + for (i = 0; i < len; i++) {
8517 + if (c == ',' || !c) {
8519 + if (!strcmp(tmpline,"noautodetect"))
8520 + raid_setup_args.noautodetect = 1;
8528 + raid_setup_args.set = 1;
8532 #ifdef CONFIG_MD_BOOT
8537 -} md_setup_args __initdata = {
8538 +} md_setup_args md__initdata = {
8542 /* called from init/main.c */
8543 -__initfunc(void md_setup(char *str,int *ints))
8544 +md__initfunc(void md_setup(char *str,int *ints))
8547 for(i=0;i<=ints[0];i++) {
8548 @@ -1228,21 +3831,24 @@
8552 -__initfunc(void do_md_setup(char *str,int *ints))
8553 +md__initfunc(void do_md_setup(char *str,int *ints))
8555 - int minor, pers, factor, fault;
8557 + int minor, pers, chunk_size, fault;
8561 + printk("i plan to phase this out --mingo\n");
8564 - printk ("md: Too few Arguments (%d).\n", ints[0]);
8565 + printk (KERN_WARNING "md: Too few Arguments (%d).\n", ints[0]);
8571 - if (minor >= MAX_MD_DEV) {
8572 - printk ("md: Minor device number too high.\n");
8573 + if ((unsigned int)minor >= MAX_MD_DEVS) {
8574 + printk (KERN_WARNING "md: Minor device number too high.\n");
8578 @@ -1252,18 +3858,20 @@
8580 #ifdef CONFIG_MD_LINEAR
8582 - printk ("md: Setting up md%d as linear device.\n",minor);
8583 + printk (KERN_INFO "md: Setting up md%d as linear device.\n",
8586 - printk ("md: Linear mode not configured."
8587 + printk (KERN_WARNING "md: Linear mode not configured."
8588 "Recompile the kernel with linear mode enabled!\n");
8593 #ifdef CONFIG_MD_STRIPED
8594 - printk ("md: Setting up md%d as a striped device.\n",minor);
8595 + printk (KERN_INFO "md: Setting up md%d as a striped device.\n",
8598 - printk ("md: Striped mode not configured."
8599 + printk (KERN_WARNING "md: Striped mode not configured."
8600 "Recompile the kernel with striped mode enabled!\n");
8603 @@ -1278,79 +3886,145 @@
8607 - printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
8608 + printk (KERN_WARNING "md: Unknown or not supported raid level %d.\n", ints[--i]);
8615 - factor=ints[i++]; /* Chunksize */
8616 - fault =ints[i++]; /* Faultlevel */
8617 + chunk_size = ints[i++]; /* Chunksize */
8618 + fault = ints[i++]; /* Faultlevel */
8620 - pers=pers | factor | (fault << FAULT_SHIFT);
8621 + pers = pers | chunk_size | (fault << FAULT_SHIFT);
8623 - while( str && (dev = name_to_kdev_t(str))) {
8624 - do_md_add (minor, dev);
8625 - if((str = strchr (str, ',')) != NULL)
8628 + while( str && (dev = name_to_kdev_t(str))) {
8629 + do_md_add (minor, dev);
8630 + if((str = strchr (str, ',')) != NULL)
8634 - do_md_run (minor, pers);
8635 - printk ("md: Loading md%d.\n",minor);
8636 + do_md_run (minor, pers);
8637 + printk (KERN_INFO "md: Loading md%d.\n",minor);
8644 +void hsm_init (void);
8645 +void translucent_init (void);
8646 void linear_init (void);
8647 void raid0_init (void);
8648 void raid1_init (void);
8649 void raid5_init (void);
8651 -__initfunc(int md_init (void))
8652 +md__initfunc(int md_init (void))
8654 - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
8655 - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
8656 - MAX_MD_DEV, MAX_REAL);
8658 - if (register_blkdev (MD_MAJOR, "md", &md_fops))
8660 - printk ("Unable to get major %d for md\n", MD_MAJOR);
8664 - blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
8665 - blk_dev[MD_MAJOR].current_request=NULL;
8666 - read_ahead[MD_MAJOR]=INT_MAX;
8667 - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
8668 - md_gendisk.next=gendisk_head;
8670 - gendisk_head=&md_gendisk;
8672 -#if SUPPORT_RECONSTRUCTION
8673 - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
8674 - printk("md: bug: md_sync_thread == NULL\n");
8675 -#endif /* SUPPORT_RECONSTRUCTION */
8676 + static char * name = "mdrecoveryd";
8678 + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
8679 + MD_MAJOR_VERSION, MD_MINOR_VERSION,
8680 + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
8682 + if (register_blkdev (MD_MAJOR, "md", &md_fops))
8684 + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
8688 + blk_dev[MD_MAJOR].request_fn = DEVICE_REQUEST;
8689 + blk_dev[MD_MAJOR].current_request = NULL;
8690 + read_ahead[MD_MAJOR] = INT_MAX;
8691 + md_gendisk.next = gendisk_head;
8693 + gendisk_head = &md_gendisk;
8695 + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
8696 + if (!md_recovery_thread)
8697 + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
8699 + md_register_reboot_notifier(&md_notifier);
8700 + md_register_sysctl();
8702 +#ifdef CONFIG_MD_HSM
8705 +#ifdef CONFIG_MD_TRANSLUCENT
8706 + translucent_init ();
8708 #ifdef CONFIG_MD_LINEAR
8712 #ifdef CONFIG_MD_STRIPED
8716 #ifdef CONFIG_MD_MIRRORING
8720 #ifdef CONFIG_MD_RAID5
8724 +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
8726 + * pick a XOR routine, runtime.
8728 + calibrate_xor_block();
8735 #ifdef CONFIG_MD_BOOT
8736 -__initfunc(void md_setup_drive(void))
8737 +md__initfunc(void md_setup_drive(void))
8739 if(md_setup_args.set)
8740 do_md_setup(md_setup_args.str, md_setup_args.ints);
8744 +MD_EXPORT_SYMBOL(md_size);
8745 +MD_EXPORT_SYMBOL(register_md_personality);
8746 +MD_EXPORT_SYMBOL(unregister_md_personality);
8747 +MD_EXPORT_SYMBOL(partition_name);
8748 +MD_EXPORT_SYMBOL(md_error);
8749 +MD_EXPORT_SYMBOL(md_recover_arrays);
8750 +MD_EXPORT_SYMBOL(md_register_thread);
8751 +MD_EXPORT_SYMBOL(md_unregister_thread);
8752 +MD_EXPORT_SYMBOL(md_update_sb);
8753 +MD_EXPORT_SYMBOL(md_map);
8754 +MD_EXPORT_SYMBOL(md_wakeup_thread);
8755 +MD_EXPORT_SYMBOL(md_do_sync);
8756 +MD_EXPORT_SYMBOL(md_print_devices);
8757 +MD_EXPORT_SYMBOL(find_rdev_nr);
8758 +MD_EXPORT_SYMBOL(md_check_ordering);
8759 +MD_EXPORT_SYMBOL(md_interrupt_thread);
8760 +MD_EXPORT_SYMBOL(mddev_map);
8762 +#ifdef CONFIG_PROC_FS
8763 +static struct proc_dir_entry proc_md = {
8764 + PROC_MD, 6, "mdstat",
8765 + S_IFREG | S_IRUGO, 1, 0, 0,
8766 + 0, &proc_array_inode_operations,
8770 +static void md_geninit (struct gendisk *gdisk)
8774 + for(i = 0; i < MAX_MD_DEVS; i++) {
8775 + md_blocksizes[i] = 1024;
8776 + md_maxreadahead[i] = MD_READAHEAD;
8777 + md_gendisk.part[i].start_sect = -1; /* avoid partition check */
8778 + md_gendisk.part[i].nr_sects = 0;
8781 + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8783 + blksize_size[MD_MAJOR] = md_blocksizes;
8784 + md_set_global_readahead(md_maxreadahead);
8786 +#ifdef CONFIG_PROC_FS
8787 + proc_register(&proc_root, &proc_md);
8791 --- linux/drivers/block/raid0.c.orig Mon Sep 4 19:39:16 2000
8792 +++ linux/drivers/block/raid0.c Fri Nov 23 11:18:18 2001
8796 raid0.c : Multiple Devices driver for Linux
8797 Copyright (C) 1994-96 Marc ZYNGIER
8798 @@ -18,146 +17,201 @@
8801 #include <linux/module.h>
8802 -#include <linux/md.h>
8803 -#include <linux/raid0.h>
8804 -#include <linux/vmalloc.h>
8805 +#include <linux/raid/raid0.h>
8807 #define MAJOR_NR MD_MAJOR
8809 #define MD_PERSONALITY
8811 -static int create_strip_zones (int minor, struct md_dev *mddev)
8812 +static int create_strip_zones (mddev_t *mddev)
8815 - int current_offset=0;
8816 - struct real_dev *smallest_by_zone;
8817 - struct raid0_data *data=(struct raid0_data *) mddev->private;
8819 - data->nr_strip_zones=1;
8821 - for (i=1; i<mddev->nb_dev; i++)
8823 - for (j=0; j<i; j++)
8824 - if (mddev->devices[i].size==mddev->devices[j].size)
8831 - data->nr_strip_zones++;
8836 - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL)
8839 - data->smallest=NULL;
8841 - for (i=0; i<data->nr_strip_zones; i++)
8843 - data->strip_zone[i].dev_offset=current_offset;
8844 - smallest_by_zone=NULL;
8847 - for (j=0; j<mddev->nb_dev; j++)
8848 - if (mddev->devices[j].size>current_offset)
8850 - data->strip_zone[i].dev[c++]=mddev->devices+j;
8851 - if (!smallest_by_zone ||
8852 - smallest_by_zone->size > mddev->devices[j].size)
8853 - smallest_by_zone=mddev->devices+j;
8856 - data->strip_zone[i].nb_dev=c;
8857 - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c;
8859 - if (!data->smallest ||
8860 - data->smallest->size > data->strip_zone[i].size)
8861 - data->smallest=data->strip_zone+i;
8863 - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+
8864 - data->strip_zone[i-1].size) : 0;
8865 - current_offset=smallest_by_zone->size;
8868 + int i, c, j, j1, j2;
8869 + int current_offset, curr_zone_offset;
8870 + raid0_conf_t *conf = mddev_to_conf(mddev);
8871 + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
8874 + * The number of 'same size groups'
8876 + conf->nr_strip_zones = 0;
8878 + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
8879 + printk("raid0: looking at %s\n", partition_name(rdev1->dev));
8881 + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
8882 + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
8883 + if (rdev2 == rdev1) {
8884 + printk("raid0: END\n");
8887 + if (rdev2->size == rdev1->size)
8890 + * Not unique, dont count it as a new
8893 + printk("raid0: EQUAL\n");
8897 + printk("raid0: NOT EQUAL\n");
8900 + printk("raid0: ==> UNIQUE\n");
8901 + conf->nr_strip_zones++;
8902 + printk("raid0: %d zones\n", conf->nr_strip_zones);
8905 + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
8907 + conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
8908 + conf->nr_strip_zones);
8909 + if (!conf->strip_zone)
8913 + conf->smallest = NULL;
8914 + current_offset = 0;
8915 + curr_zone_offset = 0;
8917 + for (i = 0; i < conf->nr_strip_zones; i++)
8919 + struct strip_zone *zone = conf->strip_zone + i;
8921 + printk("zone %d\n", i);
8922 + zone->dev_offset = current_offset;
8926 + ITERATE_RDEV_ORDERED(mddev,rdev,j) {
8928 + printk(" checking %s ...", partition_name(rdev->dev));
8929 + if (rdev->size > current_offset)
8931 + printk(" contained as device %d\n", c);
8932 + zone->dev[c] = rdev;
8934 + if (!smallest || (rdev->size <smallest->size)) {
8936 + printk(" (%d) is smallest!.\n", rdev->size);
8939 + printk(" nope.\n");
8943 + zone->size = (smallest->size - current_offset) * c;
8944 + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size);
8946 + if (!conf->smallest || (zone->size < conf->smallest->size))
8947 + conf->smallest = zone;
8949 + zone->zone_offset = curr_zone_offset;
8950 + curr_zone_offset += zone->size;
8952 + current_offset = smallest->size;
8953 + printk("current zone offset: %d\n", current_offset);
8955 + printk("done.\n");
8959 -static int raid0_run (int minor, struct md_dev *mddev)
8960 +static int raid0_run (mddev_t *mddev)
8962 - int cur=0, i=0, size, zone0_size, nb_zone;
8963 - struct raid0_data *data;
8965 - MOD_INC_USE_COUNT;
8966 + int cur=0, i=0, size, zone0_size, nb_zone;
8967 + raid0_conf_t *conf;
8969 - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1;
8970 - data=(struct raid0_data *) mddev->private;
8972 - if (create_strip_zones (minor, mddev))
8978 - nb_zone=data->nr_zones=
8979 - md_size[minor]/data->smallest->size +
8980 - (md_size[minor]%data->smallest->size ? 1 : 0);
8982 - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone);
8983 - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL)
8985 - vfree(data->strip_zone);
8989 - size=data->strip_zone[cur].size;
8992 - while (cur<data->nr_strip_zones)
8994 - data->hash_table[i].zone0=data->strip_zone+cur;
8996 - if (size>=data->smallest->size)/* If we completely fill the slot */
8998 - data->hash_table[i++].zone1=NULL;
8999 - size-=data->smallest->size;
9003 - if (++cur==data->nr_strip_zones) continue;
9004 - size=data->strip_zone[cur].size;
9010 - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */
9012 - data->hash_table[i].zone1=NULL;
9016 - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */
9017 - size=data->strip_zone[cur].size;
9018 - data->hash_table[i++].zone1=data->strip_zone+cur;
9019 - size-=(data->smallest->size - zone0_size);
9021 + MOD_INC_USE_COUNT;
9024 + conf = vmalloc(sizeof (raid0_conf_t));
9027 + mddev->private = (void *)conf;
9029 + if (md_check_ordering(mddev)) {
9030 + printk("raid0: disks are not ordered, aborting!\n");
9031 + goto out_free_conf;
9034 + if (create_strip_zones (mddev))
9035 + goto out_free_conf;
9037 + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]);
9038 + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size);
9039 + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size +
9040 + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
9041 + printk("raid0 : nb_zone is %d.\n", nb_zone);
9042 + conf->nr_zones = nb_zone;
9044 + printk("raid0 : Allocating %d bytes for hash.\n",
9045 + sizeof(struct raid0_hash)*nb_zone);
9047 + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
9048 + if (!conf->hash_table)
9049 + goto out_free_zone_conf;
9050 + size = conf->strip_zone[cur].size;
9053 + while (cur < conf->nr_strip_zones) {
9054 + conf->hash_table[i].zone0 = conf->strip_zone + cur;
9057 + * If we completely fill the slot
9059 + if (size >= conf->smallest->size) {
9060 + conf->hash_table[i++].zone1 = NULL;
9061 + size -= conf->smallest->size;
9064 + if (++cur == conf->nr_strip_zones)
9066 + size = conf->strip_zone[cur].size;
9070 + if (++cur == conf->nr_strip_zones) {
9072 + * Last dev, set unit1 as NULL
9074 + conf->hash_table[i].zone1=NULL;
9079 + * Here we use a 2nd dev to fill the slot
9081 + zone0_size = size;
9082 + size = conf->strip_zone[cur].size;
9083 + conf->hash_table[i++].zone1 = conf->strip_zone + cur;
9084 + size -= (conf->smallest->size - zone0_size);
9088 +out_free_zone_conf:
9089 + vfree(conf->strip_zone);
9090 + conf->strip_zone = NULL;
9094 + mddev->private = NULL;
9096 + MOD_DEC_USE_COUNT;
9101 -static int raid0_stop (int minor, struct md_dev *mddev)
9102 +static int raid0_stop (mddev_t *mddev)
9104 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9105 + raid0_conf_t *conf = mddev_to_conf(mddev);
9107 - vfree (data->hash_table);
9108 - vfree (data->strip_zone);
9110 + vfree (conf->hash_table);
9111 + conf->hash_table = NULL;
9112 + vfree (conf->strip_zone);
9113 + conf->strip_zone = NULL;
9115 + mddev->private = NULL;
9117 - MOD_DEC_USE_COUNT;
9119 + MOD_DEC_USE_COUNT;
9124 @@ -167,135 +221,140 @@
9125 * Of course, those facts may not be valid anymore (and surely won't...)
9126 * Hey guys, there's some work out there ;-)
9128 -static int raid0_map (struct md_dev *mddev, kdev_t *rdev,
9129 +static int raid0_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9130 unsigned long *rsector, unsigned long size)
9132 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9133 - static struct raid0_hash *hash;
9134 - struct strip_zone *zone;
9135 - struct real_dev *tmp_dev;
9136 - int blk_in_chunk, factor, chunk, chunk_size;
9137 - long block, rblock;
9139 - factor=FACTOR(mddev);
9140 - chunk_size=(1UL << FACTOR_SHIFT(factor));
9141 - block=*rsector >> 1;
9142 - hash=data->hash_table+(block/data->smallest->size);
9144 - if (hash - data->hash_table > data->nr_zones)
9146 - printk(KERN_DEBUG "raid0_map: invalid block %li\n", block);
9150 - /* Sanity check */
9151 - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size)
9153 - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9157 - if (block >= (hash->zone0->size +
9158 - hash->zone0->zone_offset))
9162 - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block);
9170 + raid0_conf_t *conf = mddev_to_conf(mddev);
9171 + struct raid0_hash *hash;
9172 + struct strip_zone *zone;
9173 + mdk_rdev_t *tmp_dev;
9174 + int blk_in_chunk, chunksize_bits, chunk, chunk_size;
9175 + long block, rblock;
9177 + chunk_size = mddev->param.chunk_size >> 10;
9178 + chunksize_bits = ffz(~chunk_size);
9179 + block = *rsector >> 1;
9180 + hash = conf->hash_table + block / conf->smallest->size;
9182 + if (hash - conf->hash_table > conf->nr_zones) {
9183 + printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block);
9187 + /* Sanity check */
9188 + if ((chunk_size * 2) < (*rsector % (chunk_size * 2)) + size)
9197 + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
9200 + zone = hash->zone1;
9202 + zone = hash->zone0;
9204 - blk_in_chunk=block & (chunk_size -1);
9205 - chunk=(block - zone->zone_offset) / (zone->nb_dev<<FACTOR_SHIFT(factor));
9206 - tmp_dev=zone->dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev];
9207 - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset;
9208 + blk_in_chunk = block & (chunk_size -1);
9209 + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
9210 + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
9211 + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset;
9213 - *rdev=tmp_dev->dev;
9214 - *rsector=rblock<<1;
9215 + *rdev = tmp_dev->dev;
9216 + *rsector = rblock << 1;
9222 + printk ("raid0_map bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size);
9225 + printk("raid0_map bug: hash==NULL for block %ld\n", block);
9228 + printk ("raid0_map bug: hash->zone0==NULL for block %ld\n", block);
9231 + printk ("raid0_map bug: hash->zone1==NULL for block %ld\n", block);
9236 -static int raid0_status (char *page, int minor, struct md_dev *mddev)
9237 +static int raid0_status (char *page, mddev_t *mddev)
9244 - struct raid0_data *data=(struct raid0_data *) mddev->private;
9246 + raid0_conf_t *conf = mddev_to_conf(mddev);
9248 - sz+=sprintf (page+sz, " ");
9249 - for (j=0; j<data->nr_zones; j++)
9251 - sz+=sprintf (page+sz, "[z%d",
9252 - data->hash_table[j].zone0-data->strip_zone);
9253 - if (data->hash_table[j].zone1)
9254 - sz+=sprintf (page+sz, "/z%d] ",
9255 - data->hash_table[j].zone1-data->strip_zone);
9257 - sz+=sprintf (page+sz, "] ");
9259 + sz += sprintf(page + sz, " ");
9260 + for (j = 0; j < conf->nr_zones; j++) {
9261 + sz += sprintf(page + sz, "[z%d",
9262 + conf->hash_table[j].zone0 - conf->strip_zone);
9263 + if (conf->hash_table[j].zone1)
9264 + sz += sprintf(page+sz, "/z%d] ",
9265 + conf->hash_table[j].zone1 - conf->strip_zone);
9267 + sz += sprintf(page+sz, "] ");
9270 - sz+=sprintf (page+sz, "\n");
9271 + sz += sprintf(page + sz, "\n");
9273 - for (j=0; j<data->nr_strip_zones; j++)
9275 - sz+=sprintf (page+sz, " z%d=[", j);
9276 - for (k=0; k<data->strip_zone[j].nb_dev; k++)
9277 - sz+=sprintf (page+sz, "%s/",
9278 - partition_name(data->strip_zone[j].dev[k]->dev));
9280 - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9281 - data->strip_zone[j].zone_offset,
9282 - data->strip_zone[j].dev_offset,
9283 - data->strip_zone[j].size);
9285 + for (j = 0; j < conf->nr_strip_zones; j++) {
9286 + sz += sprintf(page + sz, " z%d=[", j);
9287 + for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
9288 + sz += sprintf (page+sz, "%s/", partition_name(
9289 + conf->strip_zone[j].dev[k]->dev));
9291 + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
9292 + conf->strip_zone[j].zone_offset,
9293 + conf->strip_zone[j].dev_offset,
9294 + conf->strip_zone[j].size);
9297 - sz+=sprintf (page+sz, " %dk chunks", 1<<FACTOR_SHIFT(FACTOR(mddev)));
9299 + sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
9304 -static struct md_personality raid0_personality=
9305 +static mdk_personality_t raid0_personality=
9309 - NULL, /* no special make_request */
9310 - NULL, /* no special end_request */
9314 - NULL, /* no ioctls */
9316 - NULL, /* no error_handler */
9317 - NULL, /* hot_add_disk */
9318 - NULL, /* hot_remove_disk */
9319 - NULL /* mark_spare */
9322 + NULL, /* no special make_request */
9323 + NULL, /* no special end_request */
9327 + NULL, /* no ioctls */
9329 + NULL, /* no error_handler */
9330 + NULL, /* no diskop */
9331 + NULL, /* no stop resync */
9332 + NULL /* no restart resync */
9338 void raid0_init (void)
9340 - register_md_personality (RAID0, &raid0_personality);
9341 + register_md_personality (RAID0, &raid0_personality);
9346 int init_module (void)
9348 - return (register_md_personality (RAID0, &raid0_personality));
9349 + return (register_md_personality (RAID0, &raid0_personality));
9352 void cleanup_module (void)
9354 - unregister_md_personality (RAID0);
9355 + unregister_md_personality (RAID0);
9360 --- linux/drivers/block/raid1.c.orig Mon Dec 11 01:49:41 2000
9361 +++ linux/drivers/block/raid1.c Fri Nov 23 11:18:18 2001
9363 -/************************************************************************
9365 * raid1.c : Multiple Devices driver for Linux
9366 - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9367 + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
9369 * RAID-1 management functions.
9374 #include <linux/module.h>
9375 -#include <linux/locks.h>
9376 #include <linux/malloc.h>
9377 -#include <linux/md.h>
9378 -#include <linux/raid1.h>
9379 -#include <asm/bitops.h>
9380 +#include <linux/raid/raid1.h>
9381 #include <asm/atomic.h>
9383 #define MAJOR_NR MD_MAJOR
9385 #define MD_PERSONALITY
9388 - * The following can be used to debug the driver
9390 -/*#define RAID1_DEBUG*/
9392 -#define PRINTK(x) do { printk x; } while (0);
9394 -#define PRINTK(x) do { ; } while (0);
9396 +#define MAX_LINEAR_SECTORS 128
9398 #define MAX(a,b) ((a) > (b) ? (a) : (b))
9399 #define MIN(a,b) ((a) < (b) ? (a) : (b))
9401 -static struct md_personality raid1_personality;
9402 -static struct md_thread *raid1_thread = NULL;
9403 +static mdk_personality_t raid1_personality;
9404 struct buffer_head *raid1_retry_list = NULL;
9406 -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev,
9407 +static void * raid1_kmalloc (int size)
9411 + * now we are rather fault tolerant than nice, but
9412 + * there are a couple of places in the RAID code where we
9413 + * simply can not afford to fail an allocation because
9414 + * there is no failure return path (eg. make_request())
9416 + while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) {
9417 + printk ("raid1: out of memory, retrying...\n");
9418 + current->state = TASK_UNINTERRUPTIBLE;
9419 + schedule_timeout(HZ/10);
9422 + memset(ptr, 0, size);
9426 +static int __raid1_map (mddev_t *mddev, kdev_t *rdev,
9427 unsigned long *rsector, unsigned long size)
9429 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9430 - int i, n = raid_conf->raid_disks;
9431 + raid1_conf_t *conf = mddev_to_conf(mddev);
9432 + int i, disks = MD_SB_DISKS;
9435 * Later we do read balancing on the read side
9436 * now we use the first available disk.
9439 - PRINTK(("raid1_map().\n"));
9441 - for (i=0; i<n; i++) {
9442 - if (raid_conf->mirrors[i].operational) {
9443 - *rdev = raid_conf->mirrors[i].dev;
9444 + for (i = 0; i < disks; i++) {
9445 + if (conf->mirrors[i].operational) {
9446 + *rdev = conf->mirrors[i].dev;
9454 -static int raid1_map (struct md_dev *mddev, kdev_t *rdev,
9455 +static int raid1_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
9456 unsigned long *rsector, unsigned long size)
9461 -void raid1_reschedule_retry (struct buffer_head *bh)
9462 +static void raid1_reschedule_retry (struct buffer_head *bh)
9464 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9466 - PRINTK(("raid1_reschedule_retry().\n"));
9467 + mddev_t *mddev = r1_bh->mddev;
9468 + raid1_conf_t *conf = mddev_to_conf(mddev);
9470 r1_bh->next_retry = raid1_retry_list;
9471 raid1_retry_list = bh;
9472 - md_wakeup_thread(raid1_thread);
9473 + md_wakeup_thread(conf->thread);
9477 - * raid1_end_buffer_io() is called when we have finished servicing a mirrored
9478 + * raid1_end_bh_io() is called when we have finished servicing a mirrored
9479 * operation and are ready to return a success/failure code to the buffer
9482 -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate)
9483 +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
9485 struct buffer_head *bh = r1_bh->master_bh;
9491 -int raid1_one_error=0;
9493 void raid1_end_request (struct buffer_head *bh, int uptodate)
9495 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
9496 @@ -106,12 +109,7 @@
9500 - PRINTK(("raid1_end_request().\n"));
9502 - if (raid1_one_error) {
9503 - raid1_one_error=0;
9507 * this branch is our 'one mirror IO has finished' event handler:
9509 @@ -136,15 +134,11 @@
9512 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
9514 - PRINTK(("raid1_end_request(), read branch.\n"));
9517 * we have only one buffer_head on the read side
9520 - PRINTK(("raid1_end_request(), read branch, uptodate.\n"));
9521 - raid1_end_buffer_io(r1_bh, uptodate);
9522 + raid1_end_bh_io(r1_bh, uptodate);
9523 restore_flags(flags);
9526 @@ -152,71 +146,56 @@
9529 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
9530 - kdevname(bh->b_dev), bh->b_blocknr);
9531 - raid1_reschedule_retry (bh);
9532 + partition_name(bh->b_dev), bh->b_blocknr);
9533 + raid1_reschedule_retry(bh);
9534 restore_flags(flags);
9539 - * WRITE or WRITEA.
9541 - PRINTK(("raid1_end_request(), write branch.\n"));
9546 * Let's see if all mirrored write operations have finished
9547 - * already [we have irqs off, so we can decrease]:
9551 - if (!--r1_bh->remaining) {
9552 - struct md_dev *mddev = r1_bh->mddev;
9553 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9554 - int i, n = raid_conf->raid_disks;
9556 - PRINTK(("raid1_end_request(), remaining == 0.\n"));
9557 + if (atomic_dec_and_test(&r1_bh->remaining)) {
9558 + int i, disks = MD_SB_DISKS;
9560 - for ( i=0; i<n; i++)
9561 - if (r1_bh->mirror_bh[i]) kfree(r1_bh->mirror_bh[i]);
9562 + for ( i = 0; i < disks; i++)
9563 + if (r1_bh->mirror_bh[i])
9564 + kfree(r1_bh->mirror_bh[i]);
9566 - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9567 + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state));
9569 - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining));
9570 restore_flags(flags);
9573 -/* This routine checks if the undelying device is an md device and in that
9574 - * case it maps the blocks before putting the request on the queue
9576 + * This routine checks if the undelying device is an md device
9577 + * and in that case it maps the blocks before putting the
9578 + * request on the queue
9581 -map_and_make_request (int rw, struct buffer_head *bh)
9582 +static void map_and_make_request (int rw, struct buffer_head *bh)
9584 if (MAJOR (bh->b_rdev) == MD_MAJOR)
9585 - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
9586 + md_map (bh->b_rdev, &bh->b_rdev,
9587 + &bh->b_rsector, bh->b_size >> 9);
9588 clear_bit(BH_Lock, &bh->b_state);
9589 make_request (MAJOR (bh->b_rdev), rw, bh);
9593 -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
9594 +static int raid1_make_request (mddev_t *mddev, int rw,
9595 + struct buffer_head * bh)
9598 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9599 + raid1_conf_t *conf = mddev_to_conf(mddev);
9600 struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req;
9601 struct raid1_bh * r1_bh;
9602 - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors;
9603 + int disks = MD_SB_DISKS;
9604 + int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0;
9605 struct mirror_info *mirror;
9607 - PRINTK(("raid1_make_request().\n"));
9609 - while (!( /* FIXME: now we are rather fault tolerant than nice */
9610 - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_BUFFER)
9613 - printk ("raid1_make_request(#1): out of memory\n");
9614 - current->policy |= SCHED_YIELD;
9617 - memset (r1_bh, 0, sizeof (struct raid1_bh));
9618 + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh));
9621 * make_request() can abort the operation when READA or WRITEA are being
9622 @@ -227,43 +206,65 @@
9623 if (rw == READA) rw = READ;
9624 if (rw == WRITEA) rw = WRITE;
9626 - if (rw == WRITE || rw == WRITEA)
9627 - mark_buffer_clean(bh); /* Too early ? */
9628 + if (rw == WRITE) {
9632 + mark_buffer_clean(bh);
9634 + * not too early. we _first_ clean the bh, then we start
9635 + * the IO, then when the IO has finished, we unlock the
9636 + * bh and mark it uptodate. This way we do not miss the
9637 + * case when the bh got dirty again during the IO.
9642 + * special flag for 'lowprio' reconstruction requests ...
9644 + if (buffer_lowprio(bh))
9648 - * i think the read and write branch should be separated completely, since we want
9649 - * to do read balancing on the read side for example. Comments? :) --mingo
9650 + * i think the read and write branch should be separated completely,
9651 + * since we want to do read balancing on the read side for example.
9652 + * Comments? :) --mingo
9655 r1_bh->master_bh=bh;
9659 - if (rw==READ || rw==READA) {
9660 - int last_used = raid_conf->last_used;
9661 - PRINTK(("raid1_make_request(), read branch.\n"));
9662 - mirror = raid_conf->mirrors + last_used;
9664 + int last_used = conf->last_used;
9667 + * read balancing logic:
9669 + mirror = conf->mirrors + last_used;
9670 bh->b_rdev = mirror->dev;
9671 sectors = bh->b_size >> 9;
9672 - if (bh->b_blocknr * sectors == raid_conf->next_sect) {
9673 - raid_conf->sect_count += sectors;
9674 - if (raid_conf->sect_count >= mirror->sect_limit)
9676 + if (bh->b_blocknr * sectors == conf->next_sect) {
9677 + conf->sect_count += sectors;
9678 + if (conf->sect_count >= mirror->sect_limit)
9682 - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors;
9683 - if (switch_disks) {
9684 - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count));
9685 - raid_conf->sect_count = 0;
9686 - last_used = raid_conf->last_used = mirror->next;
9687 + conf->next_sect = (bh->b_blocknr + 1) * sectors;
9689 + * Do not switch disks if full resync is in progress ...
9691 + if (switch_disks && !conf->resync_mirrors) {
9692 + conf->sect_count = 0;
9693 + last_used = conf->last_used = mirror->next;
9695 - * Do not switch to write-only disks ... resyncing
9697 + * Do not switch to write-only disks ...
9698 + * reconstruction is in progress
9700 - while (raid_conf->mirrors[last_used].write_only)
9701 - raid_conf->last_used = raid_conf->mirrors[last_used].next;
9702 + while (conf->mirrors[last_used].write_only)
9703 + conf->last_used = conf->mirrors[last_used].next;
9705 - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev)));
9706 bh_req = &r1_bh->bh_req;
9707 memcpy(bh_req, bh, sizeof(*bh));
9708 bh_req->b_end_io = raid1_end_request;
9709 @@ -273,13 +274,12 @@
9713 - * WRITE or WRITEA.
9716 - PRINTK(("raid1_make_request(n=%d), write branch.\n",n));
9718 - for (i = 0; i < n; i++) {
9719 + for (i = 0; i < disks; i++) {
9721 - if (!raid_conf->mirrors [i].operational) {
9722 + if (!conf->mirrors[i].operational) {
9724 * the r1_bh->mirror_bh[i] pointer remains NULL
9726 @@ -287,89 +287,91 @@
9731 + * special case for reconstruction ...
9733 + if (lowprio && (i == conf->last_used)) {
9734 + mirror_bh[i] = NULL;
9739 + * We should use a private pool (size depending on NR_REQUEST),
9740 + * to avoid writes filling up the memory with bhs
9742 + * Such pools are much faster than kmalloc anyways (so we waste
9743 + * almost nothing by not using the master bh when writing and
9744 + * win alot of cleanness) but for now we are cool enough. --mingo
9746 + * It's safe to sleep here, buffer heads cannot be used in a shared
9747 + * manner in the write branch. Look how we lock the buffer at the
9748 + * beginning of this function to grok the difference ;)
9750 + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head));
9752 + * prepare mirrored bh (fields ordered for max mem throughput):
9754 + mirror_bh[i]->b_blocknr = bh->b_blocknr;
9755 + mirror_bh[i]->b_dev = bh->b_dev;
9756 + mirror_bh[i]->b_rdev = conf->mirrors[i].dev;
9757 + mirror_bh[i]->b_rsector = bh->b_rsector;
9758 + mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9760 + mirror_bh[i]->b_state |= (1<<BH_LowPrio);
9762 + mirror_bh[i]->b_count = 1;
9763 + mirror_bh[i]->b_size = bh->b_size;
9764 + mirror_bh[i]->b_data = bh->b_data;
9765 + mirror_bh[i]->b_list = BUF_LOCKED;
9766 + mirror_bh[i]->b_end_io = raid1_end_request;
9767 + mirror_bh[i]->b_dev_id = r1_bh;
9769 + r1_bh->mirror_bh[i] = mirror_bh[i];
9773 + md_atomic_set(&r1_bh->remaining, sum_bhs);
9776 - * We should use a private pool (size depending on NR_REQUEST),
9777 - * to avoid writes filling up the memory with bhs
9779 - * Such pools are much faster than kmalloc anyways (so we waste almost
9780 - * nothing by not using the master bh when writing and win alot of cleanness)
9782 - * but for now we are cool enough. --mingo
9784 - * It's safe to sleep here, buffer heads cannot be used in a shared
9785 - * manner in the write branch. Look how we lock the buffer at the beginning
9786 - * of this function to grok the difference ;)
9788 - while (!( /* FIXME: now we are rather fault tolerant than nice */
9789 - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_BUFFER)
9792 - printk ("raid1_make_request(#2): out of memory\n");
9793 - current->policy |= SCHED_YIELD;
9796 - memset (mirror_bh[i], 0, sizeof (struct buffer_head));
9799 - * prepare mirrored bh (fields ordered for max mem throughput):
9801 - mirror_bh [i]->b_blocknr = bh->b_blocknr;
9802 - mirror_bh [i]->b_dev = bh->b_dev;
9803 - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev;
9804 - mirror_bh [i]->b_rsector = bh->b_rsector;
9805 - mirror_bh [i]->b_state = (1<<BH_Req) | (1<<BH_Dirty);
9806 - mirror_bh [i]->b_count = 1;
9807 - mirror_bh [i]->b_size = bh->b_size;
9808 - mirror_bh [i]->b_data = bh->b_data;
9809 - mirror_bh [i]->b_list = BUF_LOCKED;
9810 - mirror_bh [i]->b_end_io = raid1_end_request;
9811 - mirror_bh [i]->b_dev_id = r1_bh;
9813 - r1_bh->mirror_bh[i] = mirror_bh[i];
9817 - r1_bh->remaining = sum_bhs;
9819 - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs));
9822 - * We have to be a bit careful about the semaphore above, thats why we
9823 - * start the requests separately. Since kmalloc() could fail, sleep and
9824 - * make_request() can sleep too, this is the safer solution. Imagine,
9825 - * end_request decreasing the semaphore before we could have set it up ...
9826 - * We could play tricks with the semaphore (presetting it and correcting
9827 - * at the end if sum_bhs is not 'n' but we have to do end_request by hand
9828 - * if all requests finish until we had a chance to set up the semaphore
9829 - * correctly ... lots of races).
9831 - for (i = 0; i < n; i++)
9832 - if (mirror_bh [i] != NULL)
9833 - map_and_make_request (rw, mirror_bh [i]);
9834 + * We have to be a bit careful about the semaphore above, thats
9835 + * why we start the requests separately. Since kmalloc() could
9836 + * fail, sleep and make_request() can sleep too, this is the
9837 + * safer solution. Imagine, end_request decreasing the semaphore
9838 + * before we could have set it up ... We could play tricks with
9839 + * the semaphore (presetting it and correcting at the end if
9840 + * sum_bhs is not 'n' but we have to do end_request by hand if
9841 + * all requests finish until we had a chance to set up the
9842 + * semaphore correctly ... lots of races).
9844 + for (i = 0; i < disks; i++)
9846 + map_and_make_request(rw, mirror_bh[i]);
9851 -static int raid1_status (char *page, int minor, struct md_dev *mddev)
9852 +static int raid1_status (char *page, mddev_t *mddev)
9854 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9855 + raid1_conf_t *conf = mddev_to_conf(mddev);
9858 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
9859 - for (i = 0; i < raid_conf->raid_disks; i++)
9860 - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_");
9861 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
9862 + conf->working_disks);
9863 + for (i = 0; i < conf->raid_disks; i++)
9864 + sz += sprintf (page+sz, "%s",
9865 + conf->mirrors[i].operational ? "U" : "_");
9866 sz += sprintf (page+sz, "]");
9870 -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index)
9871 +static void unlink_disk (raid1_conf_t *conf, int target)
9873 - int disks = raid_conf->raid_disks;
9875 + int disks = MD_SB_DISKS;
9878 - for (j = 0; j < disks; j++)
9879 - if (raid_conf->mirrors [j].next == failed_index)
9880 - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next;
9881 + for (i = 0; i < disks; i++)
9882 + if (conf->mirrors[i].next == target)
9883 + conf->mirrors[i].next = conf->mirrors[target].next;
9886 #define LAST_DISK KERN_ALERT \
9887 @@ -388,48 +390,53 @@
9888 #define ALREADY_SYNCING KERN_INFO \
9889 "raid1: syncing already in progress.\n"
9891 -static int raid1_error (struct md_dev *mddev, kdev_t dev)
9892 +static void mark_disk_bad (mddev_t *mddev, int failed)
9894 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
9895 - struct mirror_info *mirror;
9896 - md_superblock_t *sb = mddev->sb;
9897 - int disks = raid_conf->raid_disks;
9899 + raid1_conf_t *conf = mddev_to_conf(mddev);
9900 + struct mirror_info *mirror = conf->mirrors+failed;
9901 + mdp_super_t *sb = mddev->sb;
9903 + mirror->operational = 0;
9904 + unlink_disk(conf, failed);
9905 + mark_disk_faulty(sb->disks+mirror->number);
9906 + mark_disk_nonsync(sb->disks+mirror->number);
9907 + mark_disk_inactive(sb->disks+mirror->number);
9908 + sb->active_disks--;
9909 + sb->working_disks--;
9910 + sb->failed_disks++;
9911 + mddev->sb_dirty = 1;
9912 + md_wakeup_thread(conf->thread);
9913 + conf->working_disks--;
9914 + printk (DISK_FAILED, partition_name (mirror->dev),
9915 + conf->working_disks);
9918 - PRINTK(("raid1_error called\n"));
9919 +static int raid1_error (mddev_t *mddev, kdev_t dev)
9921 + raid1_conf_t *conf = mddev_to_conf(mddev);
9922 + struct mirror_info * mirrors = conf->mirrors;
9923 + int disks = MD_SB_DISKS;
9926 - if (raid_conf->working_disks == 1) {
9927 + if (conf->working_disks == 1) {
9929 * Uh oh, we can do nothing if this is our last disk, but
9930 * first check if this is a queued request for a device
9931 * which has just failed.
9933 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
9935 - if (mirror->dev == dev && !mirror->operational)
9936 + for (i = 0; i < disks; i++) {
9937 + if (mirrors[i].dev==dev && !mirrors[i].operational)
9942 - /* Mark disk as unusable */
9943 - for (i = 0, mirror = raid_conf->mirrors; i < disks;
9945 - if (mirror->dev == dev && mirror->operational){
9946 - mirror->operational = 0;
9947 - raid1_fix_links (raid_conf, i);
9948 - sb->disks[mirror->number].state |=
9949 - (1 << MD_FAULTY_DEVICE);
9950 - sb->disks[mirror->number].state &=
9951 - ~(1 << MD_SYNC_DEVICE);
9952 - sb->disks[mirror->number].state &=
9953 - ~(1 << MD_ACTIVE_DEVICE);
9954 - sb->active_disks--;
9955 - sb->working_disks--;
9956 - sb->failed_disks++;
9957 - mddev->sb_dirty = 1;
9958 - md_wakeup_thread(raid1_thread);
9959 - raid_conf->working_disks--;
9960 - printk (DISK_FAILED, kdevname (dev),
9961 - raid_conf->working_disks);
9963 + * Mark disk as unusable
9965 + for (i = 0; i < disks; i++) {
9966 + if (mirrors[i].dev==dev && mirrors[i].operational) {
9967 + mark_disk_bad (mddev, i);
9972 @@ -442,219 +449,396 @@
9973 #undef START_SYNCING
9976 - * This is the personality-specific hot-addition routine
9977 + * Insert the spare disk into the drive-ring
9979 +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
9982 + int disks = MD_SB_DISKS;
9983 + struct mirror_info *p = conf->mirrors;
9985 -#define NO_SUPERBLOCK KERN_ERR \
9986 -"raid1: cannot hot-add disk to the array with no RAID superblock\n"
9987 + for (j = 0; j < disks; j++, p++)
9988 + if (p->operational && !p->write_only) {
9990 + p->next = mirror->raid_disk;
9991 + mirror->next = next;
9995 -#define WRONG_LEVEL KERN_ERR \
9996 -"raid1: hot-add: level of disk is not RAID-1\n"
9997 + printk("raid1: bug: no read-operational devices\n");
10000 -#define HOT_ADD_SUCCEEDED KERN_INFO \
10001 -"raid1: device %s hot-added\n"
10002 +static void print_raid1_conf (raid1_conf_t *conf)
10005 + struct mirror_info *tmp;
10007 -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev)
10008 + printk("RAID1 conf printout:\n");
10010 + printk("(conf==NULL)\n");
10013 + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
10014 + conf->raid_disks, conf->nr_disks);
10016 + for (i = 0; i < MD_SB_DISKS; i++) {
10017 + tmp = conf->mirrors + i;
10018 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
10019 + i, tmp->spare,tmp->operational,
10020 + tmp->number,tmp->raid_disk,tmp->used_slot,
10021 + partition_name(tmp->dev));
10025 +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
10028 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
10029 + raid1_conf_t *conf = mddev->private;
10030 + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
10031 unsigned long flags;
10032 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10033 - struct mirror_info *mirror;
10034 - md_superblock_t *sb = mddev->sb;
10035 - struct real_dev * realdev;
10037 + mdp_super_t *sb = mddev->sb;
10038 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
10040 + save_flags(flags);
10043 + print_raid1_conf(conf);
10045 - * The device has its superblock already read and it was found
10046 - * to be consistent for generic RAID usage. Now we check whether
10047 - * it's usable for RAID-1 hot addition.
10048 + * find the disk ...
10052 - n = mddev->nb_dev++;
10053 - realdev = &mddev->devices[n];
10054 - if (!realdev->sb) {
10055 - printk (NO_SUPERBLOCK);
10058 - if (realdev->sb->level != 1) {
10059 - printk (WRONG_LEVEL);
10061 + case DISKOP_SPARE_ACTIVE:
10064 + * Find the failed disk within the RAID1 configuration ...
10065 + * (this can only be in the first conf->working_disks part)
10067 + for (i = 0; i < conf->raid_disks; i++) {
10068 + tmp = conf->mirrors + i;
10069 + if ((!tmp->operational && !tmp->spare) ||
10070 + !tmp->used_slot) {
10076 + * When we activate a spare disk we _must_ have a disk in
10077 + * the lower (active) part of the array to replace.
10079 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
10084 + /* fall through */
10086 + case DISKOP_SPARE_WRITE:
10087 + case DISKOP_SPARE_INACTIVE:
10090 + * Find the spare disk ... (can only be in the 'high'
10091 + * area of the array)
10093 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10094 + tmp = conf->mirrors + i;
10095 + if (tmp->spare && tmp->number == (*d)->number) {
10100 + if (spare_disk == -1) {
10107 + case DISKOP_HOT_REMOVE_DISK:
10109 + for (i = 0; i < MD_SB_DISKS; i++) {
10110 + tmp = conf->mirrors + i;
10111 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
10112 + if (tmp->operational) {
10116 + removed_disk = i;
10120 + if (removed_disk == -1) {
10127 + case DISKOP_HOT_ADD_DISK:
10129 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
10130 + tmp = conf->mirrors + i;
10131 + if (!tmp->used_slot) {
10136 + if (added_disk == -1) {
10143 - /* FIXME: are there other things left we could sanity-check? */
10147 - * We have to disable interrupts, as our RAID-1 state is used
10148 - * from irq handlers as well.
10149 + * Switch the spare disk to write-only mode:
10151 - save_flags(flags);
10153 + case DISKOP_SPARE_WRITE:
10154 + sdisk = conf->mirrors + spare_disk;
10155 + sdisk->operational = 1;
10156 + sdisk->write_only = 1;
10159 + * Deactivate a spare disk:
10161 + case DISKOP_SPARE_INACTIVE:
10162 + sdisk = conf->mirrors + spare_disk;
10163 + sdisk->operational = 0;
10164 + sdisk->write_only = 0;
10167 + * Activate (mark read-write) the (now sync) spare disk,
10168 + * which means we switch it's 'raid position' (->raid_disk)
10169 + * with the failed disk. (only the first 'conf->nr_disks'
10170 + * slots are used for 'real' disks and we must preserve this
10173 + case DISKOP_SPARE_ACTIVE:
10175 - raid_conf->raid_disks++;
10176 - mirror = raid_conf->mirrors+n;
10177 + sdisk = conf->mirrors + spare_disk;
10178 + fdisk = conf->mirrors + failed_disk;
10180 - mirror->number=n;
10181 - mirror->raid_disk=n;
10183 - mirror->next=0; /* FIXME */
10184 - mirror->sect_limit=128;
10186 - mirror->operational=0;
10188 - mirror->write_only=0;
10190 - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE);
10191 - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE);
10192 - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE);
10194 - sb->spare_disks++;
10195 + spare_desc = &sb->disks[sdisk->number];
10196 + failed_desc = &sb->disks[fdisk->number];
10198 - restore_flags(flags);
10199 + if (spare_desc != *d) {
10205 - md_update_sb(MINOR(dev));
10206 + if (spare_desc->raid_disk != sdisk->raid_disk) {
10212 + if (sdisk->raid_disk != spare_disk) {
10218 - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev));
10219 + if (failed_desc->raid_disk != fdisk->raid_disk) {
10227 + if (fdisk->raid_disk != failed_disk) {
10233 -#undef NO_SUPERBLOCK
10234 -#undef WRONG_LEVEL
10235 -#undef HOT_ADD_SUCCEEDED
10237 + * do the switch finally
10239 + xchg_values(*spare_desc, *failed_desc);
10240 + xchg_values(*fdisk, *sdisk);
10243 - * Insert the spare disk into the drive-ring
10245 -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror)
10248 - struct mirror_info *p = raid_conf->mirrors;
10250 + * (careful, 'failed' and 'spare' are switched from now on)
10252 + * we want to preserve linear numbering and we want to
10253 + * give the proper raid_disk number to the now activated
10254 + * disk. (this means we switch back these values)
10257 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
10258 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
10259 + xchg_values(spare_desc->number, failed_desc->number);
10260 + xchg_values(sdisk->number, fdisk->number);
10262 - for (j = 0; j < raid_conf->raid_disks; j++, p++)
10263 - if (p->operational && !p->write_only) {
10265 - p->next = mirror->raid_disk;
10266 - mirror->next = next;
10269 - printk("raid1: bug: no read-operational devices\n");
10271 + *d = failed_desc;
10273 -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare,
10276 - int i = 0, failed_disk = -1;
10277 - struct raid1_data *raid_conf = mddev->private;
10278 - struct mirror_info *mirror = raid_conf->mirrors;
10279 - md_descriptor_t *descriptor;
10280 - unsigned long flags;
10281 + if (sdisk->dev == MKDEV(0,0))
10282 + sdisk->used_slot = 0;
10284 + * this really activates the spare.
10286 + fdisk->spare = 0;
10287 + fdisk->write_only = 0;
10288 + link_disk(conf, fdisk);
10290 - for (i = 0; i < MD_SB_DISKS; i++, mirror++) {
10291 - if (mirror->spare && mirror->number == spare->number)
10296 - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks;
10298 - if (!mirror->operational)
10301 + * if we activate a spare, we definitely replace a
10302 + * non-operational disk slot in the 'low' area of
10303 + * the disk array.
10306 - save_flags(flags);
10309 - case SPARE_WRITE:
10310 - mirror->operational = 1;
10311 - mirror->write_only = 1;
10312 - raid_conf->raid_disks = MAX(raid_conf->raid_disks,
10313 - mirror->raid_disk + 1);
10315 - case SPARE_INACTIVE:
10316 - mirror->operational = 0;
10317 - mirror->write_only = 0;
10319 - case SPARE_ACTIVE:
10320 - mirror->spare = 0;
10321 - mirror->write_only = 0;
10322 - raid_conf->working_disks++;
10323 - add_ring(raid_conf, mirror);
10325 - if (failed_disk != -1) {
10326 - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number];
10327 - i = spare->raid_disk;
10328 - spare->raid_disk = descriptor->raid_disk;
10329 - descriptor->raid_disk = i;
10333 - printk("raid1_mark_spare: bug: state == %d\n", state);
10334 - restore_flags(flags);
10336 + conf->working_disks++;
10340 + case DISKOP_HOT_REMOVE_DISK:
10341 + rdisk = conf->mirrors + removed_disk;
10343 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
10348 + rdisk->dev = MKDEV(0,0);
10349 + rdisk->used_slot = 0;
10350 + conf->nr_disks--;
10353 + case DISKOP_HOT_ADD_DISK:
10354 + adisk = conf->mirrors + added_disk;
10357 + if (added_disk != added_desc->number) {
10363 + adisk->number = added_desc->number;
10364 + adisk->raid_disk = added_desc->raid_disk;
10365 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
10367 + adisk->operational = 0;
10368 + adisk->write_only = 0;
10369 + adisk->spare = 1;
10370 + adisk->used_slot = 1;
10371 + conf->nr_disks++;
10381 restore_flags(flags);
10383 + print_raid1_conf(conf);
10388 +#define IO_ERROR KERN_ALERT \
10389 +"raid1: %s: unrecoverable I/O read error for block %lu\n"
10391 +#define REDIRECT_SECTOR KERN_ERR \
10392 +"raid1: %s: redirecting sector %lu to another mirror\n"
10395 * This is a kernel thread which:
10397 * 1. Retries failed read operations on working mirrors.
10398 * 2. Updates the raid superblock when problems encounter.
10400 -void raid1d (void *data)
10401 +static void raid1d (void *data)
10403 struct buffer_head *bh;
10405 unsigned long flags;
10406 - struct raid1_bh * r1_bh;
10407 - struct md_dev *mddev;
10408 + struct raid1_bh *r1_bh;
10411 - PRINTK(("raid1d() active\n"));
10412 - save_flags(flags);
10414 while (raid1_retry_list) {
10415 + save_flags(flags);
10417 bh = raid1_retry_list;
10418 r1_bh = (struct raid1_bh *)(bh->b_dev_id);
10419 raid1_retry_list = r1_bh->next_retry;
10420 restore_flags(flags);
10422 - mddev = md_dev + MINOR(bh->b_dev);
10423 + mddev = kdev_to_mddev(bh->b_dev);
10424 if (mddev->sb_dirty) {
10425 - printk("dirty sb detected, updating.\n");
10426 + printk(KERN_INFO "dirty sb detected, updating.\n");
10427 mddev->sb_dirty = 0;
10428 - md_update_sb(MINOR(bh->b_dev));
10429 + md_update_sb(mddev);
10432 - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9);
10433 + __raid1_map (mddev, &bh->b_rdev, &bh->b_rsector,
10434 + bh->b_size >> 9);
10435 if (bh->b_rdev == dev) {
10436 - printk (KERN_ALERT
10437 - "raid1: %s: unrecoverable I/O read error for block %lu\n",
10438 - kdevname(bh->b_dev), bh->b_blocknr);
10439 - raid1_end_buffer_io(r1_bh, 0);
10440 + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
10441 + raid1_end_bh_io(r1_bh, 0);
10443 - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n",
10444 - kdevname(bh->b_dev), bh->b_blocknr);
10445 + printk (REDIRECT_SECTOR,
10446 + partition_name(bh->b_dev), bh->b_blocknr);
10447 map_and_make_request (r1_bh->cmd, bh);
10451 - restore_flags(flags);
10454 +#undef REDIRECT_SECTOR
10457 + * Private kernel thread to reconstruct mirrors after an unclean
10460 +static void raid1syncd (void *data)
10462 + raid1_conf_t *conf = data;
10463 + mddev_t *mddev = conf->mddev;
10465 + if (!conf->resync_mirrors)
10467 + if (conf->resync_mirrors == 2)
10469 + down(&mddev->recovery_sem);
10470 + if (md_do_sync(mddev, NULL)) {
10471 + up(&mddev->recovery_sem);
10475 + * Only if everything went Ok.
10477 + conf->resync_mirrors = 0;
10478 + up(&mddev->recovery_sem);
10483 * This will catch the scenario in which one of the mirrors was
10484 * mounted as a normal device rather than as a part of a raid set.
10486 + * check_consistency is very personality-dependent, eg. RAID5 cannot
10487 + * do this check, it uses another method.
10489 -static int __check_consistency (struct md_dev *mddev, int row)
10490 +static int __check_consistency (mddev_t *mddev, int row)
10492 - struct raid1_data *raid_conf = mddev->private;
10493 + raid1_conf_t *conf = mddev_to_conf(mddev);
10494 + int disks = MD_SB_DISKS;
10496 struct buffer_head *bh = NULL;
10498 char *buffer = NULL;
10500 - for (i = 0; i < raid_conf->raid_disks; i++) {
10501 - if (!raid_conf->mirrors[i].operational)
10502 + for (i = 0; i < disks; i++) {
10503 + printk("(checking disk %d)\n",i);
10504 + if (!conf->mirrors[i].operational)
10506 - dev = raid_conf->mirrors[i].dev;
10507 + printk("(really checking disk %d)\n",i);
10508 + dev = conf->mirrors[i].dev;
10509 set_blocksize(dev, 4096);
10510 if ((bh = bread(dev, row / 4, 4096)) == NULL)
10512 @@ -683,167 +867,342 @@
10516 -static int check_consistency (struct md_dev *mddev)
10517 +static int check_consistency (mddev_t *mddev)
10519 - int size = mddev->sb->size;
10521 + if (__check_consistency(mddev, 0))
10523 + * we do not do this currently, as it's perfectly possible to
10524 + * have an inconsistent array when it's freshly created. Only
10525 + * newly written data has to be consistent.
10529 - for (row = 0; row < size; row += size / 8)
10530 - if (__check_consistency(mddev, row))
10535 -static int raid1_run (int minor, struct md_dev *mddev)
10536 +#define INVALID_LEVEL KERN_WARNING \
10537 +"raid1: md%d: raid level not set to mirroring (%d)\n"
10539 +#define NO_SB KERN_ERR \
10540 +"raid1: disabled mirror %s (couldn't access raid superblock)\n"
10542 +#define ERRORS KERN_ERR \
10543 +"raid1: disabled mirror %s (errors detected)\n"
10545 +#define NOT_IN_SYNC KERN_ERR \
10546 +"raid1: disabled mirror %s (not in sync)\n"
10548 +#define INCONSISTENT KERN_ERR \
10549 +"raid1: disabled mirror %s (inconsistent descriptor)\n"
10551 +#define ALREADY_RUNNING KERN_ERR \
10552 +"raid1: disabled mirror %s (mirror %d already operational)\n"
10554 +#define OPERATIONAL KERN_INFO \
10555 +"raid1: device %s operational as mirror %d\n"
10557 +#define MEM_ERROR KERN_ERR \
10558 +"raid1: couldn't allocate memory for md%d\n"
10560 +#define SPARE KERN_INFO \
10561 +"raid1: spare disk %s\n"
10563 +#define NONE_OPERATIONAL KERN_ERR \
10564 +"raid1: no operational mirrors for md%d\n"
10566 +#define RUNNING_CKRAID KERN_ERR \
10567 +"raid1: detected mirror differences -- running resync\n"
10569 +#define ARRAY_IS_ACTIVE KERN_INFO \
10570 +"raid1: raid set md%d active with %d out of %d mirrors\n"
10572 +#define THREAD_ERROR KERN_ERR \
10573 +"raid1: couldn't allocate thread for md%d\n"
10575 +#define START_RESYNC KERN_WARNING \
10576 +"raid1: raid set md%d not clean; reconstructing mirrors\n"
10578 +static int raid1_run (mddev_t *mddev)
10580 - struct raid1_data *raid_conf;
10581 - int i, j, raid_disk;
10582 - md_superblock_t *sb = mddev->sb;
10583 - md_descriptor_t *descriptor;
10584 - struct real_dev *realdev;
10585 + raid1_conf_t *conf;
10586 + int i, j, disk_idx;
10587 + struct mirror_info *disk;
10588 + mdp_super_t *sb = mddev->sb;
10589 + mdp_disk_t *descriptor;
10590 + mdk_rdev_t *rdev;
10591 + struct md_list_head *tmp;
10592 + int start_recovery = 0;
10596 if (sb->level != 1) {
10597 - printk("raid1: %s: raid level not set to mirroring (%d)\n",
10598 - kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
10599 - MOD_DEC_USE_COUNT;
10603 - * copy the now verified devices into our private RAID1 bookkeeping
10604 - * area. [whatever we allocate in raid1_run(), should be freed in
10606 + printk(INVALID_LEVEL, mdidx(mddev), sb->level);
10610 + * copy the already verified devices into our private RAID1
10611 + * bookkeeping area. [whatever we allocate in raid1_run(),
10612 + * should be freed in raid1_stop()]
10615 - while (!( /* FIXME: now we are rather fault tolerant than nice */
10616 - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL)
10619 - printk ("raid1_run(): out of memory\n");
10620 - current->policy |= SCHED_YIELD;
10623 - raid_conf = mddev->private;
10624 - memset(raid_conf, 0, sizeof(*raid_conf));
10626 - PRINTK(("raid1_run(%d) called.\n", minor));
10628 - for (i = 0; i < mddev->nb_dev; i++) {
10629 - realdev = &mddev->devices[i];
10630 - if (!realdev->sb) {
10631 - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
10632 + conf = raid1_kmalloc(sizeof(raid1_conf_t));
10633 + mddev->private = conf;
10635 + printk(MEM_ERROR, mdidx(mddev));
10639 + ITERATE_RDEV(mddev,rdev,tmp) {
10640 + if (rdev->faulty) {
10641 + printk(ERRORS, partition_name(rdev->dev));
10648 + if (rdev->desc_nr == -1) {
10654 - * This is important -- we are using the descriptor on
10655 - * the disk only to get a pointer to the descriptor on
10656 - * the main superblock, which might be more recent.
10658 - descriptor = &sb->disks[realdev->sb->descriptor.number];
10659 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
10660 - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev));
10661 + descriptor = &sb->disks[rdev->desc_nr];
10662 + disk_idx = descriptor->raid_disk;
10663 + disk = conf->mirrors + disk_idx;
10665 + if (disk_faulty(descriptor)) {
10666 + disk->number = descriptor->number;
10667 + disk->raid_disk = disk_idx;
10668 + disk->dev = rdev->dev;
10669 + disk->sect_limit = MAX_LINEAR_SECTORS;
10670 + disk->operational = 0;
10671 + disk->write_only = 0;
10673 + disk->used_slot = 1;
10676 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
10677 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
10678 - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev));
10679 + if (disk_active(descriptor)) {
10680 + if (!disk_sync(descriptor)) {
10681 + printk(NOT_IN_SYNC,
10682 + partition_name(rdev->dev));
10685 - raid_disk = descriptor->raid_disk;
10686 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
10687 - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev));
10688 + if ((descriptor->number > MD_SB_DISKS) ||
10689 + (disk_idx > sb->raid_disks)) {
10691 + printk(INCONSISTENT,
10692 + partition_name(rdev->dev));
10695 - if (raid_conf->mirrors[raid_disk].operational) {
10696 - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk);
10697 + if (disk->operational) {
10698 + printk(ALREADY_RUNNING,
10699 + partition_name(rdev->dev),
10703 - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk);
10704 - raid_conf->mirrors[raid_disk].number = descriptor->number;
10705 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10706 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10707 - raid_conf->mirrors[raid_disk].operational = 1;
10708 - raid_conf->mirrors[raid_disk].sect_limit = 128;
10709 - raid_conf->working_disks++;
10710 + printk(OPERATIONAL, partition_name(rdev->dev),
10712 + disk->number = descriptor->number;
10713 + disk->raid_disk = disk_idx;
10714 + disk->dev = rdev->dev;
10715 + disk->sect_limit = MAX_LINEAR_SECTORS;
10716 + disk->operational = 1;
10717 + disk->write_only = 0;
10719 + disk->used_slot = 1;
10720 + conf->working_disks++;
10723 * Must be a spare disk ..
10725 - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev));
10726 - raid_disk = descriptor->raid_disk;
10727 - raid_conf->mirrors[raid_disk].number = descriptor->number;
10728 - raid_conf->mirrors[raid_disk].raid_disk = raid_disk;
10729 - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev;
10730 - raid_conf->mirrors[raid_disk].sect_limit = 128;
10732 - raid_conf->mirrors[raid_disk].operational = 0;
10733 - raid_conf->mirrors[raid_disk].write_only = 0;
10734 - raid_conf->mirrors[raid_disk].spare = 1;
10737 - if (!raid_conf->working_disks) {
10738 - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
10739 - kfree(raid_conf);
10740 - mddev->private = NULL;
10741 - MOD_DEC_USE_COUNT;
10745 - raid_conf->raid_disks = sb->raid_disks;
10746 - raid_conf->mddev = mddev;
10748 - for (j = 0; !raid_conf->mirrors[j].operational; j++);
10749 - raid_conf->last_used = j;
10750 - for (i = raid_conf->raid_disks - 1; i >= 0; i--) {
10751 - if (raid_conf->mirrors[i].operational) {
10752 - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j));
10753 - raid_conf->mirrors[i].next = j;
10754 + printk(SPARE, partition_name(rdev->dev));
10755 + disk->number = descriptor->number;
10756 + disk->raid_disk = disk_idx;
10757 + disk->dev = rdev->dev;
10758 + disk->sect_limit = MAX_LINEAR_SECTORS;
10759 + disk->operational = 0;
10760 + disk->write_only = 0;
10762 + disk->used_slot = 1;
10765 + if (!conf->working_disks) {
10766 + printk(NONE_OPERATIONAL, mdidx(mddev));
10767 + goto out_free_conf;
10770 + conf->raid_disks = sb->raid_disks;
10771 + conf->nr_disks = sb->nr_disks;
10772 + conf->mddev = mddev;
10774 + for (i = 0; i < MD_SB_DISKS; i++) {
10776 + descriptor = sb->disks+i;
10777 + disk_idx = descriptor->raid_disk;
10778 + disk = conf->mirrors + disk_idx;
10780 + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
10781 + !disk->used_slot) {
10783 + disk->number = descriptor->number;
10784 + disk->raid_disk = disk_idx;
10785 + disk->dev = MKDEV(0,0);
10787 + disk->operational = 0;
10788 + disk->write_only = 0;
10790 + disk->used_slot = 1;
10795 + * find the first working one and use it as a starting point
10796 + * to read balancing.
10798 + for (j = 0; !conf->mirrors[j].operational; j++)
10800 + conf->last_used = j;
10803 + * initialize the 'working disks' list.
10805 + for (i = conf->raid_disks - 1; i >= 0; i--) {
10806 + if (conf->mirrors[i].operational) {
10807 + conf->mirrors[i].next = j;
10812 - if (check_consistency(mddev)) {
10813 - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n");
10814 - sb->state |= 1 << MD_SB_ERRORS;
10815 - kfree(raid_conf);
10816 - mddev->private = NULL;
10817 - MOD_DEC_USE_COUNT;
10819 + if (conf->working_disks != sb->raid_disks) {
10820 + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
10821 + start_recovery = 1;
10824 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
10826 + * we do sanity checks even if the device says
10829 + if (check_consistency(mddev)) {
10830 + printk(RUNNING_CKRAID);
10831 + sb->state &= ~(1 << MD_SB_CLEAN);
10836 + const char * name = "raid1d";
10838 + conf->thread = md_register_thread(raid1d, conf, name);
10839 + if (!conf->thread) {
10840 + printk(THREAD_ERROR, mdidx(mddev));
10841 + goto out_free_conf;
10845 + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
10846 + const char * name = "raid1syncd";
10848 + conf->resync_thread = md_register_thread(raid1syncd, conf,name);
10849 + if (!conf->resync_thread) {
10850 + printk(THREAD_ERROR, mdidx(mddev));
10851 + goto out_free_conf;
10854 + printk(START_RESYNC, mdidx(mddev));
10855 + conf->resync_mirrors = 1;
10856 + md_wakeup_thread(conf->resync_thread);
10860 * Regenerate the "device is in sync with the raid set" bit for
10863 - for (i = 0; i < sb->nr_disks ; i++) {
10864 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
10865 + for (i = 0; i < MD_SB_DISKS; i++) {
10866 + mark_disk_nonsync(sb->disks+i);
10867 for (j = 0; j < sb->raid_disks; j++) {
10868 - if (!raid_conf->mirrors[j].operational)
10869 + if (!conf->mirrors[j].operational)
10871 - if (sb->disks[i].number == raid_conf->mirrors[j].number)
10872 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
10873 + if (sb->disks[i].number == conf->mirrors[j].number)
10874 + mark_disk_sync(sb->disks+i);
10877 - sb->active_disks = raid_conf->working_disks;
10878 + sb->active_disks = conf->working_disks;
10880 - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks);
10881 - /* Ok, everything is just fine now */
10883 + if (start_recovery)
10884 + md_recover_arrays();
10887 + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
10889 + * Ok, everything is just fine now
10895 + mddev->private = NULL;
10897 + MOD_DEC_USE_COUNT;
10901 +#undef INVALID_LEVEL
10904 +#undef NOT_IN_SYNC
10905 +#undef INCONSISTENT
10906 +#undef ALREADY_RUNNING
10907 +#undef OPERATIONAL
10909 +#undef NONE_OPERATIONAL
10910 +#undef RUNNING_CKRAID
10911 +#undef ARRAY_IS_ACTIVE
10913 +static int raid1_stop_resync (mddev_t *mddev)
10915 + raid1_conf_t *conf = mddev_to_conf(mddev);
10917 + if (conf->resync_thread) {
10918 + if (conf->resync_mirrors) {
10919 + conf->resync_mirrors = 2;
10920 + md_interrupt_thread(conf->resync_thread);
10921 + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
10929 +static int raid1_restart_resync (mddev_t *mddev)
10931 + raid1_conf_t *conf = mddev_to_conf(mddev);
10933 + if (conf->resync_mirrors) {
10934 + if (!conf->resync_thread) {
10938 + conf->resync_mirrors = 1;
10939 + md_wakeup_thread(conf->resync_thread);
10945 -static int raid1_stop (int minor, struct md_dev *mddev)
10946 +static int raid1_stop (mddev_t *mddev)
10948 - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private;
10949 + raid1_conf_t *conf = mddev_to_conf(mddev);
10951 - kfree (raid_conf);
10952 + md_unregister_thread(conf->thread);
10953 + if (conf->resync_thread)
10954 + md_unregister_thread(conf->resync_thread);
10956 mddev->private = NULL;
10961 -static struct md_personality raid1_personality=
10962 +static mdk_personality_t raid1_personality=
10966 @@ -855,15 +1214,13 @@
10967 NULL, /* no ioctls */
10970 - raid1_hot_add_disk,
10971 - /* raid1_hot_remove_drive */ NULL,
10974 + raid1_stop_resync,
10975 + raid1_restart_resync
10978 int raid1_init (void)
10980 - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL)
10982 return register_md_personality (RAID1, &raid1_personality);
10985 @@ -875,7 +1232,6 @@
10987 void cleanup_module (void)
10989 - md_unregister_thread (raid1_thread);
10990 unregister_md_personality (RAID1);
10993 --- linux/drivers/block/raid5.c.orig Fri May 8 09:17:13 1998
10994 +++ linux/drivers/block/raid5.c Fri Nov 23 11:18:20 2001
10996 -/*****************************************************************************
10998 * raid5.c : Multiple Devices driver for Linux
10999 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
11001 @@ -14,16 +14,15 @@
11002 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
11006 #include <linux/module.h>
11007 #include <linux/locks.h>
11008 #include <linux/malloc.h>
11009 -#include <linux/md.h>
11010 -#include <linux/raid5.h>
11011 +#include <linux/raid/raid5.h>
11012 #include <asm/bitops.h>
11013 #include <asm/atomic.h>
11014 -#include <asm/md.h>
11016 -static struct md_personality raid5_personality;
11017 +static mdk_personality_t raid5_personality;
11022 #define HASH_PAGES_ORDER 0
11023 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
11024 #define HASH_MASK (NR_HASH - 1)
11025 -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11026 +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
11029 * The following can be used to debug the driver
11031 #define PRINTK(x) do { ; } while (0)
11034 +static void print_raid5_conf (raid5_conf_t *conf);
11036 static inline int stripe_locked(struct stripe_head *sh)
11038 return test_bit(STRIPE_LOCKED, &sh->state);
11039 @@ -61,32 +62,32 @@
11041 static inline void lock_stripe(struct stripe_head *sh)
11043 - struct raid5_data *raid_conf = sh->raid_conf;
11044 - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11045 + raid5_conf_t *conf = sh->raid_conf;
11046 + if (!md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
11047 PRINTK(("locking stripe %lu\n", sh->sector));
11048 - raid_conf->nr_locked_stripes++;
11049 + conf->nr_locked_stripes++;
11053 static inline void unlock_stripe(struct stripe_head *sh)
11055 - struct raid5_data *raid_conf = sh->raid_conf;
11056 - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11057 + raid5_conf_t *conf = sh->raid_conf;
11058 + if (md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) {
11059 PRINTK(("unlocking stripe %lu\n", sh->sector));
11060 - raid_conf->nr_locked_stripes--;
11061 + conf->nr_locked_stripes--;
11062 wake_up(&sh->wait);
11066 static inline void finish_stripe(struct stripe_head *sh)
11068 - struct raid5_data *raid_conf = sh->raid_conf;
11069 + raid5_conf_t *conf = sh->raid_conf;
11071 sh->cmd = STRIPE_NONE;
11072 sh->phase = PHASE_COMPLETE;
11073 - raid_conf->nr_pending_stripes--;
11074 - raid_conf->nr_cached_stripes++;
11075 - wake_up(&raid_conf->wait_for_stripe);
11076 + conf->nr_pending_stripes--;
11077 + conf->nr_cached_stripes++;
11078 + wake_up(&conf->wait_for_stripe);
11081 void __wait_on_stripe(struct stripe_head *sh)
11082 @@ -114,7 +115,7 @@
11083 __wait_on_stripe(sh);
11086 -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11087 +static inline void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
11089 PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
11091 @@ -123,21 +124,22 @@
11092 sh->hash_next->hash_pprev = sh->hash_pprev;
11093 *sh->hash_pprev = sh->hash_next;
11094 sh->hash_pprev = NULL;
11095 - raid_conf->nr_hashed_stripes--;
11096 + conf->nr_hashed_stripes--;
11100 -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh)
11101 +static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
11103 - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size);
11104 + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
11106 - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes));
11107 + PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
11108 + sh->sector, conf->nr_hashed_stripes));
11110 if ((sh->hash_next = *shp) != NULL)
11111 (*shp)->hash_pprev = &sh->hash_next;
11113 sh->hash_pprev = shp;
11114 - raid_conf->nr_hashed_stripes++;
11115 + conf->nr_hashed_stripes++;
11118 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
11119 @@ -145,13 +147,15 @@
11120 struct buffer_head *bh;
11121 unsigned long flags;
11123 - save_flags(flags);
11125 - if ((bh = sh->buffer_pool) == NULL)
11127 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11128 + bh = sh->buffer_pool;
11131 sh->buffer_pool = bh->b_next;
11132 bh->b_size = b_size;
11133 - restore_flags(flags);
11135 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11140 @@ -160,12 +164,14 @@
11141 struct buffer_head *bh;
11142 unsigned long flags;
11144 - save_flags(flags);
11146 - if ((bh = sh->bh_pool) == NULL)
11148 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11149 + bh = sh->bh_pool;
11152 sh->bh_pool = bh->b_next;
11153 - restore_flags(flags);
11155 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11160 @@ -173,54 +179,52 @@
11162 unsigned long flags;
11164 - save_flags(flags);
11166 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11167 bh->b_next = sh->buffer_pool;
11168 sh->buffer_pool = bh;
11169 - restore_flags(flags);
11170 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11173 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
11175 unsigned long flags;
11177 - save_flags(flags);
11179 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11180 bh->b_next = sh->bh_pool;
11182 - restore_flags(flags);
11183 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11186 -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf)
11187 +static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
11189 struct stripe_head *sh;
11190 unsigned long flags;
11194 - if ((sh = raid_conf->free_sh_list) == NULL) {
11195 + if ((sh = conf->free_sh_list) == NULL) {
11196 restore_flags(flags);
11199 - raid_conf->free_sh_list = sh->free_next;
11200 - raid_conf->nr_free_sh--;
11201 - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list)
11202 + conf->free_sh_list = sh->free_next;
11203 + conf->nr_free_sh--;
11204 + if (!conf->nr_free_sh && conf->free_sh_list)
11205 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
11206 restore_flags(flags);
11207 - if (sh->hash_pprev || sh->nr_pending || sh->count)
11208 + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || sh->count)
11209 printk("get_free_stripe(): bug\n");
11213 -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh)
11214 +static void put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh)
11216 unsigned long flags;
11220 - sh->free_next = raid_conf->free_sh_list;
11221 - raid_conf->free_sh_list = sh;
11222 - raid_conf->nr_free_sh++;
11223 + sh->free_next = conf->free_sh_list;
11224 + conf->free_sh_list = sh;
11225 + conf->nr_free_sh++;
11226 restore_flags(flags);
11229 @@ -324,8 +328,8 @@
11231 static void kfree_stripe(struct stripe_head *sh)
11233 - struct raid5_data *raid_conf = sh->raid_conf;
11234 - int disks = raid_conf->raid_disks, j;
11235 + raid5_conf_t *conf = sh->raid_conf;
11236 + int disks = conf->raid_disks, j;
11238 PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
11239 if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
11240 @@ -338,19 +342,19 @@
11241 if (sh->bh_new[j] || sh->bh_copy[j])
11242 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
11244 - remove_hash(raid_conf, sh);
11245 - put_free_stripe(raid_conf, sh);
11246 + remove_hash(conf, sh);
11247 + put_free_stripe(conf, sh);
11250 -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr)
11251 +static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
11253 struct stripe_head *sh;
11256 - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock));
11257 + PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, conf->nr_hashed_stripes, conf->clock));
11258 for (i = 0; i < NR_HASH; i++) {
11260 - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK];
11261 + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
11262 for (; sh; sh = sh->hash_next) {
11263 if (sh->phase != PHASE_COMPLETE)
11265 @@ -360,30 +364,30 @@
11268 if (++count == nr) {
11269 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11270 - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK;
11271 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11272 + conf->clock = (i + conf->clock) & HASH_MASK;
11278 - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes));
11279 + PRINTK(("shrink completed, nr_hashed_stripes %d\n", conf->nr_hashed_stripes));
11283 -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11284 +static struct stripe_head *find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11286 struct stripe_head *sh;
11288 - if (raid_conf->buffer_size != size) {
11289 - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
11290 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
11291 - raid_conf->buffer_size = size;
11292 + if (conf->buffer_size != size) {
11293 + PRINTK(("switching size, %d --> %d\n", conf->buffer_size, size));
11294 + shrink_stripe_cache(conf, conf->max_nr_stripes);
11295 + conf->buffer_size = size;
11298 PRINTK(("find_stripe, sector %lu\n", sector));
11299 - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next)
11300 - if (sh->sector == sector && sh->raid_conf == raid_conf) {
11301 + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next)
11302 + if (sh->sector == sector && sh->raid_conf == conf) {
11303 if (sh->size == size) {
11304 PRINTK(("found stripe %lu\n", sector));
11306 @@ -397,7 +401,7 @@
11310 -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority)
11311 +static int grow_stripes(raid5_conf_t *conf, int num, int priority)
11313 struct stripe_head *sh;
11315 @@ -405,62 +409,64 @@
11316 if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL)
11318 memset(sh, 0, sizeof(*sh));
11319 - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) {
11320 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
11321 + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
11323 + if (grow_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
11324 + shrink_buffers(sh, 2 * conf->raid_disks);
11328 - if (grow_bh(sh, raid_conf->raid_disks, priority)) {
11329 - shrink_buffers(sh, 2 * raid_conf->raid_disks);
11330 - shrink_bh(sh, raid_conf->raid_disks);
11331 + if (grow_bh(sh, conf->raid_disks, priority)) {
11332 + shrink_buffers(sh, 2 * conf->raid_disks);
11333 + shrink_bh(sh, conf->raid_disks);
11337 - put_free_stripe(raid_conf, sh);
11338 - raid_conf->nr_stripes++;
11339 + put_free_stripe(conf, sh);
11340 + conf->nr_stripes++;
11345 -static void shrink_stripes(struct raid5_data *raid_conf, int num)
11346 +static void shrink_stripes(raid5_conf_t *conf, int num)
11348 struct stripe_head *sh;
11351 - sh = get_free_stripe(raid_conf);
11352 + sh = get_free_stripe(conf);
11355 - shrink_buffers(sh, raid_conf->raid_disks * 2);
11356 - shrink_bh(sh, raid_conf->raid_disks);
11357 + shrink_buffers(sh, conf->raid_disks * 2);
11358 + shrink_bh(sh, conf->raid_disks);
11360 - raid_conf->nr_stripes--;
11361 + conf->nr_stripes--;
11365 -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11366 +static struct stripe_head *kmalloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11368 struct stripe_head *sh = NULL, *tmp;
11369 struct buffer_head *buffer_pool, *bh_pool;
11371 PRINTK(("kmalloc_stripe called\n"));
11373 - while ((sh = get_free_stripe(raid_conf)) == NULL) {
11374 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8);
11375 - if ((sh = get_free_stripe(raid_conf)) != NULL)
11376 + while ((sh = get_free_stripe(conf)) == NULL) {
11377 + shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
11378 + if ((sh = get_free_stripe(conf)) != NULL)
11380 - if (!raid_conf->nr_pending_stripes)
11381 + if (!conf->nr_pending_stripes)
11382 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
11383 - md_wakeup_thread(raid_conf->thread);
11384 + md_wakeup_thread(conf->thread);
11385 PRINTK(("waiting for some stripes to complete\n"));
11386 - sleep_on(&raid_conf->wait_for_stripe);
11387 + sleep_on(&conf->wait_for_stripe);
11391 * The above might have slept, so perhaps another process
11392 * already created the stripe for us..
11394 - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
11395 - put_free_stripe(raid_conf, sh);
11396 + if ((tmp = find_stripe(conf, sector, size)) != NULL) {
11397 + put_free_stripe(conf, sh);
11398 wait_on_stripe(tmp);
11401 @@ -472,25 +478,25 @@
11402 sh->bh_pool = bh_pool;
11403 sh->phase = PHASE_COMPLETE;
11404 sh->cmd = STRIPE_NONE;
11405 - sh->raid_conf = raid_conf;
11406 + sh->raid_conf = conf;
11407 sh->sector = sector;
11409 - raid_conf->nr_cached_stripes++;
11410 - insert_hash(raid_conf, sh);
11411 + conf->nr_cached_stripes++;
11412 + insert_hash(conf, sh);
11413 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
11417 -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
11418 +static struct stripe_head *get_stripe(raid5_conf_t *conf, unsigned long sector, int size)
11420 struct stripe_head *sh;
11422 PRINTK(("get_stripe, sector %lu\n", sector));
11423 - sh = find_stripe(raid_conf, sector, size);
11424 + sh = find_stripe(conf, sector, size);
11426 wait_on_stripe(sh);
11428 - sh = kmalloc_stripe(raid_conf, sector, size);
11429 + sh = kmalloc_stripe(conf, sector, size);
11433 @@ -523,7 +529,7 @@
11434 bh->b_end_io(bh, uptodate);
11436 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
11437 - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
11438 + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr);
11441 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
11442 @@ -537,36 +543,35 @@
11443 static void raid5_end_request (struct buffer_head * bh, int uptodate)
11445 struct stripe_head *sh = bh->b_dev_id;
11446 - struct raid5_data *raid_conf = sh->raid_conf;
11447 - int disks = raid_conf->raid_disks, i;
11448 + raid5_conf_t *conf = sh->raid_conf;
11449 + int disks = conf->raid_disks, i;
11450 unsigned long flags;
11452 PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
11453 - save_flags(flags);
11455 + md_spin_lock_irqsave(&sh->stripe_lock, flags);
11456 raid5_mark_buffer_uptodate(bh, uptodate);
11457 - --sh->nr_pending;
11458 - if (!sh->nr_pending) {
11459 - md_wakeup_thread(raid_conf->thread);
11460 - atomic_inc(&raid_conf->nr_handle);
11461 + if (atomic_dec_and_test(&sh->nr_pending)) {
11462 + md_wakeup_thread(conf->thread);
11463 + atomic_inc(&conf->nr_handle);
11467 md_error(bh->b_dev, bh->b_rdev);
11468 - if (raid_conf->failed_disks) {
11470 + if (conf->failed_disks) {
11471 for (i = 0; i < disks; i++) {
11472 - if (raid_conf->disks[i].operational)
11473 + if (conf->disks[i].operational)
11475 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
11477 - if (bh->b_rdev != raid_conf->disks[i].dev)
11478 + if (bh->b_rdev != conf->disks[i].dev)
11480 set_bit(STRIPE_ERROR, &sh->state);
11483 - restore_flags(flags);
11484 + md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
11487 -static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
11488 +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
11489 unsigned long *rsector, unsigned long size)
11491 /* No complex mapping used: the core of the work is done in the
11492 @@ -577,11 +582,10 @@
11494 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
11496 - struct raid5_data *raid_conf = sh->raid_conf;
11497 - struct md_dev *mddev = raid_conf->mddev;
11498 - int minor = (int) (mddev - md_dev);
11499 + raid5_conf_t *conf = sh->raid_conf;
11500 + mddev_t *mddev = conf->mddev;
11502 - kdev_t dev = MKDEV(MD_MAJOR, minor);
11503 + kdev_t dev = mddev_to_kdev(mddev);
11504 int block = sh->sector / (sh->size >> 9);
11506 b_data = ((volatile struct buffer_head *) bh)->b_data;
11507 @@ -589,7 +593,7 @@
11508 init_buffer(bh, dev, block, raid5_end_request, sh);
11509 ((volatile struct buffer_head *) bh)->b_data = b_data;
11511 - bh->b_rdev = raid_conf->disks[i].dev;
11512 + bh->b_rdev = conf->disks[i].dev;
11513 bh->b_rsector = sh->sector;
11515 bh->b_state = (1 << BH_Req);
11516 @@ -597,33 +601,62 @@
11517 bh->b_list = BUF_LOCKED;
11520 -static int raid5_error (struct md_dev *mddev, kdev_t dev)
11521 +static int raid5_error (mddev_t *mddev, kdev_t dev)
11523 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
11524 - md_superblock_t *sb = mddev->sb;
11525 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
11526 + mdp_super_t *sb = mddev->sb;
11527 struct disk_info *disk;
11530 PRINTK(("raid5_error called\n"));
11531 - raid_conf->resync_parity = 0;
11532 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
11533 + conf->resync_parity = 0;
11534 + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
11535 if (disk->dev == dev && disk->operational) {
11536 disk->operational = 0;
11537 - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
11538 - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
11539 - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
11540 + mark_disk_faulty(sb->disks+disk->number);
11541 + mark_disk_nonsync(sb->disks+disk->number);
11542 + mark_disk_inactive(sb->disks+disk->number);
11543 sb->active_disks--;
11544 sb->working_disks--;
11545 sb->failed_disks++;
11546 mddev->sb_dirty = 1;
11547 - raid_conf->working_disks--;
11548 - raid_conf->failed_disks++;
11549 - md_wakeup_thread(raid_conf->thread);
11550 + conf->working_disks--;
11551 + conf->failed_disks++;
11552 + md_wakeup_thread(conf->thread);
11554 - "RAID5: Disk failure on %s, disabling device."
11555 - "Operation continuing on %d devices\n",
11556 - kdevname (dev), raid_conf->working_disks);
11557 + "raid5: Disk failure on %s, disabling device."
11558 + " Operation continuing on %d devices\n",
11559 + partition_name (dev), conf->working_disks);
11564 + * handle errors in spares (during reconstruction)
11566 + if (conf->spare) {
11567 + disk = conf->spare;
11568 + if (disk->dev == dev) {
11569 + printk (KERN_ALERT
11570 + "raid5: Disk failure on spare %s\n",
11571 + partition_name (dev));
11572 + if (!conf->spare->operational) {
11576 + disk->operational = 0;
11577 + disk->write_only = 0;
11578 + conf->spare = NULL;
11579 + mark_disk_faulty(sb->disks+disk->number);
11580 + mark_disk_nonsync(sb->disks+disk->number);
11581 + mark_disk_inactive(sb->disks+disk->number);
11582 + sb->spare_disks--;
11583 + sb->working_disks--;
11584 + sb->failed_disks++;
11593 @@ -634,12 +667,12 @@
11594 static inline unsigned long
11595 raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
11596 unsigned int * dd_idx, unsigned int * pd_idx,
11597 - struct raid5_data *raid_conf)
11598 + raid5_conf_t *conf)
11600 unsigned int stripe;
11601 int chunk_number, chunk_offset;
11602 unsigned long new_sector;
11603 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
11604 + int sectors_per_chunk = conf->chunk_size >> 9;
11606 /* First compute the information on this sector */
11608 @@ -662,9 +695,9 @@
11610 * Select the parity disk based on the user selected algorithm.
11612 - if (raid_conf->level == 4)
11613 + if (conf->level == 4)
11614 *pd_idx = data_disks;
11615 - else switch (raid_conf->algorithm) {
11616 + else switch (conf->algorithm) {
11617 case ALGORITHM_LEFT_ASYMMETRIC:
11618 *pd_idx = data_disks - stripe % raid_disks;
11619 if (*dd_idx >= *pd_idx)
11620 @@ -684,7 +717,7 @@
11621 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
11624 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11625 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11629 @@ -705,16 +738,16 @@
11631 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
11633 - struct raid5_data *raid_conf = sh->raid_conf;
11634 - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
11635 + raid5_conf_t *conf = sh->raid_conf;
11636 + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
11637 unsigned long new_sector = sh->sector, check;
11638 - int sectors_per_chunk = raid_conf->chunk_size >> 9;
11639 + int sectors_per_chunk = conf->chunk_size >> 9;
11640 unsigned long stripe = new_sector / sectors_per_chunk;
11641 int chunk_offset = new_sector % sectors_per_chunk;
11642 int chunk_number, dummy1, dummy2, dd_idx = i;
11643 unsigned long r_sector, blocknr;
11645 - switch (raid_conf->algorithm) {
11646 + switch (conf->algorithm) {
11647 case ALGORITHM_LEFT_ASYMMETRIC:
11648 case ALGORITHM_RIGHT_ASYMMETRIC:
11649 if (i > sh->pd_idx)
11650 @@ -727,14 +760,14 @@
11651 i -= (sh->pd_idx + 1);
11654 - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
11655 + printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
11658 chunk_number = stripe * data_disks + i;
11659 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
11660 blocknr = r_sector / (sh->size >> 9);
11662 - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
11663 + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
11664 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
11665 printk("compute_blocknr: map not correct\n");
11667 @@ -742,36 +775,11 @@
11671 -#ifdef HAVE_ARCH_XORBLOCK
11672 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11674 - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size);
11677 -static void xor_block(struct buffer_head *dest, struct buffer_head *source)
11679 - long lines = dest->b_size / (sizeof (long)) / 8, i;
11680 - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data;
11682 - for (i = lines; i > 0; i--) {
11683 - *(destp + 0) ^= *(sourcep + 0);
11684 - *(destp + 1) ^= *(sourcep + 1);
11685 - *(destp + 2) ^= *(sourcep + 2);
11686 - *(destp + 3) ^= *(sourcep + 3);
11687 - *(destp + 4) ^= *(sourcep + 4);
11688 - *(destp + 5) ^= *(sourcep + 5);
11689 - *(destp + 6) ^= *(sourcep + 6);
11690 - *(destp + 7) ^= *(sourcep + 7);
11697 static void compute_block(struct stripe_head *sh, int dd_idx)
11699 - struct raid5_data *raid_conf = sh->raid_conf;
11700 - int i, disks = raid_conf->raid_disks;
11701 + raid5_conf_t *conf = sh->raid_conf;
11702 + int i, count, disks = conf->raid_disks;
11703 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11705 PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
11707 @@ -780,69 +788,100 @@
11708 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
11710 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
11711 + bh_ptr[0] = sh->bh_old[dd_idx];
11713 for (i = 0; i < disks; i++) {
11716 if (sh->bh_old[i]) {
11717 - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
11720 + bh_ptr[count++] = sh->bh_old[i];
11722 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
11724 + if (count == MAX_XOR_BLOCKS) {
11725 + xor_block(count, &bh_ptr[0]);
11730 + xor_block(count, &bh_ptr[0]);
11732 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
11735 static void compute_parity(struct stripe_head *sh, int method)
11737 - struct raid5_data *raid_conf = sh->raid_conf;
11738 - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
11739 + raid5_conf_t *conf = sh->raid_conf;
11740 + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, lowprio, count;
11741 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
11743 PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
11745 for (i = 0; i < disks; i++) {
11746 if (i == pd_idx || !sh->bh_new[i])
11748 if (!sh->bh_copy[i])
11749 sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size);
11750 raid5_build_block(sh, sh->bh_copy[i], i);
11751 + if (!buffer_lowprio(sh->bh_new[i]))
11754 + mark_buffer_lowprio(sh->bh_copy[i]);
11755 mark_buffer_clean(sh->bh_new[i]);
11756 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
11758 if (sh->bh_copy[pd_idx] == NULL)
11759 sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size);
11760 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
11762 + mark_buffer_lowprio(sh->bh_copy[pd_idx]);
11764 if (method == RECONSTRUCT_WRITE) {
11765 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
11766 + bh_ptr[0] = sh->bh_copy[pd_idx];
11768 for (i = 0; i < disks; i++) {
11769 if (i == sh->pd_idx)
11771 if (sh->bh_new[i]) {
11772 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11774 + bh_ptr[count++] = sh->bh_copy[i];
11775 + } else if (sh->bh_old[i]) {
11776 + bh_ptr[count++] = sh->bh_old[i];
11778 - if (sh->bh_old[i]) {
11779 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11781 + if (count == MAX_XOR_BLOCKS) {
11782 + xor_block(count, &bh_ptr[0]);
11786 + if (count != 1) {
11787 + xor_block(count, &bh_ptr[0]);
11789 } else if (method == READ_MODIFY_WRITE) {
11790 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
11791 + bh_ptr[0] = sh->bh_copy[pd_idx];
11793 for (i = 0; i < disks; i++) {
11794 if (i == sh->pd_idx)
11796 if (sh->bh_new[i] && sh->bh_old[i]) {
11797 - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
11798 - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
11800 + bh_ptr[count++] = sh->bh_copy[i];
11801 + bh_ptr[count++] = sh->bh_old[i];
11803 + if (count >= (MAX_XOR_BLOCKS - 1)) {
11804 + xor_block(count, &bh_ptr[0]);
11808 + if (count != 1) {
11809 + xor_block(count, &bh_ptr[0]);
11812 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
11815 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
11817 - struct raid5_data *raid_conf = sh->raid_conf;
11818 + raid5_conf_t *conf = sh->raid_conf;
11819 struct buffer_head *bh_req;
11821 if (sh->bh_new[dd_idx]) {
11822 @@ -860,19 +899,22 @@
11823 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
11824 sh->phase = PHASE_BEGIN;
11825 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
11826 - raid_conf->nr_pending_stripes++;
11827 - atomic_inc(&raid_conf->nr_handle);
11828 + conf->nr_pending_stripes++;
11829 + atomic_inc(&conf->nr_handle);
11831 sh->bh_new[dd_idx] = bh;
11832 sh->bh_req[dd_idx] = bh_req;
11833 sh->cmd_new[dd_idx] = rw;
11834 sh->new[dd_idx] = 1;
11836 + if (buffer_lowprio(bh))
11837 + mark_buffer_lowprio(bh_req);
11840 static void complete_stripe(struct stripe_head *sh)
11842 - struct raid5_data *raid_conf = sh->raid_conf;
11843 - int disks = raid_conf->raid_disks;
11844 + raid5_conf_t *conf = sh->raid_conf;
11845 + int disks = conf->raid_disks;
11848 PRINTK(("complete_stripe %lu\n", sh->sector));
11849 @@ -909,6 +951,22 @@
11854 +static int is_stripe_lowprio(struct stripe_head *sh, int disks)
11856 + int i, lowprio = 1;
11858 + for (i = 0; i < disks; i++) {
11859 + if (sh->bh_new[i])
11860 + if (!buffer_lowprio(sh->bh_new[i]))
11862 + if (sh->bh_old[i])
11863 + if (!buffer_lowprio(sh->bh_old[i]))
11870 * handle_stripe() is our main logic routine. Note that:
11872 @@ -919,28 +977,27 @@
11873 * 2. We should be careful to set sh->nr_pending whenever we sleep,
11874 * to prevent re-entry of handle_stripe() for the same sh.
11876 - * 3. raid_conf->failed_disks and disk->operational can be changed
11877 + * 3. conf->failed_disks and disk->operational can be changed
11878 * from an interrupt. This complicates things a bit, but it allows
11879 * us to stop issuing requests for a failed drive as soon as possible.
11881 static void handle_stripe(struct stripe_head *sh)
11883 - struct raid5_data *raid_conf = sh->raid_conf;
11884 - struct md_dev *mddev = raid_conf->mddev;
11885 - int minor = (int) (mddev - md_dev);
11886 + raid5_conf_t *conf = sh->raid_conf;
11887 + mddev_t *mddev = conf->mddev;
11888 struct buffer_head *bh;
11889 - int disks = raid_conf->raid_disks;
11890 - int i, nr = 0, nr_read = 0, nr_write = 0;
11891 + int disks = conf->raid_disks;
11892 + int i, nr = 0, nr_read = 0, nr_write = 0, lowprio;
11893 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
11894 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
11895 int reading = 0, nr_writing = 0;
11896 int method1 = INT_MAX, method2 = INT_MAX;
11898 unsigned long flags;
11899 - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
11900 + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
11902 PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
11903 - if (sh->nr_pending) {
11904 + if (md_atomic_read(&sh->nr_pending)) {
11905 printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
11908 @@ -949,9 +1006,9 @@
11912 - atomic_dec(&raid_conf->nr_handle);
11913 + atomic_dec(&conf->nr_handle);
11915 - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11916 + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
11917 printk("raid5: restarting stripe %lu\n", sh->sector);
11918 sh->phase = PHASE_BEGIN;
11920 @@ -969,11 +1026,11 @@
11923 for (i = 0; i < disks; i++) {
11924 - operational[i] = raid_conf->disks[i].operational;
11925 - if (i == sh->pd_idx && raid_conf->resync_parity)
11926 + operational[i] = conf->disks[i].operational;
11927 + if (i == sh->pd_idx && conf->resync_parity)
11928 operational[i] = 0;
11930 - failed_disks = raid_conf->failed_disks;
11931 + failed_disks = conf->failed_disks;
11932 restore_flags(flags);
11934 if (failed_disks > 1) {
11935 @@ -1017,7 +1074,7 @@
11938 if (nr_write && nr_read)
11939 - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11940 + printk("raid5: bug, nr_write ==`%d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
11944 @@ -1030,7 +1087,7 @@
11947 block = (int) compute_blocknr(sh, i);
11948 - bh = find_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
11949 + bh = find_buffer(mddev_to_kdev(mddev), block, sh->size);
11950 if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
11951 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
11952 add_stripe_bh(sh, bh, i, WRITE);
11953 @@ -1064,21 +1121,22 @@
11955 if (!method1 || !method2) {
11957 - sh->nr_pending++;
11958 + lowprio = is_stripe_lowprio(sh, disks);
11959 + atomic_inc(&sh->nr_pending);
11960 sh->phase = PHASE_WRITE;
11961 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
11962 for (i = 0; i < disks; i++) {
11963 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11964 + if (!operational[i] && !conf->spare && !conf->resync_parity)
11966 if (i == sh->pd_idx || sh->bh_new[i])
11970 - sh->nr_pending = nr_writing;
11971 - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
11972 + md_atomic_set(&sh->nr_pending, nr_writing);
11973 + PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
11975 for (i = 0; i < disks; i++) {
11976 - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity)
11977 + if (!operational[i] && !conf->spare && !conf->resync_parity)
11979 bh = sh->bh_copy[i];
11980 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
11981 @@ -1089,18 +1147,30 @@
11982 bh->b_state |= (1<<BH_Dirty);
11983 PRINTK(("making request for buffer %d\n", i));
11984 clear_bit(BH_Lock, &bh->b_state);
11985 - if (!operational[i] && !raid_conf->resync_parity) {
11986 - bh->b_rdev = raid_conf->spare->dev;
11987 - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh);
11989 - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
11990 + if (!operational[i] && !conf->resync_parity) {
11991 + bh->b_rdev = conf->spare->dev;
11992 + make_request(MAJOR(conf->spare->dev), WRITE, bh);
11995 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
11997 + if (!lowprio || (i==sh->pd_idx))
11998 + make_request(MAJOR(conf->disks[i].dev), WRITE, bh);
12000 + mark_buffer_clean(bh);
12001 + raid5_end_request(bh,1);
12012 - sh->nr_pending++;
12013 + lowprio = is_stripe_lowprio(sh, disks);
12014 + atomic_inc(&sh->nr_pending);
12015 if (method1 < method2) {
12016 sh->write_method = RECONSTRUCT_WRITE;
12017 for (i = 0; i < disks; i++) {
12018 @@ -1110,6 +1180,8 @@
12020 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12021 raid5_build_block(sh, sh->bh_old[i], i);
12023 + mark_buffer_lowprio(sh->bh_old[i]);
12027 @@ -1121,19 +1193,21 @@
12029 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12030 raid5_build_block(sh, sh->bh_old[i], i);
12032 + mark_buffer_lowprio(sh->bh_old[i]);
12036 sh->phase = PHASE_READ_OLD;
12037 - sh->nr_pending = reading;
12038 - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
12039 + md_atomic_set(&sh->nr_pending, reading);
12040 + PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12041 for (i = 0; i < disks; i++) {
12042 if (!sh->bh_old[i])
12044 if (buffer_uptodate(sh->bh_old[i]))
12046 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12047 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12048 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12052 @@ -1141,7 +1215,8 @@
12054 method1 = nr_read - nr_cache_overwrite;
12056 - sh->nr_pending++;
12057 + lowprio = is_stripe_lowprio(sh,disks);
12058 + atomic_inc(&sh->nr_pending);
12060 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
12061 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
12062 @@ -1149,18 +1224,22 @@
12063 for (i = 0; i < disks; i++) {
12064 if (!sh->bh_new[i])
12066 - if (!sh->bh_old[i])
12067 + if (!sh->bh_old[i]) {
12068 compute_block(sh, i);
12070 + mark_buffer_lowprio
12073 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12075 - sh->nr_pending--;
12076 + atomic_dec(&sh->nr_pending);
12077 complete_stripe(sh);
12080 if (nr_failed_overwrite) {
12081 sh->phase = PHASE_READ_OLD;
12082 - sh->nr_pending = (disks - 1) - nr_cache;
12083 - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
12084 + md_atomic_set(&sh->nr_pending, (disks - 1) - nr_cache);
12085 + PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12086 for (i = 0; i < disks; i++) {
12089 @@ -1168,13 +1247,16 @@
12091 sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size);
12092 raid5_build_block(sh, sh->bh_old[i], i);
12094 + mark_buffer_lowprio(sh->bh_old[i]);
12095 clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
12096 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
12097 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_old[i]);
12100 sh->phase = PHASE_READ;
12101 - sh->nr_pending = nr_read - nr_cache_overwrite;
12102 - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
12103 + md_atomic_set(&sh->nr_pending,
12104 + nr_read - nr_cache_overwrite);
12105 + PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)));
12106 for (i = 0; i < disks; i++) {
12107 if (!sh->bh_new[i])
12109 @@ -1182,16 +1264,16 @@
12110 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
12113 - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]);
12114 + make_request(MAJOR(conf->disks[i].dev), READ, sh->bh_req[i]);
12120 -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
12121 +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
12123 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12124 - const unsigned int raid_disks = raid_conf->raid_disks;
12125 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12126 + const unsigned int raid_disks = conf->raid_disks;
12127 const unsigned int data_disks = raid_disks - 1;
12128 unsigned int dd_idx, pd_idx;
12129 unsigned long new_sector;
12130 @@ -1202,15 +1284,15 @@
12131 if (rw == WRITEA) rw = WRITE;
12133 new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
12134 - &dd_idx, &pd_idx, raid_conf);
12135 + &dd_idx, &pd_idx, conf);
12137 PRINTK(("raid5_make_request, sector %lu\n", new_sector));
12139 - sh = get_stripe(raid_conf, new_sector, bh->b_size);
12140 + sh = get_stripe(conf, new_sector, bh->b_size);
12141 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
12142 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd));
12144 - if (!sh->nr_pending)
12145 + if (!md_atomic_read(&sh->nr_pending))
12149 @@ -1221,24 +1303,24 @@
12150 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
12151 printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]);
12153 - md_wakeup_thread(raid_conf->thread);
12154 + md_wakeup_thread(conf->thread);
12155 wait_on_stripe(sh);
12158 add_stripe_bh(sh, bh, dd_idx, rw);
12160 - md_wakeup_thread(raid_conf->thread);
12161 + md_wakeup_thread(conf->thread);
12165 static void unplug_devices(struct stripe_head *sh)
12168 - struct raid5_data *raid_conf = sh->raid_conf;
12169 + raid5_conf_t *conf = sh->raid_conf;
12172 - for (i = 0; i < raid_conf->raid_disks; i++)
12173 - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
12174 + for (i = 0; i < conf->raid_disks; i++)
12175 + unplug_device(blk_dev + MAJOR(conf->disks[i].dev));
12179 @@ -1252,8 +1334,8 @@
12180 static void raid5d (void *data)
12182 struct stripe_head *sh;
12183 - struct raid5_data *raid_conf = data;
12184 - struct md_dev *mddev = raid_conf->mddev;
12185 + raid5_conf_t *conf = data;
12186 + mddev_t *mddev = conf->mddev;
12187 int i, handled = 0, unplug = 0;
12188 unsigned long flags;
12190 @@ -1261,47 +1343,47 @@
12192 if (mddev->sb_dirty) {
12193 mddev->sb_dirty = 0;
12194 - md_update_sb((int) (mddev - md_dev));
12195 + md_update_sb(mddev);
12197 for (i = 0; i < NR_HASH; i++) {
12199 - sh = raid_conf->stripe_hashtbl[i];
12200 + sh = conf->stripe_hashtbl[i];
12201 for (; sh; sh = sh->hash_next) {
12202 - if (sh->raid_conf != raid_conf)
12203 + if (sh->raid_conf != conf)
12205 if (sh->phase == PHASE_COMPLETE)
12207 - if (sh->nr_pending)
12208 + if (md_atomic_read(&sh->nr_pending))
12210 - if (sh->sector == raid_conf->next_sector) {
12211 - raid_conf->sector_count += (sh->size >> 9);
12212 - if (raid_conf->sector_count >= 128)
12213 + if (sh->sector == conf->next_sector) {
12214 + conf->sector_count += (sh->size >> 9);
12215 + if (conf->sector_count >= 128)
12220 - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
12221 + PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, conf->sector_count));
12222 unplug_devices(sh);
12224 - raid_conf->sector_count = 0;
12225 + conf->sector_count = 0;
12227 - raid_conf->next_sector = sh->sector + (sh->size >> 9);
12228 + conf->next_sector = sh->sector + (sh->size >> 9);
12235 - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle)));
12237 + PRINTK(("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)));
12240 - if (!atomic_read(&raid_conf->nr_handle))
12241 - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
12242 + if (!md_atomic_read(&conf->nr_handle))
12243 + clear_bit(THREAD_WAKEUP, &conf->thread->flags);
12244 + restore_flags(flags);
12246 PRINTK(("--- raid5d inactive\n"));
12249 -#if SUPPORT_RECONSTRUCTION
12251 * Private kernel thread for parity reconstruction after an unclean
12252 * shutdown. Reconstruction on spare drives in case of a failed drive
12253 @@ -1309,44 +1391,64 @@
12255 static void raid5syncd (void *data)
12257 - struct raid5_data *raid_conf = data;
12258 - struct md_dev *mddev = raid_conf->mddev;
12259 + raid5_conf_t *conf = data;
12260 + mddev_t *mddev = conf->mddev;
12262 - if (!raid_conf->resync_parity)
12263 + if (!conf->resync_parity)
12265 + if (conf->resync_parity == 2)
12267 + down(&mddev->recovery_sem);
12268 + if (md_do_sync(mddev,NULL)) {
12269 + up(&mddev->recovery_sem);
12270 + printk("raid5: resync aborted!\n");
12272 - md_do_sync(mddev);
12273 - raid_conf->resync_parity = 0;
12275 + conf->resync_parity = 0;
12276 + up(&mddev->recovery_sem);
12277 + printk("raid5: resync finished.\n");
12279 -#endif /* SUPPORT_RECONSTRUCTION */
12281 -static int __check_consistency (struct md_dev *mddev, int row)
12282 +static int __check_consistency (mddev_t *mddev, int row)
12284 - struct raid5_data *raid_conf = mddev->private;
12285 + raid5_conf_t *conf = mddev->private;
12287 struct buffer_head *bh[MD_SB_DISKS], tmp;
12288 - int i, rc = 0, nr = 0;
12289 + int i, rc = 0, nr = 0, count;
12290 + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
12292 - if (raid_conf->working_disks != raid_conf->raid_disks)
12293 + if (conf->working_disks != conf->raid_disks)
12296 if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL)
12298 + md_clear_page((unsigned long)tmp.b_data);
12299 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
12300 - for (i = 0; i < raid_conf->raid_disks; i++) {
12301 - dev = raid_conf->disks[i].dev;
12302 + for (i = 0; i < conf->raid_disks; i++) {
12303 + dev = conf->disks[i].dev;
12304 set_blocksize(dev, 4096);
12305 if ((bh[i] = bread(dev, row / 4, 4096)) == NULL)
12309 - if (nr == raid_conf->raid_disks) {
12310 - for (i = 1; i < nr; i++)
12311 - xor_block(&tmp, bh[i]);
12312 + if (nr == conf->raid_disks) {
12313 + bh_ptr[0] = &tmp;
12315 + for (i = 1; i < nr; i++) {
12316 + bh_ptr[count++] = bh[i];
12317 + if (count == MAX_XOR_BLOCKS) {
12318 + xor_block(count, &bh_ptr[0]);
12322 + if (count != 1) {
12323 + xor_block(count, &bh_ptr[0]);
12325 if (memcmp(tmp.b_data, bh[0]->b_data, 4096))
12328 - for (i = 0; i < raid_conf->raid_disks; i++) {
12329 - dev = raid_conf->disks[i].dev;
12330 + for (i = 0; i < conf->raid_disks; i++) {
12331 + dev = conf->disks[i].dev;
12335 @@ -1358,285 +1460,607 @@
12339 -static int check_consistency (struct md_dev *mddev)
12340 +static int check_consistency (mddev_t *mddev)
12342 - int size = mddev->sb->size;
12344 + if (__check_consistency(mddev, 0))
12346 + * We are not checking this currently, as it's legitimate to have
12347 + * an inconsistent array, at creation time.
12351 - for (row = 0; row < size; row += size / 8)
12352 - if (__check_consistency(mddev, row))
12357 -static int raid5_run (int minor, struct md_dev *mddev)
12358 +static int raid5_run (mddev_t *mddev)
12360 - struct raid5_data *raid_conf;
12361 + raid5_conf_t *conf;
12362 int i, j, raid_disk, memory;
12363 - md_superblock_t *sb = mddev->sb;
12364 - md_descriptor_t *descriptor;
12365 - struct real_dev *realdev;
12366 + mdp_super_t *sb = mddev->sb;
12367 + mdp_disk_t *desc;
12368 + mdk_rdev_t *rdev;
12369 + struct disk_info *disk;
12370 + struct md_list_head *tmp;
12371 + int start_recovery = 0;
12375 if (sb->level != 5 && sb->level != 4) {
12376 - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
12377 + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
12382 - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
12383 - if ((raid_conf = mddev->private) == NULL)
12384 + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
12385 + if ((conf = mddev->private) == NULL)
12387 - memset (raid_conf, 0, sizeof (*raid_conf));
12388 - raid_conf->mddev = mddev;
12389 + memset (conf, 0, sizeof (*conf));
12390 + conf->mddev = mddev;
12392 - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12393 + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
12395 - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12396 + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
12398 - init_waitqueue(&raid_conf->wait_for_stripe);
12399 - PRINTK(("raid5_run(%d) called.\n", minor));
12401 - for (i = 0; i < mddev->nb_dev; i++) {
12402 - realdev = &mddev->devices[i];
12403 - if (!realdev->sb) {
12404 - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
12407 + init_waitqueue(&conf->wait_for_stripe);
12408 + PRINTK(("raid5_run(md%d) called.\n", mdidx(mddev)));
12410 + ITERATE_RDEV(mddev,rdev,tmp) {
12412 * This is important -- we are using the descriptor on
12413 * the disk only to get a pointer to the descriptor on
12414 * the main superblock, which might be more recent.
12416 - descriptor = &sb->disks[realdev->sb->descriptor.number];
12417 - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
12418 - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
12419 + desc = sb->disks + rdev->desc_nr;
12420 + raid_disk = desc->raid_disk;
12421 + disk = conf->disks + raid_disk;
12423 + if (disk_faulty(desc)) {
12424 + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
12425 + if (!rdev->faulty) {
12429 + disk->number = desc->number;
12430 + disk->raid_disk = raid_disk;
12431 + disk->dev = rdev->dev;
12433 + disk->operational = 0;
12434 + disk->write_only = 0;
12436 + disk->used_slot = 1;
12439 - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
12440 - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
12441 - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
12443 + if (disk_active(desc)) {
12444 + if (!disk_sync(desc)) {
12445 + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
12449 - raid_disk = descriptor->raid_disk;
12450 - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
12451 - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
12452 + if (raid_disk > sb->raid_disks) {
12453 + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
12456 - if (raid_conf->disks[raid_disk].operational) {
12457 - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
12458 + if (disk->operational) {
12459 + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
12462 - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
12463 + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
12465 - raid_conf->disks[raid_disk].number = descriptor->number;
12466 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
12467 - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
12468 - raid_conf->disks[raid_disk].operational = 1;
12469 + disk->number = desc->number;
12470 + disk->raid_disk = raid_disk;
12471 + disk->dev = rdev->dev;
12472 + disk->operational = 1;
12473 + disk->used_slot = 1;
12475 - raid_conf->working_disks++;
12476 + conf->working_disks++;
12479 * Must be a spare disk ..
12481 - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev));
12482 - raid_disk = descriptor->raid_disk;
12483 - raid_conf->disks[raid_disk].number = descriptor->number;
12484 - raid_conf->disks[raid_disk].raid_disk = raid_disk;
12485 - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev;
12487 - raid_conf->disks[raid_disk].operational = 0;
12488 - raid_conf->disks[raid_disk].write_only = 0;
12489 - raid_conf->disks[raid_disk].spare = 1;
12492 - raid_conf->raid_disks = sb->raid_disks;
12493 - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
12494 - raid_conf->mddev = mddev;
12495 - raid_conf->chunk_size = sb->chunk_size;
12496 - raid_conf->level = sb->level;
12497 - raid_conf->algorithm = sb->parity_algorithm;
12498 - raid_conf->max_nr_stripes = NR_STRIPES;
12499 + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
12500 + disk->number = desc->number;
12501 + disk->raid_disk = raid_disk;
12502 + disk->dev = rdev->dev;
12504 - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) {
12505 - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
12507 + disk->operational = 0;
12508 + disk->write_only = 0;
12510 + disk->used_slot = 1;
12513 - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
12514 - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
12516 + for (i = 0; i < MD_SB_DISKS; i++) {
12517 + desc = sb->disks + i;
12518 + raid_disk = desc->raid_disk;
12519 + disk = conf->disks + raid_disk;
12521 + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
12522 + !conf->disks[raid_disk].used_slot) {
12524 + disk->number = desc->number;
12525 + disk->raid_disk = raid_disk;
12526 + disk->dev = MKDEV(0,0);
12528 + disk->operational = 0;
12529 + disk->write_only = 0;
12531 + disk->used_slot = 1;
12535 + conf->raid_disks = sb->raid_disks;
12537 + * 0 for a fully functional array, 1 for a degraded array.
12539 + conf->failed_disks = conf->raid_disks - conf->working_disks;
12540 + conf->mddev = mddev;
12541 + conf->chunk_size = sb->chunk_size;
12542 + conf->level = sb->level;
12543 + conf->algorithm = sb->layout;
12544 + conf->max_nr_stripes = NR_STRIPES;
12547 + for (i = 0; i < conf->raid_disks; i++) {
12548 + if (!conf->disks[i].used_slot) {
12554 + if (!conf->chunk_size || conf->chunk_size % 4) {
12555 + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
12558 - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12559 - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
12560 + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
12561 + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
12564 - if (raid_conf->failed_disks > 1) {
12565 - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
12566 + if (conf->failed_disks > 1) {
12567 + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
12571 - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) {
12572 - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
12573 - sb->state |= 1 << MD_SB_ERRORS;
12575 + if (conf->working_disks != sb->raid_disks) {
12576 + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
12577 + start_recovery = 1;
12580 - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
12581 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12583 + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
12584 + check_consistency(mddev)) {
12585 + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
12586 + sb->state &= ~(1 << MD_SB_CLEAN);
12589 -#if SUPPORT_RECONSTRUCTION
12590 - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) {
12591 - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12594 + const char * name = "raid5d";
12596 + conf->thread = md_register_thread(raid5d, conf, name);
12597 + if (!conf->thread) {
12598 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12602 -#endif /* SUPPORT_RECONSTRUCTION */
12604 - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) +
12605 - raid_conf->raid_disks * (sizeof(struct buffer_head) +
12606 + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
12607 + conf->raid_disks * (sizeof(struct buffer_head) +
12608 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
12609 - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) {
12610 + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
12611 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
12612 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12613 + shrink_stripes(conf, conf->max_nr_stripes);
12616 - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor)));
12617 + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
12620 * Regenerate the "device is in sync with the raid set" bit for
12623 - for (i = 0; i < sb->nr_disks ; i++) {
12624 - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
12625 + for (i = 0; i < MD_SB_DISKS ; i++) {
12626 + mark_disk_nonsync(sb->disks + i);
12627 for (j = 0; j < sb->raid_disks; j++) {
12628 - if (!raid_conf->disks[j].operational)
12629 + if (!conf->disks[j].operational)
12631 - if (sb->disks[i].number == raid_conf->disks[j].number)
12632 - sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
12633 + if (sb->disks[i].number == conf->disks[j].number)
12634 + mark_disk_sync(sb->disks + i);
12637 - sb->active_disks = raid_conf->working_disks;
12638 + sb->active_disks = conf->working_disks;
12640 if (sb->active_disks == sb->raid_disks)
12641 - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12642 + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12644 - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
12645 + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
12647 + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) {
12648 + const char * name = "raid5syncd";
12650 + conf->resync_thread = md_register_thread(raid5syncd, conf,name);
12651 + if (!conf->resync_thread) {
12652 + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
12656 - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) {
12657 - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor)));
12658 - raid_conf->resync_parity = 1;
12659 -#if SUPPORT_RECONSTRUCTION
12660 - md_wakeup_thread(raid_conf->resync_thread);
12661 -#endif /* SUPPORT_RECONSTRUCTION */
12662 + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
12663 + conf->resync_parity = 1;
12664 + md_wakeup_thread(conf->resync_thread);
12667 + print_raid5_conf(conf);
12668 + if (start_recovery)
12669 + md_recover_arrays();
12670 + print_raid5_conf(conf);
12672 /* Ok, everything is just fine now */
12676 - if (raid_conf->stripe_hashtbl)
12677 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12678 - kfree(raid_conf);
12680 + print_raid5_conf(conf);
12681 + if (conf->stripe_hashtbl)
12682 + free_pages((unsigned long) conf->stripe_hashtbl,
12683 + HASH_PAGES_ORDER);
12686 mddev->private = NULL;
12687 - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
12688 + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
12693 -static int raid5_stop (int minor, struct md_dev *mddev)
12694 +static int raid5_stop_resync (mddev_t *mddev)
12696 + raid5_conf_t *conf = mddev_to_conf(mddev);
12697 + mdk_thread_t *thread = conf->resync_thread;
12700 + if (conf->resync_parity) {
12701 + conf->resync_parity = 2;
12702 + md_interrupt_thread(thread);
12703 + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
12711 +static int raid5_restart_resync (mddev_t *mddev)
12713 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12714 + raid5_conf_t *conf = mddev_to_conf(mddev);
12716 - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes);
12717 - shrink_stripes(raid_conf, raid_conf->max_nr_stripes);
12718 - md_unregister_thread(raid_conf->thread);
12719 -#if SUPPORT_RECONSTRUCTION
12720 - md_unregister_thread(raid_conf->resync_thread);
12721 -#endif /* SUPPORT_RECONSTRUCTION */
12722 - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER);
12723 - kfree(raid_conf);
12724 + if (conf->resync_parity) {
12725 + if (!conf->resync_thread) {
12729 + printk("raid5: waking up raid5resync.\n");
12730 + conf->resync_parity = 1;
12731 + md_wakeup_thread(conf->resync_thread);
12734 + printk("raid5: no restart-resync needed.\n");
12739 +static int raid5_stop (mddev_t *mddev)
12741 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12743 + shrink_stripe_cache(conf, conf->max_nr_stripes);
12744 + shrink_stripes(conf, conf->max_nr_stripes);
12745 + md_unregister_thread(conf->thread);
12746 + if (conf->resync_thread)
12747 + md_unregister_thread(conf->resync_thread);
12748 + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
12750 mddev->private = NULL;
12755 -static int raid5_status (char *page, int minor, struct md_dev *mddev)
12756 +static int raid5_status (char *page, mddev_t *mddev)
12758 - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
12759 - md_superblock_t *sb = mddev->sb;
12760 + raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
12761 + mdp_super_t *sb = mddev->sb;
12764 - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
12765 - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
12766 - for (i = 0; i < raid_conf->raid_disks; i++)
12767 - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
12768 + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
12769 + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
12770 + for (i = 0; i < conf->raid_disks; i++)
12771 + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
12772 sz += sprintf (page+sz, "]");
12776 -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state)
12777 +static void print_raid5_conf (raid5_conf_t *conf)
12780 + struct disk_info *tmp;
12782 + printk("RAID5 conf printout:\n");
12784 + printk("(conf==NULL)\n");
12787 + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
12788 + conf->working_disks, conf->failed_disks);
12790 + for (i = 0; i < MD_SB_DISKS; i++) {
12791 + tmp = conf->disks + i;
12792 + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
12793 + i, tmp->spare,tmp->operational,
12794 + tmp->number,tmp->raid_disk,tmp->used_slot,
12795 + partition_name(tmp->dev));
12799 +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
12801 - int i = 0, failed_disk = -1;
12802 - struct raid5_data *raid_conf = mddev->private;
12803 - struct disk_info *disk = raid_conf->disks;
12805 + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
12806 + raid5_conf_t *conf = mddev->private;
12807 + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
12808 unsigned long flags;
12809 - md_superblock_t *sb = mddev->sb;
12810 - md_descriptor_t *descriptor;
12811 + mdp_super_t *sb = mddev->sb;
12812 + mdp_disk_t *failed_desc, *spare_desc, *added_desc;
12814 - for (i = 0; i < MD_SB_DISKS; i++, disk++) {
12815 - if (disk->spare && disk->number == spare->number)
12820 - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
12821 - if (!disk->operational)
12823 - if (failed_disk == -1)
12828 + print_raid5_conf(conf);
12830 + * find the disk ...
12833 - case SPARE_WRITE:
12834 - disk->operational = 1;
12835 - disk->write_only = 1;
12836 - raid_conf->spare = disk;
12838 - case SPARE_INACTIVE:
12839 - disk->operational = 0;
12840 - disk->write_only = 0;
12841 - raid_conf->spare = NULL;
12843 - case SPARE_ACTIVE:
12845 - disk->write_only = 0;
12847 - descriptor = &sb->disks[raid_conf->disks[failed_disk].number];
12848 - i = spare->raid_disk;
12849 - disk->raid_disk = spare->raid_disk = descriptor->raid_disk;
12850 - if (disk->raid_disk != failed_disk)
12851 - printk("raid5: disk->raid_disk != failed_disk");
12852 - descriptor->raid_disk = i;
12854 - raid_conf->spare = NULL;
12855 - raid_conf->working_disks++;
12856 - raid_conf->failed_disks--;
12857 - raid_conf->disks[failed_disk] = *disk;
12860 - printk("raid5_mark_spare: bug: state == %d\n", state);
12861 - restore_flags(flags);
12863 + case DISKOP_SPARE_ACTIVE:
12866 + * Find the failed disk within the RAID5 configuration ...
12867 + * (this can only be in the first conf->raid_disks part)
12869 + for (i = 0; i < conf->raid_disks; i++) {
12870 + tmp = conf->disks + i;
12871 + if ((!tmp->operational && !tmp->spare) ||
12872 + !tmp->used_slot) {
12878 + * When we activate a spare disk we _must_ have a disk in
12879 + * the lower (active) part of the array to replace.
12881 + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
12886 + /* fall through */
12888 + case DISKOP_SPARE_WRITE:
12889 + case DISKOP_SPARE_INACTIVE:
12892 + * Find the spare disk ... (can only be in the 'high'
12893 + * area of the array)
12895 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12896 + tmp = conf->disks + i;
12897 + if (tmp->spare && tmp->number == (*d)->number) {
12902 + if (spare_disk == -1) {
12909 + case DISKOP_HOT_REMOVE_DISK:
12911 + for (i = 0; i < MD_SB_DISKS; i++) {
12912 + tmp = conf->disks + i;
12913 + if (tmp->used_slot && (tmp->number == (*d)->number)) {
12914 + if (tmp->operational) {
12918 + removed_disk = i;
12922 + if (removed_disk == -1) {
12929 + case DISKOP_HOT_ADD_DISK:
12931 + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
12932 + tmp = conf->disks + i;
12933 + if (!tmp->used_slot) {
12938 + if (added_disk == -1) {
12948 + * Switch the spare disk to write-only mode:
12950 + case DISKOP_SPARE_WRITE:
12951 + if (conf->spare) {
12956 + sdisk = conf->disks + spare_disk;
12957 + sdisk->operational = 1;
12958 + sdisk->write_only = 1;
12959 + conf->spare = sdisk;
12962 + * Deactivate a spare disk:
12964 + case DISKOP_SPARE_INACTIVE:
12965 + sdisk = conf->disks + spare_disk;
12966 + sdisk->operational = 0;
12967 + sdisk->write_only = 0;
12969 + * Was the spare being resynced?
12971 + if (conf->spare == sdisk)
12972 + conf->spare = NULL;
12975 + * Activate (mark read-write) the (now sync) spare disk,
12976 + * which means we switch it's 'raid position' (->raid_disk)
12977 + * with the failed disk. (only the first 'conf->raid_disks'
12978 + * slots are used for 'real' disks and we must preserve this
12981 + case DISKOP_SPARE_ACTIVE:
12982 + if (!conf->spare) {
12987 + sdisk = conf->disks + spare_disk;
12988 + fdisk = conf->disks + failed_disk;
12990 + spare_desc = &sb->disks[sdisk->number];
12991 + failed_desc = &sb->disks[fdisk->number];
12993 + if (spare_desc != *d) {
12999 + if (spare_desc->raid_disk != sdisk->raid_disk) {
13005 + if (sdisk->raid_disk != spare_disk) {
13011 + if (failed_desc->raid_disk != fdisk->raid_disk) {
13017 + if (fdisk->raid_disk != failed_disk) {
13024 + * do the switch finally
13026 + xchg_values(*spare_desc, *failed_desc);
13027 + xchg_values(*fdisk, *sdisk);
13030 + * (careful, 'failed' and 'spare' are switched from now on)
13032 + * we want to preserve linear numbering and we want to
13033 + * give the proper raid_disk number to the now activated
13034 + * disk. (this means we switch back these values)
13037 + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
13038 + xchg_values(sdisk->raid_disk, fdisk->raid_disk);
13039 + xchg_values(spare_desc->number, failed_desc->number);
13040 + xchg_values(sdisk->number, fdisk->number);
13042 + *d = failed_desc;
13044 + if (sdisk->dev == MKDEV(0,0))
13045 + sdisk->used_slot = 0;
13048 + * this really activates the spare.
13050 + fdisk->spare = 0;
13051 + fdisk->write_only = 0;
13054 + * if we activate a spare, we definitely replace a
13055 + * non-operational disk slot in the 'low' area of
13056 + * the disk array.
13058 + conf->failed_disks--;
13059 + conf->working_disks++;
13060 + conf->spare = NULL;
13064 + case DISKOP_HOT_REMOVE_DISK:
13065 + rdisk = conf->disks + removed_disk;
13067 + if (rdisk->spare && (removed_disk < conf->raid_disks)) {
13072 + rdisk->dev = MKDEV(0,0);
13073 + rdisk->used_slot = 0;
13077 + case DISKOP_HOT_ADD_DISK:
13078 + adisk = conf->disks + added_disk;
13081 + if (added_disk != added_desc->number) {
13087 + adisk->number = added_desc->number;
13088 + adisk->raid_disk = added_desc->raid_disk;
13089 + adisk->dev = MKDEV(added_desc->major,added_desc->minor);
13091 + adisk->operational = 0;
13092 + adisk->write_only = 0;
13093 + adisk->spare = 1;
13094 + adisk->used_slot = 1;
13105 restore_flags(flags);
13107 + print_raid5_conf(conf);
13111 -static struct md_personality raid5_personality=
13112 +static mdk_personality_t raid5_personality=
13116 @@ -1648,14 +2072,19 @@
13117 NULL, /* no ioctls */
13120 - /* raid5_hot_add_disk, */ NULL,
13121 - /* raid1_hot_remove_drive */ NULL,
13124 + raid5_stop_resync,
13125 + raid5_restart_resync
13128 int raid5_init (void)
13130 - return register_md_personality (RAID5, &raid5_personality);
13133 + err = register_md_personality (RAID5, &raid5_personality);
13140 --- linux/drivers/block/translucent.c.orig Fri Nov 23 11:18:20 2001
13141 +++ linux/drivers/block/translucent.c Fri Nov 23 11:18:20 2001
13144 + translucent.c : Translucent RAID driver for Linux
13145 + Copyright (C) 1998 Ingo Molnar
13147 + Translucent mode management functions.
13149 + This program is free software; you can redistribute it and/or modify
13150 + it under the terms of the GNU General Public License as published by
13151 + the Free Software Foundation; either version 2, or (at your option)
13152 + any later version.
13154 + You should have received a copy of the GNU General Public License
13155 + (for example /usr/src/linux/COPYING); if not, write to the Free
13156 + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13159 +#include <linux/module.h>
13161 +#include <linux/raid/md.h>
13162 +#include <linux/malloc.h>
13164 +#include <linux/raid/translucent.h>
13166 +#define MAJOR_NR MD_MAJOR
13168 +#define MD_PERSONALITY
13170 +static int translucent_run (mddev_t *mddev)
13172 + translucent_conf_t *conf;
13173 + mdk_rdev_t *rdev;
13176 + MOD_INC_USE_COUNT;
13178 + conf = kmalloc (sizeof (*conf), GFP_KERNEL);
13181 + mddev->private = conf;
13183 + if (mddev->nb_dev != 2) {
13184 + printk("translucent: this mode needs 2 disks, aborting!\n");
13188 + if (md_check_ordering(mddev)) {
13189 + printk("translucent: disks are not ordered, aborting!\n");
13193 + ITERATE_RDEV_ORDERED(mddev,rdev,i) {
13194 + dev_info_t *disk = conf->disks + i;
13196 + disk->dev = rdev->dev;
13197 + disk->size = rdev->size;
13206 + MOD_DEC_USE_COUNT;
13210 +static int translucent_stop (mddev_t *mddev)
13212 + translucent_conf_t *conf = mddev_to_conf(mddev);
13216 + MOD_DEC_USE_COUNT;
13222 +static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev,
13223 + unsigned long *rsector, unsigned long size)
13225 + translucent_conf_t *conf = mddev_to_conf(mddev);
13227 + *rdev = conf->disks[0].dev;
13232 +static int translucent_status (char *page, mddev_t *mddev)
13236 + sz += sprintf(page+sz, " %d%% full", 10);
13241 +static mdk_personality_t translucent_personality=
13248 + translucent_stop,
13249 + translucent_status,
13260 +md__initfunc(void translucent_init (void))
13262 + register_md_personality (TRANSLUCENT, &translucent_personality);
13267 +int init_module (void)
13269 + return (register_md_personality (TRANSLUCENT, &translucent_personality));
13272 +void cleanup_module (void)
13274 + unregister_md_personality (TRANSLUCENT);
13279 --- linux/drivers/block/xor.c.orig Fri Nov 23 11:18:20 2001
13280 +++ linux/drivers/block/xor.c Fri Nov 23 11:18:20 2001
13283 + * xor.c : Multiple Devices driver for Linux
13285 + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
13288 + * optimized RAID-5 checksumming functions.
13290 + * This program is free software; you can redistribute it and/or modify
13291 + * it under the terms of the GNU General Public License as published by
13292 + * the Free Software Foundation; either version 2, or (at your option)
13293 + * any later version.
13295 + * You should have received a copy of the GNU General Public License
13296 + * (for example /usr/src/linux/COPYING); if not, write to the Free
13297 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13299 +#include <linux/module.h>
13300 +#include <linux/raid/md.h>
13301 +#ifdef __sparc_v9__
13302 +#include <asm/head.h>
13303 +#include <asm/asi.h>
13304 +#include <asm/visasm.h>
13308 + * we use the 'XOR function template' to register multiple xor
13309 + * functions runtime. The kernel measures their speed upon bootup
13310 + * and decides which one to use. (compile-time registration is
13311 + * not enough as certain CPU features like MMX can only be detected
13314 + * this architecture makes it pretty easy to add new routines
13315 + * that are faster on certain CPUs, without killing other CPU's
13316 + * 'native' routine. Although the current routines are belived
13317 + * to be the physically fastest ones on all CPUs tested, but
13318 + * feel free to prove me wrong and add yet another routine =B-)
13322 +#define MAX_XOR_BLOCKS 5
13324 +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
13326 +typedef void (*xor_block_t) XOR_ARGS;
13327 +xor_block_t xor_block = NULL;
13329 +#ifndef __sparc_v9__
13331 +struct xor_block_template;
13333 +struct xor_block_template {
13335 + xor_block_t xor_block;
13337 + struct xor_block_template * next;
13340 +struct xor_block_template * xor_functions = NULL;
13342 +#define XORBLOCK_TEMPLATE(x) \
13343 +static void xor_block_##x XOR_ARGS; \
13344 +static struct xor_block_template t_xor_block_##x = \
13345 + { #x, xor_block_##x, 0, NULL }; \
13346 +static void xor_block_##x XOR_ARGS
13350 +#ifdef CONFIG_X86_XMM
13352 + * Cache avoiding checksumming functions utilizing KNI instructions
13353 + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
13356 +XORBLOCK_TEMPLATE(pIII_kni)
13358 + char xmm_save[16*4];
13360 + int lines = (bh_ptr[0]->b_size>>8);
13362 + __asm__ __volatile__ (
13363 + "movl %%cr0,%0 ;\n\t"
13365 + "movups %%xmm0,(%1) ;\n\t"
13366 + "movups %%xmm1,0x10(%1) ;\n\t"
13367 + "movups %%xmm2,0x20(%1) ;\n\t"
13368 + "movups %%xmm3,0x30(%1) ;\n\t"
13373 +#define OFFS(x) "8*("#x"*2)"
13375 + " prefetcht0 "OFFS(x)"(%1) ;\n"
13377 + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
13379 + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
13381 + " prefetchnta "OFFS(x)"(%2) ;\n"
13383 + " prefetchnta "OFFS(x)"(%3) ;\n"
13385 + " prefetchnta "OFFS(x)"(%4) ;\n"
13387 + " prefetchnta "OFFS(x)"(%5) ;\n"
13389 + " prefetchnta "OFFS(x)"(%6) ;\n"
13390 +#define XO1(x,y) \
13391 + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
13392 +#define XO2(x,y) \
13393 + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
13394 +#define XO3(x,y) \
13395 + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
13396 +#define XO4(x,y) \
13397 + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
13398 +#define XO5(x,y) \
13399 + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
13403 + __asm__ __volatile__ (
13405 +#define BLOCK(i) \
13427 + " .align 32,0x90 ;\n"
13435 + " addl $256, %1 ;\n"
13436 + " addl $256, %2 ;\n"
13442 + "r" (bh_ptr[0]->b_data),
13443 + "r" (bh_ptr[1]->b_data)
13447 + __asm__ __volatile__ (
13449 +#define BLOCK(i) \
13477 + " .align 32,0x90 ;\n"
13485 + " addl $256, %1 ;\n"
13486 + " addl $256, %2 ;\n"
13487 + " addl $256, %3 ;\n"
13492 + "r" (bh_ptr[0]->b_data),
13493 + "r" (bh_ptr[1]->b_data),
13494 + "r" (bh_ptr[2]->b_data)
13498 + __asm__ __volatile__ (
13500 +#define BLOCK(i) \
13534 + " .align 32,0x90 ;\n"
13542 + " addl $256, %1 ;\n"
13543 + " addl $256, %2 ;\n"
13544 + " addl $256, %3 ;\n"
13545 + " addl $256, %4 ;\n"
13551 + "r" (bh_ptr[0]->b_data),
13552 + "r" (bh_ptr[1]->b_data),
13553 + "r" (bh_ptr[2]->b_data),
13554 + "r" (bh_ptr[3]->b_data)
13558 + __asm__ __volatile__ (
13560 +#define BLOCK(i) \
13600 + " .align 32,0x90 ;\n"
13608 + " addl $256, %1 ;\n"
13609 + " addl $256, %2 ;\n"
13610 + " addl $256, %3 ;\n"
13611 + " addl $256, %4 ;\n"
13612 + " addl $256, %5 ;\n"
13618 + "r" (bh_ptr[0]->b_data),
13619 + "r" (bh_ptr[1]->b_data),
13620 + "r" (bh_ptr[2]->b_data),
13621 + "r" (bh_ptr[3]->b_data),
13622 + "r" (bh_ptr[4]->b_data)
13627 + __asm__ __volatile__ (
13629 + "movups (%1),%%xmm0 ;\n\t"
13630 + "movups 0x10(%1),%%xmm1 ;\n\t"
13631 + "movups 0x20(%1),%%xmm2 ;\n\t"
13632 + "movups 0x30(%1),%%xmm3 ;\n\t"
13633 + "movl %0,%%cr0 ;\n\t"
13635 + : "r" (cr0), "r" (xmm_save)
13655 +#endif /* CONFIG_X86_XMM */
13658 + * high-speed RAID5 checksumming functions utilizing MMX instructions
13659 + * Copyright (C) 1998 Ingo Molnar
13661 +XORBLOCK_TEMPLATE(pII_mmx)
13663 + char fpu_save[108];
13664 + int lines = (bh_ptr[0]->b_size>>7);
13666 + if (!(current->flags & PF_USEDFPU))
13667 + __asm__ __volatile__ ( " clts;\n");
13669 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13672 + " movq 8*("#x")(%1), %%mm"#y" ;\n"
13674 + " movq %%mm"#y", 8*("#x")(%1) ;\n"
13675 +#define XO1(x,y) \
13676 + " pxor 8*("#x")(%2), %%mm"#y" ;\n"
13677 +#define XO2(x,y) \
13678 + " pxor 8*("#x")(%3), %%mm"#y" ;\n"
13679 +#define XO3(x,y) \
13680 + " pxor 8*("#x")(%4), %%mm"#y" ;\n"
13681 +#define XO4(x,y) \
13682 + " pxor 8*("#x")(%5), %%mm"#y" ;\n"
13686 + __asm__ __volatile__ (
13688 +#define BLOCK(i) \
13702 + " .align 32,0x90 ;\n"
13710 + " addl $128, %1 ;\n"
13711 + " addl $128, %2 ;\n"
13716 + "r" (bh_ptr[0]->b_data),
13717 + "r" (bh_ptr[1]->b_data)
13721 + __asm__ __volatile__ (
13723 +#define BLOCK(i) \
13741 + " .align 32,0x90 ;\n"
13749 + " addl $128, %1 ;\n"
13750 + " addl $128, %2 ;\n"
13751 + " addl $128, %3 ;\n"
13756 + "r" (bh_ptr[0]->b_data),
13757 + "r" (bh_ptr[1]->b_data),
13758 + "r" (bh_ptr[2]->b_data)
13762 + __asm__ __volatile__ (
13764 +#define BLOCK(i) \
13786 + " .align 32,0x90 ;\n"
13794 + " addl $128, %1 ;\n"
13795 + " addl $128, %2 ;\n"
13796 + " addl $128, %3 ;\n"
13797 + " addl $128, %4 ;\n"
13802 + "r" (bh_ptr[0]->b_data),
13803 + "r" (bh_ptr[1]->b_data),
13804 + "r" (bh_ptr[2]->b_data),
13805 + "r" (bh_ptr[3]->b_data)
13809 + __asm__ __volatile__ (
13811 +#define BLOCK(i) \
13837 + " .align 32,0x90 ;\n"
13845 + " addl $128, %1 ;\n"
13846 + " addl $128, %2 ;\n"
13847 + " addl $128, %3 ;\n"
13848 + " addl $128, %4 ;\n"
13849 + " addl $128, %5 ;\n"
13854 + "r" (bh_ptr[0]->b_data),
13855 + "r" (bh_ptr[1]->b_data),
13856 + "r" (bh_ptr[2]->b_data),
13857 + "r" (bh_ptr[3]->b_data),
13858 + "r" (bh_ptr[4]->b_data)
13863 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
13865 + if (!(current->flags & PF_USEDFPU))
13877 +XORBLOCK_TEMPLATE(p5_mmx)
13879 + char fpu_save[108];
13880 + int lines = (bh_ptr[0]->b_size>>6);
13882 + if (!(current->flags & PF_USEDFPU))
13883 + __asm__ __volatile__ ( " clts;\n");
13885 + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
13889 + __asm__ __volatile__ (
13891 + " .align 32,0x90 ;\n"
13893 + " movq (%1), %%mm0 ;\n"
13894 + " movq 8(%1), %%mm1 ;\n"
13895 + " pxor (%2), %%mm0 ;\n"
13896 + " movq 16(%1), %%mm2 ;\n"
13897 + " movq %%mm0, (%1) ;\n"
13898 + " pxor 8(%2), %%mm1 ;\n"
13899 + " movq 24(%1), %%mm3 ;\n"
13900 + " movq %%mm1, 8(%1) ;\n"
13901 + " pxor 16(%2), %%mm2 ;\n"
13902 + " movq 32(%1), %%mm4 ;\n"
13903 + " movq %%mm2, 16(%1) ;\n"
13904 + " pxor 24(%2), %%mm3 ;\n"
13905 + " movq 40(%1), %%mm5 ;\n"
13906 + " movq %%mm3, 24(%1) ;\n"
13907 + " pxor 32(%2), %%mm4 ;\n"
13908 + " movq 48(%1), %%mm6 ;\n"
13909 + " movq %%mm4, 32(%1) ;\n"
13910 + " pxor 40(%2), %%mm5 ;\n"
13911 + " movq 56(%1), %%mm7 ;\n"
13912 + " movq %%mm5, 40(%1) ;\n"
13913 + " pxor 48(%2), %%mm6 ;\n"
13914 + " pxor 56(%2), %%mm7 ;\n"
13915 + " movq %%mm6, 48(%1) ;\n"
13916 + " movq %%mm7, 56(%1) ;\n"
13918 + " addl $64, %1 ;\n"
13919 + " addl $64, %2 ;\n"
13925 + "r" (bh_ptr[0]->b_data),
13926 + "r" (bh_ptr[1]->b_data)
13930 + __asm__ __volatile__ (
13932 + " .align 32,0x90 ;\n"
13934 + " movq (%1), %%mm0 ;\n"
13935 + " movq 8(%1), %%mm1 ;\n"
13936 + " pxor (%2), %%mm0 ;\n"
13937 + " movq 16(%1), %%mm2 ;\n"
13938 + " pxor 8(%2), %%mm1 ;\n"
13939 + " pxor (%3), %%mm0 ;\n"
13940 + " pxor 16(%2), %%mm2 ;\n"
13941 + " movq %%mm0, (%1) ;\n"
13942 + " pxor 8(%3), %%mm1 ;\n"
13943 + " pxor 16(%3), %%mm2 ;\n"
13944 + " movq 24(%1), %%mm3 ;\n"
13945 + " movq %%mm1, 8(%1) ;\n"
13946 + " movq 32(%1), %%mm4 ;\n"
13947 + " movq 40(%1), %%mm5 ;\n"
13948 + " pxor 24(%2), %%mm3 ;\n"
13949 + " movq %%mm2, 16(%1) ;\n"
13950 + " pxor 32(%2), %%mm4 ;\n"
13951 + " pxor 24(%3), %%mm3 ;\n"
13952 + " pxor 40(%2), %%mm5 ;\n"
13953 + " movq %%mm3, 24(%1) ;\n"
13954 + " pxor 32(%3), %%mm4 ;\n"
13955 + " pxor 40(%3), %%mm5 ;\n"
13956 + " movq 48(%1), %%mm6 ;\n"
13957 + " movq %%mm4, 32(%1) ;\n"
13958 + " movq 56(%1), %%mm7 ;\n"
13959 + " pxor 48(%2), %%mm6 ;\n"
13960 + " movq %%mm5, 40(%1) ;\n"
13961 + " pxor 56(%2), %%mm7 ;\n"
13962 + " pxor 48(%3), %%mm6 ;\n"
13963 + " pxor 56(%3), %%mm7 ;\n"
13964 + " movq %%mm6, 48(%1) ;\n"
13965 + " movq %%mm7, 56(%1) ;\n"
13967 + " addl $64, %1 ;\n"
13968 + " addl $64, %2 ;\n"
13969 + " addl $64, %3 ;\n"
13975 + "r" (bh_ptr[0]->b_data),
13976 + "r" (bh_ptr[1]->b_data),
13977 + "r" (bh_ptr[2]->b_data)
13981 + __asm__ __volatile__ (
13983 + " .align 32,0x90 ;\n"
13985 + " movq (%1), %%mm0 ;\n"
13986 + " movq 8(%1), %%mm1 ;\n"
13987 + " pxor (%2), %%mm0 ;\n"
13988 + " movq 16(%1), %%mm2 ;\n"
13989 + " pxor 8(%2), %%mm1 ;\n"
13990 + " pxor (%3), %%mm0 ;\n"
13991 + " pxor 16(%2), %%mm2 ;\n"
13992 + " pxor 8(%3), %%mm1 ;\n"
13993 + " pxor (%4), %%mm0 ;\n"
13994 + " movq 24(%1), %%mm3 ;\n"
13995 + " pxor 16(%3), %%mm2 ;\n"
13996 + " pxor 8(%4), %%mm1 ;\n"
13997 + " movq %%mm0, (%1) ;\n"
13998 + " movq 32(%1), %%mm4 ;\n"
13999 + " pxor 24(%2), %%mm3 ;\n"
14000 + " pxor 16(%4), %%mm2 ;\n"
14001 + " movq %%mm1, 8(%1) ;\n"
14002 + " movq 40(%1), %%mm5 ;\n"
14003 + " pxor 32(%2), %%mm4 ;\n"
14004 + " pxor 24(%3), %%mm3 ;\n"
14005 + " movq %%mm2, 16(%1) ;\n"
14006 + " pxor 40(%2), %%mm5 ;\n"
14007 + " pxor 32(%3), %%mm4 ;\n"
14008 + " pxor 24(%4), %%mm3 ;\n"
14009 + " movq %%mm3, 24(%1) ;\n"
14010 + " movq 56(%1), %%mm7 ;\n"
14011 + " movq 48(%1), %%mm6 ;\n"
14012 + " pxor 40(%3), %%mm5 ;\n"
14013 + " pxor 32(%4), %%mm4 ;\n"
14014 + " pxor 48(%2), %%mm6 ;\n"
14015 + " movq %%mm4, 32(%1) ;\n"
14016 + " pxor 56(%2), %%mm7 ;\n"
14017 + " pxor 40(%4), %%mm5 ;\n"
14018 + " pxor 48(%3), %%mm6 ;\n"
14019 + " pxor 56(%3), %%mm7 ;\n"
14020 + " movq %%mm5, 40(%1) ;\n"
14021 + " pxor 48(%4), %%mm6 ;\n"
14022 + " pxor 56(%4), %%mm7 ;\n"
14023 + " movq %%mm6, 48(%1) ;\n"
14024 + " movq %%mm7, 56(%1) ;\n"
14026 + " addl $64, %1 ;\n"
14027 + " addl $64, %2 ;\n"
14028 + " addl $64, %3 ;\n"
14029 + " addl $64, %4 ;\n"
14035 + "r" (bh_ptr[0]->b_data),
14036 + "r" (bh_ptr[1]->b_data),
14037 + "r" (bh_ptr[2]->b_data),
14038 + "r" (bh_ptr[3]->b_data)
14042 + __asm__ __volatile__ (
14044 + " .align 32,0x90 ;\n"
14046 + " movq (%1), %%mm0 ;\n"
14047 + " movq 8(%1), %%mm1 ;\n"
14048 + " pxor (%2), %%mm0 ;\n"
14049 + " pxor 8(%2), %%mm1 ;\n"
14050 + " movq 16(%1), %%mm2 ;\n"
14051 + " pxor (%3), %%mm0 ;\n"
14052 + " pxor 8(%3), %%mm1 ;\n"
14053 + " pxor 16(%2), %%mm2 ;\n"
14054 + " pxor (%4), %%mm0 ;\n"
14055 + " pxor 8(%4), %%mm1 ;\n"
14056 + " pxor 16(%3), %%mm2 ;\n"
14057 + " movq 24(%1), %%mm3 ;\n"
14058 + " pxor (%5), %%mm0 ;\n"
14059 + " pxor 8(%5), %%mm1 ;\n"
14060 + " movq %%mm0, (%1) ;\n"
14061 + " pxor 16(%4), %%mm2 ;\n"
14062 + " pxor 24(%2), %%mm3 ;\n"
14063 + " movq %%mm1, 8(%1) ;\n"
14064 + " pxor 16(%5), %%mm2 ;\n"
14065 + " pxor 24(%3), %%mm3 ;\n"
14066 + " movq 32(%1), %%mm4 ;\n"
14067 + " movq %%mm2, 16(%1) ;\n"
14068 + " pxor 24(%4), %%mm3 ;\n"
14069 + " pxor 32(%2), %%mm4 ;\n"
14070 + " movq 40(%1), %%mm5 ;\n"
14071 + " pxor 24(%5), %%mm3 ;\n"
14072 + " pxor 32(%3), %%mm4 ;\n"
14073 + " pxor 40(%2), %%mm5 ;\n"
14074 + " movq %%mm3, 24(%1) ;\n"
14075 + " pxor 32(%4), %%mm4 ;\n"
14076 + " pxor 40(%3), %%mm5 ;\n"
14077 + " movq 48(%1), %%mm6 ;\n"
14078 + " movq 56(%1), %%mm7 ;\n"
14079 + " pxor 32(%5), %%mm4 ;\n"
14080 + " pxor 40(%4), %%mm5 ;\n"
14081 + " pxor 48(%2), %%mm6 ;\n"
14082 + " pxor 56(%2), %%mm7 ;\n"
14083 + " movq %%mm4, 32(%1) ;\n"
14084 + " pxor 48(%3), %%mm6 ;\n"
14085 + " pxor 56(%3), %%mm7 ;\n"
14086 + " pxor 40(%5), %%mm5 ;\n"
14087 + " pxor 48(%4), %%mm6 ;\n"
14088 + " pxor 56(%4), %%mm7 ;\n"
14089 + " movq %%mm5, 40(%1) ;\n"
14090 + " pxor 48(%5), %%mm6 ;\n"
14091 + " pxor 56(%5), %%mm7 ;\n"
14092 + " movq %%mm6, 48(%1) ;\n"
14093 + " movq %%mm7, 56(%1) ;\n"
14095 + " addl $64, %1 ;\n"
14096 + " addl $64, %2 ;\n"
14097 + " addl $64, %3 ;\n"
14098 + " addl $64, %4 ;\n"
14099 + " addl $64, %5 ;\n"
14105 + "r" (bh_ptr[0]->b_data),
14106 + "r" (bh_ptr[1]->b_data),
14107 + "r" (bh_ptr[2]->b_data),
14108 + "r" (bh_ptr[3]->b_data),
14109 + "r" (bh_ptr[4]->b_data)
14114 + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
14116 + if (!(current->flags & PF_USEDFPU))
14119 +#endif /* __i386__ */
14120 +#endif /* !__sparc_v9__ */
14122 +#ifdef __sparc_v9__
14124 + * High speed xor_block operation for RAID4/5 utilizing the
14125 + * UltraSparc Visual Instruction Set.
14127 + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
14130 + * !(((long)dest | (long)sourceN) & (64 - 1)) &&
14131 + * !(len & 127) && len >= 256
14133 + * It is done in pure assembly, as otherwise gcc makes it
14134 + * a non-leaf function, which is not what we want.
14135 + * Also, we don't measure the speeds as on other architectures,
14136 + * as the measuring routine does not take into account cold caches
14137 + * and the fact that xor_block_VIS bypasses the caches.
14138 + * xor_block_32regs might be 5% faster for count 2 if caches are hot
14139 + * and things just right (for count 3 VIS is about as fast as 32regs for
14140 + * hot caches and for count 4 and 5 VIS is faster by good margin always),
14141 + * but I think it is better not to pollute the caches.
14142 + * Actually, if I'd just fight for speed for hot caches, I could
14143 + * write a hybrid VIS/integer routine, which would do always two
14144 + * 64B blocks in VIS and two in IEUs, but I really care more about
14147 +extern void *VISenter(void);
14148 +extern void xor_block_VIS XOR_ARGS;
14150 +void __xor_block_VIS(void)
14153 + .globl xor_block_VIS
14155 + ldx [%%o1 + 0], %%o4
14156 + ldx [%%o1 + 8], %%o3
14157 + ldx [%%o4 + %1], %%g5
14158 + ldx [%%o4 + %0], %%o4
14159 + ldx [%%o3 + %0], %%o3
14161 + andcc %%o5, %2, %%g0
14162 + be,pt %%icc, 297f
14163 + sethi %%hi(%5), %%g1
14164 + jmpl %%g1 + %%lo(%5), %%g7
14165 + add %%g7, 8, %%g7
14166 +297: wr %%g0, %4, %%fprs
14167 + membar #LoadStore|#StoreLoad|#StoreStore
14168 + sub %%g5, 64, %%g5
14169 + ldda [%%o4] %3, %%f0
14170 + ldda [%%o3] %3, %%f16
14172 + bgeu,pt %%xcc, 10f
14176 + sub %%g5, 64, %%g5
14178 + wr %%g0, %3, %%asi
14180 +2: ldda [%%o4 + 64] %%asi, %%f32
14181 + fxor %%f0, %%f16, %%f16
14182 + fxor %%f2, %%f18, %%f18
14183 + fxor %%f4, %%f20, %%f20
14184 + fxor %%f6, %%f22, %%f22
14185 + fxor %%f8, %%f24, %%f24
14186 + fxor %%f10, %%f26, %%f26
14187 + fxor %%f12, %%f28, %%f28
14188 + fxor %%f14, %%f30, %%f30
14189 + stda %%f16, [%%o4] %3
14190 + ldda [%%o3 + 64] %%asi, %%f48
14191 + ldda [%%o4 + 128] %%asi, %%f0
14192 + fxor %%f32, %%f48, %%f48
14193 + fxor %%f34, %%f50, %%f50
14194 + add %%o4, 128, %%o4
14195 + fxor %%f36, %%f52, %%f52
14196 + add %%o3, 128, %%o3
14197 + fxor %%f38, %%f54, %%f54
14198 + subcc %%g5, 128, %%g5
14199 + fxor %%f40, %%f56, %%f56
14200 + fxor %%f42, %%f58, %%f58
14201 + fxor %%f44, %%f60, %%f60
14202 + fxor %%f46, %%f62, %%f62
14203 + stda %%f48, [%%o4 - 64] %%asi
14205 + ldda [%%o3] %3, %%f16
14207 + ldda [%%o4 + 64] %%asi, %%f32
14208 + fxor %%f0, %%f16, %%f16
14209 + fxor %%f2, %%f18, %%f18
14210 + fxor %%f4, %%f20, %%f20
14211 + fxor %%f6, %%f22, %%f22
14212 + fxor %%f8, %%f24, %%f24
14213 + fxor %%f10, %%f26, %%f26
14214 + fxor %%f12, %%f28, %%f28
14215 + fxor %%f14, %%f30, %%f30
14216 + stda %%f16, [%%o4] %3
14217 + ldda [%%o3 + 64] %%asi, %%f48
14219 + fxor %%f32, %%f48, %%f48
14220 + fxor %%f34, %%f50, %%f50
14221 + fxor %%f36, %%f52, %%f52
14222 + fxor %%f38, %%f54, %%f54
14223 + fxor %%f40, %%f56, %%f56
14224 + fxor %%f42, %%f58, %%f58
14225 + fxor %%f44, %%f60, %%f60
14226 + fxor %%f46, %%f62, %%f62
14227 + stda %%f48, [%%o4 + 64] %%asi
14228 + membar #Sync|#StoreStore|#StoreLoad
14229 + wr %%g0, 0, %%fprs
14231 + wr %%g1, %%g0, %%asi
14233 +13: ldx [%%o1 + 16], %%o2
14234 + ldx [%%o2 + %0], %%o2
14236 +3: ldda [%%o2] %3, %%f32
14237 + fxor %%f0, %%f16, %%f48
14238 + fxor %%f2, %%f18, %%f50
14239 + add %%o4, 64, %%o4
14240 + fxor %%f4, %%f20, %%f52
14241 + fxor %%f6, %%f22, %%f54
14242 + add %%o3, 64, %%o3
14243 + fxor %%f8, %%f24, %%f56
14244 + fxor %%f10, %%f26, %%f58
14245 + fxor %%f12, %%f28, %%f60
14246 + fxor %%f14, %%f30, %%f62
14247 + ldda [%%o4] %3, %%f0
14248 + fxor %%f48, %%f32, %%f48
14249 + fxor %%f50, %%f34, %%f50
14250 + fxor %%f52, %%f36, %%f52
14251 + fxor %%f54, %%f38, %%f54
14252 + add %%o2, 64, %%o2
14253 + fxor %%f56, %%f40, %%f56
14254 + fxor %%f58, %%f42, %%f58
14255 + subcc %%g5, 64, %%g5
14256 + fxor %%f60, %%f44, %%f60
14257 + fxor %%f62, %%f46, %%f62
14258 + stda %%f48, [%%o4 + %%g1] %3
14260 + ldda [%%o3] %3, %%f16
14262 + ldda [%%o2] %3, %%f32
14263 + fxor %%f0, %%f16, %%f48
14264 + fxor %%f2, %%f18, %%f50
14265 + fxor %%f4, %%f20, %%f52
14266 + fxor %%f6, %%f22, %%f54
14267 + fxor %%f8, %%f24, %%f56
14268 + fxor %%f10, %%f26, %%f58
14269 + fxor %%f12, %%f28, %%f60
14270 + fxor %%f14, %%f30, %%f62
14272 + fxor %%f48, %%f32, %%f48
14273 + fxor %%f50, %%f34, %%f50
14274 + fxor %%f52, %%f36, %%f52
14275 + fxor %%f54, %%f38, %%f54
14276 + fxor %%f56, %%f40, %%f56
14277 + fxor %%f58, %%f42, %%f58
14278 + fxor %%f60, %%f44, %%f60
14279 + fxor %%f62, %%f46, %%f62
14280 + stda %%f48, [%%o4] %3
14281 + membar #Sync|#StoreStore|#StoreLoad
14283 + wr %%g0, 0, %%fprs
14289 +14: ldx [%%o1 + 16], %%o2
14290 + ldx [%%o1 + 24], %%o0
14291 + ldx [%%o2 + %0], %%o2
14292 + ldx [%%o0 + %0], %%o0
14294 +4: ldda [%%o2] %3, %%f32
14295 + fxor %%f0, %%f16, %%f16
14296 + fxor %%f2, %%f18, %%f18
14297 + add %%o4, 64, %%o4
14298 + fxor %%f4, %%f20, %%f20
14299 + fxor %%f6, %%f22, %%f22
14300 + add %%o3, 64, %%o3
14301 + fxor %%f8, %%f24, %%f24
14302 + fxor %%f10, %%f26, %%f26
14303 + fxor %%f12, %%f28, %%f28
14304 + fxor %%f14, %%f30, %%f30
14305 + ldda [%%o0] %3, %%f48
14306 + fxor %%f16, %%f32, %%f32
14307 + fxor %%f18, %%f34, %%f34
14308 + fxor %%f20, %%f36, %%f36
14309 + fxor %%f22, %%f38, %%f38
14310 + add %%o2, 64, %%o2
14311 + fxor %%f24, %%f40, %%f40
14312 + fxor %%f26, %%f42, %%f42
14313 + fxor %%f28, %%f44, %%f44
14314 + fxor %%f30, %%f46, %%f46
14315 + ldda [%%o4] %3, %%f0
14316 + fxor %%f32, %%f48, %%f48
14317 + fxor %%f34, %%f50, %%f50
14318 + fxor %%f36, %%f52, %%f52
14319 + add %%o0, 64, %%o0
14320 + fxor %%f38, %%f54, %%f54
14321 + fxor %%f40, %%f56, %%f56
14322 + fxor %%f42, %%f58, %%f58
14323 + subcc %%g5, 64, %%g5
14324 + fxor %%f44, %%f60, %%f60
14325 + fxor %%f46, %%f62, %%f62
14326 + stda %%f48, [%%o4 + %%g1] %3
14328 + ldda [%%o3] %3, %%f16
14330 + ldda [%%o2] %3, %%f32
14331 + fxor %%f0, %%f16, %%f16
14332 + fxor %%f2, %%f18, %%f18
14333 + fxor %%f4, %%f20, %%f20
14334 + fxor %%f6, %%f22, %%f22
14335 + fxor %%f8, %%f24, %%f24
14336 + fxor %%f10, %%f26, %%f26
14337 + fxor %%f12, %%f28, %%f28
14338 + fxor %%f14, %%f30, %%f30
14339 + ldda [%%o0] %3, %%f48
14340 + fxor %%f16, %%f32, %%f32
14341 + fxor %%f18, %%f34, %%f34
14342 + fxor %%f20, %%f36, %%f36
14343 + fxor %%f22, %%f38, %%f38
14344 + fxor %%f24, %%f40, %%f40
14345 + fxor %%f26, %%f42, %%f42
14346 + fxor %%f28, %%f44, %%f44
14347 + fxor %%f30, %%f46, %%f46
14349 + fxor %%f32, %%f48, %%f48
14350 + fxor %%f34, %%f50, %%f50
14351 + fxor %%f36, %%f52, %%f52
14352 + fxor %%f38, %%f54, %%f54
14353 + fxor %%f40, %%f56, %%f56
14354 + fxor %%f42, %%f58, %%f58
14355 + fxor %%f44, %%f60, %%f60
14356 + fxor %%f46, %%f62, %%f62
14357 + stda %%f48, [%%o4] %3
14358 + membar #Sync|#StoreStore|#StoreLoad
14360 + wr %%g0, 0, %%fprs
14362 +15: ldx [%%o1 + 16], %%o2
14363 + ldx [%%o1 + 24], %%o0
14364 + ldx [%%o1 + 32], %%o1
14365 + ldx [%%o2 + %0], %%o2
14366 + ldx [%%o0 + %0], %%o0
14367 + ldx [%%o1 + %0], %%o1
14369 +5: ldda [%%o2] %3, %%f32
14370 + fxor %%f0, %%f16, %%f48
14371 + fxor %%f2, %%f18, %%f50
14372 + add %%o4, 64, %%o4
14373 + fxor %%f4, %%f20, %%f52
14374 + fxor %%f6, %%f22, %%f54
14375 + add %%o3, 64, %%o3
14376 + fxor %%f8, %%f24, %%f56
14377 + fxor %%f10, %%f26, %%f58
14378 + fxor %%f12, %%f28, %%f60
14379 + fxor %%f14, %%f30, %%f62
14380 + ldda [%%o0] %3, %%f16
14381 + fxor %%f48, %%f32, %%f48
14382 + fxor %%f50, %%f34, %%f50
14383 + fxor %%f52, %%f36, %%f52
14384 + fxor %%f54, %%f38, %%f54
14385 + add %%o2, 64, %%o2
14386 + fxor %%f56, %%f40, %%f56
14387 + fxor %%f58, %%f42, %%f58
14388 + fxor %%f60, %%f44, %%f60
14389 + fxor %%f62, %%f46, %%f62
14390 + ldda [%%o1] %3, %%f32
14391 + fxor %%f48, %%f16, %%f48
14392 + fxor %%f50, %%f18, %%f50
14393 + add %%o0, 64, %%o0
14394 + fxor %%f52, %%f20, %%f52
14395 + fxor %%f54, %%f22, %%f54
14396 + add %%o1, 64, %%o1
14397 + fxor %%f56, %%f24, %%f56
14398 + fxor %%f58, %%f26, %%f58
14399 + fxor %%f60, %%f28, %%f60
14400 + fxor %%f62, %%f30, %%f62
14401 + ldda [%%o4] %3, %%f0
14402 + fxor %%f48, %%f32, %%f48
14403 + fxor %%f50, %%f34, %%f50
14404 + fxor %%f52, %%f36, %%f52
14405 + fxor %%f54, %%f38, %%f54
14406 + fxor %%f56, %%f40, %%f56
14407 + fxor %%f58, %%f42, %%f58
14408 + subcc %%g5, 64, %%g5
14409 + fxor %%f60, %%f44, %%f60
14410 + fxor %%f62, %%f46, %%f62
14411 + stda %%f48, [%%o4 + %%g1] %3
14413 + ldda [%%o3] %3, %%f16
14415 + ldda [%%o2] %3, %%f32
14416 + fxor %%f0, %%f16, %%f48
14417 + fxor %%f2, %%f18, %%f50
14418 + fxor %%f4, %%f20, %%f52
14419 + fxor %%f6, %%f22, %%f54
14420 + fxor %%f8, %%f24, %%f56
14421 + fxor %%f10, %%f26, %%f58
14422 + fxor %%f12, %%f28, %%f60
14423 + fxor %%f14, %%f30, %%f62
14424 + ldda [%%o0] %3, %%f16
14425 + fxor %%f48, %%f32, %%f48
14426 + fxor %%f50, %%f34, %%f50
14427 + fxor %%f52, %%f36, %%f52
14428 + fxor %%f54, %%f38, %%f54
14429 + fxor %%f56, %%f40, %%f56
14430 + fxor %%f58, %%f42, %%f58
14431 + fxor %%f60, %%f44, %%f60
14432 + fxor %%f62, %%f46, %%f62
14433 + ldda [%%o1] %3, %%f32
14434 + fxor %%f48, %%f16, %%f48
14435 + fxor %%f50, %%f18, %%f50
14436 + fxor %%f52, %%f20, %%f52
14437 + fxor %%f54, %%f22, %%f54
14438 + fxor %%f56, %%f24, %%f56
14439 + fxor %%f58, %%f26, %%f58
14440 + fxor %%f60, %%f28, %%f60
14441 + fxor %%f62, %%f30, %%f62
14443 + fxor %%f48, %%f32, %%f48
14444 + fxor %%f50, %%f34, %%f50
14445 + fxor %%f52, %%f36, %%f52
14446 + fxor %%f54, %%f38, %%f54
14447 + fxor %%f56, %%f40, %%f56
14448 + fxor %%f58, %%f42, %%f58
14449 + fxor %%f60, %%f44, %%f60
14450 + fxor %%f62, %%f46, %%f62
14451 + stda %%f48, [%%o4] %3
14452 + membar #Sync|#StoreStore|#StoreLoad
14454 + wr %%g0, 0, %%fprs
14456 + "i" (&((struct buffer_head *)0)->b_data),
14457 + "i" (&((struct buffer_head *)0)->b_size),
14458 + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
14459 + "i" (FPRS_FEF), "i" (VISenter));
14461 +#endif /* __sparc_v9__ */
14463 +#if defined(__sparc__) && !defined(__sparc_v9__)
14465 + * High speed xor_block operation for RAID4/5 utilizing the
14466 + * ldd/std SPARC instructions.
14468 + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
14472 +XORBLOCK_TEMPLATE(SPARC)
14474 + int size = bh_ptr[0]->b_size;
14475 + int lines = size / (sizeof (long)) / 8, i;
14476 + long *destp = (long *) bh_ptr[0]->b_data;
14477 + long *source1 = (long *) bh_ptr[1]->b_data;
14478 + long *source2, *source3, *source4;
14482 + for (i = lines; i > 0; i--) {
14483 + __asm__ __volatile__("
14484 + ldd [%0 + 0x00], %%g2
14485 + ldd [%0 + 0x08], %%g4
14486 + ldd [%0 + 0x10], %%o0
14487 + ldd [%0 + 0x18], %%o2
14488 + ldd [%1 + 0x00], %%o4
14489 + ldd [%1 + 0x08], %%l0
14490 + ldd [%1 + 0x10], %%l2
14491 + ldd [%1 + 0x18], %%l4
14492 + xor %%g2, %%o4, %%g2
14493 + xor %%g3, %%o5, %%g3
14494 + xor %%g4, %%l0, %%g4
14495 + xor %%g5, %%l1, %%g5
14496 + xor %%o0, %%l2, %%o0
14497 + xor %%o1, %%l3, %%o1
14498 + xor %%o2, %%l4, %%o2
14499 + xor %%o3, %%l5, %%o3
14500 + std %%g2, [%0 + 0x00]
14501 + std %%g4, [%0 + 0x08]
14502 + std %%o0, [%0 + 0x10]
14503 + std %%o2, [%0 + 0x18]
14504 + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
14505 + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
14511 + source2 = (long *) bh_ptr[2]->b_data;
14512 + for (i = lines; i > 0; i--) {
14513 + __asm__ __volatile__("
14514 + ldd [%0 + 0x00], %%g2
14515 + ldd [%0 + 0x08], %%g4
14516 + ldd [%0 + 0x10], %%o0
14517 + ldd [%0 + 0x18], %%o2
14518 + ldd [%1 + 0x00], %%o4
14519 + ldd [%1 + 0x08], %%l0
14520 + ldd [%1 + 0x10], %%l2
14521 + ldd [%1 + 0x18], %%l4
14522 + xor %%g2, %%o4, %%g2
14523 + xor %%g3, %%o5, %%g3
14524 + ldd [%2 + 0x00], %%o4
14525 + xor %%g4, %%l0, %%g4
14526 + xor %%g5, %%l1, %%g5
14527 + ldd [%2 + 0x08], %%l0
14528 + xor %%o0, %%l2, %%o0
14529 + xor %%o1, %%l3, %%o1
14530 + ldd [%2 + 0x10], %%l2
14531 + xor %%o2, %%l4, %%o2
14532 + xor %%o3, %%l5, %%o3
14533 + ldd [%2 + 0x18], %%l4
14534 + xor %%g2, %%o4, %%g2
14535 + xor %%g3, %%o5, %%g3
14536 + xor %%g4, %%l0, %%g4
14537 + xor %%g5, %%l1, %%g5
14538 + xor %%o0, %%l2, %%o0
14539 + xor %%o1, %%l3, %%o1
14540 + xor %%o2, %%l4, %%o2
14541 + xor %%o3, %%l5, %%o3
14542 + std %%g2, [%0 + 0x00]
14543 + std %%g4, [%0 + 0x08]
14544 + std %%o0, [%0 + 0x10]
14545 + std %%o2, [%0 + 0x18]
14546 + " : : "r" (destp), "r" (source1), "r" (source2)
14547 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14548 + "l0", "l1", "l2", "l3", "l4", "l5");
14555 + source2 = (long *) bh_ptr[2]->b_data;
14556 + source3 = (long *) bh_ptr[3]->b_data;
14557 + for (i = lines; i > 0; i--) {
14558 + __asm__ __volatile__("
14559 + ldd [%0 + 0x00], %%g2
14560 + ldd [%0 + 0x08], %%g4
14561 + ldd [%0 + 0x10], %%o0
14562 + ldd [%0 + 0x18], %%o2
14563 + ldd [%1 + 0x00], %%o4
14564 + ldd [%1 + 0x08], %%l0
14565 + ldd [%1 + 0x10], %%l2
14566 + ldd [%1 + 0x18], %%l4
14567 + xor %%g2, %%o4, %%g2
14568 + xor %%g3, %%o5, %%g3
14569 + ldd [%2 + 0x00], %%o4
14570 + xor %%g4, %%l0, %%g4
14571 + xor %%g5, %%l1, %%g5
14572 + ldd [%2 + 0x08], %%l0
14573 + xor %%o0, %%l2, %%o0
14574 + xor %%o1, %%l3, %%o1
14575 + ldd [%2 + 0x10], %%l2
14576 + xor %%o2, %%l4, %%o2
14577 + xor %%o3, %%l5, %%o3
14578 + ldd [%2 + 0x18], %%l4
14579 + xor %%g2, %%o4, %%g2
14580 + xor %%g3, %%o5, %%g3
14581 + ldd [%3 + 0x00], %%o4
14582 + xor %%g4, %%l0, %%g4
14583 + xor %%g5, %%l1, %%g5
14584 + ldd [%3 + 0x08], %%l0
14585 + xor %%o0, %%l2, %%o0
14586 + xor %%o1, %%l3, %%o1
14587 + ldd [%3 + 0x10], %%l2
14588 + xor %%o2, %%l4, %%o2
14589 + xor %%o3, %%l5, %%o3
14590 + ldd [%3 + 0x18], %%l4
14591 + xor %%g2, %%o4, %%g2
14592 + xor %%g3, %%o5, %%g3
14593 + xor %%g4, %%l0, %%g4
14594 + xor %%g5, %%l1, %%g5
14595 + xor %%o0, %%l2, %%o0
14596 + xor %%o1, %%l3, %%o1
14597 + xor %%o2, %%l4, %%o2
14598 + xor %%o3, %%l5, %%o3
14599 + std %%g2, [%0 + 0x00]
14600 + std %%g4, [%0 + 0x08]
14601 + std %%o0, [%0 + 0x10]
14602 + std %%o2, [%0 + 0x18]
14603 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
14604 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14605 + "l0", "l1", "l2", "l3", "l4", "l5");
14613 + source2 = (long *) bh_ptr[2]->b_data;
14614 + source3 = (long *) bh_ptr[3]->b_data;
14615 + source4 = (long *) bh_ptr[4]->b_data;
14616 + for (i = lines; i > 0; i--) {
14617 + __asm__ __volatile__("
14618 + ldd [%0 + 0x00], %%g2
14619 + ldd [%0 + 0x08], %%g4
14620 + ldd [%0 + 0x10], %%o0
14621 + ldd [%0 + 0x18], %%o2
14622 + ldd [%1 + 0x00], %%o4
14623 + ldd [%1 + 0x08], %%l0
14624 + ldd [%1 + 0x10], %%l2
14625 + ldd [%1 + 0x18], %%l4
14626 + xor %%g2, %%o4, %%g2
14627 + xor %%g3, %%o5, %%g3
14628 + ldd [%2 + 0x00], %%o4
14629 + xor %%g4, %%l0, %%g4
14630 + xor %%g5, %%l1, %%g5
14631 + ldd [%2 + 0x08], %%l0
14632 + xor %%o0, %%l2, %%o0
14633 + xor %%o1, %%l3, %%o1
14634 + ldd [%2 + 0x10], %%l2
14635 + xor %%o2, %%l4, %%o2
14636 + xor %%o3, %%l5, %%o3
14637 + ldd [%2 + 0x18], %%l4
14638 + xor %%g2, %%o4, %%g2
14639 + xor %%g3, %%o5, %%g3
14640 + ldd [%3 + 0x00], %%o4
14641 + xor %%g4, %%l0, %%g4
14642 + xor %%g5, %%l1, %%g5
14643 + ldd [%3 + 0x08], %%l0
14644 + xor %%o0, %%l2, %%o0
14645 + xor %%o1, %%l3, %%o1
14646 + ldd [%3 + 0x10], %%l2
14647 + xor %%o2, %%l4, %%o2
14648 + xor %%o3, %%l5, %%o3
14649 + ldd [%3 + 0x18], %%l4
14650 + xor %%g2, %%o4, %%g2
14651 + xor %%g3, %%o5, %%g3
14652 + ldd [%4 + 0x00], %%o4
14653 + xor %%g4, %%l0, %%g4
14654 + xor %%g5, %%l1, %%g5
14655 + ldd [%4 + 0x08], %%l0
14656 + xor %%o0, %%l2, %%o0
14657 + xor %%o1, %%l3, %%o1
14658 + ldd [%4 + 0x10], %%l2
14659 + xor %%o2, %%l4, %%o2
14660 + xor %%o3, %%l5, %%o3
14661 + ldd [%4 + 0x18], %%l4
14662 + xor %%g2, %%o4, %%g2
14663 + xor %%g3, %%o5, %%g3
14664 + xor %%g4, %%l0, %%g4
14665 + xor %%g5, %%l1, %%g5
14666 + xor %%o0, %%l2, %%o0
14667 + xor %%o1, %%l3, %%o1
14668 + xor %%o2, %%l4, %%o2
14669 + xor %%o3, %%l5, %%o3
14670 + std %%g2, [%0 + 0x00]
14671 + std %%g4, [%0 + 0x08]
14672 + std %%o0, [%0 + 0x10]
14673 + std %%o2, [%0 + 0x18]
14674 + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
14675 + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
14676 + "l0", "l1", "l2", "l3", "l4", "l5");
14686 +#endif /* __sparc_v[78]__ */
14688 +#ifndef __sparc_v9__
14691 + * this one works reasonably on any x86 CPU
14692 + * (send me an assembly version for inclusion if you can make it faster)
14694 + * this one is just as fast as written in pure assembly on x86.
14695 + * the reason for this separate version is that the
14696 + * fast open-coded xor routine "32reg" produces suboptimal code
14697 + * on x86, due to lack of registers.
14699 +XORBLOCK_TEMPLATE(8regs)
14701 + int len = bh_ptr[0]->b_size;
14702 + long *destp = (long *) bh_ptr[0]->b_data;
14703 + long *source1, *source2, *source3, *source4;
14704 + long lines = len / (sizeof (long)) / 8, i;
14708 + source1 = (long *) bh_ptr[1]->b_data;
14709 + for (i = lines; i > 0; i--) {
14710 + *(destp + 0) ^= *(source1 + 0);
14711 + *(destp + 1) ^= *(source1 + 1);
14712 + *(destp + 2) ^= *(source1 + 2);
14713 + *(destp + 3) ^= *(source1 + 3);
14714 + *(destp + 4) ^= *(source1 + 4);
14715 + *(destp + 5) ^= *(source1 + 5);
14716 + *(destp + 6) ^= *(source1 + 6);
14717 + *(destp + 7) ^= *(source1 + 7);
14723 + source2 = (long *) bh_ptr[2]->b_data;
14724 + source1 = (long *) bh_ptr[1]->b_data;
14725 + for (i = lines; i > 0; i--) {
14726 + *(destp + 0) ^= *(source1 + 0);
14727 + *(destp + 0) ^= *(source2 + 0);
14728 + *(destp + 1) ^= *(source1 + 1);
14729 + *(destp + 1) ^= *(source2 + 1);
14730 + *(destp + 2) ^= *(source1 + 2);
14731 + *(destp + 2) ^= *(source2 + 2);
14732 + *(destp + 3) ^= *(source1 + 3);
14733 + *(destp + 3) ^= *(source2 + 3);
14734 + *(destp + 4) ^= *(source1 + 4);
14735 + *(destp + 4) ^= *(source2 + 4);
14736 + *(destp + 5) ^= *(source1 + 5);
14737 + *(destp + 5) ^= *(source2 + 5);
14738 + *(destp + 6) ^= *(source1 + 6);
14739 + *(destp + 6) ^= *(source2 + 6);
14740 + *(destp + 7) ^= *(source1 + 7);
14741 + *(destp + 7) ^= *(source2 + 7);
14748 + source3 = (long *) bh_ptr[3]->b_data;
14749 + source2 = (long *) bh_ptr[2]->b_data;
14750 + source1 = (long *) bh_ptr[1]->b_data;
14751 + for (i = lines; i > 0; i--) {
14752 + *(destp + 0) ^= *(source1 + 0);
14753 + *(destp + 0) ^= *(source2 + 0);
14754 + *(destp + 0) ^= *(source3 + 0);
14755 + *(destp + 1) ^= *(source1 + 1);
14756 + *(destp + 1) ^= *(source2 + 1);
14757 + *(destp + 1) ^= *(source3 + 1);
14758 + *(destp + 2) ^= *(source1 + 2);
14759 + *(destp + 2) ^= *(source2 + 2);
14760 + *(destp + 2) ^= *(source3 + 2);
14761 + *(destp + 3) ^= *(source1 + 3);
14762 + *(destp + 3) ^= *(source2 + 3);
14763 + *(destp + 3) ^= *(source3 + 3);
14764 + *(destp + 4) ^= *(source1 + 4);
14765 + *(destp + 4) ^= *(source2 + 4);
14766 + *(destp + 4) ^= *(source3 + 4);
14767 + *(destp + 5) ^= *(source1 + 5);
14768 + *(destp + 5) ^= *(source2 + 5);
14769 + *(destp + 5) ^= *(source3 + 5);
14770 + *(destp + 6) ^= *(source1 + 6);
14771 + *(destp + 6) ^= *(source2 + 6);
14772 + *(destp + 6) ^= *(source3 + 6);
14773 + *(destp + 7) ^= *(source1 + 7);
14774 + *(destp + 7) ^= *(source2 + 7);
14775 + *(destp + 7) ^= *(source3 + 7);
14783 + source4 = (long *) bh_ptr[4]->b_data;
14784 + source3 = (long *) bh_ptr[3]->b_data;
14785 + source2 = (long *) bh_ptr[2]->b_data;
14786 + source1 = (long *) bh_ptr[1]->b_data;
14787 + for (i = lines; i > 0; i--) {
14788 + *(destp + 0) ^= *(source1 + 0);
14789 + *(destp + 0) ^= *(source2 + 0);
14790 + *(destp + 0) ^= *(source3 + 0);
14791 + *(destp + 0) ^= *(source4 + 0);
14792 + *(destp + 1) ^= *(source1 + 1);
14793 + *(destp + 1) ^= *(source2 + 1);
14794 + *(destp + 1) ^= *(source3 + 1);
14795 + *(destp + 1) ^= *(source4 + 1);
14796 + *(destp + 2) ^= *(source1 + 2);
14797 + *(destp + 2) ^= *(source2 + 2);
14798 + *(destp + 2) ^= *(source3 + 2);
14799 + *(destp + 2) ^= *(source4 + 2);
14800 + *(destp + 3) ^= *(source1 + 3);
14801 + *(destp + 3) ^= *(source2 + 3);
14802 + *(destp + 3) ^= *(source3 + 3);
14803 + *(destp + 3) ^= *(source4 + 3);
14804 + *(destp + 4) ^= *(source1 + 4);
14805 + *(destp + 4) ^= *(source2 + 4);
14806 + *(destp + 4) ^= *(source3 + 4);
14807 + *(destp + 4) ^= *(source4 + 4);
14808 + *(destp + 5) ^= *(source1 + 5);
14809 + *(destp + 5) ^= *(source2 + 5);
14810 + *(destp + 5) ^= *(source3 + 5);
14811 + *(destp + 5) ^= *(source4 + 5);
14812 + *(destp + 6) ^= *(source1 + 6);
14813 + *(destp + 6) ^= *(source2 + 6);
14814 + *(destp + 6) ^= *(source3 + 6);
14815 + *(destp + 6) ^= *(source4 + 6);
14816 + *(destp + 7) ^= *(source1 + 7);
14817 + *(destp + 7) ^= *(source2 + 7);
14818 + *(destp + 7) ^= *(source3 + 7);
14819 + *(destp + 7) ^= *(source4 + 7);
14831 + * platform independent RAID5 checksum calculation, this should
14832 + * be very fast on any platform that has a decent amount of
14833 + * registers. (32 or more)
14835 +XORBLOCK_TEMPLATE(32regs)
14837 + int size = bh_ptr[0]->b_size;
14838 + int lines = size / (sizeof (long)) / 8, i;
14839 + long *destp = (long *) bh_ptr[0]->b_data;
14840 + long *source1, *source2, *source3, *source4;
14842 + /* LOTS of registers available...
14843 + We do explicite loop-unrolling here for code which
14844 + favours RISC machines. In fact this is almoast direct
14845 + RISC assembly on Alpha and SPARC :-) */
14850 + source1 = (long *) bh_ptr[1]->b_data;
14851 + for (i = lines; i > 0; i--) {
14852 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14853 + d0 = destp[0]; /* Pull the stuff into registers */
14854 + d1 = destp[1]; /* ... in bursts, if possible. */
14861 + d0 ^= source1[0];
14862 + d1 ^= source1[1];
14863 + d2 ^= source1[2];
14864 + d3 ^= source1[3];
14865 + d4 ^= source1[4];
14866 + d5 ^= source1[5];
14867 + d6 ^= source1[6];
14868 + d7 ^= source1[7];
14869 + destp[0] = d0; /* Store the result (in burts) */
14873 + destp[4] = d4; /* Store the result (in burts) */
14882 + source2 = (long *) bh_ptr[2]->b_data;
14883 + source1 = (long *) bh_ptr[1]->b_data;
14884 + for (i = lines; i > 0; i--) {
14885 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14886 + d0 = destp[0]; /* Pull the stuff into registers */
14887 + d1 = destp[1]; /* ... in bursts, if possible. */
14894 + d0 ^= source1[0];
14895 + d1 ^= source1[1];
14896 + d2 ^= source1[2];
14897 + d3 ^= source1[3];
14898 + d4 ^= source1[4];
14899 + d5 ^= source1[5];
14900 + d6 ^= source1[6];
14901 + d7 ^= source1[7];
14902 + d0 ^= source2[0];
14903 + d1 ^= source2[1];
14904 + d2 ^= source2[2];
14905 + d3 ^= source2[3];
14906 + d4 ^= source2[4];
14907 + d5 ^= source2[5];
14908 + d6 ^= source2[6];
14909 + d7 ^= source2[7];
14910 + destp[0] = d0; /* Store the result (in burts) */
14914 + destp[4] = d4; /* Store the result (in burts) */
14924 + source3 = (long *) bh_ptr[3]->b_data;
14925 + source2 = (long *) bh_ptr[2]->b_data;
14926 + source1 = (long *) bh_ptr[1]->b_data;
14927 + for (i = lines; i > 0; i--) {
14928 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14929 + d0 = destp[0]; /* Pull the stuff into registers */
14930 + d1 = destp[1]; /* ... in bursts, if possible. */
14937 + d0 ^= source1[0];
14938 + d1 ^= source1[1];
14939 + d2 ^= source1[2];
14940 + d3 ^= source1[3];
14941 + d4 ^= source1[4];
14942 + d5 ^= source1[5];
14943 + d6 ^= source1[6];
14944 + d7 ^= source1[7];
14945 + d0 ^= source2[0];
14946 + d1 ^= source2[1];
14947 + d2 ^= source2[2];
14948 + d3 ^= source2[3];
14949 + d4 ^= source2[4];
14950 + d5 ^= source2[5];
14951 + d6 ^= source2[6];
14952 + d7 ^= source2[7];
14953 + d0 ^= source3[0];
14954 + d1 ^= source3[1];
14955 + d2 ^= source3[2];
14956 + d3 ^= source3[3];
14957 + d4 ^= source3[4];
14958 + d5 ^= source3[5];
14959 + d6 ^= source3[6];
14960 + d7 ^= source3[7];
14961 + destp[0] = d0; /* Store the result (in burts) */
14965 + destp[4] = d4; /* Store the result (in burts) */
14976 + source4 = (long *) bh_ptr[4]->b_data;
14977 + source3 = (long *) bh_ptr[3]->b_data;
14978 + source2 = (long *) bh_ptr[2]->b_data;
14979 + source1 = (long *) bh_ptr[1]->b_data;
14980 + for (i = lines; i > 0; i--) {
14981 + register long d0, d1, d2, d3, d4, d5, d6, d7;
14982 + d0 = destp[0]; /* Pull the stuff into registers */
14983 + d1 = destp[1]; /* ... in bursts, if possible. */
14990 + d0 ^= source1[0];
14991 + d1 ^= source1[1];
14992 + d2 ^= source1[2];
14993 + d3 ^= source1[3];
14994 + d4 ^= source1[4];
14995 + d5 ^= source1[5];
14996 + d6 ^= source1[6];
14997 + d7 ^= source1[7];
14998 + d0 ^= source2[0];
14999 + d1 ^= source2[1];
15000 + d2 ^= source2[2];
15001 + d3 ^= source2[3];
15002 + d4 ^= source2[4];
15003 + d5 ^= source2[5];
15004 + d6 ^= source2[6];
15005 + d7 ^= source2[7];
15006 + d0 ^= source3[0];
15007 + d1 ^= source3[1];
15008 + d2 ^= source3[2];
15009 + d3 ^= source3[3];
15010 + d4 ^= source3[4];
15011 + d5 ^= source3[5];
15012 + d6 ^= source3[6];
15013 + d7 ^= source3[7];
15014 + d0 ^= source4[0];
15015 + d1 ^= source4[1];
15016 + d2 ^= source4[2];
15017 + d3 ^= source4[3];
15018 + d4 ^= source4[4];
15019 + d5 ^= source4[5];
15020 + d6 ^= source4[6];
15021 + d7 ^= source4[7];
15022 + destp[0] = d0; /* Store the result (in burts) */
15026 + destp[4] = d4; /* Store the result (in burts) */
15041 + * (the -6*32 shift factor colors the cache)
15043 +#define SIZE (PAGE_SIZE-6*32)
15045 +static void xor_speed ( struct xor_block_template * func,
15046 + struct buffer_head *b1, struct buffer_head *b2)
15049 + unsigned long now;
15050 + int i, count, max;
15051 + struct buffer_head *bh_ptr[6];
15053 + func->next = xor_functions;
15054 + xor_functions = func;
15059 + * count the number of XORs done during a whole jiffy.
15060 + * calculate the speed of checksumming from this.
15061 + * (we use a 2-page allocation to have guaranteed
15062 + * color L1-cache layout)
15065 + for (i = 0; i < 5; i++) {
15068 + while (jiffies == now) {
15070 + func->xor_block(2,bh_ptr);
15079 + speed = max * (HZ*SIZE/1024);
15080 + func->speed = speed;
15082 + printk( " %-10s: %5d.%03d MB/sec\n", func->name,
15083 + speed / 1000, speed % 1000);
15086 +static inline void pick_fastest_function(void)
15088 + struct xor_block_template *f, *fastest;
15090 + fastest = xor_functions;
15091 + for (f = fastest; f; f = f->next) {
15092 + if (f->speed > fastest->speed)
15095 +#ifdef CONFIG_X86_XMM
15096 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15097 + fastest = &t_xor_block_pIII_kni;
15100 + xor_block = fastest->xor_block;
15101 + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
15102 + fastest->speed / 1000, fastest->speed % 1000);
15106 +void calibrate_xor_block(void)
15108 + struct buffer_head b1, b2;
15110 + memset(&b1,0,sizeof(b1));
15113 + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
15114 + if (!b1.b_data) {
15115 + pick_fastest_function();
15118 + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
15120 + b1.b_size = SIZE;
15122 + printk(KERN_INFO "raid5: measuring checksumming speed\n");
15124 + sti(); /* should be safe */
15126 +#if defined(__sparc__) && !defined(__sparc_v9__)
15127 + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
15128 + xor_speed(&t_xor_block_SPARC,&b1,&b2);
15131 +#ifdef CONFIG_X86_XMM
15132 + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
15134 + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
15135 + /* we force the use of the KNI xor block because it
15136 + can write around l2. we may also be able
15137 + to load into the l1 only depending on how
15138 + the cpu deals with a load to a line that is
15139 + being prefetched.
15141 + xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
15143 +#endif /* CONFIG_X86_XMM */
15147 + if (md_cpu_has_mmx()) {
15149 + "raid5: MMX detected, trying high-speed MMX checksum routines\n");
15150 + xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
15151 + xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
15154 +#endif /* __i386__ */
15157 + xor_speed(&t_xor_block_8regs,&b1,&b2);
15158 + xor_speed(&t_xor_block_32regs,&b1,&b2);
15160 + free_pages((unsigned long)b1.b_data,2);
15161 + pick_fastest_function();
15164 +#else /* __sparc_v9__ */
15166 +void calibrate_xor_block(void)
15168 + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
15169 + xor_block = xor_block_VIS;
15172 +#endif /* __sparc_v9__ */
15174 +MD_EXPORT_SYMBOL(xor_block);
15176 --- linux/arch/i386/defconfig.orig Fri Nov 23 11:17:42 2001
15177 +++ linux/arch/i386/defconfig Fri Nov 23 11:18:20 2001
15180 # CONFIG_BLK_DEV_LOOP is not set
15181 # CONFIG_BLK_DEV_NBD is not set
15182 -# CONFIG_BLK_DEV_MD is not set
15183 +CONFIG_BLK_DEV_MD=y
15184 +CONFIG_AUTODETECT_RAID=y
15185 +CONFIG_MD_TRANSLUCENT=y
15186 +CONFIG_MD_LINEAR=y
15187 +CONFIG_MD_STRIPED=y
15188 +CONFIG_MD_MIRRORING=y
15191 +CONFIG_BLK_DEV_HSM=y
15192 # CONFIG_BLK_DEV_RAM is not set
15193 # CONFIG_BLK_DEV_XD is not set
15194 # CONFIG_BLK_DEV_DAC960 is not set
15195 --- linux/arch/sparc/config.in.orig Fri Nov 23 11:17:42 2001
15196 +++ linux/arch/sparc/config.in Fri Nov 23 11:18:20 2001
15197 @@ -88,10 +88,16 @@
15199 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15200 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15201 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15202 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15203 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15204 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15205 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15206 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15207 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15209 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15210 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15213 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15214 --- linux/arch/sparc/defconfig.orig Fri Nov 23 11:17:42 2001
15215 +++ linux/arch/sparc/defconfig Fri Nov 23 11:18:20 2001
15216 @@ -89,10 +89,13 @@
15218 CONFIG_BLK_DEV_FD=y
15219 CONFIG_BLK_DEV_MD=y
15220 +# CONFIG_AUTODETECT_RAID is not set
15222 CONFIG_MD_STRIPED=m
15223 CONFIG_MD_MIRRORING=m
15225 +# CONFIG_MD_TRANSLUCENT is not set
15226 +# CONFIG_MD_HSM is not set
15227 CONFIG_BLK_DEV_RAM=y
15228 CONFIG_BLK_DEV_RAM_SIZE=4096
15229 CONFIG_BLK_DEV_INITRD=y
15230 --- linux/arch/sparc64/config.in.orig Fri Nov 23 11:17:42 2001
15231 +++ linux/arch/sparc64/config.in Fri Nov 23 11:18:20 2001
15232 @@ -103,10 +103,16 @@
15234 bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD
15235 if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then
15236 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID
15237 tristate ' Linear (append) mode' CONFIG_MD_LINEAR
15238 tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED
15239 tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING
15240 tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5
15241 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT
15242 + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM
15244 +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then
15245 + bool ' Boot support (linear, striped)' CONFIG_MD_BOOT
15248 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
15249 --- linux/arch/sparc64/defconfig.orig Fri Nov 23 11:17:42 2001
15250 +++ linux/arch/sparc64/defconfig Fri Nov 23 11:18:20 2001
15251 @@ -107,10 +107,13 @@
15253 CONFIG_BLK_DEV_FD=y
15254 CONFIG_BLK_DEV_MD=y
15255 +# CONFIG_AUTODETECT_RAID is not set
15257 CONFIG_MD_STRIPED=m
15258 CONFIG_MD_MIRRORING=m
15260 +# CONFIG_MD_TRANSLUCENT is not set
15261 +# CONFIG_MD_HSM is not set
15262 CONFIG_BLK_DEV_RAM=y
15263 CONFIG_BLK_DEV_RAM_SIZE=4096
15264 CONFIG_BLK_DEV_INITRD=y
15265 --- linux/Documentation/Configure.help.orig Fri Nov 23 11:17:44 2001
15266 +++ linux/Documentation/Configure.help Fri Nov 23 11:18:20 2001
15267 @@ -1054,6 +1054,13 @@
15271 +Autodetect RAID partitions
15272 +CONFIG_AUTODETECT_RAID
15273 + This feature lets the kernel detect RAID partitions on bootup.
15274 + An autodetect RAID partition is a normal partition with partition
15275 + type 0xfd. Use this if you want to boot RAID devices, or want to
15276 + run them automatically.
15278 Linear (append) mode
15280 If you say Y here, then your multiple devices driver will be able to
15281 @@ -1132,6 +1139,21 @@
15282 Documentation/modules.txt.
15286 +Translucent Block Device Support (EXPERIMENTAL)
15287 +CONFIG_MD_TRANSLUCENT
15288 + DO NOT USE THIS STUFF YET!
15290 + currently there is only a placeholder there as the implementation
15291 + is not yet usable.
15293 +Hierarchical Storage Management support (EXPERIMENTAL)
15295 + DO NOT USE THIS STUFF YET!
15297 + i have released this so people can comment on the architecture,
15298 + but user-space tools are still unusable so there is nothing much
15299 + you can do with this.
15301 Boot support (linear, striped)