linux-2.4.23-dm-3.patch

   1 cvs dm 1.0
   2 --- diff/Documentation/Configure.help   2003-10-10 23:39:03.000000000 +0100
   3 +++ source/Documentation/Configure.help 2003-10-16 10:44:23.000000000 +0100
   4 @@ -1912,6 +1912,20 @@
   5    want), say M here and read <file:Documentation/modules.txt>.  The
   6    module will be called lvm-mod.o.
   7
   8 +Device-mapper support
   9 +CONFIG_BLK_DEV_DM
  10 +  Device-mapper is a low level volume manager.  It works by allowing
  11 +  people to specify mappings for ranges of logical sectors.  Various
  12 +  mapping types are available, in addition people may write their own
  13 +  modules containing custom mappings if they wish.
  14 +
  15 +  Higher level volume managers such as LVM2 use this driver.
  16 +
  17 +  If you want to compile this as a module, say M here and read
  18 +  <file:Documentation/modules.txt>.  The module will be called dm-mod.o.
  19 +
  20 +  If unsure, say N.
  21 +
  22  Multiple devices driver support (RAID and LVM)
  23  CONFIG_MD
  24    Support multiple physical spindles through a single logical device.
  25 --- diff/MAINTAINERS    2003-10-10 23:39:03.000000000 +0100
  26 +++ source/MAINTAINERS  2003-10-16 10:44:23.000000000 +0100
  27 @@ -572,6 +572,13 @@
  28  W:     http://www.debian.org/~dz/i8k/
  29  S:     Maintained
  30
  31 +DEVICE MAPPER
  32 +P:     Joe Thornber
  33 +M:     dm@uk.sistina.com
  34 +L:     linux-LVM@sistina.com
  35 +W:     http://www.sistina.com/lvm
  36 +S:     Maintained
  37 +
  38  DEVICE NUMBER REGISTRY
  39  P:     H. Peter Anvin
  40  M:     hpa@zytor.com
  41 --- diff/arch/mips64/kernel/ioctl32.c   2003-08-26 13:50:03.000000000 +0100
  42 +++ source/arch/mips64/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
  43 @@ -36,6 +36,7 @@
  44  #include <linux/soundcard.h>
  45
  46  #include <linux/mtd/mtd.h>
  47 +#include <linux/dm-ioctl.h>
  48  #include <linux/serial.h>
  49
  50  #ifdef CONFIG_SIBYTE_TBPROF
  51 @@ -1228,6 +1229,22 @@
  52         IOCTL32_DEFAULT(SBPROF_ZBWAITFULL),
  53  #endif /* CONFIG_SIBYTE_TBPROF */
  54
  55 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
  56 +       IOCTL32_DEFAULT(DM_VERSION),
  57 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
  58 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
  59 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
  60 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
  61 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
  62 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
  63 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
  64 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
  65 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
  66 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
  67 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
  68 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
  69 +#endif /* CONFIG_BLK_DEV_DM */
  70 +
  71         IOCTL32_DEFAULT(MTIOCTOP),                      /* mtio.h ioctls  */
  72         IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans),
  73         IOCTL32_HANDLER(MTIOCPOS32, mt_ioctl_trans),
  74 --- diff/arch/parisc/kernel/ioctl32.c   2003-08-26 13:50:03.000000000 +0100
  75 +++ source/arch/parisc/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
  76 @@ -55,6 +55,7 @@
  77  #define max max */
  78  #include <linux/lvm.h>
  79  #endif /* LVM */
  80 +#include <linux/dm-ioctl.h>
  81
  82  #include <scsi/scsi.h>
  83  /* Ugly hack. */
  84 @@ -3423,6 +3424,22 @@
  85  COMPATIBLE_IOCTL(LV_BMAP)
  86  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
  87  #endif /* LVM */
  88 +/* Device-Mapper */
  89 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
  90 +COMPATIBLE_IOCTL(DM_VERSION)
  91 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
  92 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
  93 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
  94 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
  95 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
  96 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
  97 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
  98 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
  99 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 100 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
 101 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
 102 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
 103 +#endif /* CONFIG_BLK_DEV_DM */
 104  #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
 105  COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
 106  COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
 107 --- diff/arch/ppc64/kernel/ioctl32.c    2003-08-26 13:50:04.000000000 +0100
 108 +++ source/arch/ppc64/kernel/ioctl32.c  2003-10-16 10:44:23.000000000 +0100
 109 @@ -66,6 +66,7 @@
 110  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
 111  #include <linux/lvm.h>
 112  #endif /* LVM */
 113 +#include <linux/dm-ioctl.h>
 114
 115  #include <scsi/scsi.h>
 116  /* Ugly hack. */
 117 @@ -4435,6 +4436,22 @@
 118  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
 119  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
 120  COMPATIBLE_IOCTL(NBD_DISCONNECT),
 121 +/* device-mapper */
 122 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
 123 +COMPATIBLE_IOCTL(DM_VERSION),
 124 +COMPATIBLE_IOCTL(DM_REMOVE_ALL),
 125 +COMPATIBLE_IOCTL(DM_DEV_CREATE),
 126 +COMPATIBLE_IOCTL(DM_DEV_REMOVE),
 127 +COMPATIBLE_IOCTL(DM_TABLE_LOAD),
 128 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
 129 +COMPATIBLE_IOCTL(DM_DEV_RENAME),
 130 +COMPATIBLE_IOCTL(DM_TABLE_DEPS),
 131 +COMPATIBLE_IOCTL(DM_DEV_STATUS),
 132 +COMPATIBLE_IOCTL(DM_TABLE_STATUS),
 133 +COMPATIBLE_IOCTL(DM_DEV_WAIT),
 134 +COMPATIBLE_IOCTL(DM_LIST_DEVICES),
 135 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
 136 +#endif /* CONFIG_BLK_DEV_DM */
 137  /* Remove *PRIVATE in 2.5 */
 138  COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
 139  COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
 140 --- diff/arch/s390x/kernel/ioctl32.c    2003-08-26 13:50:04.000000000 +0100
 141 +++ source/arch/s390x/kernel/ioctl32.c  2003-10-16 10:44:23.000000000 +0100
 142 @@ -30,6 +30,7 @@
 143  #include <linux/blk.h>
 144  #include <linux/elevator.h>
 145  #include <linux/raw.h>
 146 +#include <linux/dm-ioctl.h>
 147  #include <asm/types.h>
 148  #include <asm/uaccess.h>
 149  #include <asm/dasd.h>
 150 @@ -627,6 +628,20 @@
 151
 152         IOCTL32_DEFAULT(SIOCGSTAMP),
 153
 154 +       IOCTL32_DEFAULT(DM_VERSION),
 155 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
 156 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
 157 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
 158 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
 159 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
 160 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
 161 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
 162 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
 163 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
 164 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
 165 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
 166 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
 167 +
 168         IOCTL32_DEFAULT(LOOP_SET_FD),
 169         IOCTL32_DEFAULT(LOOP_CLR_FD),
 170
 171 --- diff/arch/sparc64/kernel/ioctl32.c  2003-10-10 23:39:05.000000000 +0100
 172 +++ source/arch/sparc64/kernel/ioctl32.c        2003-10-16 10:44:23.000000000 +0100
 173 @@ -56,6 +56,7 @@
 174  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
 175  #include <linux/lvm.h>
 176  #endif /* LVM */
 177 +#include <linux/dm-ioctl.h>
 178
 179  #include <scsi/scsi.h>
 180  /* Ugly hack. */
 181 @@ -5086,6 +5087,22 @@
 182  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
 183  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
 184  COMPATIBLE_IOCTL(NBD_DISCONNECT)
 185 +/* device-mapper */
 186 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
 187 +COMPATIBLE_IOCTL(DM_VERSION)
 188 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 189 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
 190 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
 191 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
 192 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
 193 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
 194 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 195 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
 196 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 197 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
 198 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
 199 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
 200 +#endif /* CONFIG_BLK_DEV_DM */
 201  /* Linux-1394 */
 202  #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
 203  COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
 204 --- diff/arch/x86_64/ia32/ia32_ioctl.c  2003-10-10 23:39:05.000000000 +0100
 205 +++ source/arch/x86_64/ia32/ia32_ioctl.c        2003-10-16 10:44:23.000000000 +0100
 206 @@ -67,6 +67,7 @@
 207  #define max max
 208  #include <linux/lvm.h>
 209  #endif /* LVM */
 210 +#include <linux/dm-ioctl.h>
 211
 212  #include <scsi/scsi.h>
 213  /* Ugly hack. */
 214 @@ -4051,6 +4052,22 @@
 215  COMPATIBLE_IOCTL(LV_BMAP)
 216  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
 217  #endif /* LVM */
 218 +/* Device-Mapper */
 219 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
 220 +COMPATIBLE_IOCTL(DM_VERSION)
 221 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 222 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
 223 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
 224 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
 225 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
 226 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
 227 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 228 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
 229 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 230 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
 231 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
 232 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
 233 +#endif /* CONFIG_BLK_DEV_DM */
 234  #ifdef CONFIG_AUTOFS_FS
 235  COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
 236  COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
 237 --- diff/drivers/md/Config.in   2001-09-26 16:15:05.000000000 +0100
 238 +++ source/drivers/md/Config.in 2003-10-16 10:44:23.000000000 +0100
 239 @@ -14,5 +14,9 @@
 240  dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
 241
 242  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
 243 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
 244 +   dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD
 245 +   dep_tristate '  Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
 246 +fi
 247
 248  endmenu
 249 --- diff/drivers/md/Makefile    2002-01-17 10:07:52.000000000 +0000
 250 +++ source/drivers/md/Makefile  2003-10-16 10:44:23.000000000 +0100
 251 @@ -4,24 +4,41 @@
 252
 253  O_TARGET       := mddev.o
 254
 255 -export-objs    := md.o xor.o
 256 -list-multi     := lvm-mod.o
 257 +export-objs    := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \
 258 +                  dm-log.o dm-io.o dm.o
 259 +
 260 +list-multi     := lvm-mod.o dm-mod.o dm-mirror-mod.o
 261  lvm-mod-objs   := lvm.o lvm-snap.o lvm-fs.o
 262 +dm-mod-objs    := dm.o dm-table.o dm-target.o dm-ioctl.o \
 263 +                  dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
 264 +                  kcopyd.o dm-daemon.o dm-io.o
 265 +dm-mirror-mod-objs := dm-raid1.o dm-log.o
 266
 267  # Note: link order is important.  All raid personalities
 268  # and xor.o must come before md.o, as they each initialise
 269  # themselves, and md.o may use the personalities when it
 270  # auto-initialised.
 271
 272 -obj-$(CONFIG_MD_LINEAR)                += linear.o
 273 -obj-$(CONFIG_MD_RAID0)         += raid0.o
 274 -obj-$(CONFIG_MD_RAID1)         += raid1.o
 275 -obj-$(CONFIG_MD_RAID5)         += raid5.o xor.o
 276 -obj-$(CONFIG_MD_MULTIPATH)     += multipath.o
 277 -obj-$(CONFIG_BLK_DEV_MD)       += md.o
 278 -obj-$(CONFIG_BLK_DEV_LVM)      += lvm-mod.o
 279 +obj-$(CONFIG_MD_LINEAR)                        += linear.o
 280 +obj-$(CONFIG_MD_RAID0)                 += raid0.o
 281 +obj-$(CONFIG_MD_RAID1)                 += raid1.o
 282 +obj-$(CONFIG_MD_RAID5)                 += raid5.o xor.o
 283 +obj-$(CONFIG_MD_MULTIPATH)             += multipath.o
 284 +obj-$(CONFIG_BLK_DEV_MD)               += md.o
 285 +
 286 +obj-$(CONFIG_BLK_DEV_LVM)              += lvm-mod.o
 287 +
 288 +obj-$(CONFIG_BLK_DEV_DM)               += dm-mod.o
 289 +obj-$(CONFIG_BLK_DEV_DM_MIRROR)                += dm-mirror.o
 290
 291  include $(TOPDIR)/Rules.make
 292
 293  lvm-mod.o: $(lvm-mod-objs)
 294         $(LD) -r -o $@ $(lvm-mod-objs)
 295 +
 296 +dm-mod.o: $(dm-mod-objs)
 297 +       $(LD) -r -o $@ $(dm-mod-objs)
 298 +
 299 +dm-mirror.o: $(dm-mirror-mod-objs)
 300 +       $(LD) -r -o $@ $(dm-mirror-mod-objs)
 301 +
 302 --- diff/fs/buffer.c    2003-10-10 23:39:08.000000000 +0100
 303 +++ source/fs/buffer.c  2003-10-16 10:44:23.000000000 +0100
 304 @@ -763,6 +763,7 @@
 305         bh->b_list = BUF_CLEAN;
 306         bh->b_end_io = handler;
 307         bh->b_private = private;
 308 +       bh->b_journal_head = NULL;
 309  }
 310
 311  static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 312 --- diff/fs/jbd/journal.c       2003-10-10 23:39:08.000000000 +0100
 313 +++ source/fs/jbd/journal.c     2003-10-16 10:44:23.000000000 +0100
 314 @@ -1802,9 +1802,9 @@
 315
 316                 if (buffer_jbd(bh)) {
 317                         /* Someone did it for us! */
 318 -                       J_ASSERT_BH(bh, bh->b_private != NULL);
 319 +                       J_ASSERT_BH(bh, bh->b_journal_head != NULL);
 320                         journal_free_journal_head(jh);
 321 -                       jh = bh->b_private;
 322 +                       jh = bh->b_journal_head;
 323                 } else {
 324                         /*
 325                          * We actually don't need jh_splice_lock when
 326 @@ -1812,7 +1812,7 @@
 327                          */
 328                         spin_lock(&jh_splice_lock);
 329                         set_bit(BH_JBD, &bh->b_state);
 330 -                       bh->b_private = jh;
 331 +                       bh->b_journal_head = jh;
 332                         jh->b_bh = bh;
 333                         atomic_inc(&bh->b_count);
 334                         spin_unlock(&jh_splice_lock);
 335 @@ -1821,7 +1821,7 @@
 336         }
 337         jh->b_jcount++;
 338         spin_unlock(&journal_datalist_lock);
 339 -       return bh->b_private;
 340 +       return bh->b_journal_head;
 341  }
 342
 343  /*
 344 @@ -1854,7 +1854,7 @@
 345                         J_ASSERT_BH(bh, jh2bh(jh) == bh);
 346                         BUFFER_TRACE(bh, "remove journal_head");
 347                         spin_lock(&jh_splice_lock);
 348 -                       bh->b_private = NULL;
 349 +                       bh->b_journal_head = NULL;
 350                         jh->b_bh = NULL;        /* debug, really */
 351                         clear_bit(BH_JBD, &bh->b_state);
 352                         __brelse(bh);
 353 --- diff/include/linux/fs.h     2003-10-10 23:39:08.000000000 +0100
 354 +++ source/include/linux/fs.h   2003-10-16 10:44:23.000000000 +0100
 355 @@ -265,7 +265,7 @@
 356         struct page *b_page;            /* the page this bh is mapped to */
 357         void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
 358         void *b_private;                /* reserved for b_end_io */
 359 -
 360 +       void *b_journal_head;           /* ext3 journal_heads */
 361         unsigned long b_rsector;        /* Real buffer location on disk */
 362         wait_queue_head_t b_wait;
 363
 364 --- diff/include/linux/jbd.h    2003-06-16 09:56:12.000000000 +0100
 365 +++ source/include/linux/jbd.h  2003-10-16 10:44:23.000000000 +0100
 366 @@ -311,7 +311,7 @@
 367
 368  static inline struct journal_head *bh2jh(struct buffer_head *bh)
 369  {
 370 -       return bh->b_private;
 371 +       return bh->b_journal_head;
 372  }
 373
 374  #define HAVE_JOURNAL_CALLBACK_STATUS
 375 --- diff/include/linux/vmalloc.h        2003-08-26 13:50:14.000000000 +0100
 376 +++ source/include/linux/vmalloc.h      2003-10-16 10:44:23.000000000 +0100
 377 @@ -29,6 +29,7 @@
 378  extern void vmfree_area_pages(unsigned long address, unsigned long size);
 379  extern int vmalloc_area_pages(unsigned long address, unsigned long size,
 380                                int gfp_mask, pgprot_t prot);
 381 +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
 382
 383  /*
 384   *     Allocate any pages
 385 --- diff/kernel/ksyms.c 2003-10-10 23:39:08.000000000 +0100
 386 +++ source/kernel/ksyms.c       2003-10-16 10:44:23.000000000 +0100
 387 @@ -114,6 +114,7 @@
 388  EXPORT_SYMBOL(__vmalloc);
 389  EXPORT_SYMBOL(vmap);
 390  EXPORT_SYMBOL(vmalloc_to_page);
 391 +EXPORT_SYMBOL(vcalloc);
 392  EXPORT_SYMBOL(mem_map);
 393  EXPORT_SYMBOL(remap_page_range);
 394  EXPORT_SYMBOL(max_mapnr);
 395 --- diff/mm/Makefile    2002-08-05 14:57:44.000000000 +0100
 396 +++ source/mm/Makefile  2003-10-16 10:44:23.000000000 +0100
 397 @@ -9,12 +9,12 @@
 398
 399  O_TARGET := mm.o
 400
 401 -export-objs := shmem.o filemap.o memory.o page_alloc.o
 402 +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
 403
 404  obj-y   := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 405             vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 406             page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 407 -           shmem.o
 408 +           shmem.o mempool.o
 409
 410  obj-$(CONFIG_HIGHMEM) += highmem.o
 411
 412 --- diff/mm/filemap.c   2003-10-10 23:39:08.000000000 +0100
 413 +++ source/mm/filemap.c 2003-10-16 10:44:23.000000000 +0100
 414 @@ -1753,7 +1753,8 @@
 415                 }
 416                 up(&inode->i_sem);
 417                 up_read(&inode->i_alloc_sem);
 418 -               UPDATE_ATIME(filp->f_dentry->d_inode);
 419 +               if (!S_ISBLK(inode->i_mode))
 420 +                       UPDATE_ATIME(filp->f_dentry->d_inode);
 421                 goto out;
 422         }
 423  }
 424 @@ -3131,8 +3132,12 @@
 425                 goto out;
 426
 427         remove_suid(inode);
 428 -       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 429 -       mark_inode_dirty_sync(inode);
 430 +
 431 +       /* Don't update times for block devices using O_DIRECT */
 432 +       if (!(file->f_flags & O_DIRECT) || !S_ISBLK(inode->i_mode)) {
 433 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 434 +               mark_inode_dirty_sync(inode);
 435 +       }
 436
 437         do {
 438                 unsigned long index, offset;
 439 --- diff/mm/vmalloc.c   2003-08-26 13:50:14.000000000 +0100
 440 +++ source/mm/vmalloc.c 2003-10-16 10:44:23.000000000 +0100
 441 @@ -374,3 +374,22 @@
 442         read_unlock(&vmlist_lock);
 443         return buf - buf_start;
 444  }
 445 +
 446 +void *vcalloc(unsigned long nmemb, unsigned long elem_size)
 447 +{
 448 +       unsigned long size;
 449 +       void *addr;
 450 +
 451 +       /*
 452 +        * Check that we're not going to overflow.
 453 +        */
 454 +       if (nmemb > (ULONG_MAX / elem_size))
 455 +               return NULL;
 456 +
 457 +       size = nmemb * elem_size;
 458 +       addr = vmalloc(size);
 459 +       if (addr)
 460 +               memset(addr, 0, size);
 461 +
 462 +       return addr;
 463 +}
 464 --- diff/drivers/md/dm-daemon.c 1970-01-01 01:00:00.000000000 +0100
 465 +++ source/drivers/md/dm-daemon.c       2003-10-16 10:44:23.000000000 +0100
 466 @@ -0,0 +1,113 @@
 467 +/*
 468 + * Copyright (C) 2003 Sistina Software
 469 + *
 470 + * This file is released under the LGPL.
 471 + */
 472 +
 473 +#include "dm.h"
 474 +#include "dm-daemon.h"
 475 +
 476 +#include <linux/module.h>
 477 +#include <linux/sched.h>
 478 +
 479 +static int daemon(void *arg)
 480 +{
 481 +       struct dm_daemon *dd = (struct dm_daemon *) arg;
 482 +       DECLARE_WAITQUEUE(wq, current);
 483 +
 484 +       daemonize();
 485 +       reparent_to_init();
 486 +
 487 +       /* block all signals */
 488 +       spin_lock_irq(&current->sigmask_lock);
 489 +       sigfillset(&current->blocked);
 490 +       flush_signals(current);
 491 +       spin_unlock_irq(&current->sigmask_lock);
 492 +
 493 +       strcpy(current->comm, dd->name);
 494 +       atomic_set(&dd->please_die, 0);
 495 +
 496 +       add_wait_queue(&dd->job_queue, &wq);
 497 +
 498 +       down(&dd->run_lock);
 499 +       up(&dd->start_lock);
 500 +
 501 +       /*
 502 +        * dd->fn() could do anything, very likely it will
 503 +        * suspend.  So we can't set the state to
 504 +        * TASK_INTERRUPTIBLE before calling it.  In order to
 505 +        * prevent a race with a waking thread we do this little
 506 +        * dance with the dd->woken variable.
 507 +        */
 508 +       while (1) {
 509 +               do {
 510 +                       set_current_state(TASK_RUNNING);
 511 +
 512 +                       if (atomic_read(&dd->please_die))
 513 +                               goto out;
 514 +
 515 +                       atomic_set(&dd->woken, 0);
 516 +                       dd->fn();
 517 +                       yield();
 518 +
 519 +                       set_current_state(TASK_INTERRUPTIBLE);
 520 +               } while (atomic_read(&dd->woken));
 521 +
 522 +               schedule();
 523 +       }
 524 +
 525 + out:
 526 +       remove_wait_queue(&dd->job_queue, &wq);
 527 +       up(&dd->run_lock);
 528 +       return 0;
 529 +}
 530 +
 531 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
 532 +{
 533 +       pid_t pid = 0;
 534 +
 535 +       /*
 536 +        * Initialise the dm_daemon.
 537 +        */
 538 +       dd->fn = fn;
 539 +       strncpy(dd->name, name, sizeof(dd->name) - 1);
 540 +       sema_init(&dd->start_lock, 1);
 541 +       sema_init(&dd->run_lock, 1);
 542 +       init_waitqueue_head(&dd->job_queue);
 543 +
 544 +       /*
 545 +        * Start the new thread.
 546 +        */
 547 +       down(&dd->start_lock);
 548 +       pid = kernel_thread(daemon, dd, 0);
 549 +       if (pid <= 0) {
 550 +               DMERR("Failed to start kcopyd thread");
 551 +               return -EAGAIN;
 552 +       }
 553 +
 554 +       /*
 555 +        * wait for the daemon to up this mutex.
 556 +        */
 557 +       down(&dd->start_lock);
 558 +       up(&dd->start_lock);
 559 +
 560 +       return 0;
 561 +}
 562 +
 563 +void dm_daemon_stop(struct dm_daemon *dd)
 564 +{
 565 +       atomic_set(&dd->please_die, 1);
 566 +       dm_daemon_wake(dd);
 567 +       down(&dd->run_lock);
 568 +       up(&dd->run_lock);
 569 +}
 570 +
 571 +void dm_daemon_wake(struct dm_daemon *dd)
 572 +{
 573 +       atomic_set(&dd->woken, 1);
 574 +       wake_up_interruptible(&dd->job_queue);
 575 +}
 576 +
 577 +EXPORT_SYMBOL(dm_daemon_start);
 578 +EXPORT_SYMBOL(dm_daemon_stop);
 579 +EXPORT_SYMBOL(dm_daemon_wake);
 580 --- diff/drivers/md/dm-daemon.h 1970-01-01 01:00:00.000000000 +0100
 581 +++ source/drivers/md/dm-daemon.h       2003-10-16 10:44:23.000000000 +0100
 582 @@ -0,0 +1,29 @@
 583 +/*
 584 + * Copyright (C) 2003 Sistina Software
 585 + *
 586 + * This file is released under the LGPL.
 587 + */
 588 +
 589 +#ifndef DM_DAEMON_H
 590 +#define DM_DAEMON_H
 591 +
 592 +#include <asm/atomic.h>
 593 +#include <asm/semaphore.h>
 594 +
 595 +struct dm_daemon {
 596 +       void (*fn)(void);
 597 +       char name[16];
 598 +       atomic_t please_die;
 599 +       struct semaphore start_lock;
 600 +       struct semaphore run_lock;
 601 +
 602 +       atomic_t woken;
 603 +       wait_queue_head_t job_queue;
 604 +};
 605 +
 606 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
 607 +void dm_daemon_stop(struct dm_daemon *dd);
 608 +void dm_daemon_wake(struct dm_daemon *dd);
 609 +int dm_daemon_running(struct dm_daemon *dd);
 610 +
 611 +#endif
 612 --- diff/drivers/md/dm-exception-store.c        1970-01-01 01:00:00.000000000 +0100
 613 +++ source/drivers/md/dm-exception-store.c      2003-10-16 10:44:23.000000000 +0100
 614 @@ -0,0 +1,675 @@
 615 +/*
 616 + * dm-snapshot.c
 617 + *
 618 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
 619 + *
 620 + * This file is released under the GPL.
 621 + */
 622 +
 623 +#include "dm-snapshot.h"
 624 +#include "dm-io.h"
 625 +#include "kcopyd.h"
 626 +
 627 +#include <linux/mm.h>
 628 +#include <linux/pagemap.h>
 629 +#include <linux/vmalloc.h>
 630 +#include <linux/slab.h>
 631 +
 632 +/*-----------------------------------------------------------------
 633 + * Persistent snapshots, by persistent we mean that the snapshot
 634 + * will survive a reboot.
 635 + *---------------------------------------------------------------*/
 636 +
 637 +/*
 638 + * We need to store a record of which parts of the origin have
 639 + * been copied to the snapshot device.  The snapshot code
 640 + * requires that we copy exception chunks to chunk aligned areas
 641 + * of the COW store.  It makes sense therefore, to store the
 642 + * metadata in chunk size blocks.
 643 + *
 644 + * There is no backward or forward compatibility implemented,
 645 + * snapshots with different disk versions than the kernel will
 646 + * not be usable.  It is expected that "lvcreate" will blank out
 647 + * the start of a fresh COW device before calling the snapshot
 648 + * constructor.
 649 + *
 650 + * The first chunk of the COW device just contains the header.
 651 + * After this there is a chunk filled with exception metadata,
 652 + * followed by as many exception chunks as can fit in the
 653 + * metadata areas.
 654 + *
 655 + * All on disk structures are in little-endian format.  The end
 656 + * of the exceptions info is indicated by an exception with a
 657 + * new_chunk of 0, which is invalid since it would point to the
 658 + * header chunk.
 659 + */
 660 +
 661 +/*
 662 + * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
 663 + */
 664 +#define SNAP_MAGIC 0x70416e53
 665 +
 666 +/*
 667 + * The on-disk version of the metadata.
 668 + */
 669 +#define SNAPSHOT_DISK_VERSION 1
 670 +
 671 +struct disk_header {
 672 +       uint32_t magic;
 673 +
 674 +       /*
 675 +        * Is this snapshot valid.  There is no way of recovering
 676 +        * an invalid snapshot.
 677 +        */
 678 +       uint32_t valid;
 679 +
 680 +       /*
 681 +        * Simple, incrementing version. no backward
 682 +        * compatibility.
 683 +        */
 684 +       uint32_t version;
 685 +
 686 +       /* In sectors */
 687 +       uint32_t chunk_size;
 688 +};
 689 +
 690 +struct disk_exception {
 691 +       uint64_t old_chunk;
 692 +       uint64_t new_chunk;
 693 +};
 694 +
 695 +struct commit_callback {
 696 +       void (*callback)(void *, int success);
 697 +       void *context;
 698 +};
 699 +
 700 +/*
 701 + * The top level structure for a persistent exception store.
 702 + */
 703 +struct pstore {
 704 +       struct dm_snapshot *snap;       /* up pointer to my snapshot */
 705 +       int version;
 706 +       int valid;
 707 +       uint32_t chunk_size;
 708 +       uint32_t exceptions_per_area;
 709 +
 710 +       /*
 711 +        * Now that we have an asynchronous kcopyd there is no
 712 +        * need for large chunk sizes, so it wont hurt to have a
 713 +        * whole chunks worth of metadata in memory at once.
 714 +        */
 715 +       void *area;
 716 +
 717 +       /*
 718 +        * Used to keep track of which metadata area the data in
 719 +        * 'chunk' refers to.
 720 +        */
 721 +       uint32_t current_area;
 722 +
 723 +       /*
 724 +        * The next free chunk for an exception.
 725 +        */
 726 +       uint32_t next_free;
 727 +
 728 +       /*
 729 +        * The index of next free exception in the current
 730 +        * metadata area.
 731 +        */
 732 +       uint32_t current_committed;
 733 +
 734 +       atomic_t pending_count;
 735 +       uint32_t callback_count;
 736 +       struct commit_callback *callbacks;
 737 +};
 738 +
 739 +static inline unsigned int sectors_to_pages(unsigned int sectors)
 740 +{
 741 +       return sectors / (PAGE_SIZE / SECTOR_SIZE);
 742 +}
 743 +
 744 +static int alloc_area(struct pstore *ps)
 745 +{
 746 +       int r = -ENOMEM;
 747 +       size_t i, len, nr_pages;
 748 +       struct page *page, *last = NULL;
 749 +
 750 +       len = ps->chunk_size << SECTOR_SHIFT;
 751 +
 752 +       /*
 753 +        * Allocate the chunk_size block of memory that will hold
 754 +        * a single metadata area.
 755 +        */
 756 +       ps->area = vmalloc(len);
 757 +       if (!ps->area)
 758 +               return r;
 759 +
 760 +       nr_pages = sectors_to_pages(ps->chunk_size);
 761 +
 762 +       /*
 763 +        * We lock the pages for ps->area into memory since
 764 +        * they'll be doing a lot of io.  We also chain them
 765 +        * together ready for dm-io.
 766 +        */
 767 +       for (i = 0; i < nr_pages; i++) {
 768 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
 769 +               LockPage(page);
 770 +               if (last)
 771 +                       last->list.next = &page->list;
 772 +               last = page;
 773 +       }
 774 +
 775 +       return 0;
 776 +}
 777 +
 778 +static void free_area(struct pstore *ps)
 779 +{
 780 +       size_t i, nr_pages;
 781 +       struct page *page;
 782 +
 783 +       nr_pages = sectors_to_pages(ps->chunk_size);
 784 +       for (i = 0; i < nr_pages; i++) {
 785 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
 786 +               page->list.next = NULL;
 787 +               UnlockPage(page);
 788 +       }
 789 +
 790 +       vfree(ps->area);
 791 +}
 792 +
 793 +/*
 794 + * Read or write a chunk aligned and sized block of data from a device.
 795 + */
 796 +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
 797 +{
 798 +       struct io_region where;
 799 +       unsigned int bits;
 800 +
 801 +       where.dev = ps->snap->cow->dev;
 802 +       where.sector = ps->chunk_size * chunk;
 803 +       where.count = ps->chunk_size;
 804 +
 805 +       return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
 806 +}
 807 +
 808 +/*
 809 + * Read or write a metadata area.  Remembering to skip the first
 810 + * chunk which holds the header.
 811 + */
 812 +static int area_io(struct pstore *ps, uint32_t area, int rw)
 813 +{
 814 +       int r;
 815 +       uint32_t chunk;
 816 +
 817 +       /* convert a metadata area index to a chunk index */
 818 +       chunk = 1 + ((ps->exceptions_per_area + 1) * area);
 819 +
 820 +       r = chunk_io(ps, chunk, rw);
 821 +       if (r)
 822 +               return r;
 823 +
 824 +       ps->current_area = area;
 825 +       return 0;
 826 +}
 827 +
 828 +static int zero_area(struct pstore *ps, uint32_t area)
 829 +{
 830 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
 831 +       return area_io(ps, area, WRITE);
 832 +}
 833 +
 834 +static int read_header(struct pstore *ps, int *new_snapshot)
 835 +{
 836 +       int r;
 837 +       struct disk_header *dh;
 838 +
 839 +       r = chunk_io(ps, 0, READ);
 840 +       if (r)
 841 +               return r;
 842 +
 843 +       dh = (struct disk_header *) ps->area;
 844 +
 845 +       if (le32_to_cpu(dh->magic) == 0) {
 846 +               *new_snapshot = 1;
 847 +
 848 +       } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
 849 +               *new_snapshot = 0;
 850 +               ps->valid = le32_to_cpu(dh->valid);
 851 +               ps->version = le32_to_cpu(dh->version);
 852 +               ps->chunk_size = le32_to_cpu(dh->chunk_size);
 853 +
 854 +       } else {
 855 +               DMWARN("Invalid/corrupt snapshot");
 856 +               r = -ENXIO;
 857 +       }
 858 +
 859 +       return r;
 860 +}
 861 +
 862 +static int write_header(struct pstore *ps)
 863 +{
 864 +       struct disk_header *dh;
 865 +
 866 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
 867 +
 868 +       dh = (struct disk_header *) ps->area;
 869 +       dh->magic = cpu_to_le32(SNAP_MAGIC);
 870 +       dh->valid = cpu_to_le32(ps->valid);
 871 +       dh->version = cpu_to_le32(ps->version);
 872 +       dh->chunk_size = cpu_to_le32(ps->chunk_size);
 873 +
 874 +       return chunk_io(ps, 0, WRITE);
 875 +}
 876 +
 877 +/*
 878 + * Access functions for the disk exceptions, these do the endian conversions.
 879 + */
 880 +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 881 +{
 882 +       if (index >= ps->exceptions_per_area)
 883 +               return NULL;
 884 +
 885 +       return ((struct disk_exception *) ps->area) + index;
 886 +}
 887 +
 888 +static int read_exception(struct pstore *ps,
 889 +                         uint32_t index, struct disk_exception *result)
 890 +{
 891 +       struct disk_exception *e;
 892 +
 893 +       e = get_exception(ps, index);
 894 +       if (!e)
 895 +               return -EINVAL;
 896 +
 897 +       /* copy it */
 898 +       result->old_chunk = le64_to_cpu(e->old_chunk);
 899 +       result->new_chunk = le64_to_cpu(e->new_chunk);
 900 +
 901 +       return 0;
 902 +}
 903 +
 904 +static int write_exception(struct pstore *ps,
 905 +                          uint32_t index, struct disk_exception *de)
 906 +{
 907 +       struct disk_exception *e;
 908 +
 909 +       e = get_exception(ps, index);
 910 +       if (!e)
 911 +               return -EINVAL;
 912 +
 913 +       /* copy it */
 914 +       e->old_chunk = cpu_to_le64(de->old_chunk);
 915 +       e->new_chunk = cpu_to_le64(de->new_chunk);
 916 +
 917 +       return 0;
 918 +}
 919 +
 920 +/*
 921 + * Registers the exceptions that are present in the current area.
 922 + * 'full' is filled in to indicate if the area has been
 923 + * filled.
 924 + */
 925 +static int insert_exceptions(struct pstore *ps, int *full)
 926 +{
 927 +       int r;
 928 +       unsigned int i;
 929 +       struct disk_exception de;
 930 +
 931 +       /* presume the area is full */
 932 +       *full = 1;
 933 +
 934 +       for (i = 0; i < ps->exceptions_per_area; i++) {
 935 +               r = read_exception(ps, i, &de);
 936 +
 937 +               if (r)
 938 +                       return r;
 939 +
 940 +               /*
 941 +                * If the new_chunk is pointing at the start of
 942 +                * the COW device, where the first metadata area
 943 +                * is we know that we've hit the end of the
 944 +                * exceptions.  Therefore the area is not full.
 945 +                */
 946 +               if (de.new_chunk == 0LL) {
 947 +                       ps->current_committed = i;
 948 +                       *full = 0;
 949 +                       break;
 950 +               }
 951 +
 952 +               /*
 953 +                * Keep track of the start of the free chunks.
 954 +                */
 955 +               if (ps->next_free <= de.new_chunk)
 956 +                       ps->next_free = de.new_chunk + 1;
 957 +
 958 +               /*
 959 +                * Otherwise we add the exception to the snapshot.
 960 +                */
 961 +               r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
 962 +               if (r)
 963 +                       return r;
 964 +       }
 965 +
 966 +       return 0;
 967 +}
 968 +
 969 +static int read_exceptions(struct pstore *ps)
 970 +{
 971 +       uint32_t area;
 972 +       int r, full = 1;
 973 +
 974 +       /*
 975 +        * Keeping reading chunks and inserting exceptions until
 976 +        * we find a partially full area.
 977 +        */
 978 +       for (area = 0; full; area++) {
 979 +               r = area_io(ps, area, READ);
 980 +               if (r)
 981 +                       return r;
 982 +
 983 +               r = insert_exceptions(ps, &full);
 984 +               if (r)
 985 +                       return r;
 986 +
 987 +               area++;
 988 +       }
 989 +
 990 +       return 0;
 991 +}
 992 +
 993 +static inline struct pstore *get_info(struct exception_store *store)
 994 +{
 995 +       return (struct pstore *) store->context;
 996 +}
 997 +
 998 +static void persistent_fraction_full(struct exception_store *store,
 999 +                                    sector_t *numerator, sector_t *denominator)
1000 +{
1001 +       *numerator = get_info(store)->next_free * store->snap->chunk_size;
1002 +       *denominator = get_dev_size(store->snap->cow->dev);
1003 +}
1004 +
1005 +static void persistent_destroy(struct exception_store *store)
1006 +{
1007 +       struct pstore *ps = get_info(store);
1008 +
1009 +       dm_io_put(sectors_to_pages(ps->chunk_size));
1010 +       vfree(ps->callbacks);
1011 +       free_area(ps);
1012 +       kfree(ps);
1013 +}
1014 +
1015 +static int persistent_read_metadata(struct exception_store *store)
1016 +{
1017 +       int r, new_snapshot;
1018 +       struct pstore *ps = get_info(store);
1019 +
1020 +       /*
1021 +        * Read the snapshot header.
1022 +        */
1023 +       r = read_header(ps, &new_snapshot);
1024 +       if (r)
1025 +               return r;
1026 +
1027 +       /*
1028 +        * Do we need to setup a new snapshot ?
1029 +        */
1030 +       if (new_snapshot) {
1031 +               r = write_header(ps);
1032 +               if (r) {
1033 +                       DMWARN("write_header failed");
1034 +                       return r;
1035 +               }
1036 +
1037 +               r = zero_area(ps, 0);
1038 +               if (r) {
1039 +                       DMWARN("zero_area(0) failed");
1040 +                       return r;
1041 +               }
1042 +
1043 +       } else {
1044 +               /*
1045 +                * Sanity checks.
1046 +                */
1047 +               if (!ps->valid) {
1048 +                       DMWARN("snapshot is marked invalid");
1049 +                       return -EINVAL;
1050 +               }
1051 +
1052 +               if (ps->version != SNAPSHOT_DISK_VERSION) {
1053 +                       DMWARN("unable to handle snapshot disk version %d",
1054 +                              ps->version);
1055 +                       return -EINVAL;
1056 +               }
1057 +
1058 +               /*
1059 +                * Read the metadata.
1060 +                */
1061 +               r = read_exceptions(ps);
1062 +               if (r)
1063 +                       return r;
1064 +       }
1065 +
1066 +       return 0;
1067 +}
1068 +
1069 +static int persistent_prepare(struct exception_store *store,
1070 +                             struct exception *e)
1071 +{
1072 +       struct pstore *ps = get_info(store);
1073 +       uint32_t stride;
1074 +       sector_t size = get_dev_size(store->snap->cow->dev);
1075 +
1076 +       /* Is there enough room ? */
1077 +       if (size < ((ps->next_free + 1) * store->snap->chunk_size))
1078 +               return -ENOSPC;
1079 +
1080 +       e->new_chunk = ps->next_free;
1081 +
1082 +       /*
1083 +        * Move onto the next free pending, making sure to take
1084 +        * into account the location of the metadata chunks.
1085 +        */
1086 +       stride = (ps->exceptions_per_area + 1);
1087 +       if ((++ps->next_free % stride) == 1)
1088 +               ps->next_free++;
1089 +
1090 +       atomic_inc(&ps->pending_count);
1091 +       return 0;
1092 +}
1093 +
1094 +static void persistent_commit(struct exception_store *store,
1095 +                             struct exception *e,
1096 +                             void (*callback) (void *, int success),
1097 +                             void *callback_context)
1098 +{
1099 +       int r;
1100 +       unsigned int i;
1101 +       struct pstore *ps = get_info(store);
1102 +       struct disk_exception de;
1103 +       struct commit_callback *cb;
1104 +
1105 +       de.old_chunk = e->old_chunk;
1106 +       de.new_chunk = e->new_chunk;
1107 +       write_exception(ps, ps->current_committed++, &de);
1108 +
1109 +       /*
1110 +        * Add the callback to the back of the array.  This code
1111 +        * is the only place where the callback array is
1112 +        * manipulated, and we know that it will never be called
1113 +        * multiple times concurrently.
1114 +        */
1115 +       cb = ps->callbacks + ps->callback_count++;
1116 +       cb->callback = callback;
1117 +       cb->context = callback_context;
1118 +
1119 +       /*
1120 +        * If there are no more exceptions in flight, or we have
1121 +        * filled this metadata area we commit the exceptions to
1122 +        * disk.
1123 +        */
1124 +       if (atomic_dec_and_test(&ps->pending_count) ||
1125 +           (ps->current_committed == ps->exceptions_per_area)) {
1126 +               r = area_io(ps, ps->current_area, WRITE);
1127 +               if (r)
1128 +                       ps->valid = 0;
1129 +
1130 +               for (i = 0; i < ps->callback_count; i++) {
1131 +                       cb = ps->callbacks + i;
1132 +                       cb->callback(cb->context, r == 0 ? 1 : 0);
1133 +               }
1134 +
1135 +               ps->callback_count = 0;
1136 +       }
1137 +
1138 +       /*
1139 +        * Have we completely filled the current area ?
1140 +        */
1141 +       if (ps->current_committed == ps->exceptions_per_area) {
1142 +               ps->current_committed = 0;
1143 +               r = zero_area(ps, ps->current_area + 1);
1144 +               if (r)
1145 +                       ps->valid = 0;
1146 +       }
1147 +}
1148 +
1149 +static void persistent_drop(struct exception_store *store)
1150 +{
1151 +       struct pstore *ps = get_info(store);
1152 +
1153 +       ps->valid = 0;
1154 +       if (write_header(ps))
1155 +               DMWARN("write header failed");
1156 +}
1157 +
1158 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
1159 +{
1160 +       int r;
1161 +       struct pstore *ps;
1162 +
1163 +       r = dm_io_get(sectors_to_pages(chunk_size));
1164 +       if (r)
1165 +               return r;
1166 +
1167 +       /* allocate the pstore */
1168 +       ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1169 +       if (!ps) {
1170 +               r = -ENOMEM;
1171 +               goto bad;
1172 +       }
1173 +
1174 +       ps->snap = store->snap;
1175 +       ps->valid = 1;
1176 +       ps->version = SNAPSHOT_DISK_VERSION;
1177 +       ps->chunk_size = chunk_size;
1178 +       ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
1179 +           sizeof(struct disk_exception);
1180 +       ps->next_free = 2;      /* skipping the header and first area */
1181 +       ps->current_committed = 0;
1182 +
1183 +       r = alloc_area(ps);
1184 +       if (r)
1185 +               goto bad;
1186 +
1187 +       /*
1188 +        * Allocate space for all the callbacks.
1189 +        */
1190 +       ps->callback_count = 0;
1191 +       atomic_set(&ps->pending_count, 0);
1192 +       ps->callbacks = vcalloc(ps->exceptions_per_area,
1193 +                               sizeof(*ps->callbacks));
1194 +
1195 +       if (!ps->callbacks) {
1196 +               r = -ENOMEM;
1197 +               goto bad;
1198 +       }
1199 +
1200 +       store->destroy = persistent_destroy;
1201 +       store->read_metadata = persistent_read_metadata;
1202 +       store->prepare_exception = persistent_prepare;
1203 +       store->commit_exception = persistent_commit;
1204 +       store->drop_snapshot = persistent_drop;
1205 +       store->fraction_full = persistent_fraction_full;
1206 +       store->context = ps;
1207 +
1208 +       return 0;
1209 +
1210 +      bad:
1211 +       dm_io_put(sectors_to_pages(chunk_size));
1212 +       if (ps) {
1213 +               if (ps->callbacks)
1214 +                       vfree(ps->callbacks);
1215 +
1216 +               kfree(ps);
1217 +       }
1218 +       return r;
1219 +}
1220 +
1221 +/*-----------------------------------------------------------------
1222 + * Implementation of the store for non-persistent snapshots.
1223 + *---------------------------------------------------------------*/
1224 +struct transient_c {
1225 +       sector_t next_free;
1226 +};
1227 +
1228 +void transient_destroy(struct exception_store *store)
1229 +{
1230 +       kfree(store->context);
1231 +}
1232 +
1233 +int transient_read_metadata(struct exception_store *store)
1234 +{
1235 +       return 0;
1236 +}
1237 +
1238 +int transient_prepare(struct exception_store *store, struct exception *e)
1239 +{
1240 +       struct transient_c *tc = (struct transient_c *) store->context;
1241 +       sector_t size = get_dev_size(store->snap->cow->dev);
1242 +
1243 +       if (size < (tc->next_free + store->snap->chunk_size))
1244 +               return -1;
1245 +
1246 +       e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
1247 +       tc->next_free += store->snap->chunk_size;
1248 +
1249 +       return 0;
1250 +}
1251 +
1252 +void transient_commit(struct exception_store *store,
1253 +                     struct exception *e,
1254 +                     void (*callback) (void *, int success),
1255 +                     void *callback_context)
1256 +{
1257 +       /* Just succeed */
1258 +       callback(callback_context, 1);
1259 +}
1260 +
1261 +static void transient_fraction_full(struct exception_store *store,
1262 +                                   sector_t *numerator, sector_t *denominator)
1263 +{
1264 +       *numerator = ((struct transient_c *) store->context)->next_free;
1265 +       *denominator = get_dev_size(store->snap->cow->dev);
1266 +}
1267 +
1268 +int dm_create_transient(struct exception_store *store,
1269 +                       struct dm_snapshot *s, int blocksize)
1270 +{
1271 +       struct transient_c *tc;
1272 +
1273 +       memset(store, 0, sizeof(*store));
1274 +       store->destroy = transient_destroy;
1275 +       store->read_metadata = transient_read_metadata;
1276 +       store->prepare_exception = transient_prepare;
1277 +       store->commit_exception = transient_commit;
1278 +       store->fraction_full = transient_fraction_full;
1279 +       store->snap = s;
1280 +
1281 +       tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
1282 +       if (!tc)
1283 +               return -ENOMEM;
1284 +
1285 +       tc->next_free = 0;
1286 +       store->context = tc;
1287 +
1288 +       return 0;
1289 +}
1290 --- diff/drivers/md/dm-io.c     1970-01-01 01:00:00.000000000 +0100
1291 +++ source/drivers/md/dm-io.c   2003-10-16 10:44:23.000000000 +0100
1292 @@ -0,0 +1,344 @@
1293 +/*
1294 + * Copyright (C) 2003 Sistina Software
1295 + *
1296 + * This file is released under the GPL.
1297 + */
1298 +
1299 +#include "dm-io.h"
1300 +
1301 +#include <linux/mempool.h>
1302 +#include <linux/module.h>
1303 +#include <linux/slab.h>
1304 +#include <linux/sched.h>
1305 +
1306 +/* FIXME: can we shrink this ? */
1307 +struct io_context {
1308 +       int rw;
1309 +       unsigned int error;
1310 +       atomic_t count;
1311 +       struct task_struct *sleeper;
1312 +       io_notify_fn callback;
1313 +       void *context;
1314 +};
1315 +
1316 +/*
1317 + * We maintain a pool of buffer heads for dispatching the io.
1318 + */
1319 +static unsigned int _num_bhs;
1320 +static mempool_t *_buffer_pool;
1321 +
1322 +/*
1323 + * io contexts are only dynamically allocated for asynchronous
1324 + * io.  Since async io is likely to be the majority of io we'll
1325 + * have the same number of io contexts as buffer heads ! (FIXME:
1326 + * must reduce this).
1327 + */
1328 +mempool_t *_io_pool;
1329 +
1330 +static void *alloc_bh(int gfp_mask, void *pool_data)
1331 +{
1332 +       struct buffer_head *bh;
1333 +
1334 +       bh = kmem_cache_alloc(bh_cachep, gfp_mask);
1335 +       if (bh) {
1336 +               bh->b_reqnext = NULL;
1337 +               init_waitqueue_head(&bh->b_wait);
1338 +               INIT_LIST_HEAD(&bh->b_inode_buffers);
1339 +       }
1340 +
1341 +       return bh;
1342 +}
1343 +
1344 +static void *alloc_io(int gfp_mask, void *pool_data)
1345 +{
1346 +       return kmalloc(sizeof(struct io_context), gfp_mask);
1347 +}
1348 +
1349 +static void free_io(void *element, void *pool_data)
1350 +{
1351 +       kfree(element);
1352 +}
1353 +
1354 +static unsigned int pages_to_buffers(unsigned int pages)
1355 +{
1356 +       return 4 * pages;       /* too many ? */
1357 +}
1358 +
1359 +static int resize_pool(unsigned int new_bhs)
1360 +{
1361 +       int r = 0;
1362 +
1363 +       if (_buffer_pool) {
1364 +               if (new_bhs == 0) {
1365 +                       /* free off the pools */
1366 +                       mempool_destroy(_buffer_pool);
1367 +                       mempool_destroy(_io_pool);
1368 +                       _buffer_pool = _io_pool = NULL;
1369 +               } else {
1370 +                       /* resize the pools */
1371 +                       r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
1372 +                       if (!r)
1373 +                               r = mempool_resize(_io_pool,
1374 +                                                  new_bhs, GFP_KERNEL);
1375 +               }
1376 +       } else {
1377 +               /* create new pools */
1378 +               _buffer_pool = mempool_create(new_bhs, alloc_bh,
1379 +                                             mempool_free_slab, bh_cachep);
1380 +               if (!_buffer_pool)
1381 +                       r = -ENOMEM;
1382 +
1383 +               _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
1384 +               if (!_io_pool) {
1385 +                       mempool_destroy(_buffer_pool);
1386 +                       _buffer_pool = NULL;
1387 +                       r = -ENOMEM;
1388 +               }
1389 +       }
1390 +
1391 +       if (!r)
1392 +               _num_bhs = new_bhs;
1393 +
1394 +       return r;
1395 +}
1396 +
1397 +int dm_io_get(unsigned int num_pages)
1398 +{
1399 +       return resize_pool(_num_bhs + pages_to_buffers(num_pages));
1400 +}
1401 +
1402 +void dm_io_put(unsigned int num_pages)
1403 +{
1404 +       resize_pool(_num_bhs - pages_to_buffers(num_pages));
1405 +}
1406 +
1407 +/*-----------------------------------------------------------------
1408 + * We need to keep track of which region a buffer is doing io
1409 + * for.  In order to save a memory allocation we store this in an
1410 + * unused field of the buffer head, and provide these access
1411 + * functions.
1412 + *
1413 + * FIXME: add compile time check that an unsigned int can fit
1414 + * into a pointer.
1415 + *
1416 + *---------------------------------------------------------------*/
1417 +static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
1418 +{
1419 +       bh->b_journal_head = (void *) region;
1420 +}
1421 +
1422 +static inline int bh_get_region(struct buffer_head *bh)
1423 +{
1424 +       return (unsigned int) bh->b_journal_head;
1425 +}
1426 +
1427 +/*-----------------------------------------------------------------
1428 + * We need an io object to keep track of the number of bhs that
1429 + * have been dispatched for a particular io.
1430 + *---------------------------------------------------------------*/
1431 +static void dec_count(struct io_context *io, unsigned int region, int error)
1432 +{
1433 +       if (error)
1434 +               set_bit(region, &io->error);
1435 +
1436 +       if (atomic_dec_and_test(&io->count)) {
1437 +               if (io->sleeper)
1438 +                       wake_up_process(io->sleeper);
1439 +
1440 +               else {
1441 +                       int r = io->error;
1442 +                       io_notify_fn fn = io->callback;
1443 +                       void *context = io->context;
1444 +
1445 +                       mempool_free(io, _io_pool);
1446 +                       fn(r, context);
1447 +               }
1448 +       }
1449 +}
1450 +
1451 +static void endio(struct buffer_head *bh, int uptodate)
1452 +{
1453 +       struct io_context *io = (struct io_context *) bh->b_private;
1454 +
1455 +       if (!uptodate && io->rw != WRITE) {
1456 +               /*
1457 +                * We need to zero this region, otherwise people
1458 +                * like kcopyd may write the arbitrary contents
1459 +                * of the page.
1460 +                */
1461 +               memset(bh->b_data, 0, bh->b_size);
1462 +       }
1463 +
1464 +       dec_count((struct io_context *) bh->b_private,
1465 +                 bh_get_region(bh), !uptodate);
1466 +       mempool_free(bh, _buffer_pool);
1467 +}
1468 +
1469 +/*
1470 + * Primitives for alignment calculations.
1471 + */
1472 +int fls(unsigned n)
1473 +{
1474 +       return generic_fls32(n);
1475 +}
1476 +
1477 +static inline int log2_floor(unsigned n)
1478 +{
1479 +       return ffs(n) - 1;
1480 +}
1481 +
1482 +static inline int log2_align(unsigned n)
1483 +{
1484 +       return fls(n) - 1;
1485 +}
1486 +
1487 +/*
1488 + * Returns the next block for io.
1489 + */
1490 +static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
1491 +                  unsigned int block_size,
1492 +                  struct page *p, unsigned int offset,
1493 +                  unsigned int region, struct io_context *io)
1494 +{
1495 +       struct buffer_head *bh;
1496 +       sector_t b = *block;
1497 +       sector_t blocks_per_page = PAGE_SIZE / block_size;
1498 +       unsigned int this_size; /* holds the size of the current io */
1499 +       unsigned int len;
1500 +
1501 +       while ((offset < PAGE_SIZE) && (b != end_block)) {
1502 +               bh = mempool_alloc(_buffer_pool, GFP_NOIO);
1503 +               init_buffer(bh, endio, io);
1504 +               bh_set_region(bh, region);
1505 +
1506 +               /*
1507 +                * Block size must be a power of 2 and aligned
1508 +                * correctly.
1509 +                */
1510 +               len = end_block - b;
1511 +               this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
1512 +               if (this_size > len)
1513 +                       this_size = 1 << log2_align(len);
1514 +
1515 +               /*
1516 +                * Add in the job offset.
1517 +                */
1518 +               bh->b_blocknr = (b / this_size);
1519 +               bh->b_size = block_size * this_size;
1520 +               set_bh_page(bh, p, offset);
1521 +               bh->b_this_page = bh;
1522 +
1523 +               bh->b_dev = dev;
1524 +               atomic_set(&bh->b_count, 1);
1525 +
1526 +               bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
1527 +                              (1 << BH_Lock));
1528 +
1529 +               if (io->rw == WRITE)
1530 +                       clear_bit(BH_Dirty, &bh->b_state);
1531 +
1532 +               atomic_inc(&io->count);
1533 +               submit_bh(io->rw, bh);
1534 +
1535 +               b += this_size;
1536 +               offset += block_size * this_size;
1537 +       }
1538 +
1539 +       *block = b;
1540 +       return (b == end_block);
1541 +}
1542 +
1543 +static void do_region(unsigned int region, struct io_region *where,
1544 +                     struct page *page, unsigned int offset,
1545 +                     struct io_context *io)
1546 +{
1547 +       unsigned int block_size = get_hardsect_size(where->dev);
1548 +       unsigned int sblock_size = block_size >> 9;
1549 +       sector_t block = where->sector / sblock_size;
1550 +       sector_t end_block = (where->sector + where->count) / sblock_size;
1551 +
1552 +       while (1) {
1553 +               if (do_page(where->dev, &block, end_block, block_size,
1554 +                           page, offset, region, io))
1555 +                       break;
1556 +
1557 +               offset = 0;     /* only offset the first page */
1558 +
1559 +               page = list_entry(page->list.next, struct page, list);
1560 +       }
1561 +}
1562 +
1563 +static void dispatch_io(unsigned int num_regions, struct io_region *where,
1564 +                       struct page *pages, unsigned int offset,
1565 +                       struct io_context *io)
1566 +{
1567 +       int i;
1568 +
1569 +       for (i = 0; i < num_regions; i++)
1570 +               if (where[i].count)
1571 +                       do_region(i, where + i, pages, offset, io);
1572 +
1573 +       /*
1574 +        * Drop the extra refence that we were holding to avoid
1575 +        * the io being completed too early.
1576 +        */
1577 +       dec_count(io, 0, 0);
1578 +}
1579 +
1580 +/*
1581 + * Synchronous io
1582 + */
1583 +int dm_io_sync(unsigned int num_regions, struct io_region *where,
1584 +              int rw, struct page *pages, unsigned int offset,
1585 +              unsigned int *error_bits)
1586 +{
1587 +       struct io_context io;
1588 +
1589 +       BUG_ON(num_regions > 1 && rw != WRITE);
1590 +
1591 +       io.rw = rw;
1592 +       io.error = 0;
1593 +       atomic_set(&io.count, 1); /* see dispatch_io() */
1594 +       io.sleeper = current;
1595 +
1596 +       dispatch_io(num_regions, where, pages, offset, &io);
1597 +       run_task_queue(&tq_disk);
1598 +
1599 +       while (1) {
1600 +               set_current_state(TASK_UNINTERRUPTIBLE);
1601 +
1602 +               if (!atomic_read(&io.count))
1603 +                       break;
1604 +
1605 +               schedule();
1606 +       }
1607 +       set_current_state(TASK_RUNNING);
1608 +
1609 +       *error_bits = io.error;
1610 +       return io.error ? -EIO : 0;
1611 +}
1612 +
1613 +/*
1614 + * Asynchronous io
1615 + */
1616 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1617 +               struct page *pages, unsigned int offset,
1618 +               io_notify_fn fn, void *context)
1619 +{
1620 +       struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
1621 +
1622 +       io->rw = rw;
1623 +       io->error = 0;
1624 +       atomic_set(&io->count, 1); /* see dispatch_io() */
1625 +       io->sleeper = NULL;
1626 +       io->callback = fn;
1627 +       io->context = context;
1628 +
1629 +       dispatch_io(num_regions, where, pages, offset, io);
1630 +       return 0;
1631 +}
1632 +
1633 +EXPORT_SYMBOL(dm_io_get);
1634 +EXPORT_SYMBOL(dm_io_put);
1635 +EXPORT_SYMBOL(dm_io_sync);
1636 +EXPORT_SYMBOL(dm_io_async);
1637 --- diff/drivers/md/dm-io.h     1970-01-01 01:00:00.000000000 +0100
1638 +++ source/drivers/md/dm-io.h   2003-10-16 10:44:23.000000000 +0100
1639 @@ -0,0 +1,86 @@
1640 +/*
1641 + * Copyright (C) 2003 Sistina Software
1642 + *
1643 + * This file is released under the GPL.
1644 + */
1645 +
1646 +#ifndef _DM_IO_H
1647 +#define _DM_IO_H
1648 +
1649 +#include "dm.h"
1650 +
1651 +#include <linux/list.h>
1652 +
1653 +/* Move these to bitops.h eventually */
1654 +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
1655 +/* (c) 2002, D.Phillips and Sistina Software */
1656 +/* Licensed under Version 2 of the GPL */
1657 +
1658 +static unsigned generic_fls8(unsigned n)
1659 +{
1660 +       return n & 0xf0 ?
1661 +           n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
1662 +           n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
1663 +}
1664 +
1665 +static inline unsigned generic_fls16(unsigned n)
1666 +{
1667 +       return  n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
1668 +}
1669 +
1670 +static inline unsigned generic_fls32(unsigned n)
1671 +{
1672 +       return  n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
1673 +}
1674 +
1675 +/* FIXME make this configurable */
1676 +#define DM_MAX_IO_REGIONS 8
1677 +
1678 +struct io_region {
1679 +       kdev_t dev;
1680 +       sector_t sector;
1681 +       sector_t count;
1682 +};
1683 +
1684 +
1685 +/*
1686 + * 'error' is a bitset, with each bit indicating whether an error
1687 + * occurred doing io to the corresponding region.
1688 + */
1689 +typedef void (*io_notify_fn)(unsigned int error, void *context);
1690 +
1691 +
1692 +/*
1693 + * Before anyone uses the IO interface they should call
1694 + * dm_io_get(), specifying roughly how many pages they are
1695 + * expecting to perform io on concurrently.
1696 + *
1697 + * This function may block.
1698 + */
1699 +int dm_io_get(unsigned int num_pages);
1700 +void dm_io_put(unsigned int num_pages);
1701 +
1702 +
1703 +/*
1704 + * Synchronous IO.
1705 + *
1706 + * Please ensure that the rw flag in the next two functions is
1707 + * either READ or WRITE, ie. we don't take READA.  Any
1708 + * regions with a zero count field will be ignored.
1709 + */
1710 +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
1711 +              struct page *pages, unsigned int offset,
1712 +              unsigned int *error_bits);
1713 +
1714 +
1715 +/*
1716 + * Aynchronous IO.
1717 + *
1718 + * The 'where' array may be safely allocated on the stack since
1719 + * the function takes a copy.
1720 + */
1721 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1722 +               struct page *pages, unsigned int offset,
1723 +               io_notify_fn fn, void *context);
1724 +
1725 +#endif
1726 --- diff/drivers/md/dm-ioctl.c  1970-01-01 01:00:00.000000000 +0100
1727 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:44:23.000000000 +0100
1728 @@ -0,0 +1,1272 @@
1729 +/*
1730 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
1731 + *
1732 + * This file is released under the GPL.
1733 + */
1734 +
1735 +#include "dm.h"
1736 +
1737 +#include <linux/module.h>
1738 +#include <linux/vmalloc.h>
1739 +#include <linux/miscdevice.h>
1740 +#include <linux/dm-ioctl.h>
1741 +#include <linux/init.h>
1742 +#include <linux/wait.h>
1743 +#include <linux/blk.h>
1744 +#include <linux/slab.h>
1745 +
1746 +#include <asm/uaccess.h>
1747 +
1748 +#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
1749 +
1750 +/*-----------------------------------------------------------------
1751 + * The ioctl interface needs to be able to look up devices by
1752 + * name or uuid.
1753 + *---------------------------------------------------------------*/
1754 +struct hash_cell {
1755 +       struct list_head name_list;
1756 +       struct list_head uuid_list;
1757 +
1758 +       char *name;
1759 +       char *uuid;
1760 +       struct mapped_device *md;
1761 +       struct dm_table *new_map;
1762 +
1763 +       /* I hate devfs */
1764 +       devfs_handle_t devfs_entry;
1765 +};
1766 +
1767 +#define NUM_BUCKETS 64
1768 +#define MASK_BUCKETS (NUM_BUCKETS - 1)
1769 +static struct list_head _name_buckets[NUM_BUCKETS];
1770 +static struct list_head _uuid_buckets[NUM_BUCKETS];
1771 +
1772 +static devfs_handle_t _dev_dir;
1773 +void dm_hash_remove_all(void);
1774 +
1775 +/*
1776 + * Guards access to both hash tables.
1777 + */
1778 +static DECLARE_RWSEM(_hash_lock);
1779 +
1780 +static void init_buckets(struct list_head *buckets)
1781 +{
1782 +       unsigned int i;
1783 +
1784 +       for (i = 0; i < NUM_BUCKETS; i++)
1785 +               INIT_LIST_HEAD(buckets + i);
1786 +}
1787 +
1788 +int dm_hash_init(void)
1789 +{
1790 +       init_buckets(_name_buckets);
1791 +       init_buckets(_uuid_buckets);
1792 +       _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
1793 +       return 0;
1794 +}
1795 +
1796 +void dm_hash_exit(void)
1797 +{
1798 +       dm_hash_remove_all();
1799 +       devfs_unregister(_dev_dir);
1800 +}
1801 +
1802 +/*-----------------------------------------------------------------
1803 + * Hash function:
1804 + * We're not really concerned with the str hash function being
1805 + * fast since it's only used by the ioctl interface.
1806 + *---------------------------------------------------------------*/
1807 +static unsigned int hash_str(const char *str)
1808 +{
1809 +       const unsigned int hash_mult = 2654435387U;
1810 +       unsigned int h = 0;
1811 +
1812 +       while (*str)
1813 +               h = (h + (unsigned int) *str++) * hash_mult;
1814 +
1815 +       return h & MASK_BUCKETS;
1816 +}
1817 +
1818 +/*-----------------------------------------------------------------
1819 + * Code for looking up a device by name
1820 + *---------------------------------------------------------------*/
1821 +static struct hash_cell *__get_name_cell(const char *str)
1822 +{
1823 +       struct list_head *tmp;
1824 +       struct hash_cell *hc;
1825 +       unsigned int h = hash_str(str);
1826 +
1827 +       list_for_each (tmp, _name_buckets + h) {
1828 +               hc = list_entry(tmp, struct hash_cell, name_list);
1829 +               if (!strcmp(hc->name, str))
1830 +                       return hc;
1831 +       }
1832 +
1833 +       return NULL;
1834 +}
1835 +
1836 +static struct hash_cell *__get_uuid_cell(const char *str)
1837 +{
1838 +       struct list_head *tmp;
1839 +       struct hash_cell *hc;
1840 +       unsigned int h = hash_str(str);
1841 +
1842 +       list_for_each (tmp, _uuid_buckets + h) {
1843 +               hc = list_entry(tmp, struct hash_cell, uuid_list);
1844 +               if (!strcmp(hc->uuid, str))
1845 +                       return hc;
1846 +       }
1847 +
1848 +       return NULL;
1849 +}
1850 +
1851 +/*-----------------------------------------------------------------
1852 + * Inserting, removing and renaming a device.
1853 + *---------------------------------------------------------------*/
1854 +static inline char *kstrdup(const char *str)
1855 +{
1856 +       char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
1857 +       if (r)
1858 +               strcpy(r, str);
1859 +       return r;
1860 +}
1861 +
1862 +static struct hash_cell *alloc_cell(const char *name, const char *uuid,
1863 +                                   struct mapped_device *md)
1864 +{
1865 +       struct hash_cell *hc;
1866 +
1867 +       hc = kmalloc(sizeof(*hc), GFP_KERNEL);
1868 +       if (!hc)
1869 +               return NULL;
1870 +
1871 +       hc->name = kstrdup(name);
1872 +       if (!hc->name) {
1873 +               kfree(hc);
1874 +               return NULL;
1875 +       }
1876 +
1877 +       if (!uuid)
1878 +               hc->uuid = NULL;
1879 +
1880 +       else {
1881 +               hc->uuid = kstrdup(uuid);
1882 +               if (!hc->uuid) {
1883 +                       kfree(hc->name);
1884 +                       kfree(hc);
1885 +                       return NULL;
1886 +               }
1887 +       }
1888 +
1889 +       INIT_LIST_HEAD(&hc->name_list);
1890 +       INIT_LIST_HEAD(&hc->uuid_list);
1891 +       hc->md = md;
1892 +       hc->new_map = NULL;
1893 +       return hc;
1894 +}
1895 +
1896 +static void free_cell(struct hash_cell *hc)
1897 +{
1898 +       if (hc) {
1899 +               kfree(hc->name);
1900 +               kfree(hc->uuid);
1901 +               kfree(hc);
1902 +       }
1903 +}
1904 +
1905 +/*
1906 + * devfs stuff.
1907 + */
1908 +static int register_with_devfs(struct hash_cell *hc)
1909 +{
1910 +       kdev_t dev = dm_kdev(hc->md);
1911 +
1912 +       hc->devfs_entry =
1913 +           devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
1914 +                          major(dev), minor(dev),
1915 +                          S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
1916 +                          &dm_blk_dops, NULL);
1917 +
1918 +       return 0;
1919 +}
1920 +
1921 +static int unregister_with_devfs(struct hash_cell *hc)
1922 +{
1923 +       devfs_unregister(hc->devfs_entry);
1924 +       return 0;
1925 +}
1926 +
1927 +/*
1928 + * The kdev_t and uuid of a device can never change once it is
1929 + * initially inserted.
1930 + */
1931 +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
1932 +{
1933 +       struct hash_cell *cell;
1934 +
1935 +       /*
1936 +        * Allocate the new cells.
1937 +        */
1938 +       cell = alloc_cell(name, uuid, md);
1939 +       if (!cell)
1940 +               return -ENOMEM;
1941 +
1942 +       /*
1943 +        * Insert the cell into both hash tables.
1944 +        */
1945 +       down_write(&_hash_lock);
1946 +       if (__get_name_cell(name))
1947 +               goto bad;
1948 +
1949 +       list_add(&cell->name_list, _name_buckets + hash_str(name));
1950 +
1951 +       if (uuid) {
1952 +               if (__get_uuid_cell(uuid)) {
1953 +                       list_del(&cell->name_list);
1954 +                       goto bad;
1955 +               }
1956 +               list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
1957 +       }
1958 +       register_with_devfs(cell);
1959 +       dm_get(md);
1960 +       up_write(&_hash_lock);
1961 +
1962 +       return 0;
1963 +
1964 +      bad:
1965 +       up_write(&_hash_lock);
1966 +       free_cell(cell);
1967 +       return -EBUSY;
1968 +}
1969 +
1970 +void __hash_remove(struct hash_cell *hc)
1971 +{
1972 +       /* remove from the dev hash */
1973 +       list_del(&hc->uuid_list);
1974 +       list_del(&hc->name_list);
1975 +       unregister_with_devfs(hc);
1976 +       dm_put(hc->md);
1977 +       if (hc->new_map)
1978 +               dm_table_put(hc->new_map);
1979 +       free_cell(hc);
1980 +}
1981 +
1982 +void dm_hash_remove_all(void)
1983 +{
1984 +       int i;
1985 +       struct hash_cell *hc;
1986 +       struct list_head *tmp, *n;
1987 +
1988 +       down_write(&_hash_lock);
1989 +       for (i = 0; i < NUM_BUCKETS; i++) {
1990 +               list_for_each_safe (tmp, n, _name_buckets + i) {
1991 +                       hc = list_entry(tmp, struct hash_cell, name_list);
1992 +                       __hash_remove(hc);
1993 +               }
1994 +       }
1995 +       up_write(&_hash_lock);
1996 +}
1997 +
1998 +int dm_hash_rename(const char *old, const char *new)
1999 +{
2000 +       char *new_name, *old_name;
2001 +       struct hash_cell *hc;
2002 +
2003 +       /*
2004 +        * duplicate new.
2005 +        */
2006 +       new_name = kstrdup(new);
2007 +       if (!new_name)
2008 +               return -ENOMEM;
2009 +
2010 +       down_write(&_hash_lock);
2011 +
2012 +       /*
2013 +        * Is new free ?
2014 +        */
2015 +       hc = __get_name_cell(new);
2016 +       if (hc) {
2017 +               DMWARN("asked to rename to an already existing name %s -> %s",
2018 +                      old, new);
2019 +               up_write(&_hash_lock);
2020 +               kfree(new_name);
2021 +               return -EBUSY;
2022 +       }
2023 +
2024 +       /*
2025 +        * Is there such a device as 'old' ?
2026 +        */
2027 +       hc = __get_name_cell(old);
2028 +       if (!hc) {
2029 +               DMWARN("asked to rename a non existent device %s -> %s",
2030 +                      old, new);
2031 +               up_write(&_hash_lock);
2032 +               kfree(new_name);
2033 +               return -ENXIO;
2034 +       }
2035 +
2036 +       /*
2037 +        * rename and move the name cell.
2038 +        */
2039 +       list_del(&hc->name_list);
2040 +       old_name = hc->name;
2041 +       hc->name = new_name;
2042 +       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
2043 +
2044 +       /* rename the device node in devfs */
2045 +       unregister_with_devfs(hc);
2046 +       register_with_devfs(hc);
2047 +
2048 +       up_write(&_hash_lock);
2049 +       kfree(old_name);
2050 +       return 0;
2051 +}
2052 +
2053 +/*-----------------------------------------------------------------
2054 + * Implementation of the ioctl commands
2055 + *---------------------------------------------------------------*/
2056 +/*
2057 + * All the ioctl commands get dispatched to functions with this
2058 + * prototype.
2059 + */
2060 +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
2061 +
2062 +static int remove_all(struct dm_ioctl *param, size_t param_size)
2063 +{
2064 +       dm_hash_remove_all();
2065 +       param->data_size = 0;
2066 +       return 0;
2067 +}
2068 +
2069 +/*
2070 + * Round up the ptr to an 8-byte boundary.
2071 + */
2072 +#define ALIGN_MASK 7
2073 +static inline void *align_ptr(void *ptr)
2074 +{
2075 +       return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
2076 +}
2077 +
2078 +/*
2079 + * Retrieves the data payload buffer from an already allocated
2080 + * struct dm_ioctl.
2081 + */
2082 +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
2083 +                              size_t *len)
2084 +{
2085 +       param->data_start = align_ptr(param + 1) - (void *) param;
2086 +
2087 +       if (param->data_start < param_size)
2088 +               *len = param_size - param->data_start;
2089 +       else
2090 +               *len = 0;
2091 +
2092 +       return ((void *) param) + param->data_start;
2093 +}
2094 +
2095 +static int list_devices(struct dm_ioctl *param, size_t param_size)
2096 +{
2097 +       unsigned int i;
2098 +       struct hash_cell *hc;
2099 +       size_t len, needed = 0;
2100 +       struct dm_name_list *nl, *old_nl = NULL;
2101 +
2102 +       down_write(&_hash_lock);
2103 +
2104 +       /*
2105 +        * Loop through all the devices working out how much
2106 +        * space we need.
2107 +        */
2108 +       for (i = 0; i < NUM_BUCKETS; i++) {
2109 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
2110 +                       needed += sizeof(struct dm_name_list);
2111 +                       needed += strlen(hc->name);
2112 +                       needed += ALIGN_MASK;
2113 +               }
2114 +       }
2115 +
2116 +       /*
2117 +        * Grab our output buffer.
2118 +        */
2119 +       nl = get_result_buffer(param, param_size, &len);
2120 +       if (len < needed) {
2121 +               param->flags |= DM_BUFFER_FULL_FLAG;
2122 +               goto out;
2123 +       }
2124 +       param->data_size = param->data_start + needed;
2125 +
2126 +       nl->dev = 0;    /* Flags no data */
2127 +
2128 +       /*
2129 +        * Now loop through filling out the names.
2130 +        */
2131 +       for (i = 0; i < NUM_BUCKETS; i++) {
2132 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
2133 +                       if (old_nl)
2134 +                               old_nl->next = (uint32_t) ((void *) nl -
2135 +                                                          (void *) old_nl);
2136 +
2137 +                       nl->dev = dm_kdev(hc->md);
2138 +                       nl->next = 0;
2139 +                       strcpy(nl->name, hc->name);
2140 +
2141 +                       old_nl = nl;
2142 +                       nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
2143 +               }
2144 +       }
2145 +
2146 + out:
2147 +       up_write(&_hash_lock);
2148 +       return 0;
2149 +}
2150 +
2151 +static int check_name(const char *name)
2152 +{
2153 +       if (strchr(name, '/')) {
2154 +               DMWARN("invalid device name");
2155 +               return -EINVAL;
2156 +       }
2157 +
2158 +       return 0;
2159 +}
2160 +
2161 +/*
2162 + * Fills in a dm_ioctl structure, ready for sending back to
2163 + * userland.
2164 + */
2165 +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
2166 +{
2167 +       kdev_t dev = dm_kdev(md);
2168 +       struct dm_table *table;
2169 +       struct block_device *bdev;
2170 +
2171 +       param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
2172 +                         DM_ACTIVE_PRESENT_FLAG);
2173 +
2174 +       if (dm_suspended(md))
2175 +               param->flags |= DM_SUSPEND_FLAG;
2176 +
2177 +       param->dev = kdev_t_to_nr(dev);
2178 +
2179 +       if (is_read_only(dev))
2180 +               param->flags |= DM_READONLY_FLAG;
2181 +
2182 +       param->event_nr = dm_get_event_nr(md);
2183 +
2184 +       table = dm_get_table(md);
2185 +       if (table) {
2186 +               param->flags |= DM_ACTIVE_PRESENT_FLAG;
2187 +               param->target_count = dm_table_get_num_targets(table);
2188 +               dm_table_put(table);
2189 +       } else
2190 +               param->target_count = 0;
2191 +
2192 +       bdev = bdget(param->dev);
2193 +       if (!bdev)
2194 +               return -ENXIO;
2195 +       param->open_count = bdev->bd_openers;
2196 +       bdput(bdev);
2197 +
2198 +       return 0;
2199 +}
2200 +
2201 +static int dev_create(struct dm_ioctl *param, size_t param_size)
2202 +{
2203 +       int r;
2204 +       kdev_t dev = 0;
2205 +       struct mapped_device *md;
2206 +
2207 +       r = check_name(param->name);
2208 +       if (r)
2209 +               return r;
2210 +
2211 +       if (param->flags & DM_PERSISTENT_DEV_FLAG)
2212 +               dev = to_kdev_t(param->dev);
2213 +
2214 +       r = dm_create(dev, &md);
2215 +       if (r)
2216 +               return r;
2217 +
2218 +       r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
2219 +       if (r) {
2220 +               dm_put(md);
2221 +               return r;
2222 +       }
2223 +
2224 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2225 +
2226 +       r = __dev_status(md, param);
2227 +       dm_put(md);
2228 +
2229 +       return r;
2230 +}
2231 +
2232 +/*
2233 + * Always use UUID for lookups if it's present, otherwise use name.
2234 + */
2235 +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
2236 +{
2237 +       return *param->uuid ?
2238 +           __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
2239 +}
2240 +
2241 +static inline struct mapped_device *find_device(struct dm_ioctl *param)
2242 +{
2243 +       struct hash_cell *hc;
2244 +       struct mapped_device *md = NULL;
2245 +
2246 +       down_read(&_hash_lock);
2247 +       hc = __find_device_hash_cell(param);
2248 +       if (hc) {
2249 +               md = hc->md;
2250 +
2251 +               /*
2252 +                * Sneakily write in both the name and the uuid
2253 +                * while we have the cell.
2254 +                */
2255 +               strncpy(param->name, hc->name, sizeof(param->name));
2256 +               if (hc->uuid)
2257 +                       strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
2258 +               else
2259 +                       param->uuid[0] = '\0';
2260 +
2261 +               if (hc->new_map)
2262 +                       param->flags |= DM_INACTIVE_PRESENT_FLAG;
2263 +               else
2264 +                       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2265 +
2266 +               dm_get(md);
2267 +       }
2268 +       up_read(&_hash_lock);
2269 +
2270 +       return md;
2271 +}
2272 +
2273 +static int dev_remove(struct dm_ioctl *param, size_t param_size)
2274 +{
2275 +       struct hash_cell *hc;
2276 +
2277 +       down_write(&_hash_lock);
2278 +       hc = __find_device_hash_cell(param);
2279 +
2280 +       if (!hc) {
2281 +               DMWARN("device doesn't appear to be in the dev hash table.");
2282 +               up_write(&_hash_lock);
2283 +               return -ENXIO;
2284 +       }
2285 +
2286 +       __hash_remove(hc);
2287 +       up_write(&_hash_lock);
2288 +       param->data_size = 0;
2289 +       return 0;
2290 +}
2291 +
2292 +/*
2293 + * Check a string doesn't overrun the chunk of
2294 + * memory we copied from userland.
2295 + */
2296 +static int invalid_str(char *str, void *end)
2297 +{
2298 +       while ((void *) str < end)
2299 +               if (!*str++)
2300 +                       return 0;
2301 +
2302 +       return -EINVAL;
2303 +}
2304 +
2305 +static int dev_rename(struct dm_ioctl *param, size_t param_size)
2306 +{
2307 +       int r;
2308 +       char *new_name = (char *) param + param->data_start;
2309 +
2310 +       if (new_name < (char *) (param + 1) ||
2311 +           invalid_str(new_name, (void *) param + param_size)) {
2312 +               DMWARN("Invalid new logical volume name supplied.");
2313 +               return -EINVAL;
2314 +       }
2315 +
2316 +       r = check_name(new_name);
2317 +       if (r)
2318 +               return r;
2319 +
2320 +       param->data_size = 0;
2321 +       return dm_hash_rename(param->name, new_name);
2322 +}
2323 +
2324 +static int suspend(struct dm_ioctl *param)
2325 +{
2326 +       int r = 0;
2327 +       struct mapped_device *md;
2328 +
2329 +       md = find_device(param);
2330 +       if (!md)
2331 +               return -ENXIO;
2332 +
2333 +       if (!dm_suspended(md))
2334 +               r = dm_suspend(md);
2335 +
2336 +       if (!r)
2337 +               r = __dev_status(md, param);
2338 +
2339 +       dm_put(md);
2340 +       return r;
2341 +}
2342 +
2343 +static int resume(struct dm_ioctl *param)
2344 +{
2345 +       int r = 0;
2346 +       struct hash_cell *hc;
2347 +       struct mapped_device *md;
2348 +       struct dm_table *new_map;
2349 +
2350 +       down_write(&_hash_lock);
2351 +
2352 +       hc = __find_device_hash_cell(param);
2353 +       if (!hc) {
2354 +               DMWARN("device doesn't appear to be in the dev hash table.");
2355 +               up_write(&_hash_lock);
2356 +               return -ENXIO;
2357 +       }
2358 +
2359 +       md = hc->md;
2360 +       dm_get(md);
2361 +
2362 +       new_map = hc->new_map;
2363 +       hc->new_map = NULL;
2364 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2365 +
2366 +       up_write(&_hash_lock);
2367 +
2368 +       /* Do we need to load a new map ? */
2369 +       if (new_map) {
2370 +               /* Suspend if it isn't already suspended */
2371 +               if (!dm_suspended(md))
2372 +                       dm_suspend(md);
2373 +
2374 +               r = dm_swap_table(md, new_map);
2375 +               if (r) {
2376 +                       dm_put(md);
2377 +                       dm_table_put(new_map);
2378 +                       return r;
2379 +               }
2380 +
2381 +               if (dm_table_get_mode(new_map) & FMODE_WRITE)
2382 +                       set_device_ro(dm_kdev(md), 0);
2383 +               else
2384 +                       set_device_ro(dm_kdev(md), 1);
2385 +
2386 +               dm_table_put(new_map);
2387 +       }
2388 +
2389 +       if (dm_suspended(md))
2390 +               r = dm_resume(md);
2391 +
2392 +       if (!r)
2393 +               r = __dev_status(md, param);
2394 +
2395 +       dm_put(md);
2396 +       return r;
2397 +}
2398 +
2399 +/*
2400 + * Set or unset the suspension state of a device.
2401 + * If the device already is in the requested state we just return its status.
2402 + */
2403 +static int dev_suspend(struct dm_ioctl *param, size_t param_size)
2404 +{
2405 +       if (param->flags & DM_SUSPEND_FLAG)
2406 +               return suspend(param);
2407 +
2408 +       return resume(param);
2409 +}
2410 +
2411 +/*
2412 + * Copies device info back to user space, used by
2413 + * the create and info ioctls.
2414 + */
2415 +static int dev_status(struct dm_ioctl *param, size_t param_size)
2416 +{
2417 +       int r;
2418 +       struct mapped_device *md;
2419 +
2420 +       md = find_device(param);
2421 +       if (!md)
2422 +               return -ENXIO;
2423 +
2424 +       r = __dev_status(md, param);
2425 +       dm_put(md);
2426 +       return r;
2427 +}
2428 +
2429 +/*
2430 + * Wait for a device to report an event
2431 + */
2432 +static int dev_wait(struct dm_ioctl *param, size_t param_size)
2433 +{
2434 +       int r;
2435 +       struct mapped_device *md;
2436 +       DECLARE_WAITQUEUE(wq, current);
2437 +
2438 +       md = find_device(param);
2439 +       if (!md)
2440 +               return -ENXIO;
2441 +
2442 +       /*
2443 +        * Wait for a notification event
2444 +        */
2445 +       set_current_state(TASK_INTERRUPTIBLE);
2446 +       if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
2447 +               schedule();
2448 +               dm_remove_wait_queue(md, &wq);
2449 +       }
2450 +       set_current_state(TASK_RUNNING);
2451 +
2452 +       /*
2453 +        * The userland program is going to want to know what
2454 +        * changed to trigger the event, so we may as well tell
2455 +        * him and save an ioctl.
2456 +        */
2457 +       r = __dev_status(md, param);
2458 +
2459 +       dm_put(md);
2460 +       return r;
2461 +}
2462 +
2463 +static inline int get_mode(struct dm_ioctl *param)
2464 +{
2465 +       int mode = FMODE_READ | FMODE_WRITE;
2466 +
2467 +       if (param->flags & DM_READONLY_FLAG)
2468 +               mode = FMODE_READ;
2469 +
2470 +       return mode;
2471 +}
2472 +
2473 +static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
2474 +                      struct dm_target_spec **spec, char **target_params)
2475 +{
2476 +       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
2477 +       *target_params = (char *) (*spec + 1);
2478 +
2479 +       if (*spec < (last + 1))
2480 +               return -EINVAL;
2481 +
2482 +       return invalid_str(*target_params, end);
2483 +}
2484 +
2485 +static int populate_table(struct dm_table *table, struct dm_ioctl *param,
2486 +                         size_t param_size)
2487 +{
2488 +       int r;
2489 +       unsigned int i = 0;
2490 +       struct dm_target_spec *spec = (struct dm_target_spec *) param;
2491 +       uint32_t next = param->data_start;
2492 +       void *end = (void *) param + param_size;
2493 +       char *target_params;
2494 +
2495 +       if (!param->target_count) {
2496 +               DMWARN("populate_table: no targets specified");
2497 +               return -EINVAL;
2498 +       }
2499 +
2500 +       for (i = 0; i < param->target_count; i++) {
2501 +
2502 +               r = next_target(spec, next, end, &spec, &target_params);
2503 +               if (r) {
2504 +                       DMWARN("unable to find target");
2505 +                       return r;
2506 +               }
2507 +
2508 +               r = dm_table_add_target(table, spec->target_type,
2509 +                                       (sector_t) spec->sector_start,
2510 +                                       (sector_t) spec->length,
2511 +                                       target_params);
2512 +               if (r) {
2513 +                       DMWARN("error adding target to table");
2514 +                       return r;
2515 +               }
2516 +
2517 +               next = spec->next;
2518 +       }
2519 +
2520 +       return dm_table_complete(table);
2521 +}
2522 +
2523 +static int table_load(struct dm_ioctl *param, size_t param_size)
2524 +{
2525 +       int r;
2526 +       struct hash_cell *hc;
2527 +       struct dm_table *t;
2528 +
2529 +       r = dm_table_create(&t, get_mode(param));
2530 +       if (r)
2531 +               return r;
2532 +
2533 +       r = populate_table(t, param, param_size);
2534 +       if (r) {
2535 +               dm_table_put(t);
2536 +               return r;
2537 +       }
2538 +
2539 +       down_write(&_hash_lock);
2540 +       hc = __find_device_hash_cell(param);
2541 +       if (!hc) {
2542 +               DMWARN("device doesn't appear to be in the dev hash table.");
2543 +               up_write(&_hash_lock);
2544 +               return -ENXIO;
2545 +       }
2546 +
2547 +       hc->new_map = t;
2548 +       param->flags |= DM_INACTIVE_PRESENT_FLAG;
2549 +
2550 +       r = __dev_status(hc->md, param);
2551 +       up_write(&_hash_lock);
2552 +       return r;
2553 +}
2554 +
2555 +static int table_clear(struct dm_ioctl *param, size_t param_size)
2556 +{
2557 +       int r;
2558 +       struct hash_cell *hc;
2559 +
2560 +       down_write(&_hash_lock);
2561 +
2562 +       hc = __find_device_hash_cell(param);
2563 +       if (!hc) {
2564 +               DMWARN("device doesn't appear to be in the dev hash table.");
2565 +               up_write(&_hash_lock);
2566 +               return -ENXIO;
2567 +       }
2568 +
2569 +       if (hc->new_map) {
2570 +               dm_table_put(hc->new_map);
2571 +               hc->new_map = NULL;
2572 +       }
2573 +
2574 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2575 +
2576 +       r = __dev_status(hc->md, param);
2577 +       up_write(&_hash_lock);
2578 +       return r;
2579 +}
2580 +
2581 +/*
2582 + * Retrieves a list of devices used by a particular dm device.
2583 + */
2584 +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
2585 +                         size_t param_size)
2586 +{
2587 +       unsigned int count = 0;
2588 +       struct list_head *tmp;
2589 +       size_t len, needed;
2590 +       struct dm_target_deps *deps;
2591 +
2592 +       deps = get_result_buffer(param, param_size, &len);
2593 +
2594 +       /*
2595 +        * Count the devices.
2596 +        */
2597 +       list_for_each(tmp, dm_table_get_devices(table))
2598 +               count++;
2599 +
2600 +       /*
2601 +        * Check we have enough space.
2602 +        */
2603 +       needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
2604 +       if (len < needed) {
2605 +               param->flags |= DM_BUFFER_FULL_FLAG;
2606 +               return;
2607 +       }
2608 +
2609 +       /*
2610 +        * Fill in the devices.
2611 +        */
2612 +       deps->count = count;
2613 +       count = 0;
2614 +       list_for_each(tmp, dm_table_get_devices(table)) {
2615 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
2616 +               deps->dev[count++] = dd->bdev->bd_dev;
2617 +       }
2618 +
2619 +       param->data_size = param->data_start + needed;
2620 +}
2621 +
2622 +static int table_deps(struct dm_ioctl *param, size_t param_size)
2623 +{
2624 +       int r;
2625 +       struct mapped_device *md;
2626 +       struct dm_table *table;
2627 +
2628 +       md = find_device(param);
2629 +       if (!md)
2630 +               return -ENXIO;
2631 +
2632 +       r = __dev_status(md, param);
2633 +       if (r)
2634 +               goto out;
2635 +
2636 +       table = dm_get_table(md);
2637 +       if (table) {
2638 +               retrieve_deps(table, param, param_size);
2639 +               dm_table_put(table);
2640 +       }
2641 +
2642 + out:
2643 +       dm_put(md);
2644 +       return r;
2645 +}
2646 +
2647 +/*
2648 + * Build up the status struct for each target
2649 + */
2650 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
2651 +                           size_t param_size)
2652 +{
2653 +       unsigned int i, num_targets;
2654 +       struct dm_target_spec *spec;
2655 +       char *outbuf, *outptr;
2656 +       status_type_t type;
2657 +       size_t remaining, len, used = 0;
2658 +
2659 +       outptr = outbuf = get_result_buffer(param, param_size, &len);
2660 +
2661 +       if (param->flags & DM_STATUS_TABLE_FLAG)
2662 +               type = STATUSTYPE_TABLE;
2663 +       else
2664 +               type = STATUSTYPE_INFO;
2665 +
2666 +       /* Get all the target info */
2667 +       num_targets = dm_table_get_num_targets(table);
2668 +       for (i = 0; i < num_targets; i++) {
2669 +               struct dm_target *ti = dm_table_get_target(table, i);
2670 +
2671 +               remaining = len - (outptr - outbuf);
2672 +               if (remaining < sizeof(struct dm_target_spec)) {
2673 +                       param->flags |= DM_BUFFER_FULL_FLAG;
2674 +                       break;
2675 +               }
2676 +
2677 +               spec = (struct dm_target_spec *) outptr;
2678 +
2679 +               spec->status = 0;
2680 +               spec->sector_start = ti->begin;
2681 +               spec->length = ti->len;
2682 +               strncpy(spec->target_type, ti->type->name,
2683 +                       sizeof(spec->target_type));
2684 +
2685 +               outptr += sizeof(struct dm_target_spec);
2686 +               remaining = len - (outptr - outbuf);
2687 +
2688 +               /* Get the status/table string from the target driver */
2689 +               if (ti->type->status) {
2690 +                       if (ti->type->status(ti, type, outptr, remaining)) {
2691 +                               param->flags |= DM_BUFFER_FULL_FLAG;
2692 +                               break;
2693 +                       }
2694 +               } else
2695 +                       outptr[0] = '\0';
2696 +
2697 +               outptr += strlen(outptr) + 1;
2698 +               used = param->data_start + (outptr - outbuf);
2699 +
2700 +               align_ptr(outptr);
2701 +               spec->next = outptr - outbuf;
2702 +       }
2703 +
2704 +       if (used)
2705 +               param->data_size = used;
2706 +
2707 +       param->target_count = num_targets;
2708 +}
2709 +
2710 +/*
2711 + * Return the status of a device as a text string for each
2712 + * target.
2713 + */
2714 +static int table_status(struct dm_ioctl *param, size_t param_size)
2715 +{
2716 +       int r;
2717 +       struct mapped_device *md;
2718 +       struct dm_table *table;
2719 +
2720 +       md = find_device(param);
2721 +       if (!md)
2722 +               return -ENXIO;
2723 +
2724 +       r = __dev_status(md, param);
2725 +       if (r)
2726 +               goto out;
2727 +
2728 +       table = dm_get_table(md);
2729 +       if (table) {
2730 +               retrieve_status(table, param, param_size);
2731 +               dm_table_put(table);
2732 +       }
2733 +
2734 + out:
2735 +       dm_put(md);
2736 +       return r;
2737 +}
2738 +
2739 +/*-----------------------------------------------------------------
2740 + * Implementation of open/close/ioctl on the special char
2741 + * device.
2742 + *---------------------------------------------------------------*/
2743 +static ioctl_fn lookup_ioctl(unsigned int cmd)
2744 +{
2745 +       static struct {
2746 +               int cmd;
2747 +               ioctl_fn fn;
2748 +       } _ioctls[] = {
2749 +               {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
2750 +               {DM_REMOVE_ALL_CMD, remove_all},
2751 +               {DM_LIST_DEVICES_CMD, list_devices},
2752 +
2753 +               {DM_DEV_CREATE_CMD, dev_create},
2754 +               {DM_DEV_REMOVE_CMD, dev_remove},
2755 +               {DM_DEV_RENAME_CMD, dev_rename},
2756 +               {DM_DEV_SUSPEND_CMD, dev_suspend},
2757 +               {DM_DEV_STATUS_CMD, dev_status},
2758 +               {DM_DEV_WAIT_CMD, dev_wait},
2759 +
2760 +               {DM_TABLE_LOAD_CMD, table_load},
2761 +               {DM_TABLE_CLEAR_CMD, table_clear},
2762 +               {DM_TABLE_DEPS_CMD, table_deps},
2763 +               {DM_TABLE_STATUS_CMD, table_status}
2764 +       };
2765 +
2766 +       return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
2767 +}
2768 +
2769 +/*
2770 + * As well as checking the version compatibility this always
2771 + * copies the kernel interface version out.
2772 + */
2773 +static int check_version(unsigned int cmd, struct dm_ioctl *user)
2774 +{
2775 +       uint32_t version[3];
2776 +       int r = 0;
2777 +
2778 +       if (copy_from_user(version, user->version, sizeof(version)))
2779 +               return -EFAULT;
2780 +
2781 +       if ((DM_VERSION_MAJOR != version[0]) ||
2782 +           (DM_VERSION_MINOR < version[1])) {
2783 +               DMWARN("ioctl interface mismatch: "
2784 +                      "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
2785 +                      DM_VERSION_MAJOR, DM_VERSION_MINOR,
2786 +                      DM_VERSION_PATCHLEVEL,
2787 +                      version[0], version[1], version[2], cmd);
2788 +               r = -EINVAL;
2789 +       }
2790 +
2791 +       /*
2792 +        * Fill in the kernel version.
2793 +        */
2794 +       version[0] = DM_VERSION_MAJOR;
2795 +       version[1] = DM_VERSION_MINOR;
2796 +       version[2] = DM_VERSION_PATCHLEVEL;
2797 +       if (copy_to_user(user->version, version, sizeof(version)))
2798 +               return -EFAULT;
2799 +
2800 +       return r;
2801 +}
2802 +
2803 +static void free_params(struct dm_ioctl *param)
2804 +{
2805 +       vfree(param);
2806 +}
2807 +
2808 +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
2809 +{
2810 +       struct dm_ioctl tmp, *dmi;
2811 +
2812 +       if (copy_from_user(&tmp, user, sizeof(tmp)))
2813 +               return -EFAULT;
2814 +
2815 +       if (tmp.data_size < sizeof(tmp))
2816 +               return -EINVAL;
2817 +
2818 +       dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
2819 +       if (!dmi)
2820 +               return -ENOMEM;
2821 +
2822 +       if (copy_from_user(dmi, user, tmp.data_size)) {
2823 +               vfree(dmi);
2824 +               return -EFAULT;
2825 +       }
2826 +
2827 +       *param = dmi;
2828 +       return 0;
2829 +}
2830 +
2831 +static int validate_params(uint cmd, struct dm_ioctl *param)
2832 +{
2833 +       /* Always clear this flag */
2834 +       param->flags &= ~DM_BUFFER_FULL_FLAG;
2835 +
2836 +       /* Ignores parameters */
2837 +       if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
2838 +               return 0;
2839 +
2840 +       /* Unless creating, either name or uuid but not both */
2841 +       if (cmd != DM_DEV_CREATE_CMD) {
2842 +               if ((!*param->uuid && !*param->name) ||
2843 +                   (*param->uuid && *param->name)) {
2844 +                       DMWARN("one of name or uuid must be supplied, cmd(%u)",
2845 +                              cmd);
2846 +                       return -EINVAL;
2847 +               }
2848 +       }
2849 +
2850 +       /* Ensure strings are terminated */
2851 +       param->name[DM_NAME_LEN - 1] = '\0';
2852 +       param->uuid[DM_UUID_LEN - 1] = '\0';
2853 +
2854 +       return 0;
2855 +}
2856 +
2857 +static int ctl_ioctl(struct inode *inode, struct file *file,
2858 +                    uint command, ulong u)
2859 +{
2860 +       int r = 0;
2861 +       unsigned int cmd;
2862 +       struct dm_ioctl *param;
2863 +       struct dm_ioctl *user = (struct dm_ioctl *) u;
2864 +       ioctl_fn fn = NULL;
2865 +       size_t param_size;
2866 +
2867 +       /* only root can play with this */
2868 +       if (!capable(CAP_SYS_ADMIN))
2869 +               return -EACCES;
2870 +
2871 +       if (_IOC_TYPE(command) != DM_IOCTL)
2872 +               return -ENOTTY;
2873 +
2874 +       cmd = _IOC_NR(command);
2875 +
2876 +       /*
2877 +        * Check the interface version passed in.  This also
2878 +        * writes out the kernel's interface version.
2879 +        */
2880 +       r = check_version(cmd, user);
2881 +       if (r)
2882 +               return r;
2883 +
2884 +       /*
2885 +        * Nothing more to do for the version command.
2886 +        */
2887 +       if (cmd == DM_VERSION_CMD)
2888 +               return 0;
2889 +
2890 +       fn = lookup_ioctl(cmd);
2891 +       if (!fn) {
2892 +               DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
2893 +               return -ENOTTY;
2894 +       }
2895 +
2896 +       /*
2897 +        * FIXME: I don't like this, we're trying to avoid low
2898 +        * memory issues when a device is suspended.
2899 +        */
2900 +       current->flags |= PF_MEMALLOC;
2901 +
2902 +       /*
2903 +        * Copy the parameters into kernel space.
2904 +        */
2905 +       r = copy_params(user, &param);
2906 +       if (r) {
2907 +               current->flags &= ~PF_MEMALLOC;
2908 +               return r;
2909 +       }
2910 +
2911 +       r = validate_params(cmd, param);
2912 +       if (r)
2913 +               goto out;
2914 +
2915 +       param_size = param->data_size;
2916 +       param->data_size = sizeof(*param);
2917 +       r = fn(param, param_size);
2918 +
2919 +       /*
2920 +        * Copy the results back to userland.
2921 +        */
2922 +       if (!r && copy_to_user(user, param, param->data_size))
2923 +               r = -EFAULT;
2924 +
2925 + out:
2926 +       free_params(param);
2927 +       current->flags &= ~PF_MEMALLOC;
2928 +       return r;
2929 +}
2930 +
2931 +static struct file_operations _ctl_fops = {
2932 +       .ioctl   = ctl_ioctl,
2933 +       .owner   = THIS_MODULE,
2934 +};
2935 +
2936 +static devfs_handle_t _ctl_handle;
2937 +
2938 +static struct miscdevice _dm_misc = {
2939 +       .minor = MISC_DYNAMIC_MINOR,
2940 +       .name  = DM_NAME,
2941 +       .fops  = &_ctl_fops
2942 +};
2943 +
2944 +/*
2945 + * Create misc character device and link to DM_DIR/control.
2946 + */
2947 +int __init dm_interface_init(void)
2948 +{
2949 +       int r;
2950 +       char rname[64];
2951 +
2952 +       r = dm_hash_init();
2953 +       if (r)
2954 +               return r;
2955 +
2956 +       r = misc_register(&_dm_misc);
2957 +       if (r) {
2958 +               DMERR("misc_register failed for control device");
2959 +               dm_hash_exit();
2960 +               return r;
2961 +       }
2962 +
2963 +       r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
2964 +                               sizeof rname - 3);
2965 +       if (r == -ENOSYS)
2966 +               goto done;      /* devfs not present */
2967 +
2968 +       if (r < 0) {
2969 +               DMERR("devfs_generate_path failed for control device");
2970 +               goto failed;
2971 +       }
2972 +
2973 +       strncpy(rname + r, "../", 3);
2974 +       r = devfs_mk_symlink(NULL, DM_DIR "/control",
2975 +                            DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
2976 +       if (r) {
2977 +               DMERR("devfs_mk_symlink failed for control device");
2978 +               goto failed;
2979 +       }
2980 +       devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
2981 +
2982 +      done:
2983 +       DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
2984 +              DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
2985 +              DM_DRIVER_EMAIL);
2986 +       return 0;
2987 +
2988 +      failed:
2989 +       misc_deregister(&_dm_misc);
2990 +       dm_hash_exit();
2991 +       return r;
2992 +}
2993 +
2994 +void dm_interface_exit(void)
2995 +{
2996 +       if (misc_deregister(&_dm_misc) < 0)
2997 +               DMERR("misc_deregister failed for control device");
2998 +
2999 +       dm_hash_exit();
3000 +}
3001 --- diff/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100
3002 +++ source/drivers/md/dm-linear.c       2003-10-16 10:44:23.000000000 +0100
3003 @@ -0,0 +1,123 @@
3004 +/*
3005 + * Copyright (C) 2001 Sistina Software (UK) Limited.
3006 + *
3007 + * This file is released under the GPL.
3008 + */
3009 +
3010 +#include "dm.h"
3011 +
3012 +#include <linux/module.h>
3013 +#include <linux/init.h>
3014 +#include <linux/blkdev.h>
3015 +#include <linux/slab.h>
3016 +
3017 +/*
3018 + * Linear: maps a linear range of a device.
3019 + */
3020 +struct linear_c {
3021 +       struct dm_dev *dev;
3022 +       sector_t start;
3023 +};
3024 +
3025 +/*
3026 + * Construct a linear mapping: <dev_path> <offset>
3027 + */
3028 +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3029 +{
3030 +       struct linear_c *lc;
3031 +
3032 +       if (argc != 2) {
3033 +               ti->error = "dm-linear: Not enough arguments";
3034 +               return -EINVAL;
3035 +       }
3036 +
3037 +       lc = kmalloc(sizeof(*lc), GFP_KERNEL);
3038 +       if (lc == NULL) {
3039 +               ti->error = "dm-linear: Cannot allocate linear context";
3040 +               return -ENOMEM;
3041 +       }
3042 +
3043 +       if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
3044 +               ti->error = "dm-linear: Invalid device sector";
3045 +               goto bad;
3046 +       }
3047 +
3048 +       if (dm_get_device(ti, argv[0], lc->start, ti->len,
3049 +                         dm_table_get_mode(ti->table), &lc->dev)) {
3050 +               ti->error = "dm-linear: Device lookup failed";
3051 +               goto bad;
3052 +       }
3053 +
3054 +       ti->private = lc;
3055 +       return 0;
3056 +
3057 +      bad:
3058 +       kfree(lc);
3059 +       return -EINVAL;
3060 +}
3061 +
3062 +static void linear_dtr(struct dm_target *ti)
3063 +{
3064 +       struct linear_c *lc = (struct linear_c *) ti->private;
3065 +
3066 +       dm_put_device(ti, lc->dev);
3067 +       kfree(lc);
3068 +}
3069 +
3070 +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
3071 +                     union map_info *map_context)
3072 +{
3073 +       struct linear_c *lc = (struct linear_c *) ti->private;
3074 +
3075 +       bh->b_rdev = lc->dev->dev;
3076 +       bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
3077 +
3078 +       return 1;
3079 +}
3080 +
3081 +static int linear_status(struct dm_target *ti, status_type_t type,
3082 +                        char *result, unsigned int maxlen)
3083 +{
3084 +       struct linear_c *lc = (struct linear_c *) ti->private;
3085 +       kdev_t kdev;
3086 +
3087 +       switch (type) {
3088 +       case STATUSTYPE_INFO:
3089 +               result[0] = '\0';
3090 +               break;
3091 +
3092 +       case STATUSTYPE_TABLE:
3093 +               kdev = to_kdev_t(lc->dev->bdev->bd_dev);
3094 +               snprintf(result, maxlen, "%s " SECTOR_FORMAT,
3095 +                        dm_kdevname(kdev), lc->start);
3096 +               break;
3097 +       }
3098 +       return 0;
3099 +}
3100 +
3101 +static struct target_type linear_target = {
3102 +       .name   = "linear",
3103 +       .module = THIS_MODULE,
3104 +       .ctr    = linear_ctr,
3105 +       .dtr    = linear_dtr,
3106 +       .map    = linear_map,
3107 +       .status = linear_status,
3108 +};
3109 +
3110 +int __init dm_linear_init(void)
3111 +{
3112 +       int r = dm_register_target(&linear_target);
3113 +
3114 +       if (r < 0)
3115 +               DMERR("linear: register failed %d", r);
3116 +
3117 +       return r;
3118 +}
3119 +
3120 +void dm_linear_exit(void)
3121 +{
3122 +       int r = dm_unregister_target(&linear_target);
3123 +
3124 +       if (r < 0)
3125 +               DMERR("linear: unregister failed %d", r);
3126 +}
3127 --- diff/drivers/md/dm-log.c    1970-01-01 01:00:00.000000000 +0100
3128 +++ source/drivers/md/dm-log.c  2003-10-16 10:44:23.000000000 +0100
3129 @@ -0,0 +1,302 @@
3130 +/*
3131 + * Copyright (C) 2003 Sistina Software
3132 + *
3133 + * This file is released under the LGPL.
3134 + */
3135 +
3136 +#include <linux/init.h>
3137 +#include <linux/slab.h>
3138 +#include <linux/module.h>
3139 +#include <linux/vmalloc.h>
3140 +
3141 +#include "dm-log.h"
3142 +#include "dm-io.h"
3143 +
3144 +static LIST_HEAD(_log_types);
3145 +static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
3146 +
3147 +int dm_register_dirty_log_type(struct dirty_log_type *type)
3148 +{
3149 +       spin_lock(&_lock);
3150 +       type->use_count = 0;
3151 +       if (type->module)
3152 +               __MOD_INC_USE_COUNT(type->module);
3153 +
3154 +       list_add(&type->list, &_log_types);
3155 +       spin_unlock(&_lock);
3156 +
3157 +       return 0;
3158 +}
3159 +
3160 +int dm_unregister_dirty_log_type(struct dirty_log_type *type)
3161 +{
3162 +       spin_lock(&_lock);
3163 +
3164 +       if (type->use_count)
3165 +               DMWARN("Attempt to unregister a log type that is still in use");
3166 +       else {
3167 +               list_del(&type->list);
3168 +               if (type->module)
3169 +                       __MOD_DEC_USE_COUNT(type->module);
3170 +       }
3171 +
3172 +       spin_unlock(&_lock);
3173 +
3174 +       return 0;
3175 +}
3176 +
3177 +static struct dirty_log_type *get_type(const char *type_name)
3178 +{
3179 +       struct dirty_log_type *type;
3180 +       struct list_head *tmp;
3181 +
3182 +       spin_lock(&_lock);
3183 +       list_for_each (tmp, &_log_types) {
3184 +               type = list_entry(tmp, struct dirty_log_type, list);
3185 +               if (!strcmp(type_name, type->name)) {
3186 +                       type->use_count++;
3187 +                       spin_unlock(&_lock);
3188 +                       return type;
3189 +               }
3190 +       }
3191 +
3192 +       spin_unlock(&_lock);
3193 +       return NULL;
3194 +}
3195 +
3196 +static void put_type(struct dirty_log_type *type)
3197 +{
3198 +       spin_lock(&_lock);
3199 +       type->use_count--;
3200 +       spin_unlock(&_lock);
3201 +}
3202 +
3203 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3204 +                                     unsigned int argc, char **argv)
3205 +{
3206 +       struct dirty_log_type *type;
3207 +       struct dirty_log *log;
3208 +
3209 +       log = kmalloc(sizeof(*log), GFP_KERNEL);
3210 +       if (!log)
3211 +               return NULL;
3212 +
3213 +       type = get_type(type_name);
3214 +       if (!type) {
3215 +               kfree(log);
3216 +               return NULL;
3217 +       }
3218 +
3219 +       log->type = type;
3220 +       if (type->ctr(log, dev_size, argc, argv)) {
3221 +               kfree(log);
3222 +               put_type(type);
3223 +               return NULL;
3224 +       }
3225 +
3226 +       return log;
3227 +}
3228 +
3229 +void dm_destroy_dirty_log(struct dirty_log *log)
3230 +{
3231 +       log->type->dtr(log);
3232 +       put_type(log->type);
3233 +       kfree(log);
3234 +}
3235 +
3236 +
3237 +/*-----------------------------------------------------------------
3238 + * In core log, ie. trivial, non-persistent
3239 + *
3240 + * For now we'll keep this simple and just have 2 bitsets, one
3241 + * for clean/dirty, the other for sync/nosync.  The sync bitset
3242 + * will be freed when everything is in sync.
3243 + *
3244 + * FIXME: problems with a 64bit sector_t
3245 + *---------------------------------------------------------------*/
3246 +struct core_log {
3247 +       sector_t region_size;
3248 +       unsigned int region_count;
3249 +       unsigned long *clean_bits;
3250 +       unsigned long *sync_bits;
3251 +       unsigned long *recovering_bits; /* FIXME: this seems excessive */
3252 +
3253 +       int sync_search;
3254 +};
3255 +
3256 +static int core_ctr(struct dirty_log *log, sector_t dev_size,
3257 +                   unsigned int argc, char **argv)
3258 +{
3259 +       struct core_log *clog;
3260 +       sector_t region_size;
3261 +       unsigned int region_count;
3262 +       size_t bitset_size;
3263 +
3264 +       if (argc != 1) {
3265 +               DMWARN("wrong number of arguments to core_log");
3266 +               return -EINVAL;
3267 +       }
3268 +
3269 +       if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
3270 +               DMWARN("invalid region size string");
3271 +               return -EINVAL;
3272 +       }
3273 +
3274 +       region_count = dm_div_up(dev_size, region_size);
3275 +
3276 +       clog = kmalloc(sizeof(*clog), GFP_KERNEL);
3277 +       if (!clog) {
3278 +               DMWARN("couldn't allocate core log");
3279 +               return -ENOMEM;
3280 +       }
3281 +
3282 +       clog->region_size = region_size;
3283 +       clog->region_count = region_count;
3284 +
3285 +       bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
3286 +       clog->clean_bits = vmalloc(bitset_size);
3287 +       if (!clog->clean_bits) {
3288 +               DMWARN("couldn't allocate clean bitset");
3289 +               kfree(clog);
3290 +               return -ENOMEM;
3291 +       }
3292 +       memset(clog->clean_bits, -1, bitset_size);
3293 +
3294 +       clog->sync_bits = vmalloc(bitset_size);
3295 +       if (!clog->sync_bits) {
3296 +               DMWARN("couldn't allocate sync bitset");
3297 +               vfree(clog->clean_bits);
3298 +               kfree(clog);
3299 +               return -ENOMEM;
3300 +       }
3301 +       memset(clog->sync_bits, 0, bitset_size);
3302 +
3303 +       clog->recovering_bits = vmalloc(bitset_size);
3304 +       if (!clog->recovering_bits) {
3305 +               DMWARN("couldn't allocate sync bitset");
3306 +               vfree(clog->sync_bits);
3307 +               vfree(clog->clean_bits);
3308 +               kfree(clog);
3309 +               return -ENOMEM;
3310 +       }
3311 +       memset(clog->recovering_bits, 0, bitset_size);
3312 +       clog->sync_search = 0;
3313 +       log->context = clog;
3314 +       return 0;
3315 +}
3316 +
3317 +static void core_dtr(struct dirty_log *log)
3318 +{
3319 +       struct core_log *clog = (struct core_log *) log->context;
3320 +       vfree(clog->clean_bits);
3321 +       vfree(clog->sync_bits);
3322 +       vfree(clog->recovering_bits);
3323 +       kfree(clog);
3324 +}
3325 +
3326 +static sector_t core_get_region_size(struct dirty_log *log)
3327 +{
3328 +       struct core_log *clog = (struct core_log *) log->context;
3329 +       return clog->region_size;
3330 +}
3331 +
3332 +static int core_is_clean(struct dirty_log *log, region_t region)
3333 +{
3334 +       struct core_log *clog = (struct core_log *) log->context;
3335 +       return test_bit(region, clog->clean_bits);
3336 +}
3337 +
3338 +static int core_in_sync(struct dirty_log *log, region_t region, int block)
3339 +{
3340 +       struct core_log *clog = (struct core_log *) log->context;
3341 +
3342 +       return test_bit(region, clog->sync_bits) ? 1 : 0;
3343 +}
3344 +
3345 +static int core_flush(struct dirty_log *log)
3346 +{
3347 +       /* no op */
3348 +       return 0;
3349 +}
3350 +
3351 +static void core_mark_region(struct dirty_log *log, region_t region)
3352 +{
3353 +       struct core_log *clog = (struct core_log *) log->context;
3354 +       clear_bit(region, clog->clean_bits);
3355 +}
3356 +
3357 +static void core_clear_region(struct dirty_log *log, region_t region)
3358 +{
3359 +       struct core_log *clog = (struct core_log *) log->context;
3360 +       set_bit(region, clog->clean_bits);
3361 +}
3362 +
3363 +static int core_get_resync_work(struct dirty_log *log, region_t *region)
3364 +{
3365 +       struct core_log *clog = (struct core_log *) log->context;
3366 +
3367 +       if (clog->sync_search >= clog->region_count)
3368 +               return 0;
3369 +
3370 +       do {
3371 +               *region = find_next_zero_bit(clog->sync_bits,
3372 +                                            clog->region_count,
3373 +                                            clog->sync_search);
3374 +               clog->sync_search = *region + 1;
3375 +
3376 +               if (*region == clog->region_count)
3377 +                       return 0;
3378 +
3379 +       } while (test_bit(*region, clog->recovering_bits));
3380 +
3381 +       set_bit(*region, clog->recovering_bits);
3382 +       return 1;
3383 +}
3384 +
3385 +static void core_complete_resync_work(struct dirty_log *log, region_t region,
3386 +                                     int success)
3387 +{
3388 +       struct core_log *clog = (struct core_log *) log->context;
3389 +
3390 +       clear_bit(region, clog->recovering_bits);
3391 +       if (success)
3392 +               set_bit(region, clog->sync_bits);
3393 +}
3394 +
3395 +static struct dirty_log_type _core_type = {
3396 +       .name = "core",
3397 +
3398 +       .ctr = core_ctr,
3399 +       .dtr = core_dtr,
3400 +       .get_region_size = core_get_region_size,
3401 +       .is_clean = core_is_clean,
3402 +       .in_sync = core_in_sync,
3403 +       .flush = core_flush,
3404 +       .mark_region = core_mark_region,
3405 +       .clear_region = core_clear_region,
3406 +       .get_resync_work = core_get_resync_work,
3407 +       .complete_resync_work = core_complete_resync_work
3408 +};
3409 +
3410 +__init int dm_dirty_log_init(void)
3411 +{
3412 +       int r;
3413 +
3414 +       r = dm_register_dirty_log_type(&_core_type);
3415 +       if (r)
3416 +               DMWARN("couldn't register core log");
3417 +
3418 +       return r;
3419 +}
3420 +
3421 +void dm_dirty_log_exit(void)
3422 +{
3423 +       dm_unregister_dirty_log_type(&_core_type);
3424 +}
3425 +
3426 +EXPORT_SYMBOL(dm_register_dirty_log_type);
3427 +EXPORT_SYMBOL(dm_unregister_dirty_log_type);
3428 +EXPORT_SYMBOL(dm_dirty_log_init);
3429 +EXPORT_SYMBOL(dm_dirty_log_exit);
3430 +EXPORT_SYMBOL(dm_create_dirty_log);
3431 +EXPORT_SYMBOL(dm_destroy_dirty_log);
3432 --- diff/drivers/md/dm-log.h    1970-01-01 01:00:00.000000000 +0100
3433 +++ source/drivers/md/dm-log.h  2003-10-16 10:44:23.000000000 +0100
3434 @@ -0,0 +1,112 @@
3435 +/*
3436 + * Copyright (C) 2003 Sistina Software
3437 + *
3438 + * This file is released under the LGPL.
3439 + */
3440 +
3441 +#ifndef DM_DIRTY_LOG
3442 +#define DM_DIRTY_LOG
3443 +
3444 +#include "dm.h"
3445 +
3446 +typedef sector_t region_t;
3447 +
3448 +struct dirty_log_type;
3449 +
3450 +struct dirty_log {
3451 +       struct dirty_log_type *type;
3452 +       void *context;
3453 +};
3454 +
3455 +struct dirty_log_type {
3456 +       struct list_head list;
3457 +       const char *name;
3458 +       struct module *module;
3459 +       unsigned int use_count;
3460 +
3461 +       int (*ctr)(struct dirty_log *log, sector_t dev_size,
3462 +                  unsigned int argc, char **argv);
3463 +       void (*dtr)(struct dirty_log *log);
3464 +
3465 +       /*
3466 +        * Retrieves the smallest size of region that the log can
3467 +        * deal with.
3468 +        */
3469 +       sector_t (*get_region_size)(struct dirty_log *log);
3470 +
3471 +        /*
3472 +        * A predicate to say whether a region is clean or not.
3473 +        * May block.
3474 +        */
3475 +       int (*is_clean)(struct dirty_log *log, region_t region);
3476 +
3477 +       /*
3478 +        *  Returns: 0, 1, -EWOULDBLOCK, < 0
3479 +        *
3480 +        * A predicate function to check the area given by
3481 +        * [sector, sector + len) is in sync.
3482 +        *
3483 +        * If -EWOULDBLOCK is returned the state of the region is
3484 +        * unknown, typically this will result in a read being
3485 +        * passed to a daemon to deal with, since a daemon is
3486 +        * allowed to block.
3487 +        */
3488 +       int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
3489 +
3490 +       /*
3491 +        * Flush the current log state (eg, to disk).  This
3492 +        * function may block.
3493 +        */
3494 +       int (*flush)(struct dirty_log *log);
3495 +
3496 +       /*
3497 +        * Mark an area as clean or dirty.  These functions may
3498 +        * block, though for performance reasons blocking should
3499 +        * be extremely rare (eg, allocating another chunk of
3500 +        * memory for some reason).
3501 +        */
3502 +       void (*mark_region)(struct dirty_log *log, region_t region);
3503 +       void (*clear_region)(struct dirty_log *log, region_t region);
3504 +
3505 +       /*
3506 +        * Returns: <0 (error), 0 (no region), 1 (region)
3507 +        *
3508 +        * The mirrord will need perform recovery on regions of
3509 +        * the mirror that are in the NOSYNC state.  This
3510 +        * function asks the log to tell the caller about the
3511 +        * next region that this machine should recover.
3512 +        *
3513 +        * Do not confuse this function with 'in_sync()', one
3514 +        * tells you if an area is synchronised, the other
3515 +        * assigns recovery work.
3516 +       */
3517 +       int (*get_resync_work)(struct dirty_log *log, region_t *region);
3518 +
3519 +       /*
3520 +        * This notifies the log that the resync of an area has
3521 +        * been completed.  The log should then mark this region
3522 +        * as CLEAN.
3523 +        */
3524 +       void (*complete_resync_work)(struct dirty_log *log,
3525 +                                    region_t region, int success);
3526 +};
3527 +
3528 +int dm_register_dirty_log_type(struct dirty_log_type *type);
3529 +int dm_unregister_dirty_log_type(struct dirty_log_type *type);
3530 +
3531 +
3532 +/*
3533 + * Make sure you use these two functions, rather than calling
3534 + * type->constructor/destructor() directly.
3535 + */
3536 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3537 +                                     unsigned int argc, char **argv);
3538 +void dm_destroy_dirty_log(struct dirty_log *log);
3539 +
3540 +/*
3541 + * init/exit functions.
3542 + */
3543 +int dm_dirty_log_init(void);
3544 +void dm_dirty_log_exit(void);
3545 +
3546 +#endif
3547 --- diff/drivers/md/dm-raid1.c  1970-01-01 01:00:00.000000000 +0100
3548 +++ source/drivers/md/dm-raid1.c        2003-10-16 10:44:23.000000000 +0100
3549 @@ -0,0 +1,1297 @@
3550 +/*
3551 + * Copyright (C) 2003 Sistina Software Limited.
3552 + *
3553 + * This file is released under the GPL.
3554 + */
3555 +
3556 +#include "dm.h"
3557 +#include "dm-daemon.h"
3558 +#include "dm-io.h"
3559 +#include "dm-log.h"
3560 +#include "kcopyd.h"
3561 +
3562 +#include <linux/ctype.h>
3563 +#include <linux/init.h>
3564 +#include <linux/mempool.h>
3565 +#include <linux/module.h>
3566 +#include <linux/pagemap.h>
3567 +#include <linux/slab.h>
3568 +#include <linux/time.h>
3569 +#include <linux/vmalloc.h>
3570 +
3571 +static struct dm_daemon _kmirrord;
3572 +
3573 +/*-----------------------------------------------------------------
3574 + * buffer lists:
3575 + *
3576 + * We play with singly linked lists of buffers, but we want to be
3577 + * careful to add new buffers to the back of the list, to avoid
3578 + * buffers being starved of attention.
3579 + *---------------------------------------------------------------*/
3580 +struct buffer_list {
3581 +       struct buffer_head *head;
3582 +       struct buffer_head *tail;
3583 +};
3584 +
3585 +static inline void buffer_list_init(struct buffer_list *bl)
3586 +{
3587 +       bl->head = bl->tail = NULL;
3588 +}
3589 +
3590 +static inline void buffer_list_add(struct buffer_list *bl,
3591 +                                  struct buffer_head *bh)
3592 +{
3593 +       bh->b_reqnext = NULL;
3594 +
3595 +       if (bl->tail) {
3596 +               bl->tail->b_reqnext = bh;
3597 +               bl->tail = bh;
3598 +       } else
3599 +               bl->head = bl->tail = bh;
3600 +}
3601 +
3602 +static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
3603 +{
3604 +       struct buffer_head *bh = bl->head;
3605 +
3606 +       if (bh) {
3607 +               bl->head = bl->head->b_reqnext;
3608 +               if (!bl->head)
3609 +                       bl->tail = NULL;
3610 +
3611 +               bh->b_reqnext = NULL;
3612 +       }
3613 +
3614 +       return bh;
3615 +}
3616 +
3617 +/*-----------------------------------------------------------------
3618 + * Region hash
3619 + *
3620 + * The mirror splits itself up into discrete regions.  Each
3621 + * region can be in one of three states: clean, dirty,
3622 + * nosync.  There is no need to put clean regions in the hash.
3623 + *
3624 + * In addition to being present in the hash table a region _may_
3625 + * be present on one of three lists.
3626 + *
3627 + *   clean_regions: Regions on this list have no io pending to
3628 + *   them, they are in sync, we are no longer interested in them,
3629 + *   they are dull.  rh_update_states() will remove them from the
3630 + *   hash table.
3631 + *
3632 + *   quiesced_regions: These regions have been spun down, ready
3633 + *   for recovery.  rh_recovery_start() will remove regions from
3634 + *   this list and hand them to kmirrord, which will schedule the
3635 + *   recovery io with kcopyd.
3636 + *
3637 + *   recovered_regions: Regions that kcopyd has successfully
3638 + *   recovered.  rh_update_states() will now schedule any delayed
3639 + *   io, up the recovery_count, and remove the region from the
3640 + *   hash.
3641 + *
3642 + * There are 2 locks:
3643 + *   A rw spin lock 'hash_lock' protects just the hash table,
3644 + *   this is never held in write mode from interrupt context,
3645 + *   which I believe means that we only have to disable irqs when
3646 + *   doing a write lock.
3647 + *
3648 + *   An ordinary spin lock 'region_lock' that protects the three
3649 + *   lists in the region_hash, with the 'state', 'list' and
3650 + *   'bhs_delayed' fields of the regions.  This is used from irq
3651 + *   context, so all other uses will have to suspend local irqs.
3652 + *---------------------------------------------------------------*/
3653 +struct mirror_set;
3654 +struct region_hash {
3655 +       struct mirror_set *ms;
3656 +       sector_t region_size;
3657 +
3658 +       /* holds persistent region state */
3659 +       struct dirty_log *log;
3660 +
3661 +       /* hash table */
3662 +       rwlock_t hash_lock;
3663 +       mempool_t *region_pool;
3664 +       unsigned int mask;
3665 +       unsigned int nr_buckets;
3666 +       struct list_head *buckets;
3667 +
3668 +       spinlock_t region_lock;
3669 +       struct semaphore recovery_count;
3670 +       struct list_head clean_regions;
3671 +       struct list_head quiesced_regions;
3672 +       struct list_head recovered_regions;
3673 +};
3674 +
3675 +enum {
3676 +       RH_CLEAN,
3677 +       RH_DIRTY,
3678 +       RH_NOSYNC,
3679 +       RH_RECOVERING
3680 +};
3681 +
3682 +struct region {
3683 +       struct region_hash *rh; /* FIXME: can we get rid of this ? */
3684 +       region_t key;
3685 +       int state;
3686 +
3687 +       struct list_head hash_list;
3688 +       struct list_head list;
3689 +
3690 +       atomic_t pending;
3691 +       struct buffer_head *delayed_bhs;
3692 +};
3693 +
3694 +/*
3695 + * Conversion fns
3696 + */
3697 +static inline region_t bh_to_region(struct region_hash *rh,
3698 +                                   struct buffer_head *bh)
3699 +{
3700 +       return bh->b_rsector / rh->region_size;
3701 +}
3702 +
3703 +static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
3704 +{
3705 +       return region * rh->region_size;
3706 +}
3707 +
3708 +/* FIXME move this */
3709 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
3710 +
3711 +static void *region_alloc(int gfp_mask, void *pool_data)
3712 +{
3713 +       return kmalloc(sizeof(struct region), gfp_mask);
3714 +}
3715 +
3716 +static void region_free(void *element, void *pool_data)
3717 +{
3718 +       kfree(element);
3719 +}
3720 +
3721 +#define MIN_REGIONS 64
3722 +#define MAX_RECOVERY 1
3723 +static int rh_init(struct region_hash *rh, struct mirror_set *ms,
3724 +                  struct dirty_log *log, sector_t region_size,
3725 +                  region_t nr_regions)
3726 +{
3727 +       unsigned int nr_buckets, max_buckets;
3728 +       size_t i;
3729 +
3730 +       /*
3731 +        * Calculate a suitable number of buckets for our hash
3732 +        * table.
3733 +        */
3734 +       max_buckets = nr_regions >> 6;
3735 +       for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
3736 +               ;
3737 +       nr_buckets >>= 1;
3738 +
3739 +       rh->ms = ms;
3740 +       rh->log = log;
3741 +       rh->region_size = region_size;
3742 +       rwlock_init(&rh->hash_lock);
3743 +       rh->mask = nr_buckets - 1;
3744 +       rh->nr_buckets = nr_buckets;
3745 +
3746 +       rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
3747 +       if (!rh->buckets) {
3748 +               DMERR("unable to allocate region hash memory");
3749 +               return -ENOMEM;
3750 +       }
3751 +
3752 +       for (i = 0; i < nr_buckets; i++)
3753 +               INIT_LIST_HEAD(rh->buckets + i);
3754 +
3755 +       spin_lock_init(&rh->region_lock);
3756 +       sema_init(&rh->recovery_count, 0);
3757 +       INIT_LIST_HEAD(&rh->clean_regions);
3758 +       INIT_LIST_HEAD(&rh->quiesced_regions);
3759 +       INIT_LIST_HEAD(&rh->recovered_regions);
3760 +
3761 +       rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
3762 +                                        region_free, NULL);
3763 +       if (!rh->region_pool) {
3764 +               vfree(rh->buckets);
3765 +               rh->buckets = NULL;
3766 +               return -ENOMEM;
3767 +       }
3768 +
3769 +       return 0;
3770 +}
3771 +
3772 +static void rh_exit(struct region_hash *rh)
3773 +{
3774 +       unsigned int h;
3775 +       struct region *reg;
3776 +       struct list_head *tmp, *tmp2;
3777 +
3778 +       BUG_ON(!list_empty(&rh->quiesced_regions));
3779 +       for (h = 0; h < rh->nr_buckets; h++) {
3780 +               list_for_each_safe (tmp, tmp2, rh->buckets + h) {
3781 +                       reg = list_entry(tmp, struct region, hash_list);
3782 +                       BUG_ON(atomic_read(&reg->pending));
3783 +                       mempool_free(reg, rh->region_pool);
3784 +               }
3785 +       }
3786 +
3787 +       if (rh->log)
3788 +               dm_destroy_dirty_log(rh->log);
3789 +       if (rh->region_pool)
3790 +               mempool_destroy(rh->region_pool);
3791 +       vfree(rh->buckets);
3792 +}
3793 +
3794 +#define RH_HASH_MULT 2654435387U
3795 +
3796 +static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
3797 +{
3798 +       return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
3799 +}
3800 +
3801 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
3802 +{
3803 +       struct region *reg;
3804 +
3805 +       list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
3806 +               if (reg->key == region)
3807 +                       return reg;
3808 +
3809 +       return NULL;
3810 +}
3811 +
3812 +static void __rh_insert(struct region_hash *rh, struct region *reg)
3813 +{
3814 +       unsigned int h = rh_hash(rh, reg->key);
3815 +       list_add(&reg->hash_list, rh->buckets + h);
3816 +}
3817 +
3818 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
3819 +{
3820 +       struct region *reg, *nreg;
3821 +
3822 +       read_unlock(&rh->hash_lock);
3823 +       nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
3824 +       nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
3825 +               RH_CLEAN : RH_NOSYNC;
3826 +       nreg->rh = rh;
3827 +       nreg->key = region;
3828 +
3829 +       INIT_LIST_HEAD(&nreg->list);
3830 +
3831 +       atomic_set(&nreg->pending, 0);
3832 +       nreg->delayed_bhs = NULL;
3833 +       write_lock_irq(&rh->hash_lock);
3834 +
3835 +       reg = __rh_lookup(rh, region);
3836 +       if (reg)
3837 +               /* we lost the race */
3838 +               mempool_free(nreg, rh->region_pool);
3839 +
3840 +       else {
3841 +               __rh_insert(rh, nreg);
3842 +               if (nreg->state == RH_CLEAN) {
3843 +                       spin_lock_irq(&rh->region_lock);
3844 +                       list_add(&nreg->list, &rh->clean_regions);
3845 +                       spin_unlock_irq(&rh->region_lock);
3846 +               }
3847 +               reg = nreg;
3848 +       }
3849 +       write_unlock_irq(&rh->hash_lock);
3850 +       read_lock(&rh->hash_lock);
3851 +
3852 +       return reg;
3853 +}
3854 +
3855 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
3856 +{
3857 +       struct region *reg;
3858 +
3859 +       reg = __rh_lookup(rh, region);
3860 +       if (!reg)
3861 +               reg = __rh_alloc(rh, region);
3862 +
3863 +       return reg;
3864 +}
3865 +
3866 +static int rh_state(struct region_hash *rh, region_t region, int may_block)
3867 +{
3868 +       int r;
3869 +       struct region *reg;
3870 +
3871 +       read_lock(&rh->hash_lock);
3872 +       reg = __rh_lookup(rh, region);
3873 +       read_unlock(&rh->hash_lock);
3874 +
3875 +       if (reg)
3876 +               return reg->state;
3877 +
3878 +       /*
3879 +        * The region wasn't in the hash, so we fall back to the
3880 +        * dirty log.
3881 +        */
3882 +       r = rh->log->type->in_sync(rh->log, region, may_block);
3883 +
3884 +       /*
3885 +        * Any error from the dirty log (eg. -EWOULDBLOCK) gets
3886 +        * taken as a RH_NOSYNC
3887 +        */
3888 +       return r == 1 ? RH_CLEAN : RH_NOSYNC;
3889 +}
3890 +
3891 +static inline int rh_in_sync(struct region_hash *rh,
3892 +                            region_t region, int may_block)
3893 +{
3894 +       int state = rh_state(rh, region, may_block);
3895 +       return state == RH_CLEAN || state == RH_DIRTY;
3896 +}
3897 +
3898 +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
3899 +{
3900 +       struct buffer_head *nbh;
3901 +
3902 +       while (bh) {
3903 +               nbh = bh->b_reqnext;
3904 +               queue_bh(ms, bh, WRITE);
3905 +               bh = nbh;
3906 +       }
3907 +}
3908 +
3909 +static void rh_update_states(struct region_hash *rh)
3910 +{
3911 +       struct list_head *tmp, *tmp2;
3912 +       struct region *reg;
3913 +
3914 +       LIST_HEAD(clean);
3915 +       LIST_HEAD(recovered);
3916 +
3917 +       /*
3918 +        * Quickly grab the lists.
3919 +        */
3920 +       write_lock_irq(&rh->hash_lock);
3921 +       spin_lock(&rh->region_lock);
3922 +       if (!list_empty(&rh->clean_regions)) {
3923 +               list_splice(&rh->clean_regions, &clean);
3924 +               INIT_LIST_HEAD(&rh->clean_regions);
3925 +
3926 +               list_for_each_entry (reg, &clean, list) {
3927 +                       rh->log->type->clear_region(rh->log, reg->key);
3928 +                       list_del(&reg->hash_list);
3929 +               }
3930 +       }
3931 +
3932 +       if (!list_empty(&rh->recovered_regions)) {
3933 +               list_splice(&rh->recovered_regions, &recovered);
3934 +               INIT_LIST_HEAD(&rh->recovered_regions);
3935 +
3936 +               list_for_each_entry (reg, &recovered, list)
3937 +                       list_del(&reg->hash_list);
3938 +       }
3939 +       spin_unlock(&rh->region_lock);
3940 +       write_unlock_irq(&rh->hash_lock);
3941 +
3942 +       /*
3943 +        * All the regions on the recovered and clean lists have
3944 +        * now been pulled out of the system, so no need to do
3945 +        * any more locking.
3946 +        */
3947 +       list_for_each_safe (tmp, tmp2, &recovered) {
3948 +               reg = list_entry(tmp, struct region, list);
3949 +
3950 +               rh->log->type->complete_resync_work(rh->log, reg->key, 1);
3951 +               dispatch_buffers(rh->ms, reg->delayed_bhs);
3952 +               up(&rh->recovery_count);
3953 +               mempool_free(reg, rh->region_pool);
3954 +       }
3955 +
3956 +       list_for_each_safe (tmp, tmp2, &clean) {
3957 +               reg = list_entry(tmp, struct region, list);
3958 +               mempool_free(reg, rh->region_pool);
3959 +       }
3960 +}
3961 +
3962 +static void rh_inc(struct region_hash *rh, region_t region)
3963 +{
3964 +       struct region *reg;
3965 +
3966 +       read_lock(&rh->hash_lock);
3967 +       reg = __rh_find(rh, region);
3968 +       if (reg->state == RH_CLEAN) {
3969 +               rh->log->type->mark_region(rh->log, reg->key);
3970 +
3971 +               spin_lock_irq(&rh->region_lock);
3972 +               reg->state = RH_DIRTY;
3973 +               list_del_init(&reg->list);      /* take off the clean list */
3974 +               spin_unlock_irq(&rh->region_lock);
3975 +       }
3976 +
3977 +       atomic_inc(&reg->pending);
3978 +       read_unlock(&rh->hash_lock);
3979 +}
3980 +
3981 +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
3982 +{
3983 +       struct buffer_head *bh;
3984 +
3985 +       for (bh = buffers->head; bh; bh = bh->b_reqnext)
3986 +               rh_inc(rh, bh_to_region(rh, bh));
3987 +}
3988 +
3989 +static void rh_dec(struct region_hash *rh, region_t region)
3990 +{
3991 +       unsigned long flags;
3992 +       struct region *reg;
3993 +       int wake = 0;
3994 +
3995 +       read_lock(&rh->hash_lock);
3996 +       reg = __rh_lookup(rh, region);
3997 +       read_unlock(&rh->hash_lock);
3998 +
3999 +       if (atomic_dec_and_test(&reg->pending)) {
4000 +               spin_lock_irqsave(&rh->region_lock, flags);
4001 +               if (reg->state == RH_RECOVERING) {
4002 +                       list_add_tail(&reg->list, &rh->quiesced_regions);
4003 +               } else {
4004 +                       reg->state = RH_CLEAN;
4005 +                       list_add(&reg->list, &rh->clean_regions);
4006 +               }
4007 +               spin_unlock_irqrestore(&rh->region_lock, flags);
4008 +               wake = 1;
4009 +       }
4010 +
4011 +       if (wake)
4012 +               dm_daemon_wake(&_kmirrord);
4013 +}
4014 +
4015 +/*
4016 + * Starts quiescing a region in preparation for recovery.
4017 + */
4018 +static int __rh_recovery_prepare(struct region_hash *rh)
4019 +{
4020 +       int r;
4021 +       struct region *reg;
4022 +       region_t region;
4023 +
4024 +       /*
4025 +        * Ask the dirty log what's next.
4026 +        */
4027 +       r = rh->log->type->get_resync_work(rh->log, &region);
4028 +       if (r <= 0)
4029 +               return r;
4030 +
4031 +       /*
4032 +        * Get this region, and start it quiescing by setting the
4033 +        * recovering flag.
4034 +        */
4035 +       read_lock(&rh->hash_lock);
4036 +       reg = __rh_find(rh, region);
4037 +       read_unlock(&rh->hash_lock);
4038 +
4039 +       spin_lock_irq(&rh->region_lock);
4040 +       reg->state = RH_RECOVERING;
4041 +
4042 +       /* Already quiesced ? */
4043 +       if (atomic_read(&reg->pending))
4044 +               list_del_init(&reg->list);
4045 +
4046 +       else {
4047 +               list_del_init(&reg->list);
4048 +               list_add(&reg->list, &rh->quiesced_regions);
4049 +       }
4050 +       spin_unlock_irq(&rh->region_lock);
4051 +
4052 +       return 1;
4053 +}
4054 +
4055 +static void rh_recovery_prepare(struct region_hash *rh)
4056 +{
4057 +       while (!down_trylock(&rh->recovery_count))
4058 +               if (__rh_recovery_prepare(rh) <= 0) {
4059 +                       up(&rh->recovery_count);
4060 +                       break;
4061 +               }
4062 +}
4063 +
4064 +/*
4065 + * Returns any quiesced regions.
4066 + */
4067 +static struct region *rh_recovery_start(struct region_hash *rh)
4068 +{
4069 +       struct region *reg = NULL;
4070 +
4071 +       spin_lock_irq(&rh->region_lock);
4072 +       if (!list_empty(&rh->quiesced_regions)) {
4073 +               reg = list_entry(rh->quiesced_regions.next,
4074 +                                struct region, list);
4075 +               list_del_init(&reg->list);      /* remove from the quiesced list */
4076 +       }
4077 +       spin_unlock_irq(&rh->region_lock);
4078 +
4079 +       return reg;
4080 +}
4081 +
4082 +/* FIXME: success ignored for now */
4083 +static void rh_recovery_end(struct region *reg, int success)
4084 +{
4085 +       struct region_hash *rh = reg->rh;
4086 +
4087 +       spin_lock_irq(&rh->region_lock);
4088 +       list_add(&reg->list, &reg->rh->recovered_regions);
4089 +       spin_unlock_irq(&rh->region_lock);
4090 +
4091 +       dm_daemon_wake(&_kmirrord);
4092 +}
4093 +
4094 +static void rh_flush(struct region_hash *rh)
4095 +{
4096 +       rh->log->type->flush(rh->log);
4097 +}
4098 +
4099 +static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
4100 +{
4101 +       struct region *reg;
4102 +
4103 +       read_lock(&rh->hash_lock);
4104 +       reg = __rh_find(rh, bh_to_region(rh, bh));
4105 +       bh->b_reqnext = reg->delayed_bhs;
4106 +       reg->delayed_bhs = bh;
4107 +       read_unlock(&rh->hash_lock);
4108 +}
4109 +
4110 +static void rh_stop_recovery(struct region_hash *rh)
4111 +{
4112 +       int i;
4113 +
4114 +       /* wait for any recovering regions */
4115 +       for (i = 0; i < MAX_RECOVERY; i++)
4116 +               down(&rh->recovery_count);
4117 +}
4118 +
4119 +static void rh_start_recovery(struct region_hash *rh)
4120 +{
4121 +       int i;
4122 +
4123 +       for (i = 0; i < MAX_RECOVERY; i++)
4124 +               up(&rh->recovery_count);
4125 +
4126 +       dm_daemon_wake(&_kmirrord);
4127 +}
4128 +
4129 +/*-----------------------------------------------------------------
4130 + * Mirror set structures.
4131 + *---------------------------------------------------------------*/
4132 +struct mirror {
4133 +       atomic_t error_count;
4134 +       struct dm_dev *dev;
4135 +       sector_t offset;
4136 +};
4137 +
4138 +struct mirror_set {
4139 +       struct dm_target *ti;
4140 +       struct list_head list;
4141 +       struct region_hash rh;
4142 +       struct kcopyd_client *kcopyd_client;
4143 +
4144 +       spinlock_t lock;        /* protects the next two lists */
4145 +       struct buffer_list reads;
4146 +       struct buffer_list writes;
4147 +
4148 +       /* recovery */
4149 +       region_t nr_regions;
4150 +       region_t sync_count;
4151 +
4152 +       unsigned int nr_mirrors;
4153 +       struct mirror mirror[0];
4154 +};
4155 +
4156 +/*
4157 + * Every mirror should look like this one.
4158 + */
4159 +#define DEFAULT_MIRROR 0
4160 +
4161 +/*
4162 + * This is yucky.  We squirrel the mirror_set struct away inside
4163 + * b_reqnext for write buffers.  This is safe since the bh
4164 + * doesn't get submitted to the lower levels of block layer.
4165 + */
4166 +static struct mirror_set *bh_get_ms(struct buffer_head *bh)
4167 +{
4168 +       return (struct mirror_set *) bh->b_reqnext;
4169 +}
4170 +
4171 +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
4172 +{
4173 +       bh->b_reqnext = (struct buffer_head *) ms;
4174 +}
4175 +
4176 +/*-----------------------------------------------------------------
4177 + * Recovery.
4178 + *
4179 + * When a mirror is first activated we may find that some regions
4180 + * are in the no-sync state.  We have to recover these by
4181 + * recopying from the default mirror to all the others.
4182 + *---------------------------------------------------------------*/
4183 +static void recovery_complete(int read_err, unsigned int write_err,
4184 +                             void *context)
4185 +{
4186 +       struct region *reg = (struct region *) context;
4187 +       struct mirror_set *ms = reg->rh->ms;
4188 +
4189 +       /* FIXME: better error handling */
4190 +       rh_recovery_end(reg, read_err || write_err);
4191 +       if (++ms->sync_count == ms->nr_regions)
4192 +               /* the sync is complete */
4193 +               dm_table_event(ms->ti->table);
4194 +}
4195 +
4196 +static int recover(struct mirror_set *ms, struct region *reg)
4197 +{
4198 +       int r;
4199 +       unsigned int i;
4200 +       struct io_region from, to[ms->nr_mirrors - 1], *dest;
4201 +       struct mirror *m;
4202 +       unsigned int flags = 0;
4203 +
4204 +       /* fill in the source */
4205 +       m = ms->mirror + DEFAULT_MIRROR;
4206 +       from.dev = m->dev->dev;
4207 +       from.sector = m->offset + region_to_sector(reg->rh, reg->key);
4208 +       if (reg->key == (ms->nr_regions - 1)) {
4209 +               /*
4210 +                * The final region may be smaller than
4211 +                * region_size.
4212 +                */
4213 +               from.count = ms->ti->len & (reg->rh->region_size - 1);
4214 +               if (!from.count)
4215 +                       from.count = reg->rh->region_size;
4216 +       } else
4217 +               from.count = reg->rh->region_size;
4218 +
4219 +       /* fill in the destinations */
4220 +       for (i = 1; i < ms->nr_mirrors; i++) {
4221 +               m = ms->mirror + i;
4222 +               dest = to + (i - 1);
4223 +
4224 +               dest->dev = m->dev->dev;
4225 +               dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
4226 +               dest->count = from.count;
4227 +       }
4228 +
4229 +       /* hand to kcopyd */
4230 +       set_bit(KCOPYD_IGNORE_ERROR, &flags);
4231 +       r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
4232 +                       recovery_complete, reg);
4233 +
4234 +       return r;
4235 +}
4236 +
4237 +static void do_recovery(struct mirror_set *ms)
4238 +{
4239 +       int r;
4240 +       struct region *reg;
4241 +
4242 +       /*
4243 +        * Start quiescing some regions.
4244 +        */
4245 +       rh_recovery_prepare(&ms->rh);
4246 +
4247 +       /*
4248 +        * Copy any already quiesced regions.
4249 +        */
4250 +       while ((reg = rh_recovery_start(&ms->rh))) {
4251 +               r = recover(ms, reg);
4252 +               if (r)
4253 +                       rh_recovery_end(reg, 0);
4254 +       }
4255 +}
4256 +
4257 +/*-----------------------------------------------------------------
4258 + * Reads
4259 + *---------------------------------------------------------------*/
4260 +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
4261 +{
4262 +       /* FIXME: add read balancing */
4263 +       return ms->mirror + DEFAULT_MIRROR;
4264 +}
4265 +
4266 +/*
4267 + * remap a buffer to a particular mirror.
4268 + */
4269 +static void map_buffer(struct mirror_set *ms,
4270 +                      struct mirror *m, struct buffer_head *bh)
4271 +{
4272 +       sector_t bsize = bh->b_size >> 9;
4273 +       sector_t rsector = bh->b_blocknr * bsize;
4274 +
4275 +       bh->b_rdev = m->dev->dev;
4276 +       bh->b_rsector = m->offset + (rsector - ms->ti->begin);
4277 +}
4278 +
4279 +static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
4280 +{
4281 +       region_t region;
4282 +       struct buffer_head *bh;
4283 +       struct mirror *m;
4284 +
4285 +       while ((bh = buffer_list_pop(reads))) {
4286 +               region = bh_to_region(&ms->rh, bh);
4287 +
4288 +               /*
4289 +                * We can only read balance if the region is in sync.
4290 +                */
4291 +               if (rh_in_sync(&ms->rh, region, 0))
4292 +                       m = choose_mirror(ms, bh->b_rsector);
4293 +               else
4294 +                       m = ms->mirror + DEFAULT_MIRROR;
4295 +
4296 +               map_buffer(ms, m, bh);
4297 +               generic_make_request(READ, bh);
4298 +       }
4299 +}
4300 +
4301 +/*-----------------------------------------------------------------
4302 + * Writes.
4303 + *
4304 + * We do different things with the write io depending on the
4305 + * state of the region that it's in:
4306 + *
4307 + * SYNC:       increment pending, use kcopyd to write to *all* mirrors
4308 + * RECOVERING: delay the io until recovery completes
4309 + * NOSYNC:     increment pending, just write to the default mirror
4310 + *---------------------------------------------------------------*/
4311 +static void write_callback(unsigned int error, void *context)
4312 +{
4313 +       unsigned int i;
4314 +       int uptodate = 1;
4315 +       struct buffer_head *bh = (struct buffer_head *) context;
4316 +       struct mirror_set *ms;
4317 +
4318 +       ms = bh_get_ms(bh);
4319 +       bh_set_ms(bh, NULL);
4320 +
4321 +       /*
4322 +        * NOTE: We don't decrement the pending count here,
4323 +        * instead it is done by the targets endio function.
4324 +        * This way we handle both writes to SYNC and NOSYNC
4325 +        * regions with the same code.
4326 +        */
4327 +
4328 +       if (error) {
4329 +               /*
4330 +                * only error the io if all mirrors failed.
4331 +                * FIXME: bogus
4332 +                */
4333 +               uptodate = 0;
4334 +               for (i = 0; i < ms->nr_mirrors; i++)
4335 +                       if (!test_bit(i, &error)) {
4336 +                               uptodate = 1;
4337 +                               break;
4338 +                       }
4339 +       }
4340 +       bh->b_end_io(bh, uptodate);
4341 +}
4342 +
4343 +static void do_write(struct mirror_set *ms, struct buffer_head *bh)
4344 +{
4345 +       unsigned int i;
4346 +       struct io_region io[ms->nr_mirrors];
4347 +       struct mirror *m;
4348 +
4349 +       for (i = 0; i < ms->nr_mirrors; i++) {
4350 +               m = ms->mirror + i;
4351 +
4352 +               io[i].dev = m->dev->dev;
4353 +               io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin);
4354 +               io[i].count = bh->b_size >> 9;
4355 +       }
4356 +
4357 +       bh_set_ms(bh, ms);
4358 +       dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page,
4359 +                   (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh);
4360 +}
4361 +
4362 +static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
4363 +{
4364 +       int state;
4365 +       struct buffer_head *bh;
4366 +       struct buffer_list sync, nosync, recover, *this_list = NULL;
4367 +
4368 +       if (!writes->head)
4369 +               return;
4370 +
4371 +       /*
4372 +        * Classify each write.
4373 +        */
4374 +       buffer_list_init(&sync);
4375 +       buffer_list_init(&nosync);
4376 +       buffer_list_init(&recover);
4377 +
4378 +       while ((bh = buffer_list_pop(writes))) {
4379 +               state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1);
4380 +               switch (state) {
4381 +               case RH_CLEAN:
4382 +               case RH_DIRTY:
4383 +                       this_list = &sync;
4384 +                       break;
4385 +
4386 +               case RH_NOSYNC:
4387 +                       this_list = &nosync;
4388 +                       break;
4389 +
4390 +               case RH_RECOVERING:
4391 +                       this_list = &recover;
4392 +                       break;
4393 +               }
4394 +
4395 +               buffer_list_add(this_list, bh);
4396 +       }
4397 +
4398 +       /*
4399 +        * Increment the pending counts for any regions that will
4400 +        * be written to (writes to recover regions are going to
4401 +        * be delayed).
4402 +        */
4403 +       rh_inc_pending(&ms->rh, &sync);
4404 +       rh_inc_pending(&ms->rh, &nosync);
4405 +       rh_flush(&ms->rh);
4406 +
4407 +       /*
4408 +        * Dispatch io.
4409 +        */
4410 +       while ((bh = buffer_list_pop(&sync)))
4411 +               do_write(ms, bh);
4412 +
4413 +       while ((bh = buffer_list_pop(&recover)))
4414 +               rh_delay(&ms->rh, bh);
4415 +
4416 +       while ((bh = buffer_list_pop(&nosync))) {
4417 +               map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh);
4418 +               generic_make_request(WRITE, bh);
4419 +       }
4420 +}
4421 +
4422 +/*-----------------------------------------------------------------
4423 + * kmirrord
4424 + *---------------------------------------------------------------*/
4425 +static LIST_HEAD(_mirror_sets);
4426 +static DECLARE_RWSEM(_mirror_sets_lock);
4427 +
4428 +static void do_mirror(struct mirror_set *ms)
4429 +{
4430 +       struct buffer_list reads, writes;
4431 +
4432 +       spin_lock(&ms->lock);
4433 +       memcpy(&reads, &ms->reads, sizeof(reads));
4434 +       buffer_list_init(&ms->reads);
4435 +       memcpy(&writes, &ms->writes, sizeof(writes));
4436 +       buffer_list_init(&ms->writes);
4437 +       spin_unlock(&ms->lock);
4438 +
4439 +       rh_update_states(&ms->rh);
4440 +       do_recovery(ms);
4441 +       do_reads(ms, &reads);
4442 +       do_writes(ms, &writes);
4443 +       run_task_queue(&tq_disk);
4444 +}
4445 +
4446 +static void do_work(void)
4447 +{
4448 +       struct mirror_set *ms;
4449 +
4450 +       down_read(&_mirror_sets_lock);
4451 +       list_for_each_entry (ms, &_mirror_sets, list)
4452 +               do_mirror(ms);
4453 +       up_read(&_mirror_sets_lock);
4454 +}
4455 +
4456 +/*-----------------------------------------------------------------
4457 + * Target functions
4458 + *---------------------------------------------------------------*/
4459 +static struct mirror_set *alloc_context(unsigned int nr_mirrors,
4460 +                                       sector_t region_size,
4461 +                                       struct dm_target *ti,
4462 +                                       struct dirty_log *dl)
4463 +{
4464 +       size_t len;
4465 +       struct mirror_set *ms = NULL;
4466 +
4467 +       if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
4468 +               return NULL;
4469 +
4470 +       len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
4471 +
4472 +       ms = kmalloc(len, GFP_KERNEL);
4473 +       if (!ms) {
4474 +               ti->error = "dm-mirror: Cannot allocate mirror context";
4475 +               return NULL;
4476 +       }
4477 +
4478 +       memset(ms, 0, len);
4479 +       spin_lock_init(&ms->lock);
4480 +
4481 +       ms->ti = ti;
4482 +       ms->nr_mirrors = nr_mirrors;
4483 +       ms->nr_regions = dm_div_up(ti->len, region_size);
4484 +
4485 +       if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
4486 +               ti->error = "dm-mirror: Error creating dirty region hash";
4487 +               kfree(ms);
4488 +               return NULL;
4489 +       }
4490 +
4491 +       return ms;
4492 +}
4493 +
4494 +static void free_context(struct mirror_set *ms, struct dm_target *ti,
4495 +                        unsigned int m)
4496 +{
4497 +       while (m--)
4498 +               dm_put_device(ti, ms->mirror[m].dev);
4499 +
4500 +       rh_exit(&ms->rh);
4501 +       kfree(ms);
4502 +}
4503 +
4504 +static inline int _check_region_size(struct dm_target *ti, sector_t size)
4505 +{
4506 +       return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
4507 +                size > ti->len);
4508 +}
4509 +
4510 +static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
4511 +                     unsigned int mirror, char **argv)
4512 +{
4513 +       sector_t offset;
4514 +
4515 +       if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
4516 +               ti->error = "dm-mirror: Invalid offset";
4517 +               return -EINVAL;
4518 +       }
4519 +
4520 +       if (dm_get_device(ti, argv[0], offset, ti->len,
4521 +                         dm_table_get_mode(ti->table),
4522 +                         &ms->mirror[mirror].dev)) {
4523 +               ti->error = "dm-mirror: Device lookup failure";
4524 +               return -ENXIO;
4525 +       }
4526 +
4527 +       ms->mirror[mirror].offset = offset;
4528 +
4529 +       return 0;
4530 +}
4531 +
4532 +static int add_mirror_set(struct mirror_set *ms)
4533 +{
4534 +       down_write(&_mirror_sets_lock);
4535 +       list_add_tail(&ms->list, &_mirror_sets);
4536 +       up_write(&_mirror_sets_lock);
4537 +       dm_daemon_wake(&_kmirrord);
4538 +
4539 +       return 0;
4540 +}
4541 +
4542 +static void del_mirror_set(struct mirror_set *ms)
4543 +{
4544 +       down_write(&_mirror_sets_lock);
4545 +       list_del(&ms->list);
4546 +       up_write(&_mirror_sets_lock);
4547 +}
4548 +
4549 +/*
4550 + * Create dirty log: log_type #log_params <log_params>
4551 + */
4552 +static struct dirty_log *create_dirty_log(struct dm_target *ti,
4553 +                                         unsigned int argc, char **argv,
4554 +                                         unsigned int *args_used)
4555 +{
4556 +       unsigned int param_count;
4557 +       struct dirty_log *dl;
4558 +
4559 +       if (argc < 2) {
4560 +               ti->error = "dm-mirror: Insufficient mirror log arguments";
4561 +               return NULL;
4562 +       }
4563 +
4564 +       if (sscanf(argv[1], "%u", &param_count) != 1 || param_count != 1) {
4565 +               ti->error = "dm-mirror: Invalid mirror log argument count";
4566 +               return NULL;
4567 +       }
4568 +
4569 +       *args_used = 2 + param_count;
4570 +
4571 +       if (argc < *args_used) {
4572 +               ti->error = "dm-mirror: Insufficient mirror log arguments";
4573 +               return NULL;
4574 +       }
4575 +
4576 +       dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
4577 +       if (!dl) {
4578 +               ti->error = "dm-mirror: Error creating mirror dirty log";
4579 +               return NULL;
4580 +       }
4581 +
4582 +       if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
4583 +               ti->error = "dm-mirror: Invalid region size";
4584 +               dm_destroy_dirty_log(dl);
4585 +               return NULL;
4586 +       }
4587 +
4588 +       return dl;
4589 +}
4590 +
4591 +/*
4592 + * Construct a mirror mapping:
4593 + *
4594 + * log_type #log_params <log_params>
4595 + * #mirrors [mirror_path offset]{2,}
4596 + *
4597 + * For now, #log_params = 1, log_type = "core"
4598 + *
4599 + */
4600 +#define DM_IO_PAGES 64
4601 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4602 +{
4603 +       int r;
4604 +       unsigned int nr_mirrors, m, args_used;
4605 +       struct mirror_set *ms;
4606 +       struct dirty_log *dl;
4607 +
4608 +       dl = create_dirty_log(ti, argc, argv, &args_used);
4609 +       if (!dl)
4610 +               return -EINVAL;
4611 +
4612 +       argv += args_used;
4613 +       argc -= args_used;
4614 +
4615 +       if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
4616 +           nr_mirrors < 2) {
4617 +               ti->error = "dm-mirror: Invalid number of mirrors";
4618 +               dm_destroy_dirty_log(dl);
4619 +               return -EINVAL;
4620 +       }
4621 +
4622 +       argv++, argc--;
4623 +
4624 +       if (argc != nr_mirrors * 2) {
4625 +               ti->error = "dm-mirror: Wrong number of mirror arguments";
4626 +               dm_destroy_dirty_log(dl);
4627 +               return -EINVAL;
4628 +       }
4629 +
4630 +       ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
4631 +       if (!ms) {
4632 +               dm_destroy_dirty_log(dl);
4633 +               return -ENOMEM;
4634 +       }
4635 +
4636 +       /* Get the mirror parameter sets */
4637 +       for (m = 0; m < nr_mirrors; m++) {
4638 +               r = get_mirror(ms, ti, m, argv);
4639 +               if (r) {
4640 +                       free_context(ms, ti, m);
4641 +                       return r;
4642 +               }
4643 +               argv += 2;
4644 +               argc -= 2;
4645 +       }
4646 +
4647 +       ti->private = ms;
4648 +
4649 +       r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
4650 +       if (r) {
4651 +               free_context(ms, ti, ms->nr_mirrors);
4652 +               return r;
4653 +       }
4654 +
4655 +       add_mirror_set(ms);
4656 +       return 0;
4657 +}
4658 +
4659 +static void mirror_dtr(struct dm_target *ti)
4660 +{
4661 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4662 +
4663 +       del_mirror_set(ms);
4664 +       kcopyd_client_destroy(ms->kcopyd_client);
4665 +       free_context(ms, ti, ms->nr_mirrors);
4666 +}
4667 +
4668 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
4669 +{
4670 +       int wake = 0;
4671 +       struct buffer_list *bl;
4672 +
4673 +       bl = (rw == WRITE) ? &ms->writes : &ms->reads;
4674 +       spin_lock(&ms->lock);
4675 +       wake = !(bl->head);
4676 +       buffer_list_add(bl, bh);
4677 +       spin_unlock(&ms->lock);
4678 +
4679 +       if (wake)
4680 +               dm_daemon_wake(&_kmirrord);
4681 +}
4682 +
4683 +/*
4684 + * Mirror mapping function
4685 + */
4686 +static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
4687 +                     int rw, union map_info *map_context)
4688 +{
4689 +       int r;
4690 +       struct mirror *m;
4691 +       struct mirror_set *ms = ti->private;
4692 +
4693 +       /* FIXME: nasty hack, 32 bit sector_t only */
4694 +       map_context->ll = bh->b_rsector / ms->rh.region_size;
4695 +
4696 +       if (rw == WRITE) {
4697 +               queue_bh(ms, bh, rw);
4698 +               return 0;
4699 +       }
4700 +
4701 +       r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0);
4702 +       if (r < 0 && r != -EWOULDBLOCK)
4703 +               return r;
4704 +
4705 +       if (r == -EWOULDBLOCK)  /* FIXME: ugly */
4706 +               r = 0;
4707 +
4708 +       /*
4709 +        * We don't want to fast track a recovery just for a read
4710 +        * ahead.  So we just let it silently fail.
4711 +        * FIXME: get rid of this.
4712 +        */
4713 +       if (!r && rw == READA)
4714 +               return -EIO;
4715 +
4716 +       if (!r) {
4717 +               /* Pass this io over to the daemon */
4718 +               queue_bh(ms, bh, rw);
4719 +               return 0;
4720 +       }
4721 +
4722 +       m = choose_mirror(ms, bh->b_rsector);
4723 +       if (!m)
4724 +               return -EIO;
4725 +
4726 +       map_buffer(ms, m, bh);
4727 +       return 1;
4728 +}
4729 +
4730 +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
4731 +                        int rw, int error, union map_info *map_context)
4732 +{
4733 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4734 +       region_t region = map_context->ll;
4735 +
4736 +       /*
4737 +        * We need to dec pending if this was a write.
4738 +        */
4739 +       if (rw == WRITE)
4740 +               rh_dec(&ms->rh, region);
4741 +
4742 +       return 0;
4743 +}
4744 +
4745 +static void mirror_suspend(struct dm_target *ti)
4746 +{
4747 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4748 +       rh_stop_recovery(&ms->rh);
4749 +}
4750 +
4751 +static void mirror_resume(struct dm_target *ti)
4752 +{
4753 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4754 +       rh_start_recovery(&ms->rh);
4755 +}
4756 +
4757 +static int mirror_status(struct dm_target *ti, status_type_t type,
4758 +                        char *result, unsigned int maxlen)
4759 +{
4760 +       unsigned int m, sz = 0;
4761 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4762 +
4763 +       switch (type) {
4764 +       case STATUSTYPE_INFO:
4765 +               sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
4766 +
4767 +               for (m = 0; m < ms->nr_mirrors; m++)
4768 +                       sz += snprintf(result + sz, maxlen - sz, "%s ",
4769 +                                      dm_kdevname(ms->mirror[m].dev->dev));
4770 +
4771 +               sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
4772 +                              ms->sync_count, ms->nr_regions);
4773 +               break;
4774 +
4775 +       case STATUSTYPE_TABLE:
4776 +               sz += snprintf(result + sz, maxlen - sz,
4777 +                              "%s 1 " SECTOR_FORMAT " %d ",
4778 +                              ms->rh.log->type->name, ms->rh.region_size,
4779 +                              ms->nr_mirrors);
4780 +
4781 +               for (m = 0; m < ms->nr_mirrors; m++)
4782 +                       sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
4783 +                                      dm_kdevname(ms->mirror[m].dev->dev),
4784 +                                      ms->mirror[m].offset);
4785 +       }
4786 +
4787 +       return 0;
4788 +}
4789 +
4790 +static struct target_type mirror_target = {
4791 +       .name    = "mirror",
4792 +       .module  = THIS_MODULE,
4793 +       .ctr     = mirror_ctr,
4794 +       .dtr     = mirror_dtr,
4795 +       .map     = mirror_map,
4796 +       .end_io  = mirror_end_io,
4797 +       .suspend = mirror_suspend,
4798 +       .resume  = mirror_resume,
4799 +       .status  = mirror_status,
4800 +};
4801 +
4802 +static int __init dm_mirror_init(void)
4803 +{
4804 +       int r;
4805 +
4806 +       r = dm_dirty_log_init();
4807 +       if (r)
4808 +               return r;
4809 +
4810 +       r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
4811 +       if (r) {
4812 +               DMERR("couldn't start kmirrord");
4813 +               dm_dirty_log_exit();
4814 +               return r;
4815 +       }
4816 +
4817 +       r = dm_register_target(&mirror_target);
4818 +       if (r < 0) {
4819 +               DMERR("%s: Failed to register mirror target",
4820 +                     mirror_target.name);
4821 +               dm_dirty_log_exit();
4822 +               dm_daemon_stop(&_kmirrord);
4823 +       }
4824 +
4825 +       return r;
4826 +}
4827 +
4828 +static void __exit dm_mirror_exit(void)
4829 +{
4830 +       int r;
4831 +
4832 +       r = dm_unregister_target(&mirror_target);
4833 +       if (r < 0)
4834 +               DMERR("%s: unregister failed %d", mirror_target.name, r);
4835 +
4836 +       dm_daemon_stop(&_kmirrord);
4837 +       dm_dirty_log_exit();
4838 +}
4839 +
4840 +/* Module hooks */
4841 +module_init(dm_mirror_init);
4842 +module_exit(dm_mirror_exit);
4843 +
4844 +MODULE_DESCRIPTION(DM_NAME " mirror target");
4845 +MODULE_AUTHOR("Heinz Mauelshagen <mge@sistina.com>");
4846 +MODULE_LICENSE("GPL");
4847 --- diff/drivers/md/dm-snapshot.c       1970-01-01 01:00:00.000000000 +0100
4848 +++ source/drivers/md/dm-snapshot.c     2003-10-16 10:44:23.000000000 +0100
4849 @@ -0,0 +1,1235 @@
4850 +/*
4851 + * dm-snapshot.c
4852 + *
4853 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4854 + *
4855 + * This file is released under the GPL.
4856 + */
4857 +
4858 +#include <linux/config.h>
4859 +#include <linux/ctype.h>
4860 +#include <linux/module.h>
4861 +#include <linux/init.h>
4862 +#include <linux/slab.h>
4863 +#include <linux/list.h>
4864 +#include <linux/fs.h>
4865 +#include <linux/blkdev.h>
4866 +#include <linux/mempool.h>
4867 +#include <linux/device-mapper.h>
4868 +#include <linux/vmalloc.h>
4869 +
4870 +#include "dm-snapshot.h"
4871 +#include "kcopyd.h"
4872 +
4873 +/*
4874 + * FIXME: Remove this before release.
4875 + */
4876 +#if 0
4877 +#define DMDEBUG(x...) DMWARN( ## x)
4878 +#else
4879 +#define DMDEBUG(x...)
4880 +#endif
4881 +
4882 +/*
4883 + * The percentage increment we will wake up users at
4884 + */
4885 +#define WAKE_UP_PERCENT 5
4886 +
4887 +/*
4888 + * kcopyd priority of snapshot operations
4889 + */
4890 +#define SNAPSHOT_COPY_PRIORITY 2
4891 +
4892 +/*
4893 + * Each snapshot reserves this many pages for io
4894 + * FIXME: calculate this
4895 + */
4896 +#define SNAPSHOT_PAGES 256
4897 +
4898 +struct pending_exception {
4899 +       struct exception e;
4900 +
4901 +       /*
4902 +        * Origin buffers waiting for this to complete are held
4903 +        * in a list (using b_reqnext).
4904 +        */
4905 +       struct buffer_head *origin_bhs;
4906 +       struct buffer_head *snapshot_bhs;
4907 +
4908 +       /*
4909 +        * Other pending_exceptions that are processing this
4910 +        * chunk.  When this list is empty, we know we can
4911 +        * complete the origins.
4912 +        */
4913 +       struct list_head siblings;
4914 +
4915 +       /* Pointer back to snapshot context */
4916 +       struct dm_snapshot *snap;
4917 +
4918 +       /*
4919 +        * 1 indicates the exception has already been sent to
4920 +        * kcopyd.
4921 +        */
4922 +       int started;
4923 +};
4924 +
4925 +/*
4926 + * Hash table mapping origin volumes to lists of snapshots and
4927 + * a lock to protect it
4928 + */
4929 +static kmem_cache_t *exception_cache;
4930 +static kmem_cache_t *pending_cache;
4931 +static mempool_t *pending_pool;
4932 +
4933 +/*
4934 + * One of these per registered origin, held in the snapshot_origins hash
4935 + */
4936 +struct origin {
4937 +       /* The origin device */
4938 +       kdev_t dev;
4939 +
4940 +       struct list_head hash_list;
4941 +
4942 +       /* List of snapshots for this origin */
4943 +       struct list_head snapshots;
4944 +};
4945 +
4946 +/*
4947 + * Size of the hash table for origin volumes. If we make this
4948 + * the size of the minors list then it should be nearly perfect
4949 + */
4950 +#define ORIGIN_HASH_SIZE 256
4951 +#define ORIGIN_MASK      0xFF
4952 +static struct list_head *_origins;
4953 +static struct rw_semaphore _origins_lock;
4954 +
4955 +static int init_origin_hash(void)
4956 +{
4957 +       int i;
4958 +
4959 +       _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4960 +                          GFP_KERNEL);
4961 +       if (!_origins) {
4962 +               DMERR("Device mapper: Snapshot: unable to allocate memory");
4963 +               return -ENOMEM;
4964 +       }
4965 +
4966 +       for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4967 +               INIT_LIST_HEAD(_origins + i);
4968 +       init_rwsem(&_origins_lock);
4969 +
4970 +       return 0;
4971 +}
4972 +
4973 +static void exit_origin_hash(void)
4974 +{
4975 +       kfree(_origins);
4976 +}
4977 +
4978 +static inline unsigned int origin_hash(kdev_t dev)
4979 +{
4980 +       return MINOR(dev) & ORIGIN_MASK;
4981 +}
4982 +
4983 +static struct origin *__lookup_origin(kdev_t origin)
4984 +{
4985 +       struct list_head *slist;
4986 +       struct list_head *ol;
4987 +       struct origin *o;
4988 +
4989 +       ol = &_origins[origin_hash(origin)];
4990 +       list_for_each(slist, ol) {
4991 +               o = list_entry(slist, struct origin, hash_list);
4992 +
4993 +               if (o->dev == origin)
4994 +                       return o;
4995 +       }
4996 +
4997 +       return NULL;
4998 +}
4999 +
5000 +static void __insert_origin(struct origin *o)
5001 +{
5002 +       struct list_head *sl = &_origins[origin_hash(o->dev)];
5003 +       list_add_tail(&o->hash_list, sl);
5004 +}
5005 +
5006 +/*
5007 + * Make a note of the snapshot and its origin so we can look it
5008 + * up when the origin has a write on it.
5009 + */
5010 +static int register_snapshot(struct dm_snapshot *snap)
5011 +{
5012 +       struct origin *o;
5013 +       kdev_t dev = snap->origin->dev;
5014 +
5015 +       down_write(&_origins_lock);
5016 +       o = __lookup_origin(dev);
5017 +
5018 +       if (!o) {
5019 +               /* New origin */
5020 +               o = kmalloc(sizeof(*o), GFP_KERNEL);
5021 +               if (!o) {
5022 +                       up_write(&_origins_lock);
5023 +                       return -ENOMEM;
5024 +               }
5025 +
5026 +               /* Initialise the struct */
5027 +               INIT_LIST_HEAD(&o->snapshots);
5028 +               o->dev = dev;
5029 +
5030 +               __insert_origin(o);
5031 +       }
5032 +
5033 +       list_add_tail(&snap->list, &o->snapshots);
5034 +
5035 +       up_write(&_origins_lock);
5036 +       return 0;
5037 +}
5038 +
5039 +static void unregister_snapshot(struct dm_snapshot *s)
5040 +{
5041 +       struct origin *o;
5042 +
5043 +       down_write(&_origins_lock);
5044 +       o = __lookup_origin(s->origin->dev);
5045 +
5046 +       list_del(&s->list);
5047 +       if (list_empty(&o->snapshots)) {
5048 +               list_del(&o->hash_list);
5049 +               kfree(o);
5050 +       }
5051 +
5052 +       up_write(&_origins_lock);
5053 +}
5054 +
5055 +/*
5056 + * Implementation of the exception hash tables.
5057 + */
5058 +static int init_exception_table(struct exception_table *et, uint32_t size)
5059 +{
5060 +       unsigned int i;
5061 +
5062 +       et->hash_mask = size - 1;
5063 +       et->table = vcalloc(size, sizeof(struct list_head));
5064 +       if (!et->table)
5065 +               return -ENOMEM;
5066 +
5067 +       for (i = 0; i < size; i++)
5068 +               INIT_LIST_HEAD(et->table + i);
5069 +
5070 +       return 0;
5071 +}
5072 +
5073 +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
5074 +{
5075 +       struct list_head *slot, *entry, *temp;
5076 +       struct exception *ex;
5077 +       int i, size;
5078 +
5079 +       size = et->hash_mask + 1;
5080 +       for (i = 0; i < size; i++) {
5081 +               slot = et->table + i;
5082 +
5083 +               list_for_each_safe(entry, temp, slot) {
5084 +                       ex = list_entry(entry, struct exception, hash_list);
5085 +                       kmem_cache_free(mem, ex);
5086 +               }
5087 +       }
5088 +
5089 +       vfree(et->table);
5090 +}
5091 +
5092 +/*
5093 + * FIXME: check how this hash fn is performing.
5094 + */
5095 +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
5096 +{
5097 +       return chunk & et->hash_mask;
5098 +}
5099 +
5100 +static void insert_exception(struct exception_table *eh, struct exception *e)
5101 +{
5102 +       struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
5103 +       list_add(&e->hash_list, l);
5104 +}
5105 +
5106 +static inline void remove_exception(struct exception *e)
5107 +{
5108 +       list_del(&e->hash_list);
5109 +}
5110 +
5111 +/*
5112 + * Return the exception data for a sector, or NULL if not
5113 + * remapped.
5114 + */
5115 +static struct exception *lookup_exception(struct exception_table *et,
5116 +                                         chunk_t chunk)
5117 +{
5118 +       struct list_head *slot, *el;
5119 +       struct exception *e;
5120 +
5121 +       slot = &et->table[exception_hash(et, chunk)];
5122 +       list_for_each(el, slot) {
5123 +               e = list_entry(el, struct exception, hash_list);
5124 +               if (e->old_chunk == chunk)
5125 +                       return e;
5126 +       }
5127 +
5128 +       return NULL;
5129 +}
5130 +
5131 +static inline struct exception *alloc_exception(void)
5132 +{
5133 +       struct exception *e;
5134 +
5135 +       e = kmem_cache_alloc(exception_cache, GFP_NOIO);
5136 +       if (!e)
5137 +               e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
5138 +
5139 +       return e;
5140 +}
5141 +
5142 +static inline void free_exception(struct exception *e)
5143 +{
5144 +       kmem_cache_free(exception_cache, e);
5145 +}
5146 +
5147 +static inline struct pending_exception *alloc_pending_exception(void)
5148 +{
5149 +       return mempool_alloc(pending_pool, GFP_NOIO);
5150 +}
5151 +
5152 +static inline void free_pending_exception(struct pending_exception *pe)
5153 +{
5154 +       mempool_free(pe, pending_pool);
5155 +}
5156 +
5157 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
5158 +{
5159 +       struct exception *e;
5160 +
5161 +       e = alloc_exception();
5162 +       if (!e)
5163 +               return -ENOMEM;
5164 +
5165 +       e->old_chunk = old;
5166 +       e->new_chunk = new;
5167 +       insert_exception(&s->complete, e);
5168 +       return 0;
5169 +}
5170 +
5171 +/*
5172 + * Hard coded magic.
5173 + */
5174 +static int calc_max_buckets(void)
5175 +{
5176 +       unsigned long mem;
5177 +
5178 +       mem = num_physpages << PAGE_SHIFT;
5179 +       mem /= 50;
5180 +       mem /= sizeof(struct list_head);
5181 +
5182 +       return mem;
5183 +}
5184 +
5185 +/*
5186 + * Rounds a number down to a power of 2.
5187 + */
5188 +static inline uint32_t round_down(uint32_t n)
5189 +{
5190 +       while (n & (n - 1))
5191 +               n &= (n - 1);
5192 +       return n;
5193 +}
5194 +
5195 +/*
5196 + * Allocate room for a suitable hash table.
5197 + */
5198 +static int init_hash_tables(struct dm_snapshot *s)
5199 +{
5200 +       sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
5201 +
5202 +       /*
5203 +        * Calculate based on the size of the original volume or
5204 +        * the COW volume...
5205 +        */
5206 +       cow_dev_size = get_dev_size(s->cow->dev);
5207 +       origin_dev_size = get_dev_size(s->origin->dev);
5208 +       max_buckets = calc_max_buckets();
5209 +
5210 +       hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
5211 +       hash_size = min(hash_size, max_buckets);
5212 +
5213 +       /* Round it down to a power of 2 */
5214 +       hash_size = round_down(hash_size);
5215 +       if (init_exception_table(&s->complete, hash_size))
5216 +               return -ENOMEM;
5217 +
5218 +       /*
5219 +        * Allocate hash table for in-flight exceptions
5220 +        * Make this smaller than the real hash table
5221 +        */
5222 +       hash_size >>= 3;
5223 +       if (!hash_size)
5224 +               hash_size = 64;
5225 +
5226 +       if (init_exception_table(&s->pending, hash_size)) {
5227 +               exit_exception_table(&s->complete, exception_cache);
5228 +               return -ENOMEM;
5229 +       }
5230 +
5231 +       return 0;
5232 +}
5233 +
5234 +/*
5235 + * Round a number up to the nearest 'size' boundary.  size must
5236 + * be a power of 2.
5237 + */
5238 +static inline ulong round_up(ulong n, ulong size)
5239 +{
5240 +       size--;
5241 +       return (n + size) & ~size;
5242 +}
5243 +
5244 +/*
5245 + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
5246 + */
5247 +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5248 +{
5249 +       struct dm_snapshot *s;
5250 +       unsigned long chunk_size;
5251 +       int r = -EINVAL;
5252 +       char persistent;
5253 +       char *origin_path;
5254 +       char *cow_path;
5255 +       char *value;
5256 +       int blocksize;
5257 +
5258 +       if (argc < 4) {
5259 +               ti->error = "dm-snapshot: requires exactly 4 arguments";
5260 +               r = -EINVAL;
5261 +               goto bad1;
5262 +       }
5263 +
5264 +       origin_path = argv[0];
5265 +       cow_path = argv[1];
5266 +       persistent = toupper(*argv[2]);
5267 +
5268 +       if (persistent != 'P' && persistent != 'N') {
5269 +               ti->error = "Persistent flag is not P or N";
5270 +               r = -EINVAL;
5271 +               goto bad1;
5272 +       }
5273 +
5274 +       chunk_size = simple_strtoul(argv[3], &value, 10);
5275 +       if (chunk_size == 0 || value == NULL) {
5276 +               ti->error = "Invalid chunk size";
5277 +               r = -EINVAL;
5278 +               goto bad1;
5279 +       }
5280 +
5281 +       s = kmalloc(sizeof(*s), GFP_KERNEL);
5282 +       if (s == NULL) {
5283 +               ti->error = "Cannot allocate snapshot context private "
5284 +                   "structure";
5285 +               r = -ENOMEM;
5286 +               goto bad1;
5287 +       }
5288 +
5289 +       r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
5290 +       if (r) {
5291 +               ti->error = "Cannot get origin device";
5292 +               goto bad2;
5293 +       }
5294 +
5295 +       /* FIXME: get cow length */
5296 +       r = dm_get_device(ti, cow_path, 0, 0,
5297 +                         FMODE_READ | FMODE_WRITE, &s->cow);
5298 +       if (r) {
5299 +               dm_put_device(ti, s->origin);
5300 +               ti->error = "Cannot get COW device";
5301 +               goto bad2;
5302 +       }
5303 +
5304 +       /*
5305 +        * Chunk size must be multiple of page size.  Silently
5306 +        * round up if it's not.
5307 +        */
5308 +       chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
5309 +
5310 +       /* Validate the chunk size against the device block size */
5311 +       blocksize = get_hardsect_size(s->cow->dev);
5312 +       if (chunk_size % (blocksize / SECTOR_SIZE)) {
5313 +               ti->error = "Chunk size is not a multiple of device blocksize";
5314 +               r = -EINVAL;
5315 +               goto bad3;
5316 +       }
5317 +
5318 +       /* Check the sizes are small enough to fit in one kiovec */
5319 +       if (chunk_size > KIO_MAX_SECTORS) {
5320 +               ti->error = "Chunk size is too big";
5321 +               r = -EINVAL;
5322 +               goto bad3;
5323 +       }
5324 +
5325 +       /* Check chunk_size is a power of 2 */
5326 +       if (chunk_size & (chunk_size - 1)) {
5327 +               ti->error = "Chunk size is not a power of 2";
5328 +               r = -EINVAL;
5329 +               goto bad3;
5330 +       }
5331 +
5332 +       s->chunk_size = chunk_size;
5333 +       s->chunk_mask = chunk_size - 1;
5334 +       s->type = persistent;
5335 +       for (s->chunk_shift = 0; chunk_size;
5336 +            s->chunk_shift++, chunk_size >>= 1)
5337 +               ;
5338 +       s->chunk_shift--;
5339 +
5340 +       s->valid = 1;
5341 +       s->have_metadata = 0;
5342 +       s->last_percent = 0;
5343 +       init_rwsem(&s->lock);
5344 +       s->table = ti->table;
5345 +
5346 +       /* Allocate hash table for COW data */
5347 +       if (init_hash_tables(s)) {
5348 +               ti->error = "Unable to allocate hash table space";
5349 +               r = -ENOMEM;
5350 +               goto bad3;
5351 +       }
5352 +
5353 +       /*
5354 +        * Check the persistent flag - done here because we need the iobuf
5355 +        * to check the LV header
5356 +        */
5357 +       s->store.snap = s;
5358 +
5359 +       if (persistent == 'P')
5360 +               r = dm_create_persistent(&s->store, s->chunk_size);
5361 +       else
5362 +               r = dm_create_transient(&s->store, s, blocksize);
5363 +
5364 +       if (r) {
5365 +               ti->error = "Couldn't create exception store";
5366 +               r = -EINVAL;
5367 +               goto bad4;
5368 +       }
5369 +
5370 +       r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
5371 +       if (r) {
5372 +               ti->error = "Could not create kcopyd client";
5373 +               goto bad5;
5374 +       }
5375 +
5376 +       /* Flush IO to the origin device */
5377 +       fsync_dev(s->origin->dev);
5378 +
5379 +       /* Add snapshot to the list of snapshots for this origin */
5380 +       if (register_snapshot(s)) {
5381 +               r = -EINVAL;
5382 +               ti->error = "Cannot register snapshot origin";
5383 +               goto bad6;
5384 +       }
5385 +
5386 +       ti->private = s;
5387 +       return 0;
5388 +
5389 + bad6:
5390 +       kcopyd_client_destroy(s->kcopyd_client);
5391 +
5392 + bad5:
5393 +       s->store.destroy(&s->store);
5394 +
5395 + bad4:
5396 +       exit_exception_table(&s->pending, pending_cache);
5397 +       exit_exception_table(&s->complete, exception_cache);
5398 +
5399 + bad3:
5400 +       dm_put_device(ti, s->cow);
5401 +       dm_put_device(ti, s->origin);
5402 +
5403 + bad2:
5404 +       kfree(s);
5405 +
5406 + bad1:
5407 +       return r;
5408 +}
5409 +
5410 +static void snapshot_dtr(struct dm_target *ti)
5411 +{
5412 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5413 +
5414 +       dm_table_event(ti->table);
5415 +
5416 +       unregister_snapshot(s);
5417 +
5418 +       exit_exception_table(&s->pending, pending_cache);
5419 +       exit_exception_table(&s->complete, exception_cache);
5420 +
5421 +       /* Deallocate memory used */
5422 +       s->store.destroy(&s->store);
5423 +
5424 +       dm_put_device(ti, s->origin);
5425 +       dm_put_device(ti, s->cow);
5426 +       kcopyd_client_destroy(s->kcopyd_client);
5427 +       kfree(s);
5428 +}
5429 +
5430 +/*
5431 + * We hold lists of buffer_heads, using the b_reqnext field.
5432 + */
5433 +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
5434 +{
5435 +       bh->b_reqnext = *queue;
5436 +       *queue = bh;
5437 +}
5438 +
5439 +/*
5440 + * FIXME: inefficient.
5441 + */
5442 +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
5443 +{
5444 +       while (*queue)
5445 +               queue = &((*queue)->b_reqnext);
5446 +
5447 +       *queue = bhs;
5448 +}
5449 +
5450 +/*
5451 + * Flush a list of buffers.
5452 + */
5453 +static void flush_buffers(struct buffer_head *bh)
5454 +{
5455 +       struct buffer_head *n;
5456 +
5457 +       DMDEBUG("begin flush");
5458 +       while (bh) {
5459 +               n = bh->b_reqnext;
5460 +               bh->b_reqnext = NULL;
5461 +               DMDEBUG("flushing %p", bh);
5462 +               generic_make_request(WRITE, bh);
5463 +               bh = n;
5464 +       }
5465 +
5466 +       run_task_queue(&tq_disk);
5467 +}
5468 +
5469 +/*
5470 + * Error a list of buffers.
5471 + */
5472 +static void error_buffers(struct buffer_head *bh)
5473 +{
5474 +       struct buffer_head *n;
5475 +
5476 +       while (bh) {
5477 +               n = bh->b_reqnext;
5478 +               bh->b_reqnext = NULL;
5479 +               buffer_IO_error(bh);
5480 +               bh = n;
5481 +       }
5482 +}
5483 +
5484 +static struct buffer_head *__flush_bhs(struct pending_exception *pe)
5485 +{
5486 +       struct pending_exception *sibling;
5487 +
5488 +       if (list_empty(&pe->siblings))
5489 +               return pe->origin_bhs;
5490 +
5491 +       sibling = list_entry(pe->siblings.next,
5492 +                            struct pending_exception, siblings);
5493 +
5494 +       list_del(&pe->siblings);
5495 +
5496 +       /* FIXME: I think there's a race on SMP machines here, add spin lock */
5497 +       queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
5498 +
5499 +       return NULL;
5500 +}
5501 +
5502 +static void pending_complete(struct pending_exception *pe, int success)
5503 +{
5504 +       struct exception *e;
5505 +       struct dm_snapshot *s = pe->snap;
5506 +       struct buffer_head *flush = NULL;
5507 +
5508 +       if (success) {
5509 +               e = alloc_exception();
5510 +               if (!e) {
5511 +                       DMWARN("Unable to allocate exception.");
5512 +                       down_write(&s->lock);
5513 +                       s->store.drop_snapshot(&s->store);
5514 +                       s->valid = 0;
5515 +                       flush = __flush_bhs(pe);
5516 +                       up_write(&s->lock);
5517 +
5518 +                       error_buffers(pe->snapshot_bhs);
5519 +                       goto out;
5520 +               }
5521 +
5522 +               /*
5523 +                * Add a proper exception, and remove the
5524 +                * in-flight exception from the list.
5525 +                */
5526 +               down_write(&s->lock);
5527 +
5528 +               memcpy(e, &pe->e, sizeof(*e));
5529 +               insert_exception(&s->complete, e);
5530 +               remove_exception(&pe->e);
5531 +               flush = __flush_bhs(pe);
5532 +
5533 +               /* Submit any pending write BHs */
5534 +               up_write(&s->lock);
5535 +
5536 +               flush_buffers(pe->snapshot_bhs);
5537 +               DMDEBUG("Exception completed successfully.");
5538 +
5539 +               /* Notify any interested parties */
5540 +               if (s->store.fraction_full) {
5541 +                       sector_t numerator, denominator;
5542 +                       int pc;
5543 +
5544 +                       s->store.fraction_full(&s->store, &numerator,
5545 +                                              &denominator);
5546 +                       pc = numerator * 100 / denominator;
5547 +
5548 +                       if (pc >= s->last_percent + WAKE_UP_PERCENT) {
5549 +                               dm_table_event(s->table);
5550 +                               s->last_percent = pc - pc % WAKE_UP_PERCENT;
5551 +                       }
5552 +               }
5553 +
5554 +       } else {
5555 +               /* Read/write error - snapshot is unusable */
5556 +               down_write(&s->lock);
5557 +               if (s->valid)
5558 +                       DMERR("Error reading/writing snapshot");
5559 +               s->store.drop_snapshot(&s->store);
5560 +               s->valid = 0;
5561 +               remove_exception(&pe->e);
5562 +               flush = __flush_bhs(pe);
5563 +               up_write(&s->lock);
5564 +
5565 +               error_buffers(pe->snapshot_bhs);
5566 +
5567 +               dm_table_event(s->table);
5568 +               DMDEBUG("Exception failed.");
5569 +       }
5570 +
5571 + out:
5572 +       if (flush)
5573 +               flush_buffers(flush);
5574 +
5575 +       free_pending_exception(pe);
5576 +}
5577 +
5578 +static void commit_callback(void *context, int success)
5579 +{
5580 +       struct pending_exception *pe = (struct pending_exception *) context;
5581 +       pending_complete(pe, success);
5582 +}
5583 +
5584 +/*
5585 + * Called when the copy I/O has finished.  kcopyd actually runs
5586 + * this code so don't block.
5587 + */
5588 +static void copy_callback(int read_err, unsigned int write_err, void *context)
5589 +{
5590 +       struct pending_exception *pe = (struct pending_exception *) context;
5591 +       struct dm_snapshot *s = pe->snap;
5592 +
5593 +       if (read_err || write_err)
5594 +               pending_complete(pe, 0);
5595 +
5596 +       else
5597 +               /* Update the metadata if we are persistent */
5598 +               s->store.commit_exception(&s->store, &pe->e, commit_callback,
5599 +                                         pe);
5600 +}
5601 +
5602 +/*
5603 + * Dispatches the copy operation to kcopyd.
5604 + */
5605 +static inline void start_copy(struct pending_exception *pe)
5606 +{
5607 +       struct dm_snapshot *s = pe->snap;
5608 +       struct io_region src, dest;
5609 +       kdev_t dev = s->origin->dev;
5610 +       int *sizes = blk_size[major(dev)];
5611 +       sector_t dev_size = (sector_t) -1;
5612 +
5613 +       if (pe->started)
5614 +               return;
5615 +
5616 +       /* this is protected by snap->lock */
5617 +       pe->started = 1;
5618 +
5619 +       if (sizes && sizes[minor(dev)])
5620 +               dev_size = sizes[minor(dev)] << 1;
5621 +
5622 +       src.dev = dev;
5623 +       src.sector = chunk_to_sector(s, pe->e.old_chunk);
5624 +       src.count = min(s->chunk_size, dev_size - src.sector);
5625 +
5626 +       dest.dev = s->cow->dev;
5627 +       dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5628 +       dest.count = src.count;
5629 +
5630 +       /* Hand over to kcopyd */
5631 +       kcopyd_copy(s->kcopyd_client,
5632 +                   &src, 1, &dest, 0, copy_callback, pe);
5633 +}
5634 +
5635 +/*
5636 + * Looks to see if this snapshot already has a pending exception
5637 + * for this chunk, otherwise it allocates a new one and inserts
5638 + * it into the pending table.
5639 + */
5640 +static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5641 +                                                       struct buffer_head *bh)
5642 +{
5643 +       struct exception *e;
5644 +       struct pending_exception *pe;
5645 +       chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5646 +
5647 +       /*
5648 +        * Is there a pending exception for this already ?
5649 +        */
5650 +       e = lookup_exception(&s->pending, chunk);
5651 +       if (e) {
5652 +               /* cast the exception to a pending exception */
5653 +               pe = list_entry(e, struct pending_exception, e);
5654 +
5655 +       } else {
5656 +               /* Create a new pending exception */
5657 +               pe = alloc_pending_exception();
5658 +               pe->e.old_chunk = chunk;
5659 +               pe->origin_bhs = pe->snapshot_bhs = NULL;
5660 +               INIT_LIST_HEAD(&pe->siblings);
5661 +               pe->snap = s;
5662 +               pe->started = 0;
5663 +
5664 +               if (s->store.prepare_exception(&s->store, &pe->e)) {
5665 +                       free_pending_exception(pe);
5666 +                       s->valid = 0;
5667 +                       return NULL;
5668 +               }
5669 +
5670 +               insert_exception(&s->pending, &pe->e);
5671 +       }
5672 +
5673 +       return pe;
5674 +}
5675 +
5676 +static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5677 +                                  struct buffer_head *bh)
5678 +{
5679 +       bh->b_rdev = s->cow->dev;
5680 +       bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5681 +           (bh->b_rsector & s->chunk_mask);
5682 +}
5683 +
5684 +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5685 +                       union map_info *map_context)
5686 +{
5687 +       struct exception *e;
5688 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5689 +       int r = 1;
5690 +       chunk_t chunk;
5691 +       struct pending_exception *pe;
5692 +
5693 +       chunk = sector_to_chunk(s, bh->b_rsector);
5694 +
5695 +       /* Full snapshots are not usable */
5696 +       if (!s->valid)
5697 +               return -1;
5698 +
5699 +       /*
5700 +        * Write to snapshot - higher level takes care of RW/RO
5701 +        * flags so we should only get this if we are
5702 +        * writeable.
5703 +        */
5704 +       if (rw == WRITE) {
5705 +
5706 +               down_write(&s->lock);
5707 +
5708 +               /* If the block is already remapped - use that, else remap it */
5709 +               e = lookup_exception(&s->complete, chunk);
5710 +               if (e)
5711 +                       remap_exception(s, e, bh);
5712 +
5713 +               else {
5714 +                       pe = find_pending_exception(s, bh);
5715 +
5716 +                       if (!pe) {
5717 +                               s->store.drop_snapshot(&s->store);
5718 +                               s->valid = 0;
5719 +                               r = -EIO;
5720 +                       } else {
5721 +                               remap_exception(s, &pe->e, bh);
5722 +                               queue_buffer(&pe->snapshot_bhs, bh);
5723 +                               start_copy(pe);
5724 +                               r = 0;
5725 +                       }
5726 +               }
5727 +
5728 +               up_write(&s->lock);
5729 +
5730 +       } else {
5731 +               /*
5732 +                * FIXME: this read path scares me because we
5733 +                * always use the origin when we have a pending
5734 +                * exception.  However I can't think of a
5735 +                * situation where this is wrong - ejt.
5736 +                */
5737 +
5738 +               /* Do reads */
5739 +               down_read(&s->lock);
5740 +
5741 +               /* See if it it has been remapped */
5742 +               e = lookup_exception(&s->complete, chunk);
5743 +               if (e)
5744 +                       remap_exception(s, e, bh);
5745 +               else
5746 +                       bh->b_rdev = s->origin->dev;
5747 +
5748 +               up_read(&s->lock);
5749 +       }
5750 +
5751 +       return r;
5752 +}
5753 +
5754 +void snapshot_resume(struct dm_target *ti)
5755 +{
5756 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5757 +
5758 +       if (s->have_metadata)
5759 +               return;
5760 +
5761 +       if (s->store.read_metadata(&s->store)) {
5762 +               down_write(&s->lock);
5763 +               s->valid = 0;
5764 +               up_write(&s->lock);
5765 +       }
5766 +
5767 +       s->have_metadata = 1;
5768 +}
5769 +
5770 +static int snapshot_status(struct dm_target *ti, status_type_t type,
5771 +                          char *result, unsigned int maxlen)
5772 +{
5773 +       struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5774 +       char cow[16];
5775 +       char org[16];
5776 +
5777 +       switch (type) {
5778 +       case STATUSTYPE_INFO:
5779 +               if (!snap->valid)
5780 +                       snprintf(result, maxlen, "Invalid");
5781 +               else {
5782 +                       if (snap->store.fraction_full) {
5783 +                               sector_t numerator, denominator;
5784 +                               snap->store.fraction_full(&snap->store,
5785 +                                                         &numerator,
5786 +                                                         &denominator);
5787 +                               snprintf(result, maxlen,
5788 +                                        SECTOR_FORMAT "/" SECTOR_FORMAT,
5789 +                                        numerator, denominator);
5790 +                       }
5791 +                       else
5792 +                               snprintf(result, maxlen, "Unknown");
5793 +               }
5794 +               break;
5795 +
5796 +       case STATUSTYPE_TABLE:
5797 +               /*
5798 +                * kdevname returns a static pointer so we need
5799 +                * to make private copies if the output is to
5800 +                * make sense.
5801 +                */
5802 +               strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5803 +               strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5804 +               snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5805 +                        snap->type, snap->chunk_size);
5806 +               break;
5807 +       }
5808 +
5809 +       return 0;
5810 +}
5811 +
5812 +/*-----------------------------------------------------------------
5813 + * Origin methods
5814 + *---------------------------------------------------------------*/
5815 +static void list_merge(struct list_head *l1, struct list_head *l2)
5816 +{
5817 +       struct list_head *l1_n, *l2_p;
5818 +
5819 +       l1_n = l1->next;
5820 +       l2_p = l2->prev;
5821 +
5822 +       l1->next = l2;
5823 +       l2->prev = l1;
5824 +
5825 +       l2_p->next = l1_n;
5826 +       l1_n->prev = l2_p;
5827 +}
5828 +
5829 +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5830 +{
5831 +       int r = 1, first = 1;
5832 +       struct list_head *sl;
5833 +       struct dm_snapshot *snap;
5834 +       struct exception *e;
5835 +       struct pending_exception *pe, *last = NULL;
5836 +       chunk_t chunk;
5837 +
5838 +       /* Do all the snapshots on this origin */
5839 +       list_for_each(sl, snapshots) {
5840 +               snap = list_entry(sl, struct dm_snapshot, list);
5841 +
5842 +               /* Only deal with valid snapshots */
5843 +               if (!snap->valid)
5844 +                       continue;
5845 +
5846 +               down_write(&snap->lock);
5847 +
5848 +               /*
5849 +                * Remember, different snapshots can have
5850 +                * different chunk sizes.
5851 +                */
5852 +               chunk = sector_to_chunk(snap, bh->b_rsector);
5853 +
5854 +               /*
5855 +                * Check exception table to see if block
5856 +                * is already remapped in this snapshot
5857 +                * and trigger an exception if not.
5858 +                */
5859 +               e = lookup_exception(&snap->complete, chunk);
5860 +               if (!e) {
5861 +                       pe = find_pending_exception(snap, bh);
5862 +                       if (!pe) {
5863 +                               snap->store.drop_snapshot(&snap->store);
5864 +                               snap->valid = 0;
5865 +
5866 +                       } else {
5867 +                               if (last)
5868 +                                       list_merge(&pe->siblings,
5869 +                                                  &last->siblings);
5870 +
5871 +                               last = pe;
5872 +                               r = 0;
5873 +                       }
5874 +               }
5875 +
5876 +               up_write(&snap->lock);
5877 +       }
5878 +
5879 +       /*
5880 +        * Now that we have a complete pe list we can start the copying.
5881 +        */
5882 +       if (last) {
5883 +               pe = last;
5884 +               do {
5885 +                       down_write(&pe->snap->lock);
5886 +                       if (first)
5887 +                               queue_buffer(&pe->origin_bhs, bh);
5888 +                       start_copy(pe);
5889 +                       up_write(&pe->snap->lock);
5890 +                       first = 0;
5891 +                       pe = list_entry(pe->siblings.next,
5892 +                                       struct pending_exception, siblings);
5893 +
5894 +               } while (pe != last);
5895 +       }
5896 +
5897 +       return r;
5898 +}
5899 +
5900 +/*
5901 + * Called on a write from the origin driver.
5902 + */
5903 +int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5904 +{
5905 +       struct origin *o;
5906 +       int r;
5907 +
5908 +       down_read(&_origins_lock);
5909 +       o = __lookup_origin(origin->dev);
5910 +       if (!o)
5911 +               BUG();
5912 +
5913 +       r = __origin_write(&o->snapshots, bh);
5914 +       up_read(&_origins_lock);
5915 +
5916 +       return r;
5917 +}
5918 +
5919 +/*
5920 + * Origin: maps a linear range of a device, with hooks for snapshotting.
5921 + */
5922 +
5923 +/*
5924 + * Construct an origin mapping: <dev_path>
5925 + * The context for an origin is merely a 'struct dm_dev *'
5926 + * pointing to the real device.
5927 + */
5928 +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5929 +{
5930 +       int r;
5931 +       struct dm_dev *dev;
5932 +
5933 +       if (argc != 1) {
5934 +               ti->error = "dm-origin: incorrect number of arguments";
5935 +               return -EINVAL;
5936 +       }
5937 +
5938 +       r = dm_get_device(ti, argv[0], 0, ti->len,
5939 +                         dm_table_get_mode(ti->table), &dev);
5940 +       if (r) {
5941 +               ti->error = "Cannot get target device";
5942 +               return r;
5943 +       }
5944 +
5945 +       ti->private = dev;
5946 +       return 0;
5947 +}
5948 +
5949 +static void origin_dtr(struct dm_target *ti)
5950 +{
5951 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5952 +       dm_put_device(ti, dev);
5953 +}
5954 +
5955 +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5956 +                     union map_info *map_context)
5957 +{
5958 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5959 +       bh->b_rdev = dev->dev;
5960 +
5961 +       /* Only tell snapshots if this is a write */
5962 +       return (rw == WRITE) ? do_origin(dev, bh) : 1;
5963 +}
5964 +
5965 +static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5966 +                        unsigned int maxlen)
5967 +{
5968 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5969 +
5970 +       switch (type) {
5971 +       case STATUSTYPE_INFO:
5972 +               result[0] = '\0';
5973 +               break;
5974 +
5975 +       case STATUSTYPE_TABLE:
5976 +               snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5977 +               break;
5978 +       }
5979 +
5980 +       return 0;
5981 +}
5982 +
5983 +static struct target_type origin_target = {
5984 +       name:   "snapshot-origin",
5985 +       module: THIS_MODULE,
5986 +       ctr:    origin_ctr,
5987 +       dtr:    origin_dtr,
5988 +       map:    origin_map,
5989 +       status: origin_status,
5990 +};
5991 +
5992 +static struct target_type snapshot_target = {
5993 +       name:   "snapshot",
5994 +       module: THIS_MODULE,
5995 +       ctr:    snapshot_ctr,
5996 +       dtr:    snapshot_dtr,
5997 +       map:    snapshot_map,
5998 +       resume: snapshot_resume,
5999 +       status: snapshot_status,
6000 +};
6001 +
6002 +int __init dm_snapshot_init(void)
6003 +{
6004 +       int r;
6005 +
6006 +       r = dm_register_target(&snapshot_target);
6007 +       if (r) {
6008 +               DMERR("snapshot target register failed %d", r);
6009 +               return r;
6010 +       }
6011 +
6012 +       r = dm_register_target(&origin_target);
6013 +       if (r < 0) {
6014 +               DMERR("Device mapper: Origin: register failed %d\n", r);
6015 +               goto bad1;
6016 +       }
6017 +
6018 +       r = init_origin_hash();
6019 +       if (r) {
6020 +               DMERR("init_origin_hash failed.");
6021 +               goto bad2;
6022 +       }
6023 +
6024 +       exception_cache = kmem_cache_create("dm-snapshot-ex",
6025 +                                           sizeof(struct exception),
6026 +                                           __alignof__(struct exception),
6027 +                                           0, NULL, NULL);
6028 +       if (!exception_cache) {
6029 +               DMERR("Couldn't create exception cache.");
6030 +               r = -ENOMEM;
6031 +               goto bad3;
6032 +       }
6033 +
6034 +       pending_cache =
6035 +           kmem_cache_create("dm-snapshot-in",
6036 +                             sizeof(struct pending_exception),
6037 +                             __alignof__(struct pending_exception),
6038 +                             0, NULL, NULL);
6039 +       if (!pending_cache) {
6040 +               DMERR("Couldn't create pending cache.");
6041 +               r = -ENOMEM;
6042 +               goto bad4;
6043 +       }
6044 +
6045 +       pending_pool = mempool_create(128, mempool_alloc_slab,
6046 +                                     mempool_free_slab, pending_cache);
6047 +       if (!pending_pool) {
6048 +               DMERR("Couldn't create pending pool.");
6049 +               r = -ENOMEM;
6050 +               goto bad5;
6051 +       }
6052 +
6053 +       return 0;
6054 +
6055 +      bad5:
6056 +       kmem_cache_destroy(pending_cache);
6057 +      bad4:
6058 +       kmem_cache_destroy(exception_cache);
6059 +      bad3:
6060 +       exit_origin_hash();
6061 +      bad2:
6062 +       dm_unregister_target(&origin_target);
6063 +      bad1:
6064 +       dm_unregister_target(&snapshot_target);
6065 +       return r;
6066 +}
6067 +
6068 +void dm_snapshot_exit(void)
6069 +{
6070 +       int r;
6071 +
6072 +       r = dm_unregister_target(&snapshot_target);
6073 +       if (r)
6074 +               DMERR("snapshot unregister failed %d", r);
6075 +
6076 +       r = dm_unregister_target(&origin_target);
6077 +       if (r)
6078 +               DMERR("origin unregister failed %d", r);
6079 +
6080 +       exit_origin_hash();
6081 +       mempool_destroy(pending_pool);
6082 +       kmem_cache_destroy(pending_cache);
6083 +       kmem_cache_destroy(exception_cache);
6084 +}
6085 --- diff/drivers/md/dm-snapshot.h       1970-01-01 01:00:00.000000000 +0100
6086 +++ source/drivers/md/dm-snapshot.h     2003-10-16 10:44:23.000000000 +0100
6087 @@ -0,0 +1,158 @@
6088 +/*
6089 + * dm-snapshot.c
6090 + *
6091 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
6092 + *
6093 + * This file is released under the GPL.
6094 + */
6095 +
6096 +#ifndef DM_SNAPSHOT_H
6097 +#define DM_SNAPSHOT_H
6098 +
6099 +#include "dm.h"
6100 +#include <linux/blkdev.h>
6101 +
6102 +struct exception_table {
6103 +       uint32_t hash_mask;
6104 +       struct list_head *table;
6105 +};
6106 +
6107 +/*
6108 + * The snapshot code deals with largish chunks of the disk at a
6109 + * time. Typically 64k - 256k.
6110 + */
6111 +/* FIXME: can we get away with limiting these to a uint32_t ? */
6112 +typedef sector_t chunk_t;
6113 +
6114 +/*
6115 + * An exception is used where an old chunk of data has been
6116 + * replaced by a new one.
6117 + */
6118 +struct exception {
6119 +       struct list_head hash_list;
6120 +
6121 +       chunk_t old_chunk;
6122 +       chunk_t new_chunk;
6123 +};
6124 +
6125 +/*
6126 + * Abstraction to handle the meta/layout of exception stores (the
6127 + * COW device).
6128 + */
6129 +struct exception_store {
6130 +
6131 +       /*
6132 +        * Destroys this object when you've finished with it.
6133 +        */
6134 +       void (*destroy) (struct exception_store *store);
6135 +
6136 +       /*
6137 +        * The target shouldn't read the COW device until this is
6138 +        * called.
6139 +        */
6140 +       int (*read_metadata) (struct exception_store *store);
6141 +
6142 +       /*
6143 +        * Find somewhere to store the next exception.
6144 +        */
6145 +       int (*prepare_exception) (struct exception_store *store,
6146 +                                 struct exception *e);
6147 +
6148 +       /*
6149 +        * Update the metadata with this exception.
6150 +        */
6151 +       void (*commit_exception) (struct exception_store *store,
6152 +                                 struct exception *e,
6153 +                                 void (*callback) (void *, int success),
6154 +                                 void *callback_context);
6155 +
6156 +       /*
6157 +        * The snapshot is invalid, note this in the metadata.
6158 +        */
6159 +       void (*drop_snapshot) (struct exception_store *store);
6160 +
6161 +       /*
6162 +        * Return how full the snapshot is.
6163 +        */
6164 +       void (*fraction_full) (struct exception_store *store,
6165 +                              sector_t *numerator,
6166 +                              sector_t *denominator);
6167 +
6168 +       struct dm_snapshot *snap;
6169 +       void *context;
6170 +};
6171 +
6172 +struct dm_snapshot {
6173 +       struct rw_semaphore lock;
6174 +       struct dm_table *table;
6175 +
6176 +       struct dm_dev *origin;
6177 +       struct dm_dev *cow;
6178 +
6179 +       /* List of snapshots per Origin */
6180 +       struct list_head list;
6181 +
6182 +       /* Size of data blocks saved - must be a power of 2 */
6183 +       chunk_t chunk_size;
6184 +       chunk_t chunk_mask;
6185 +       chunk_t chunk_shift;
6186 +
6187 +       /* You can't use a snapshot if this is 0 (e.g. if full) */
6188 +       int valid;
6189 +       int have_metadata;
6190 +
6191 +       /* Used for display of table */
6192 +       char type;
6193 +
6194 +       /* The last percentage we notified */
6195 +       int last_percent;
6196 +
6197 +       struct exception_table pending;
6198 +       struct exception_table complete;
6199 +
6200 +       /* The on disk metadata handler */
6201 +       struct exception_store store;
6202 +
6203 +       struct kcopyd_client *kcopyd_client;
6204 +};
6205 +
6206 +/*
6207 + * Used by the exception stores to load exceptions hen
6208 + * initialising.
6209 + */
6210 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
6211 +
6212 +/*
6213 + * Constructor and destructor for the default persistent
6214 + * store.
6215 + */
6216 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
6217 +
6218 +int dm_create_transient(struct exception_store *store,
6219 +                       struct dm_snapshot *s, int blocksize);
6220 +
6221 +/*
6222 + * Return the number of sectors in the device.
6223 + */
6224 +static inline sector_t get_dev_size(kdev_t dev)
6225 +{
6226 +       int *sizes;
6227 +
6228 +       sizes = blk_size[MAJOR(dev)];
6229 +       if (sizes)
6230 +               return sizes[MINOR(dev)] << 1;
6231 +
6232 +       return 0;
6233 +}
6234 +
6235 +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
6236 +{
6237 +       return (sector & ~s->chunk_mask) >> s->chunk_shift;
6238 +}
6239 +
6240 +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
6241 +{
6242 +       return chunk << s->chunk_shift;
6243 +}
6244 +
6245 +#endif
6246 --- diff/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100
6247 +++ source/drivers/md/dm-stripe.c       2003-10-16 10:44:23.000000000 +0100
6248 @@ -0,0 +1,258 @@
6249 +/*
6250 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6251 + *
6252 + * This file is released under the GPL.
6253 + */
6254 +
6255 +#include "dm.h"
6256 +
6257 +#include <linux/module.h>
6258 +#include <linux/init.h>
6259 +#include <linux/blkdev.h>
6260 +#include <linux/slab.h>
6261 +
6262 +struct stripe {
6263 +       struct dm_dev *dev;
6264 +       sector_t physical_start;
6265 +};
6266 +
6267 +struct stripe_c {
6268 +       uint32_t stripes;
6269 +
6270 +       /* The size of this target / num. stripes */
6271 +       uint32_t stripe_width;
6272 +
6273 +       /* stripe chunk size */
6274 +       uint32_t chunk_shift;
6275 +       sector_t chunk_mask;
6276 +
6277 +       struct stripe stripe[0];
6278 +};
6279 +
6280 +static inline struct stripe_c *alloc_context(unsigned int stripes)
6281 +{
6282 +       size_t len;
6283 +
6284 +       if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
6285 +                         stripes))
6286 +               return NULL;
6287 +
6288 +       len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
6289 +
6290 +       return kmalloc(len, GFP_KERNEL);
6291 +}
6292 +
6293 +/*
6294 + * Parse a single <dev> <sector> pair
6295 + */
6296 +static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
6297 +                     unsigned int stripe, char **argv)
6298 +{
6299 +       sector_t start;
6300 +
6301 +       if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
6302 +               return -EINVAL;
6303 +
6304 +       if (dm_get_device(ti, argv[0], start, sc->stripe_width,
6305 +                         dm_table_get_mode(ti->table),
6306 +                         &sc->stripe[stripe].dev))
6307 +               return -ENXIO;
6308 +
6309 +       sc->stripe[stripe].physical_start = start;
6310 +       return 0;
6311 +}
6312 +
6313 +/*
6314 + * FIXME: Nasty function, only present because we can't link
6315 + * against __moddi3 and __divdi3.
6316 + *
6317 + * returns a == b * n
6318 + */
6319 +static int multiple(sector_t a, sector_t b, sector_t *n)
6320 +{
6321 +       sector_t acc, prev, i;
6322 +
6323 +       *n = 0;
6324 +       while (a >= b) {
6325 +               for (acc = b, prev = 0, i = 1;
6326 +                    acc <= a;
6327 +                    prev = acc, acc <<= 1, i <<= 1)
6328 +                       ;
6329 +
6330 +               a -= prev;
6331 +               *n += i >> 1;
6332 +       }
6333 +
6334 +       return a == 0;
6335 +}
6336 +
6337 +/*
6338 + * Construct a striped mapping.
6339 + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
6340 + */
6341 +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
6342 +{
6343 +       struct stripe_c *sc;
6344 +       sector_t width;
6345 +       uint32_t stripes;
6346 +       uint32_t chunk_size;
6347 +       char *end;
6348 +       int r;
6349 +       unsigned int i;
6350 +
6351 +       if (argc < 2) {
6352 +               ti->error = "dm-stripe: Not enough arguments";
6353 +               return -EINVAL;
6354 +       }
6355 +
6356 +       stripes = simple_strtoul(argv[0], &end, 10);
6357 +       if (*end) {
6358 +               ti->error = "dm-stripe: Invalid stripe count";
6359 +               return -EINVAL;
6360 +       }
6361 +
6362 +       chunk_size = simple_strtoul(argv[1], &end, 10);
6363 +       if (*end) {
6364 +               ti->error = "dm-stripe: Invalid chunk_size";
6365 +               return -EINVAL;
6366 +       }
6367 +
6368 +       /*
6369 +        * chunk_size is a power of two
6370 +        */
6371 +       if (!chunk_size || (chunk_size & (chunk_size - 1))) {
6372 +               ti->error = "dm-stripe: Invalid chunk size";
6373 +               return -EINVAL;
6374 +       }
6375 +
6376 +       if (!multiple(ti->len, stripes, &width)) {
6377 +               ti->error = "dm-stripe: Target length not divisable by "
6378 +                   "number of stripes";
6379 +               return -EINVAL;
6380 +       }
6381 +
6382 +       /*
6383 +        * Do we have enough arguments for that many stripes ?
6384 +        */
6385 +       if (argc != (2 + 2 * stripes)) {
6386 +               ti->error = "dm-stripe: Not enough destinations specified";
6387 +               return -EINVAL;
6388 +       }
6389 +
6390 +       sc = alloc_context(stripes);
6391 +       if (!sc) {
6392 +               ti->error = "dm-stripe: Memory allocation for striped context "
6393 +                   "failed";
6394 +               return -ENOMEM;
6395 +       }
6396 +
6397 +       sc->stripes = stripes;
6398 +       sc->stripe_width = width;
6399 +
6400 +       sc->chunk_mask = ((sector_t) chunk_size) - 1;
6401 +       for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
6402 +               chunk_size >>= 1;
6403 +       sc->chunk_shift--;
6404 +
6405 +       /*
6406 +        * Get the stripe destinations.
6407 +        */
6408 +       for (i = 0; i < stripes; i++) {
6409 +               argv += 2;
6410 +
6411 +               r = get_stripe(ti, sc, i, argv);
6412 +               if (r < 0) {
6413 +                       ti->error = "dm-stripe: Couldn't parse stripe "
6414 +                           "destination";
6415 +                       while (i--)
6416 +                               dm_put_device(ti, sc->stripe[i].dev);
6417 +                       kfree(sc);
6418 +                       return r;
6419 +               }
6420 +       }
6421 +
6422 +       ti->private = sc;
6423 +       return 0;
6424 +}
6425 +
6426 +static void stripe_dtr(struct dm_target *ti)
6427 +{
6428 +       unsigned int i;
6429 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6430 +
6431 +       for (i = 0; i < sc->stripes; i++)
6432 +               dm_put_device(ti, sc->stripe[i].dev);
6433 +
6434 +       kfree(sc);
6435 +}
6436 +
6437 +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6438 +                     union map_info *context)
6439 +{
6440 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6441 +
6442 +       sector_t offset = bh->b_rsector - ti->begin;
6443 +       uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
6444 +       uint32_t stripe = chunk % sc->stripes;  /* 32bit modulus */
6445 +       chunk = chunk / sc->stripes;
6446 +
6447 +       bh->b_rdev = sc->stripe[stripe].dev->dev;
6448 +       bh->b_rsector = sc->stripe[stripe].physical_start +
6449 +           (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
6450 +       return 1;
6451 +}
6452 +
6453 +static int stripe_status(struct dm_target *ti, status_type_t type,
6454 +                        char *result, unsigned int maxlen)
6455 +{
6456 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6457 +       int offset;
6458 +       unsigned int i;
6459 +
6460 +       switch (type) {
6461 +       case STATUSTYPE_INFO:
6462 +               result[0] = '\0';
6463 +               break;
6464 +
6465 +       case STATUSTYPE_TABLE:
6466 +               offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
6467 +                                 sc->stripes, sc->chunk_mask + 1);
6468 +               for (i = 0; i < sc->stripes; i++) {
6469 +                       offset +=
6470 +                           snprintf(result + offset, maxlen - offset,
6471 +                                    " %s " SECTOR_FORMAT,
6472 +                      dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
6473 +                                    sc->stripe[i].physical_start);
6474 +               }
6475 +               break;
6476 +       }
6477 +       return 0;
6478 +}
6479 +
6480 +static struct target_type stripe_target = {
6481 +       .name   = "striped",
6482 +       .module = THIS_MODULE,
6483 +       .ctr    = stripe_ctr,
6484 +       .dtr    = stripe_dtr,
6485 +       .map    = stripe_map,
6486 +       .status = stripe_status,
6487 +};
6488 +
6489 +int __init dm_stripe_init(void)
6490 +{
6491 +       int r;
6492 +
6493 +       r = dm_register_target(&stripe_target);
6494 +       if (r < 0)
6495 +               DMWARN("striped target registration failed");
6496 +
6497 +       return r;
6498 +}
6499 +
6500 +void dm_stripe_exit(void)
6501 +{
6502 +       if (dm_unregister_target(&stripe_target))
6503 +               DMWARN("striped target unregistration failed");
6504 +
6505 +       return;
6506 +}
6507 --- diff/drivers/md/dm-table.c  1970-01-01 01:00:00.000000000 +0100
6508 +++ source/drivers/md/dm-table.c        2003-10-16 10:44:23.000000000 +0100
6509 @@ -0,0 +1,687 @@
6510 +/*
6511 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6512 + *
6513 + * This file is released under the GPL.
6514 + */
6515 +
6516 +#include "dm.h"
6517 +
6518 +#include <linux/module.h>
6519 +#include <linux/vmalloc.h>
6520 +#include <linux/blkdev.h>
6521 +#include <linux/ctype.h>
6522 +#include <linux/slab.h>
6523 +#include <asm/atomic.h>
6524 +
6525 +#define MAX_DEPTH 16
6526 +#define NODE_SIZE L1_CACHE_BYTES
6527 +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
6528 +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
6529 +
6530 +struct dm_table {
6531 +       atomic_t holders;
6532 +
6533 +       /* btree table */
6534 +       unsigned int depth;
6535 +       unsigned int counts[MAX_DEPTH]; /* in nodes */
6536 +       sector_t *index[MAX_DEPTH];
6537 +
6538 +       unsigned int num_targets;
6539 +       unsigned int num_allocated;
6540 +       sector_t *highs;
6541 +       struct dm_target *targets;
6542 +
6543 +       /*
6544 +        * Indicates the rw permissions for the new logical
6545 +        * device.  This should be a combination of FMODE_READ
6546 +        * and FMODE_WRITE.
6547 +        */
6548 +       int mode;
6549 +
6550 +       /* a list of devices used by this table */
6551 +       struct list_head devices;
6552 +
6553 +       /* events get handed up using this callback */
6554 +       void (*event_fn)(void *);
6555 +       void *event_context;
6556 +};
6557 +
6558 +/*
6559 + * Similar to ceiling(log_size(n))
6560 + */
6561 +static unsigned int int_log(unsigned long n, unsigned long base)
6562 +{
6563 +       int result = 0;
6564 +
6565 +       while (n > 1) {
6566 +               n = dm_div_up(n, base);
6567 +               result++;
6568 +       }
6569 +
6570 +       return result;
6571 +}
6572 +
6573 +/*
6574 + * Calculate the index of the child node of the n'th node k'th key.
6575 + */
6576 +static inline unsigned int get_child(unsigned int n, unsigned int k)
6577 +{
6578 +       return (n * CHILDREN_PER_NODE) + k;
6579 +}
6580 +
6581 +/*
6582 + * Return the n'th node of level l from table t.
6583 + */
6584 +static inline sector_t *get_node(struct dm_table *t, unsigned int l,
6585 +                                unsigned int n)
6586 +{
6587 +       return t->index[l] + (n * KEYS_PER_NODE);
6588 +}
6589 +
6590 +/*
6591 + * Return the highest key that you could lookup from the n'th
6592 + * node on level l of the btree.
6593 + */
6594 +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
6595 +{
6596 +       for (; l < t->depth - 1; l++)
6597 +               n = get_child(n, CHILDREN_PER_NODE - 1);
6598 +
6599 +       if (n >= t->counts[l])
6600 +               return (sector_t) - 1;
6601 +
6602 +       return get_node(t, l, n)[KEYS_PER_NODE - 1];
6603 +}
6604 +
6605 +/*
6606 + * Fills in a level of the btree based on the highs of the level
6607 + * below it.
6608 + */
6609 +static int setup_btree_index(unsigned int l, struct dm_table *t)
6610 +{
6611 +       unsigned int n, k;
6612 +       sector_t *node;
6613 +
6614 +       for (n = 0U; n < t->counts[l]; n++) {
6615 +               node = get_node(t, l, n);
6616 +
6617 +               for (k = 0U; k < KEYS_PER_NODE; k++)
6618 +                       node[k] = high(t, l + 1, get_child(n, k));
6619 +       }
6620 +
6621 +       return 0;
6622 +}
6623 +
6624 +/*
6625 + * highs, and targets are managed as dynamic arrays during a
6626 + * table load.
6627 + */
6628 +static int alloc_targets(struct dm_table *t, unsigned int num)
6629 +{
6630 +       sector_t *n_highs;
6631 +       struct dm_target *n_targets;
6632 +       int n = t->num_targets;
6633 +
6634 +       /*
6635 +        * Allocate both the target array and offset array at once.
6636 +        */
6637 +       n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
6638 +                                      sizeof(sector_t), num);
6639 +       if (!n_highs)
6640 +               return -ENOMEM;
6641 +
6642 +       n_targets = (struct dm_target *) (n_highs + num);
6643 +
6644 +       if (n) {
6645 +               memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
6646 +               memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
6647 +       }
6648 +
6649 +       memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
6650 +       vfree(t->highs);
6651 +
6652 +       t->num_allocated = num;
6653 +       t->highs = n_highs;
6654 +       t->targets = n_targets;
6655 +
6656 +       return 0;
6657 +}
6658 +
6659 +int dm_table_create(struct dm_table **result, int mode)
6660 +{
6661 +       struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
6662 +
6663 +       if (!t)
6664 +               return -ENOMEM;
6665 +
6666 +       memset(t, 0, sizeof(*t));
6667 +       INIT_LIST_HEAD(&t->devices);
6668 +       atomic_set(&t->holders, 1);
6669 +
6670 +       /* allocate a single nodes worth of targets to begin with */
6671 +       if (alloc_targets(t, KEYS_PER_NODE)) {
6672 +               kfree(t);
6673 +               t = NULL;
6674 +               return -ENOMEM;
6675 +       }
6676 +
6677 +       t->mode = mode;
6678 +       *result = t;
6679 +       return 0;
6680 +}
6681 +
6682 +static void free_devices(struct list_head *devices)
6683 +{
6684 +       struct list_head *tmp, *next;
6685 +
6686 +       for (tmp = devices->next; tmp != devices; tmp = next) {
6687 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6688 +               next = tmp->next;
6689 +               kfree(dd);
6690 +       }
6691 +}
6692 +
6693 +void table_destroy(struct dm_table *t)
6694 +{
6695 +       unsigned int i;
6696 +
6697 +       /* free the indexes (see dm_table_complete) */
6698 +       if (t->depth >= 2)
6699 +               vfree(t->index[t->depth - 2]);
6700 +
6701 +       /* free the targets */
6702 +       for (i = 0; i < t->num_targets; i++) {
6703 +               struct dm_target *tgt = t->targets + i;
6704 +
6705 +               if (tgt->type->dtr)
6706 +                       tgt->type->dtr(tgt);
6707 +
6708 +               dm_put_target_type(tgt->type);
6709 +       }
6710 +
6711 +       vfree(t->highs);
6712 +
6713 +       /* free the device list */
6714 +       if (t->devices.next != &t->devices) {
6715 +               DMWARN("devices still present during destroy: "
6716 +                      "dm_table_remove_device calls missing");
6717 +
6718 +               free_devices(&t->devices);
6719 +       }
6720 +
6721 +       kfree(t);
6722 +}
6723 +
6724 +void dm_table_get(struct dm_table *t)
6725 +{
6726 +       atomic_inc(&t->holders);
6727 +}
6728 +
6729 +void dm_table_put(struct dm_table *t)
6730 +{
6731 +       if (atomic_dec_and_test(&t->holders))
6732 +               table_destroy(t);
6733 +}
6734 +
6735 +/*
6736 + * Checks to see if we need to extend highs or targets.
6737 + */
6738 +static inline int check_space(struct dm_table *t)
6739 +{
6740 +       if (t->num_targets >= t->num_allocated)
6741 +               return alloc_targets(t, t->num_allocated * 2);
6742 +
6743 +       return 0;
6744 +}
6745 +
6746 +/*
6747 + * Convert a device path to a dev_t.
6748 + */
6749 +static int lookup_device(const char *path, kdev_t *dev)
6750 +{
6751 +       int r;
6752 +       struct nameidata nd;
6753 +       struct inode *inode;
6754 +
6755 +       if (!path_init(path, LOOKUP_FOLLOW, &nd))
6756 +               return 0;
6757 +
6758 +       if ((r = path_walk(path, &nd)))
6759 +               goto out;
6760 +
6761 +       inode = nd.dentry->d_inode;
6762 +       if (!inode) {
6763 +               r = -ENOENT;
6764 +               goto out;
6765 +       }
6766 +
6767 +       if (!S_ISBLK(inode->i_mode)) {
6768 +               r = -ENOTBLK;
6769 +               goto out;
6770 +       }
6771 +
6772 +       *dev = inode->i_rdev;
6773 +
6774 +      out:
6775 +       path_release(&nd);
6776 +       return r;
6777 +}
6778 +
6779 +/*
6780 + * See if we've already got a device in the list.
6781 + */
6782 +static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6783 +{
6784 +       struct list_head *tmp;
6785 +
6786 +       list_for_each(tmp, l) {
6787 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6788 +               if (kdev_same(dd->dev, dev))
6789 +                       return dd;
6790 +       }
6791 +
6792 +       return NULL;
6793 +}
6794 +
6795 +/*
6796 + * Open a device so we can use it as a map destination.
6797 + */
6798 +static int open_dev(struct dm_dev *dd)
6799 +{
6800 +       if (dd->bdev)
6801 +               BUG();
6802 +
6803 +       dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6804 +       if (!dd->bdev)
6805 +               return -ENOMEM;
6806 +
6807 +       return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6808 +}
6809 +
6810 +/*
6811 + * Close a device that we've been using.
6812 + */
6813 +static void close_dev(struct dm_dev *dd)
6814 +{
6815 +       if (!dd->bdev)
6816 +               return;
6817 +
6818 +       blkdev_put(dd->bdev, BDEV_RAW);
6819 +       dd->bdev = NULL;
6820 +}
6821 +
6822 +/*
6823 + * If possible (ie. blk_size[major] is set), this checks an area
6824 + * of a destination device is valid.
6825 + */
6826 +static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6827 +{
6828 +       int *sizes;
6829 +       sector_t dev_size;
6830 +
6831 +       if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6832 +               /* we don't know the device details,
6833 +                * so give the benefit of the doubt */
6834 +               return 1;
6835 +
6836 +       /* convert to 512-byte sectors */
6837 +       dev_size <<= 1;
6838 +
6839 +       return ((start < dev_size) && (len <= (dev_size - start)));
6840 +}
6841 +
6842 +/*
6843 + * This upgrades the mode on an already open dm_dev.  Being
6844 + * careful to leave things as they were if we fail to reopen the
6845 + * device.
6846 + */
6847 +static int upgrade_mode(struct dm_dev *dd, int new_mode)
6848 +{
6849 +       int r;
6850 +       struct dm_dev dd_copy;
6851 +
6852 +       memcpy(&dd_copy, dd, sizeof(dd_copy));
6853 +
6854 +       dd->mode |= new_mode;
6855 +       dd->bdev = NULL;
6856 +       r = open_dev(dd);
6857 +       if (!r)
6858 +               close_dev(&dd_copy);
6859 +       else
6860 +               memcpy(dd, &dd_copy, sizeof(dd_copy));
6861 +
6862 +       return r;
6863 +}
6864 +
6865 +/*
6866 + * Add a device to the list, or just increment the usage count if
6867 + * it's already present.
6868 + */
6869 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6870 +                 sector_t len, int mode, struct dm_dev **result)
6871 +{
6872 +       int r;
6873 +       kdev_t dev;
6874 +       struct dm_dev *dd;
6875 +       unsigned major, minor;
6876 +       struct dm_table *t = ti->table;
6877 +
6878 +       if (!t)
6879 +               BUG();
6880 +
6881 +       if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6882 +               /* Extract the major/minor numbers */
6883 +               dev = mk_kdev(major, minor);
6884 +       } else {
6885 +               /* convert the path to a device */
6886 +               if ((r = lookup_device(path, &dev)))
6887 +                       return r;
6888 +       }
6889 +
6890 +       dd = find_device(&t->devices, dev);
6891 +       if (!dd) {
6892 +               dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6893 +               if (!dd)
6894 +                       return -ENOMEM;
6895 +
6896 +               dd->dev = dev;
6897 +               dd->mode = mode;
6898 +               dd->bdev = NULL;
6899 +
6900 +               if ((r = open_dev(dd))) {
6901 +                       kfree(dd);
6902 +                       return r;
6903 +               }
6904 +
6905 +               atomic_set(&dd->count, 0);
6906 +               list_add(&dd->list, &t->devices);
6907 +
6908 +       } else if (dd->mode != (mode | dd->mode)) {
6909 +               r = upgrade_mode(dd, mode);
6910 +               if (r)
6911 +                       return r;
6912 +       }
6913 +       atomic_inc(&dd->count);
6914 +
6915 +       if (!check_device_area(dd->dev, start, len)) {
6916 +               DMWARN("device %s too small for target", path);
6917 +               dm_put_device(ti, dd);
6918 +               return -EINVAL;
6919 +       }
6920 +
6921 +       *result = dd;
6922 +
6923 +       return 0;
6924 +}
6925 +
6926 +/*
6927 + * Decrement a devices use count and remove it if neccessary.
6928 + */
6929 +void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6930 +{
6931 +       if (atomic_dec_and_test(&dd->count)) {
6932 +               close_dev(dd);
6933 +               list_del(&dd->list);
6934 +               kfree(dd);
6935 +       }
6936 +}
6937 +
6938 +/*
6939 + * Checks to see if the target joins onto the end of the table.
6940 + */
6941 +static int adjoin(struct dm_table *table, struct dm_target *ti)
6942 +{
6943 +       struct dm_target *prev;
6944 +
6945 +       if (!table->num_targets)
6946 +               return !ti->begin;
6947 +
6948 +       prev = &table->targets[table->num_targets - 1];
6949 +       return (ti->begin == (prev->begin + prev->len));
6950 +}
6951 +
6952 +/*
6953 + * Destructively splits up the argument list to pass to ctr.
6954 + */
6955 +static int split_args(int max, int *argc, char **argv, char *input)
6956 +{
6957 +       char *start, *end = input, *out;
6958 +       *argc = 0;
6959 +
6960 +       while (1) {
6961 +               start = end;
6962 +
6963 +               /* Skip whitespace */
6964 +               while (*start && isspace(*start))
6965 +                       start++;
6966 +
6967 +               if (!*start)
6968 +                       break;  /* success, we hit the end */
6969 +
6970 +               /* 'out' is used to remove any back-quotes */
6971 +               end = out = start;
6972 +               while (*end) {
6973 +                       /* Everything apart from '\0' can be quoted */
6974 +                       if (*end == '\\' && *(end + 1)) {
6975 +                               *out++ = *(end + 1);
6976 +                               end += 2;
6977 +                               continue;
6978 +                       }
6979 +
6980 +                       if (isspace(*end))
6981 +                               break;  /* end of token */
6982 +
6983 +                       *out++ = *end++;
6984 +               }
6985 +
6986 +               /* have we already filled the array ? */
6987 +               if ((*argc + 1) > max)
6988 +                       return -EINVAL;
6989 +
6990 +               /* we know this is whitespace */
6991 +               if (*end)
6992 +                       end++;
6993 +
6994 +               /* terminate the string and put it in the array */
6995 +               *out = '\0';
6996 +               argv[*argc] = start;
6997 +               (*argc)++;
6998 +       }
6999 +
7000 +       return 0;
7001 +}
7002 +
7003 +int dm_table_add_target(struct dm_table *t, const char *type,
7004 +                       sector_t start, sector_t len, char *params)
7005 +{
7006 +       int r = -EINVAL, argc;
7007 +       char *argv[32];
7008 +       struct dm_target *tgt;
7009 +
7010 +       if ((r = check_space(t)))
7011 +               return r;
7012 +
7013 +       tgt = t->targets + t->num_targets;
7014 +       memset(tgt, 0, sizeof(*tgt));
7015 +
7016 +       tgt->type = dm_get_target_type(type);
7017 +       if (!tgt->type) {
7018 +               tgt->error = "unknown target type";
7019 +               return -EINVAL;
7020 +       }
7021 +
7022 +       tgt->table = t;
7023 +       tgt->begin = start;
7024 +       tgt->len = len;
7025 +       tgt->error = "Unknown error";
7026 +
7027 +       /*
7028 +        * Does this target adjoin the previous one ?
7029 +        */
7030 +       if (!adjoin(t, tgt)) {
7031 +               tgt->error = "Gap in table";
7032 +               r = -EINVAL;
7033 +               goto bad;
7034 +       }
7035 +
7036 +       r = split_args(ARRAY_SIZE(argv), &argc, argv, params);
7037 +       if (r) {
7038 +               tgt->error = "couldn't split parameters";
7039 +               goto bad;
7040 +       }
7041 +
7042 +       r = tgt->type->ctr(tgt, argc, argv);
7043 +       if (r)
7044 +               goto bad;
7045 +
7046 +       t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
7047 +       return 0;
7048 +
7049 +      bad:
7050 +       printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
7051 +       dm_put_target_type(tgt->type);
7052 +       return r;
7053 +}
7054 +
7055 +static int setup_indexes(struct dm_table *t)
7056 +{
7057 +       int i;
7058 +       unsigned int total = 0;
7059 +       sector_t *indexes;
7060 +
7061 +       /* allocate the space for *all* the indexes */
7062 +       for (i = t->depth - 2; i >= 0; i--) {
7063 +               t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
7064 +               total += t->counts[i];
7065 +       }
7066 +
7067 +       indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
7068 +       if (!indexes)
7069 +               return -ENOMEM;
7070 +
7071 +       /* set up internal nodes, bottom-up */
7072 +       for (i = t->depth - 2, total = 0; i >= 0; i--) {
7073 +               t->index[i] = indexes;
7074 +               indexes += (KEYS_PER_NODE * t->counts[i]);
7075 +               setup_btree_index(i, t);
7076 +       }
7077 +
7078 +       return 0;
7079 +}
7080 +
7081 +/*
7082 + * Builds the btree to index the map.
7083 + */
7084 +int dm_table_complete(struct dm_table *t)
7085 +{
7086 +       int r = 0;
7087 +       unsigned int leaf_nodes;
7088 +
7089 +       /* how many indexes will the btree have ? */
7090 +       leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
7091 +       t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
7092 +
7093 +       /* leaf layer has already been set up */
7094 +       t->counts[t->depth - 1] = leaf_nodes;
7095 +       t->index[t->depth - 1] = t->highs;
7096 +
7097 +       if (t->depth >= 2)
7098 +               r = setup_indexes(t);
7099 +
7100 +       return r;
7101 +}
7102 +
7103 +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
7104 +void dm_table_event_callback(struct dm_table *t,
7105 +                            void (*fn)(void *), void *context)
7106 +{
7107 +       spin_lock_irq(&_event_lock);
7108 +       t->event_fn = fn;
7109 +       t->event_context = context;
7110 +       spin_unlock_irq(&_event_lock);
7111 +}
7112 +
7113 +void dm_table_event(struct dm_table *t)
7114 +{
7115 +       spin_lock(&_event_lock);
7116 +       if (t->event_fn)
7117 +               t->event_fn(t->event_context);
7118 +       spin_unlock(&_event_lock);
7119 +}
7120 +
7121 +sector_t dm_table_get_size(struct dm_table *t)
7122 +{
7123 +       return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
7124 +}
7125 +
7126 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
7127 +{
7128 +       if (index > t->num_targets)
7129 +               return NULL;
7130 +
7131 +       return t->targets + index;
7132 +}
7133 +
7134 +/*
7135 + * Search the btree for the correct target.
7136 + */
7137 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
7138 +{
7139 +       unsigned int l, n = 0, k = 0;
7140 +       sector_t *node;
7141 +
7142 +       for (l = 0; l < t->depth; l++) {
7143 +               n = get_child(n, k);
7144 +               node = get_node(t, l, n);
7145 +
7146 +               for (k = 0; k < KEYS_PER_NODE; k++)
7147 +                       if (node[k] >= sector)
7148 +                               break;
7149 +       }
7150 +
7151 +       return &t->targets[(KEYS_PER_NODE * n) + k];
7152 +}
7153 +
7154 +unsigned int dm_table_get_num_targets(struct dm_table *t)
7155 +{
7156 +       return t->num_targets;
7157 +}
7158 +
7159 +struct list_head *dm_table_get_devices(struct dm_table *t)
7160 +{
7161 +       return &t->devices;
7162 +}
7163 +
7164 +int dm_table_get_mode(struct dm_table *t)
7165 +{
7166 +       return t->mode;
7167 +}
7168 +
7169 +void dm_table_suspend_targets(struct dm_table *t)
7170 +{
7171 +       int i;
7172 +
7173 +       for (i = 0; i < t->num_targets; i++) {
7174 +               struct dm_target *ti = t->targets + i;
7175 +
7176 +               if (ti->type->suspend)
7177 +                       ti->type->suspend(ti);
7178 +       }
7179 +}
7180 +
7181 +void dm_table_resume_targets(struct dm_table *t)
7182 +{
7183 +       int i;
7184 +
7185 +       for (i = 0; i < t->num_targets; i++) {
7186 +               struct dm_target *ti = t->targets + i;
7187 +
7188 +               if (ti->type->resume)
7189 +                       ti->type->resume(ti);
7190 +       }
7191 +}
7192 +
7193 +EXPORT_SYMBOL(dm_get_device);
7194 +EXPORT_SYMBOL(dm_put_device);
7195 +EXPORT_SYMBOL(dm_table_event);
7196 +EXPORT_SYMBOL(dm_table_get_mode);
7197 --- diff/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100
7198 +++ source/drivers/md/dm-target.c       2003-10-16 10:44:23.000000000 +0100
7199 @@ -0,0 +1,188 @@
7200 +/*
7201 + * Copyright (C) 2001 Sistina Software (UK) Limited
7202 + *
7203 + * This file is released under the GPL.
7204 + */
7205 +
7206 +#include "dm.h"
7207 +
7208 +#include <linux/module.h>
7209 +#include <linux/kmod.h>
7210 +#include <linux/slab.h>
7211 +
7212 +struct tt_internal {
7213 +       struct target_type tt;
7214 +
7215 +       struct list_head list;
7216 +       long use;
7217 +};
7218 +
7219 +static LIST_HEAD(_targets);
7220 +static DECLARE_RWSEM(_lock);
7221 +
7222 +#define DM_MOD_NAME_SIZE 32
7223 +
7224 +static inline struct tt_internal *__find_target_type(const char *name)
7225 +{
7226 +       struct list_head *tih;
7227 +       struct tt_internal *ti;
7228 +
7229 +       list_for_each(tih, &_targets) {
7230 +               ti = list_entry(tih, struct tt_internal, list);
7231 +
7232 +               if (!strcmp(name, ti->tt.name))
7233 +                       return ti;
7234 +       }
7235 +
7236 +       return NULL;
7237 +}
7238 +
7239 +static struct tt_internal *get_target_type(const char *name)
7240 +{
7241 +       struct tt_internal *ti;
7242 +
7243 +       down_read(&_lock);
7244 +       ti = __find_target_type(name);
7245 +
7246 +       if (ti) {
7247 +               if (ti->use == 0 && ti->tt.module)
7248 +                       __MOD_INC_USE_COUNT(ti->tt.module);
7249 +               ti->use++;
7250 +       }
7251 +       up_read(&_lock);
7252 +
7253 +       return ti;
7254 +}
7255 +
7256 +static void load_module(const char *name)
7257 +{
7258 +       char module_name[DM_MOD_NAME_SIZE] = "dm-";
7259 +
7260 +       /* Length check for strcat() below */
7261 +       if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
7262 +               return;
7263 +
7264 +       strcat(module_name, name);
7265 +       request_module(module_name);
7266 +}
7267 +
7268 +struct target_type *dm_get_target_type(const char *name)
7269 +{
7270 +       struct tt_internal *ti = get_target_type(name);
7271 +
7272 +       if (!ti) {
7273 +               load_module(name);
7274 +               ti = get_target_type(name);
7275 +       }
7276 +
7277 +       return ti ? &ti->tt : NULL;
7278 +}
7279 +
7280 +void dm_put_target_type(struct target_type *t)
7281 +{
7282 +       struct tt_internal *ti = (struct tt_internal *) t;
7283 +
7284 +       down_read(&_lock);
7285 +       if (--ti->use == 0 && ti->tt.module)
7286 +               __MOD_DEC_USE_COUNT(ti->tt.module);
7287 +
7288 +       if (ti->use < 0)
7289 +               BUG();
7290 +       up_read(&_lock);
7291 +
7292 +       return;
7293 +}
7294 +
7295 +static struct tt_internal *alloc_target(struct target_type *t)
7296 +{
7297 +       struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
7298 +
7299 +       if (ti) {
7300 +               memset(ti, 0, sizeof(*ti));
7301 +               ti->tt = *t;
7302 +       }
7303 +
7304 +       return ti;
7305 +}
7306 +
7307 +int dm_register_target(struct target_type *t)
7308 +{
7309 +       int rv = 0;
7310 +       struct tt_internal *ti = alloc_target(t);
7311 +
7312 +       if (!ti)
7313 +               return -ENOMEM;
7314 +
7315 +       down_write(&_lock);
7316 +       if (__find_target_type(t->name)) {
7317 +               kfree(ti);
7318 +               rv = -EEXIST;
7319 +       } else
7320 +               list_add(&ti->list, &_targets);
7321 +
7322 +       up_write(&_lock);
7323 +       return rv;
7324 +}
7325 +
7326 +int dm_unregister_target(struct target_type *t)
7327 +{
7328 +       struct tt_internal *ti;
7329 +
7330 +       down_write(&_lock);
7331 +       if (!(ti = __find_target_type(t->name))) {
7332 +               up_write(&_lock);
7333 +               return -EINVAL;
7334 +       }
7335 +
7336 +       if (ti->use) {
7337 +               up_write(&_lock);
7338 +               return -ETXTBSY;
7339 +       }
7340 +
7341 +       list_del(&ti->list);
7342 +       kfree(ti);
7343 +
7344 +       up_write(&_lock);
7345 +       return 0;
7346 +}
7347 +
7348 +/*
7349 + * io-err: always fails an io, useful for bringing
7350 + * up LVs that have holes in them.
7351 + */
7352 +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
7353 +{
7354 +       return 0;
7355 +}
7356 +
7357 +static void io_err_dtr(struct dm_target *ti)
7358 +{
7359 +       /* empty */
7360 +}
7361 +
7362 +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
7363 +                     union map_info *map_context)
7364 +{
7365 +       return -EIO;
7366 +}
7367 +
7368 +static struct target_type error_target = {
7369 +       .name = "error",
7370 +       .ctr  = io_err_ctr,
7371 +       .dtr  = io_err_dtr,
7372 +       .map  = io_err_map,
7373 +};
7374 +
7375 +int dm_target_init(void)
7376 +{
7377 +       return dm_register_target(&error_target);
7378 +}
7379 +
7380 +void dm_target_exit(void)
7381 +{
7382 +       if (dm_unregister_target(&error_target))
7383 +               DMWARN("error target unregistration failed");
7384 +}
7385 +
7386 +EXPORT_SYMBOL(dm_register_target);
7387 +EXPORT_SYMBOL(dm_unregister_target);
7388 --- diff/drivers/md/dm.c        1970-01-01 01:00:00.000000000 +0100
7389 +++ source/drivers/md/dm.c      2003-10-16 10:44:23.000000000 +0100
7390 @@ -0,0 +1,1115 @@
7391 +/*
7392 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
7393 + *
7394 + * This file is released under the GPL.
7395 + */
7396 +
7397 +#include "dm.h"
7398 +#include "kcopyd.h"
7399 +
7400 +#include <linux/init.h>
7401 +#include <linux/module.h>
7402 +#include <linux/blk.h>
7403 +#include <linux/blkpg.h>
7404 +#include <linux/mempool.h>
7405 +#include <linux/slab.h>
7406 +#include <linux/major.h>
7407 +#include <linux/kdev_t.h>
7408 +#include <linux/lvm.h>
7409 +
7410 +#include <asm/uaccess.h>
7411 +
7412 +static const char *_name = DM_NAME;
7413 +#define DEFAULT_READ_AHEAD 64
7414 +
7415 +struct dm_io {
7416 +       struct mapped_device *md;
7417 +
7418 +       struct dm_target *ti;
7419 +       int rw;
7420 +       union map_info map_context;
7421 +       void (*end_io) (struct buffer_head * bh, int uptodate);
7422 +       void *context;
7423 +};
7424 +
7425 +struct deferred_io {
7426 +       int rw;
7427 +       struct buffer_head *bh;
7428 +       struct deferred_io *next;
7429 +};
7430 +
7431 +/*
7432 + * Bits for the md->flags field.
7433 + */
7434 +#define DMF_BLOCK_IO 0
7435 +#define DMF_SUSPENDED 1
7436 +
7437 +struct mapped_device {
7438 +       struct rw_semaphore lock;
7439 +       atomic_t holders;
7440 +
7441 +       kdev_t dev;
7442 +       unsigned long flags;
7443 +
7444 +       /*
7445 +        * A list of ios that arrived while we were suspended.
7446 +        */
7447 +       atomic_t pending;
7448 +       wait_queue_head_t wait;
7449 +       struct deferred_io *deferred;
7450 +
7451 +       /*
7452 +        * The current mapping.
7453 +        */
7454 +       struct dm_table *map;
7455 +
7456 +       /*
7457 +        * io objects are allocated from here.
7458 +        */
7459 +       mempool_t *io_pool;
7460 +
7461 +       /*
7462 +        * Event handling.
7463 +        */
7464 +       uint32_t event_nr;
7465 +       wait_queue_head_t eventq;
7466 +};
7467 +
7468 +#define MIN_IOS 256
7469 +static kmem_cache_t *_io_cache;
7470 +
7471 +static struct mapped_device *get_kdev(kdev_t dev);
7472 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
7473 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
7474 +
7475 +/*-----------------------------------------------------------------
7476 + * In order to avoid the 256 minor number limit we are going to
7477 + * register more major numbers as neccessary.
7478 + *---------------------------------------------------------------*/
7479 +#define MAX_MINORS (1 << MINORBITS)
7480 +
7481 +struct major_details {
7482 +       unsigned int major;
7483 +
7484 +       int transient;
7485 +       struct list_head transient_list;
7486 +
7487 +       unsigned int first_free_minor;
7488 +       int nr_free_minors;
7489 +
7490 +       struct mapped_device *mds[MAX_MINORS];
7491 +       int blk_size[MAX_MINORS];
7492 +       int blksize_size[MAX_MINORS];
7493 +       int hardsect_size[MAX_MINORS];
7494 +};
7495 +
7496 +static struct rw_semaphore _dev_lock;
7497 +static struct major_details *_majors[MAX_BLKDEV];
7498 +
7499 +/*
7500 + * This holds a list of majors that non-specified device numbers
7501 + * may be allocated from.  Only majors with free minors appear on
7502 + * this list.
7503 + */
7504 +static LIST_HEAD(_transients_free);
7505 +
7506 +static int __alloc_major(unsigned int major, struct major_details **result)
7507 +{
7508 +       int r;
7509 +       unsigned int transient = !major;
7510 +       struct major_details *maj;
7511 +
7512 +       /* Major already allocated? */
7513 +       if (major && _majors[major])
7514 +               return 0;
7515 +
7516 +       maj = kmalloc(sizeof(*maj), GFP_KERNEL);
7517 +       if (!maj)
7518 +               return -ENOMEM;
7519 +
7520 +       memset(maj, 0, sizeof(*maj));
7521 +       INIT_LIST_HEAD(&maj->transient_list);
7522 +
7523 +       maj->nr_free_minors = MAX_MINORS;
7524 +
7525 +       r = register_blkdev(major, _name, &dm_blk_dops);
7526 +       if (r < 0) {
7527 +               DMERR("register_blkdev failed for %d", major);
7528 +               kfree(maj);
7529 +               return r;
7530 +       }
7531 +       if (r > 0)
7532 +               major = r;
7533 +
7534 +       maj->major = major;
7535 +
7536 +       if (transient) {
7537 +               maj->transient = transient;
7538 +               list_add_tail(&maj->transient_list, &_transients_free);
7539 +       }
7540 +
7541 +       _majors[major] = maj;
7542 +
7543 +       blk_size[major] = maj->blk_size;
7544 +       blksize_size[major] = maj->blksize_size;
7545 +       hardsect_size[major] = maj->hardsect_size;
7546 +       read_ahead[major] = DEFAULT_READ_AHEAD;
7547 +
7548 +       blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
7549 +
7550 +       *result = maj;
7551 +       return 0;
7552 +}
7553 +
7554 +static void __free_major(struct major_details *maj)
7555 +{
7556 +       unsigned int major = maj->major;
7557 +
7558 +       list_del(&maj->transient_list);
7559 +
7560 +       read_ahead[major] = 0;
7561 +       blk_size[major] = NULL;
7562 +       blksize_size[major] = NULL;
7563 +       hardsect_size[major] = NULL;
7564 +
7565 +       _majors[major] = NULL;
7566 +       kfree(maj);
7567 +
7568 +       if (unregister_blkdev(major, _name) < 0)
7569 +               DMERR("devfs_unregister_blkdev failed");
7570 +}
7571 +
7572 +static void free_all_majors(void)
7573 +{
7574 +       unsigned int major = ARRAY_SIZE(_majors);
7575 +
7576 +       down_write(&_dev_lock);
7577 +
7578 +       while (major--)
7579 +               if (_majors[major])
7580 +                       __free_major(_majors[major]);
7581 +
7582 +       up_write(&_dev_lock);
7583 +}
7584 +
7585 +static void free_dev(kdev_t dev)
7586 +{
7587 +       unsigned int major = major(dev);
7588 +       unsigned int minor = minor(dev);
7589 +       struct major_details *maj;
7590 +
7591 +       down_write(&_dev_lock);
7592 +
7593 +       maj = _majors[major];
7594 +       if (!maj)
7595 +               goto out;
7596 +
7597 +       maj->mds[minor] = NULL;
7598 +       maj->nr_free_minors++;
7599 +
7600 +       if (maj->nr_free_minors == MAX_MINORS) {
7601 +               __free_major(maj);
7602 +               goto out;
7603 +       }
7604 +
7605 +       if (!maj->transient)
7606 +               goto out;
7607 +
7608 +       if (maj->nr_free_minors == 1)
7609 +               list_add_tail(&maj->transient_list, &_transients_free);
7610 +
7611 +       if (minor < maj->first_free_minor)
7612 +               maj->first_free_minor = minor;
7613 +
7614 +      out:
7615 +       up_write(&_dev_lock);
7616 +}
7617 +
7618 +static void __alloc_minor(struct major_details *maj, unsigned int minor,
7619 +                         struct mapped_device *md)
7620 +{
7621 +       maj->mds[minor] = md;
7622 +       md->dev = mk_kdev(maj->major, minor);
7623 +       maj->nr_free_minors--;
7624 +
7625 +       if (maj->transient && !maj->nr_free_minors)
7626 +               list_del_init(&maj->transient_list);
7627 +}
7628 +
7629 +/*
7630 + * See if requested kdev_t is available.
7631 + */
7632 +static int specific_dev(kdev_t dev, struct mapped_device *md)
7633 +{
7634 +       int r = 0;
7635 +       unsigned int major = major(dev);
7636 +       unsigned int minor = minor(dev);
7637 +       struct major_details *maj;
7638 +
7639 +       if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
7640 +               DMWARN("device number requested out of range (%d, %d)",
7641 +                      major, minor);
7642 +               return -EINVAL;
7643 +       }
7644 +
7645 +       down_write(&_dev_lock);
7646 +       maj = _majors[major];
7647 +
7648 +       /* Register requested major? */
7649 +       if (!maj) {
7650 +               r = __alloc_major(major, &maj);
7651 +               if (r)
7652 +                       goto out;
7653 +
7654 +               major = maj->major;
7655 +       }
7656 +
7657 +       if (maj->mds[minor]) {
7658 +               r = -EBUSY;
7659 +               goto out;
7660 +       }
7661 +
7662 +       __alloc_minor(maj, minor, md);
7663 +
7664 +      out:
7665 +       up_write(&_dev_lock);
7666 +
7667 +       return r;
7668 +}
7669 +
7670 +/*
7671 + * Find first unused device number, requesting a new major number if required.
7672 + */
7673 +static int first_free_dev(struct mapped_device *md)
7674 +{
7675 +       int r = 0;
7676 +       struct major_details *maj;
7677 +
7678 +       down_write(&_dev_lock);
7679 +
7680 +       if (list_empty(&_transients_free)) {
7681 +               r = __alloc_major(0, &maj);
7682 +               if (r)
7683 +                       goto out;
7684 +       } else
7685 +               maj = list_entry(_transients_free.next, struct major_details,
7686 +                                transient_list);
7687 +
7688 +       while (maj->mds[maj->first_free_minor++])
7689 +               ;
7690 +
7691 +       __alloc_minor(maj, maj->first_free_minor - 1, md);
7692 +
7693 +      out:
7694 +       up_write(&_dev_lock);
7695 +
7696 +       return r;
7697 +}
7698 +
7699 +static struct mapped_device *get_kdev(kdev_t dev)
7700 +{
7701 +       struct mapped_device *md;
7702 +       struct major_details *maj;
7703 +
7704 +       down_read(&_dev_lock);
7705 +       maj = _majors[major(dev)];
7706 +       if (!maj) {
7707 +               md = NULL;
7708 +               goto out;
7709 +       }
7710 +       md = maj->mds[minor(dev)];
7711 +       if (md)
7712 +               dm_get(md);
7713 +      out:
7714 +       up_read(&_dev_lock);
7715 +
7716 +       return md;
7717 +}
7718 +
7719 +/*-----------------------------------------------------------------
7720 + * init/exit code
7721 + *---------------------------------------------------------------*/
7722 +
7723 +static __init int local_init(void)
7724 +{
7725 +       init_rwsem(&_dev_lock);
7726 +
7727 +       /* allocate a slab for the dm_ios */
7728 +       _io_cache = kmem_cache_create("dm io",
7729 +                                     sizeof(struct dm_io), 0, 0, NULL, NULL);
7730 +
7731 +       if (!_io_cache)
7732 +               return -ENOMEM;
7733 +
7734 +       return 0;
7735 +}
7736 +
7737 +static void local_exit(void)
7738 +{
7739 +       kmem_cache_destroy(_io_cache);
7740 +       free_all_majors();
7741 +
7742 +       DMINFO("cleaned up");
7743 +}
7744 +
7745 +/*
7746 + * We have a lot of init/exit functions, so it seems easier to
7747 + * store them in an array.  The disposable macro 'xx'
7748 + * expands a prefix into a pair of function names.
7749 + */
7750 +static struct {
7751 +       int (*init) (void);
7752 +       void (*exit) (void);
7753 +
7754 +} _inits[] = {
7755 +#define xx(n) {n ## _init, n ## _exit},
7756 +       xx(local)
7757 +       xx(kcopyd)
7758 +       xx(dm_target)
7759 +       xx(dm_linear)
7760 +       xx(dm_stripe)
7761 +       xx(dm_snapshot)
7762 +       xx(dm_interface)
7763 +#undef xx
7764 +};
7765 +
7766 +static int __init dm_init(void)
7767 +{
7768 +       const int count = ARRAY_SIZE(_inits);
7769 +
7770 +       int r, i;
7771 +
7772 +       for (i = 0; i < count; i++) {
7773 +               r = _inits[i].init();
7774 +               if (r)
7775 +                       goto bad;
7776 +       }
7777 +
7778 +       return 0;
7779 +
7780 +      bad:
7781 +       while (i--)
7782 +               _inits[i].exit();
7783 +
7784 +       return r;
7785 +}
7786 +
7787 +static void __exit dm_exit(void)
7788 +{
7789 +       int i = ARRAY_SIZE(_inits);
7790 +
7791 +       while (i--)
7792 +               _inits[i].exit();
7793 +}
7794 +
7795 +/*
7796 + * Block device functions
7797 + */
7798 +static int dm_blk_open(struct inode *inode, struct file *file)
7799 +{
7800 +       struct mapped_device *md;
7801 +
7802 +       md = get_kdev(inode->i_rdev);
7803 +       if (!md)
7804 +               return -ENXIO;
7805 +
7806 +       return 0;
7807 +}
7808 +
7809 +static int dm_blk_close(struct inode *inode, struct file *file)
7810 +{
7811 +       struct mapped_device *md;
7812 +
7813 +       md = get_kdev(inode->i_rdev);
7814 +       dm_put(md);             /* put the reference gained by dm_blk_open */
7815 +       dm_put(md);
7816 +       return 0;
7817 +}
7818 +
7819 +static inline struct dm_io *alloc_io(struct mapped_device *md)
7820 +{
7821 +       return mempool_alloc(md->io_pool, GFP_NOIO);
7822 +}
7823 +
7824 +static inline void free_io(struct mapped_device *md, struct dm_io *io)
7825 +{
7826 +       mempool_free(io, md->io_pool);
7827 +}
7828 +
7829 +static inline struct deferred_io *alloc_deferred(void)
7830 +{
7831 +       return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
7832 +}
7833 +
7834 +static inline void free_deferred(struct deferred_io *di)
7835 +{
7836 +       kfree(di);
7837 +}
7838 +
7839 +static inline sector_t volume_size(kdev_t dev)
7840 +{
7841 +       return blk_size[major(dev)][minor(dev)] << 1;
7842 +}
7843 +
7844 +/* FIXME: check this */
7845 +static int dm_blk_ioctl(struct inode *inode, struct file *file,
7846 +                       unsigned int command, unsigned long a)
7847 +{
7848 +       kdev_t dev = inode->i_rdev;
7849 +       long size;
7850 +
7851 +       switch (command) {
7852 +       case BLKROSET:
7853 +       case BLKROGET:
7854 +       case BLKRASET:
7855 +       case BLKRAGET:
7856 +       case BLKFLSBUF:
7857 +       case BLKSSZGET:
7858 +               //case BLKRRPART: /* Re-read partition tables */
7859 +               //case BLKPG:
7860 +       case BLKELVGET:
7861 +       case BLKELVSET:
7862 +       case BLKBSZGET:
7863 +       case BLKBSZSET:
7864 +               return blk_ioctl(dev, command, a);
7865 +               break;
7866 +
7867 +       case BLKGETSIZE:
7868 +               size = volume_size(dev);
7869 +               if (copy_to_user((void *) a, &size, sizeof(long)))
7870 +                       return -EFAULT;
7871 +               break;
7872 +
7873 +       case BLKGETSIZE64:
7874 +               size = volume_size(dev);
7875 +               if (put_user((u64) ((u64) size) << 9, (u64 *) a))
7876 +                       return -EFAULT;
7877 +               break;
7878 +
7879 +       case BLKRRPART:
7880 +               return -ENOTTY;
7881 +
7882 +       case LV_BMAP:
7883 +               return dm_user_bmap(inode, (struct lv_bmap *) a);
7884 +
7885 +       default:
7886 +               DMWARN("unknown block ioctl 0x%x", command);
7887 +               return -ENOTTY;
7888 +       }
7889 +
7890 +       return 0;
7891 +}
7892 +
7893 +/*
7894 + * Add the buffer to the list of deferred io.
7895 + */
7896 +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
7897 +{
7898 +       struct deferred_io *di;
7899 +
7900 +       di = alloc_deferred();
7901 +       if (!di)
7902 +               return -ENOMEM;
7903 +
7904 +       down_write(&md->lock);
7905 +
7906 +       if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
7907 +               up_write(&md->lock);
7908 +               free_deferred(di);
7909 +               return 1;
7910 +       }
7911 +
7912 +       di->bh = bh;
7913 +       di->rw = rw;
7914 +       di->next = md->deferred;
7915 +       md->deferred = di;
7916 +
7917 +       up_write(&md->lock);
7918 +       return 0;               /* deferred successfully */
7919 +}
7920 +
7921 +/*
7922 + * bh->b_end_io routine that decrements the pending count
7923 + * and then calls the original bh->b_end_io fn.
7924 + */
7925 +static void dec_pending(struct buffer_head *bh, int uptodate)
7926 +{
7927 +       int r;
7928 +       struct dm_io *io = bh->b_private;
7929 +       dm_endio_fn endio = io->ti->type->end_io;
7930 +
7931 +       if (endio) {
7932 +               r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
7933 +                         &io->map_context);
7934 +               if (r < 0)
7935 +                       uptodate = 0;
7936 +
7937 +               else if (r > 0)
7938 +                       /* the target wants another shot at the io */
7939 +                       return;
7940 +       }
7941 +
7942 +       if (atomic_dec_and_test(&io->md->pending))
7943 +               /* nudge anyone waiting on suspend queue */
7944 +               wake_up(&io->md->wait);
7945 +
7946 +       bh->b_end_io = io->end_io;
7947 +       bh->b_private = io->context;
7948 +       free_io(io->md, io);
7949 +
7950 +       bh->b_end_io(bh, uptodate);
7951 +}
7952 +
7953 +/*
7954 + * Do the bh mapping for a given leaf
7955 + */
7956 +static inline int __map_buffer(struct mapped_device *md, int rw,
7957 +                              struct buffer_head *bh, struct dm_io *io)
7958 +{
7959 +       struct dm_target *ti;
7960 +
7961 +       if (!md->map)
7962 +               return -EINVAL;
7963 +
7964 +       ti = dm_table_find_target(md->map, bh->b_rsector);
7965 +       if (!ti->type)
7966 +               return -EINVAL;
7967 +
7968 +       /* hook the end io request fn */
7969 +       atomic_inc(&md->pending);
7970 +       io->md = md;
7971 +       io->ti = ti;
7972 +       io->rw = rw;
7973 +       io->end_io = bh->b_end_io;
7974 +       io->context = bh->b_private;
7975 +       bh->b_end_io = dec_pending;
7976 +       bh->b_private = io;
7977 +
7978 +       return ti->type->map(ti, bh, rw, &io->map_context);
7979 +}
7980 +
7981 +/*
7982 + * Checks to see if we should be deferring io, if so it queues it
7983 + * and returns 1.
7984 + */
7985 +static inline int __deferring(struct mapped_device *md, int rw,
7986 +                             struct buffer_head *bh)
7987 +{
7988 +       int r;
7989 +
7990 +       /*
7991 +        * If we're suspended we have to queue this io for later.
7992 +        */
7993 +       while (test_bit(DMF_BLOCK_IO, &md->flags)) {
7994 +               up_read(&md->lock);
7995 +
7996 +               /*
7997 +                * There's no point deferring a read ahead
7998 +                * request, just drop it.
7999 +                */
8000 +               if (rw == READA) {
8001 +                       down_read(&md->lock);
8002 +                       return -EIO;
8003 +               }
8004 +
8005 +               r = queue_io(md, bh, rw);
8006 +               down_read(&md->lock);
8007 +
8008 +               if (r < 0)
8009 +                       return r;
8010 +
8011 +               if (r == 0)
8012 +                       return 1;       /* deferred successfully */
8013 +
8014 +       }
8015 +
8016 +       return 0;
8017 +}
8018 +
8019 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
8020 +{
8021 +       int r;
8022 +       struct dm_io *io;
8023 +       struct mapped_device *md;
8024 +
8025 +       md = get_kdev(bh->b_rdev);
8026 +       if (!md) {
8027 +               buffer_IO_error(bh);
8028 +               return 0;
8029 +       }
8030 +
8031 +       io = alloc_io(md);
8032 +       down_read(&md->lock);
8033 +
8034 +       r = __deferring(md, rw, bh);
8035 +       if (r < 0)
8036 +               goto bad;
8037 +
8038 +       else if (!r) {
8039 +               /* not deferring */
8040 +               r = __map_buffer(md, rw, bh, io);
8041 +               if (r < 0)
8042 +                       goto bad;
8043 +       } else
8044 +               r = 0;
8045 +
8046 +       up_read(&md->lock);
8047 +       dm_put(md);
8048 +       return r;
8049 +
8050 +      bad:
8051 +       buffer_IO_error(bh);
8052 +       up_read(&md->lock);
8053 +       dm_put(md);
8054 +       return 0;
8055 +}
8056 +
8057 +static int check_dev_size(kdev_t dev, unsigned long block)
8058 +{
8059 +       unsigned int major = major(dev);
8060 +       unsigned int minor = minor(dev);
8061 +
8062 +       /* FIXME: check this */
8063 +       unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
8064 +       unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
8065 +
8066 +       return (sector > max_sector) ? 0 : 1;
8067 +}
8068 +
8069 +/*
8070 + * Creates a dummy buffer head and maps it (for lilo).
8071 + */
8072 +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
8073 +                 kdev_t *r_dev, unsigned long *r_block)
8074 +{
8075 +       struct buffer_head bh;
8076 +       struct dm_target *ti;
8077 +       union map_info map_context;
8078 +       int r;
8079 +
8080 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8081 +               return -EPERM;
8082 +       }
8083 +
8084 +       if (!check_dev_size(dev, block)) {
8085 +               return -EINVAL;
8086 +       }
8087 +
8088 +       if (!md->map)
8089 +               return -EINVAL;
8090 +
8091 +       /* setup dummy bh */
8092 +       memset(&bh, 0, sizeof(bh));
8093 +       bh.b_blocknr = block;
8094 +       bh.b_dev = bh.b_rdev = dev;
8095 +       bh.b_size = blksize_size[major(dev)][minor(dev)];
8096 +       bh.b_rsector = block * (bh.b_size >> 9);
8097 +
8098 +       /* find target */
8099 +       ti = dm_table_find_target(md->map, bh.b_rsector);
8100 +
8101 +       /* do the mapping */
8102 +       r = ti->type->map(ti, &bh, READ, &map_context);
8103 +       ti->type->end_io(ti, &bh, READ, 0, &map_context);
8104 +
8105 +       if (!r) {
8106 +               *r_dev = bh.b_rdev;
8107 +               *r_block = bh.b_rsector / (bh.b_size >> 9);
8108 +       }
8109 +
8110 +       return r;
8111 +}
8112 +
8113 +/*
8114 + * Marshals arguments and results between user and kernel space.
8115 + */
8116 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
8117 +{
8118 +       struct mapped_device *md;
8119 +       unsigned long block, r_block;
8120 +       kdev_t r_dev;
8121 +       int r;
8122 +
8123 +       if (get_user(block, &lvb->lv_block))
8124 +               return -EFAULT;
8125 +
8126 +       md = get_kdev(inode->i_rdev);
8127 +       if (!md)
8128 +               return -ENXIO;
8129 +
8130 +       down_read(&md->lock);
8131 +       r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
8132 +       up_read(&md->lock);
8133 +       dm_put(md);
8134 +
8135 +       if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
8136 +                  put_user(r_block, &lvb->lv_block)))
8137 +               r = -EFAULT;
8138 +
8139 +       return r;
8140 +}
8141 +
8142 +static void free_md(struct mapped_device *md)
8143 +{
8144 +       free_dev(md->dev);
8145 +       mempool_destroy(md->io_pool);
8146 +       kfree(md);
8147 +}
8148 +
8149 +/*
8150 + * Allocate and initialise a blank device with a given minor.
8151 + */
8152 +static struct mapped_device *alloc_md(kdev_t dev)
8153 +{
8154 +       int r;
8155 +       struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
8156 +
8157 +       if (!md) {
8158 +               DMWARN("unable to allocate device, out of memory.");
8159 +               return NULL;
8160 +       }
8161 +
8162 +       memset(md, 0, sizeof(*md));
8163 +
8164 +       /* Allocate suitable device number */
8165 +       if (!dev)
8166 +               r = first_free_dev(md);
8167 +       else
8168 +               r = specific_dev(dev, md);
8169 +
8170 +       if (r) {
8171 +               kfree(md);
8172 +               return NULL;
8173 +       }
8174 +
8175 +       md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
8176 +                                    mempool_free_slab, _io_cache);
8177 +       if (!md->io_pool) {
8178 +               free_md(md);
8179 +               kfree(md);
8180 +               return NULL;
8181 +       }
8182 +
8183 +       init_rwsem(&md->lock);
8184 +       atomic_set(&md->holders, 1);
8185 +       atomic_set(&md->pending, 0);
8186 +       init_waitqueue_head(&md->wait);
8187 +       init_waitqueue_head(&md->eventq);
8188 +
8189 +       return md;
8190 +}
8191 +
8192 +/*
8193 + * The hardsect size for a mapped device is the largest hardsect size
8194 + * from the devices it maps onto.
8195 + */
8196 +static int __find_hardsect_size(struct list_head *devices)
8197 +{
8198 +       int result = 512, size;
8199 +       struct list_head *tmp;
8200 +
8201 +       list_for_each (tmp, devices) {
8202 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
8203 +               size = get_hardsect_size(dd->dev);
8204 +               if (size > result)
8205 +                       result = size;
8206 +       }
8207 +
8208 +       return result;
8209 +}
8210 +
8211 +/*
8212 + * Bind a table to the device.
8213 + */
8214 +static void event_callback(void *context)
8215 +{
8216 +       struct mapped_device *md = (struct mapped_device *) context;
8217 +
8218 +       down_write(&md->lock);
8219 +       md->event_nr++;
8220 +       wake_up_interruptible(&md->eventq);
8221 +       up_write(&md->lock);
8222 +}
8223 +
8224 +static int __bind(struct mapped_device *md, struct dm_table *t)
8225 +{
8226 +       unsigned int minor = minor(md->dev);
8227 +       unsigned int major = major(md->dev);
8228 +       md->map = t;
8229 +
8230 +       /* in k */
8231 +       blk_size[major][minor] = dm_table_get_size(t) >> 1;
8232 +       blksize_size[major][minor] = BLOCK_SIZE;
8233 +       hardsect_size[major][minor] =
8234 +           __find_hardsect_size(dm_table_get_devices(t));
8235 +       register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
8236 +
8237 +       dm_table_event_callback(md->map, event_callback, md);
8238 +       dm_table_get(t);
8239 +       return 0;
8240 +}
8241 +
8242 +static void __unbind(struct mapped_device *md)
8243 +{
8244 +       unsigned int minor = minor(md->dev);
8245 +       unsigned int major = major(md->dev);
8246 +
8247 +       if (md->map) {
8248 +               dm_table_event_callback(md->map, NULL, NULL);
8249 +               dm_table_put(md->map);
8250 +               md->map = NULL;
8251 +
8252 +       }
8253 +
8254 +       blk_size[major][minor] = 0;
8255 +       blksize_size[major][minor] = 0;
8256 +       hardsect_size[major][minor] = 0;
8257 +}
8258 +
8259 +/*
8260 + * Constructor for a new device.
8261 + */
8262 +int dm_create(kdev_t dev, struct mapped_device **result)
8263 +{
8264 +       struct mapped_device *md;
8265 +
8266 +       md = alloc_md(dev);
8267 +       if (!md)
8268 +               return -ENXIO;
8269 +
8270 +       __unbind(md);   /* Ensure zero device size */
8271 +
8272 +       *result = md;
8273 +       return 0;
8274 +}
8275 +
8276 +void dm_get(struct mapped_device *md)
8277 +{
8278 +       atomic_inc(&md->holders);
8279 +}
8280 +
8281 +void dm_put(struct mapped_device *md)
8282 +{
8283 +       if (atomic_dec_and_test(&md->holders)) {
8284 +               if (md->map)
8285 +                       dm_table_suspend_targets(md->map);
8286 +               __unbind(md);
8287 +               free_md(md);
8288 +       }
8289 +}
8290 +
8291 +/*
8292 + * Requeue the deferred io by calling generic_make_request.
8293 + */
8294 +static void flush_deferred_io(struct deferred_io *c)
8295 +{
8296 +       struct deferred_io *n;
8297 +
8298 +       while (c) {
8299 +               n = c->next;
8300 +               generic_make_request(c->rw, c->bh);
8301 +               free_deferred(c);
8302 +               c = n;
8303 +       }
8304 +}
8305 +
8306 +/*
8307 + * Swap in a new table (destroying old one).
8308 + */
8309 +int dm_swap_table(struct mapped_device *md, struct dm_table *table)
8310 +{
8311 +       int r;
8312 +
8313 +       down_write(&md->lock);
8314 +
8315 +       /*
8316 +        * The device must be suspended, or have no table bound yet.
8317 +        */
8318 +       if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
8319 +               up_write(&md->lock);
8320 +               return -EPERM;
8321 +       }
8322 +
8323 +       __unbind(md);
8324 +       r = __bind(md, table);
8325 +       if (r)
8326 +               return r;
8327 +
8328 +       up_write(&md->lock);
8329 +       return 0;
8330 +}
8331 +
8332 +/*
8333 + * We need to be able to change a mapping table under a mounted
8334 + * filesystem.  For example we might want to move some data in
8335 + * the background.  Before the table can be swapped with
8336 + * dm_bind_table, dm_suspend must be called to flush any in
8337 + * flight io and ensure that any further io gets deferred.
8338 + */
8339 +int dm_suspend(struct mapped_device *md)
8340 +{
8341 +       int r = 0;
8342 +       DECLARE_WAITQUEUE(wait, current);
8343 +
8344 +       down_write(&md->lock);
8345 +
8346 +       /*
8347 +        * First we set the BLOCK_IO flag so no more ios will be
8348 +        * mapped.
8349 +        */
8350 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8351 +               up_write(&md->lock);
8352 +               return -EINVAL;
8353 +       }
8354 +
8355 +       set_bit(DMF_BLOCK_IO, &md->flags);
8356 +       add_wait_queue(&md->wait, &wait);
8357 +       up_write(&md->lock);
8358 +
8359 +       /*
8360 +        * Then we wait for the already mapped ios to
8361 +        * complete.
8362 +        */
8363 +       run_task_queue(&tq_disk);
8364 +       while (1) {
8365 +               set_current_state(TASK_INTERRUPTIBLE);
8366 +
8367 +               if (!atomic_read(&md->pending) || signal_pending(current))
8368 +                       break;
8369 +
8370 +               schedule();
8371 +       }
8372 +       set_current_state(TASK_RUNNING);
8373 +
8374 +       down_write(&md->lock);
8375 +       remove_wait_queue(&md->wait, &wait);
8376 +
8377 +       /* did we flush everything ? */
8378 +       if (atomic_read(&md->pending)) {
8379 +               clear_bit(DMF_BLOCK_IO, &md->flags);
8380 +               r = -EINTR;
8381 +       } else {
8382 +               set_bit(DMF_SUSPENDED, &md->flags);
8383 +               if (md->map)
8384 +                       dm_table_suspend_targets(md->map);
8385 +       }
8386 +       up_write(&md->lock);
8387 +
8388 +       return r;
8389 +}
8390 +
8391 +int dm_resume(struct mapped_device *md)
8392 +{
8393 +       struct deferred_io *def;
8394 +
8395 +       down_write(&md->lock);
8396 +       if (!test_bit(DMF_SUSPENDED, &md->flags)) {
8397 +               up_write(&md->lock);
8398 +               return -EINVAL;
8399 +       }
8400 +
8401 +       if (md->map)
8402 +               dm_table_resume_targets(md->map);
8403 +
8404 +       clear_bit(DMF_SUSPENDED, &md->flags);
8405 +       clear_bit(DMF_BLOCK_IO, &md->flags);
8406 +       def = md->deferred;
8407 +       md->deferred = NULL;
8408 +       up_write(&md->lock);
8409 +
8410 +       flush_deferred_io(def);
8411 +       run_task_queue(&tq_disk);
8412 +
8413 +       return 0;
8414 +}
8415 +
8416 +struct dm_table *dm_get_table(struct mapped_device *md)
8417 +{
8418 +       struct dm_table *t;
8419 +
8420 +       down_read(&md->lock);
8421 +       t = md->map;
8422 +       if (t)
8423 +               dm_table_get(t);
8424 +       up_read(&md->lock);
8425 +
8426 +       return t;
8427 +}
8428 +
8429 +/*-----------------------------------------------------------------
8430 + * Event notification.
8431 + *---------------------------------------------------------------*/
8432 +uint32_t dm_get_event_nr(struct mapped_device *md)
8433 +{
8434 +       uint32_t r;
8435 +
8436 +       down_read(&md->lock);
8437 +       r = md->event_nr;
8438 +       up_read(&md->lock);
8439 +
8440 +       return r;
8441 +}
8442 +
8443 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8444 +                     uint32_t event_nr)
8445 +{
8446 +       down_write(&md->lock);
8447 +       if (event_nr != md->event_nr) {
8448 +               up_write(&md->lock);
8449 +               return 1;
8450 +       }
8451 +
8452 +       add_wait_queue(&md->eventq, wq);
8453 +       up_write(&md->lock);
8454 +
8455 +       return 0;
8456 +}
8457 +
8458 +const char *dm_kdevname(kdev_t dev)
8459 +{
8460 +       static char buffer[32];
8461 +       sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
8462 +       return buffer;
8463 +}
8464 +
8465 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
8466 +{
8467 +       down_write(&md->lock);
8468 +       remove_wait_queue(&md->eventq, wq);
8469 +       up_write(&md->lock);
8470 +}
8471 +
8472 +kdev_t dm_kdev(struct mapped_device *md)
8473 +{
8474 +       kdev_t dev;
8475 +
8476 +       down_read(&md->lock);
8477 +       dev = md->dev;
8478 +       up_read(&md->lock);
8479 +
8480 +       return dev;
8481 +}
8482 +
8483 +int dm_suspended(struct mapped_device *md)
8484 +{
8485 +       return test_bit(DMF_SUSPENDED, &md->flags);
8486 +}
8487 +
8488 +struct block_device_operations dm_blk_dops = {
8489 +       .open = dm_blk_open,
8490 +       .release = dm_blk_close,
8491 +       .ioctl = dm_blk_ioctl,
8492 +       .owner = THIS_MODULE
8493 +};
8494 +
8495 +/*
8496 + * module hooks
8497 + */
8498 +module_init(dm_init);
8499 +module_exit(dm_exit);
8500 +
8501 +MODULE_DESCRIPTION(DM_NAME " driver");
8502 +MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
8503 +MODULE_LICENSE("GPL");
8504 +
8505 +EXPORT_SYMBOL(dm_kdevname);
8506 --- diff/drivers/md/dm.h        1970-01-01 01:00:00.000000000 +0100
8507 +++ source/drivers/md/dm.h      2003-10-16 10:44:23.000000000 +0100
8508 @@ -0,0 +1,175 @@
8509 +/*
8510 + * Internal header file for device mapper
8511 + *
8512 + * Copyright (C) 2001, 2002 Sistina Software
8513 + *
8514 + * This file is released under the LGPL.
8515 + */
8516 +
8517 +#ifndef DM_INTERNAL_H
8518 +#define DM_INTERNAL_H
8519 +
8520 +#include <linux/fs.h>
8521 +#include <linux/device-mapper.h>
8522 +#include <linux/list.h>
8523 +#include <linux/blkdev.h>
8524 +
8525 +#define DM_NAME "device-mapper"
8526 +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
8527 +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
8528 +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
8529 +
8530 +/*
8531 + * FIXME: I think this should be with the definition of sector_t
8532 + * in types.h.
8533 + */
8534 +#ifdef CONFIG_LBD
8535 +#define SECTOR_FORMAT "%Lu"
8536 +#else
8537 +#define SECTOR_FORMAT "%lu"
8538 +#endif
8539 +
8540 +#define SECTOR_SHIFT 9
8541 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
8542 +
8543 +extern struct block_device_operations dm_blk_dops;
8544 +
8545 +/*
8546 + * List of devices that a metadevice uses and should open/close.
8547 + */
8548 +struct dm_dev {
8549 +       struct list_head list;
8550 +
8551 +       atomic_t count;
8552 +       int mode;
8553 +       kdev_t dev;
8554 +       struct block_device *bdev;
8555 +};
8556 +
8557 +struct dm_table;
8558 +struct mapped_device;
8559 +
8560 +/*-----------------------------------------------------------------
8561 + * Functions for manipulating a struct mapped_device.
8562 + * Drop the reference with dm_put when you finish with the object.
8563 + *---------------------------------------------------------------*/
8564 +int dm_create(kdev_t dev, struct mapped_device **md);
8565 +
8566 +/*
8567 + * Reference counting for md.
8568 + */
8569 +void dm_get(struct mapped_device *md);
8570 +void dm_put(struct mapped_device *md);
8571 +
8572 +/*
8573 + * A device can still be used while suspended, but I/O is deferred.
8574 + */
8575 +int dm_suspend(struct mapped_device *md);
8576 +int dm_resume(struct mapped_device *md);
8577 +
8578 +/*
8579 + * The device must be suspended before calling this method.
8580 + */
8581 +int dm_swap_table(struct mapped_device *md, struct dm_table *t);
8582 +
8583 +/*
8584 + * Drop a reference on the table when you've finished with the
8585 + * result.
8586 + */
8587 +struct dm_table *dm_get_table(struct mapped_device *md);
8588 +
8589 +/*
8590 + * Event functions.
8591 + */
8592 +uint32_t dm_get_event_nr(struct mapped_device *md);
8593 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8594 +                     uint32_t event_nr);
8595 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
8596 +
8597 +/*
8598 + * Info functions.
8599 + */
8600 +kdev_t dm_kdev(struct mapped_device *md);
8601 +int dm_suspended(struct mapped_device *md);
8602 +
8603 +/*-----------------------------------------------------------------
8604 + * Functions for manipulating a table.  Tables are also reference
8605 + * counted.
8606 + *---------------------------------------------------------------*/
8607 +int dm_table_create(struct dm_table **result, int mode);
8608 +
8609 +void dm_table_get(struct dm_table *t);
8610 +void dm_table_put(struct dm_table *t);
8611 +
8612 +int dm_table_add_target(struct dm_table *t, const char *type,
8613 +                       sector_t start, sector_t len, char *params);
8614 +int dm_table_complete(struct dm_table *t);
8615 +void dm_table_event_callback(struct dm_table *t,
8616 +                            void (*fn)(void *), void *context);
8617 +void dm_table_event(struct dm_table *t);
8618 +sector_t dm_table_get_size(struct dm_table *t);
8619 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
8620 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
8621 +unsigned int dm_table_get_num_targets(struct dm_table *t);
8622 +struct list_head *dm_table_get_devices(struct dm_table *t);
8623 +int dm_table_get_mode(struct dm_table *t);
8624 +void dm_table_suspend_targets(struct dm_table *t);
8625 +void dm_table_resume_targets(struct dm_table *t);
8626 +
8627 +/*-----------------------------------------------------------------
8628 + * A registry of target types.
8629 + *---------------------------------------------------------------*/
8630 +int dm_target_init(void);
8631 +void dm_target_exit(void);
8632 +struct target_type *dm_get_target_type(const char *name);
8633 +void dm_put_target_type(struct target_type *t);
8634 +
8635 +
8636 +/*-----------------------------------------------------------------
8637 + * Useful inlines.
8638 + *---------------------------------------------------------------*/
8639 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
8640 +                               unsigned long num)
8641 +{
8642 +       return (num > (ULONG_MAX - fixed) / obj);
8643 +}
8644 +
8645 +/*
8646 + * ceiling(n / size) * size
8647 + */
8648 +static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
8649 +{
8650 +       unsigned long r = n % size;
8651 +       return n + (r ? (size - r) : 0);
8652 +}
8653 +
8654 +/*
8655 + * Ceiling(n / size)
8656 + */
8657 +static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
8658 +{
8659 +       return dm_round_up(n, size) / size;
8660 +}
8661 +
8662 +const char *dm_kdevname(kdev_t dev);
8663 +
8664 +/*
8665 + * The device-mapper can be driven through one of two interfaces;
8666 + * ioctl or filesystem, depending which patch you have applied.
8667 + */
8668 +int dm_interface_init(void);
8669 +void dm_interface_exit(void);
8670 +
8671 +/*
8672 + * Targets for linear and striped mappings
8673 + */
8674 +int dm_linear_init(void);
8675 +void dm_linear_exit(void);
8676 +
8677 +int dm_stripe_init(void);
8678 +void dm_stripe_exit(void);
8679 +
8680 +int dm_snapshot_init(void);
8681 +void dm_snapshot_exit(void);
8682 +
8683 +#endif
8684 --- diff/drivers/md/kcopyd.c    1970-01-01 01:00:00.000000000 +0100
8685 +++ source/drivers/md/kcopyd.c  2003-10-16 10:44:23.000000000 +0100
8686 @@ -0,0 +1,650 @@
8687 +/*
8688 + * Copyright (C) 2002 Sistina Software (UK) Limited.
8689 + *
8690 + * This file is released under the GPL.
8691 + */
8692 +
8693 +#include <asm/atomic.h>
8694 +
8695 +#include <linux/blkdev.h>
8696 +#include <linux/config.h>
8697 +#include <linux/device-mapper.h>
8698 +#include <linux/fs.h>
8699 +#include <linux/init.h>
8700 +#include <linux/list.h>
8701 +#include <linux/locks.h>
8702 +#include <linux/mempool.h>
8703 +#include <linux/module.h>
8704 +#include <linux/pagemap.h>
8705 +#include <linux/slab.h>
8706 +#include <linux/vmalloc.h>
8707 +
8708 +#include "kcopyd.h"
8709 +#include "dm-daemon.h"
8710 +
8711 +/* FIXME: this is only needed for the DMERR macros */
8712 +#include "dm.h"
8713 +
8714 +static struct dm_daemon _kcopyd;
8715 +
8716 +/*-----------------------------------------------------------------
8717 + * Each kcopyd client has its own little pool of preallocated
8718 + * pages for kcopyd io.
8719 + *---------------------------------------------------------------*/
8720 +struct kcopyd_client {
8721 +       struct list_head list;
8722 +
8723 +       spinlock_t lock;
8724 +       struct list_head pages;
8725 +       unsigned int nr_pages;
8726 +       unsigned int nr_free_pages;
8727 +};
8728 +
8729 +static inline void __push_page(struct kcopyd_client *kc, struct page *p)
8730 +{
8731 +       list_add(&p->list, &kc->pages);
8732 +       kc->nr_free_pages++;
8733 +}
8734 +
8735 +static inline struct page *__pop_page(struct kcopyd_client *kc)
8736 +{
8737 +       struct page *p;
8738 +
8739 +       p = list_entry(kc->pages.next, struct page, list);
8740 +       list_del(&p->list);
8741 +       kc->nr_free_pages--;
8742 +
8743 +       return p;
8744 +}
8745 +
8746 +static int kcopyd_get_pages(struct kcopyd_client *kc,
8747 +                           unsigned int nr, struct list_head *pages)
8748 +{
8749 +       struct page *p;
8750 +       INIT_LIST_HEAD(pages);
8751 +
8752 +       spin_lock(&kc->lock);
8753 +       if (kc->nr_free_pages < nr) {
8754 +               spin_unlock(&kc->lock);
8755 +               return -ENOMEM;
8756 +       }
8757 +
8758 +       while (nr--) {
8759 +               p = __pop_page(kc);
8760 +               list_add(&p->list, pages);
8761 +       }
8762 +       spin_unlock(&kc->lock);
8763 +
8764 +       return 0;
8765 +}
8766 +
8767 +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
8768 +{
8769 +       struct list_head *tmp, *tmp2;
8770 +
8771 +       spin_lock(&kc->lock);
8772 +       list_for_each_safe (tmp, tmp2, pages)
8773 +               __push_page(kc, list_entry(tmp, struct page, list));
8774 +       spin_unlock(&kc->lock);
8775 +}
8776 +
8777 +/*
8778 + * These three functions resize the page pool.
8779 + */
8780 +static void release_pages(struct list_head *pages)
8781 +{
8782 +       struct page *p;
8783 +       struct list_head *tmp, *tmp2;
8784 +
8785 +       list_for_each_safe (tmp, tmp2, pages) {
8786 +               p = list_entry(tmp, struct page, list);
8787 +               UnlockPage(p);
8788 +               __free_page(p);
8789 +       }
8790 +}
8791 +
8792 +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
8793 +{
8794 +       unsigned int i;
8795 +       struct page *p;
8796 +       LIST_HEAD(new);
8797 +
8798 +       for (i = 0; i < nr; i++) {
8799 +               p = alloc_page(GFP_KERNEL);
8800 +               if (!p) {
8801 +                       release_pages(&new);
8802 +                       return -ENOMEM;
8803 +               }
8804 +
8805 +               LockPage(p);
8806 +               list_add(&p->list, &new);
8807 +       }
8808 +
8809 +       kcopyd_put_pages(kc, &new);
8810 +       kc->nr_pages += nr;
8811 +       return 0;
8812 +}
8813 +
8814 +static void client_free_pages(struct kcopyd_client *kc)
8815 +{
8816 +       BUG_ON(kc->nr_free_pages != kc->nr_pages);
8817 +       release_pages(&kc->pages);
8818 +       kc->nr_free_pages = kc->nr_pages = 0;
8819 +}
8820 +
8821 +/*-----------------------------------------------------------------
8822 + * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
8823 + * for this reason we use a mempool to prevent the client from
8824 + * ever having to do io (which could cause a deadlock).
8825 + *---------------------------------------------------------------*/
8826 +struct kcopyd_job {
8827 +       struct kcopyd_client *kc;
8828 +       struct list_head list;
8829 +       unsigned int flags;
8830 +
8831 +       /*
8832 +        * Error state of the job.
8833 +        */
8834 +       int read_err;
8835 +       unsigned int write_err;
8836 +
8837 +       /*
8838 +        * Either READ or WRITE
8839 +        */
8840 +       int rw;
8841 +       struct io_region source;
8842 +
8843 +       /*
8844 +        * The destinations for the transfer.
8845 +        */
8846 +       unsigned int num_dests;
8847 +       struct io_region dests[KCOPYD_MAX_REGIONS];
8848 +
8849 +       sector_t offset;
8850 +       unsigned int nr_pages;
8851 +       struct list_head pages;
8852 +
8853 +       /*
8854 +        * Set this to ensure you are notified when the job has
8855 +        * completed.  'context' is for callback to use.
8856 +        */
8857 +       kcopyd_notify_fn fn;
8858 +       void *context;
8859 +
8860 +       /*
8861 +        * These fields are only used if the job has been split
8862 +        * into more manageable parts.
8863 +        */
8864 +       struct semaphore lock;
8865 +       atomic_t sub_jobs;
8866 +       sector_t progress;
8867 +};
8868 +
8869 +/* FIXME: this should scale with the number of pages */
8870 +#define MIN_JOBS 512
8871 +
8872 +static kmem_cache_t *_job_cache = NULL;
8873 +static mempool_t *_job_pool = NULL;
8874 +
8875 +/*
8876 + * We maintain three lists of jobs:
8877 + *
8878 + * i)   jobs waiting for pages
8879 + * ii)  jobs that have pages, and are waiting for the io to be issued.
8880 + * iii) jobs that have completed.
8881 + *
8882 + * All three of these are protected by job_lock.
8883 + */
8884 +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
8885 +
8886 +static LIST_HEAD(_complete_jobs);
8887 +static LIST_HEAD(_io_jobs);
8888 +static LIST_HEAD(_pages_jobs);
8889 +
8890 +static int jobs_init(void)
8891 +{
8892 +       INIT_LIST_HEAD(&_complete_jobs);
8893 +       INIT_LIST_HEAD(&_io_jobs);
8894 +       INIT_LIST_HEAD(&_pages_jobs);
8895 +
8896 +       _job_cache = kmem_cache_create("kcopyd-jobs",
8897 +                                      sizeof(struct kcopyd_job),
8898 +                                      __alignof__(struct kcopyd_job),
8899 +                                      0, NULL, NULL);
8900 +       if (!_job_cache)
8901 +               return -ENOMEM;
8902 +
8903 +       _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
8904 +                                  mempool_free_slab, _job_cache);
8905 +       if (!_job_pool) {
8906 +               kmem_cache_destroy(_job_cache);
8907 +               return -ENOMEM;
8908 +       }
8909 +
8910 +       return 0;
8911 +}
8912 +
8913 +static void jobs_exit(void)
8914 +{
8915 +       BUG_ON(!list_empty(&_complete_jobs));
8916 +       BUG_ON(!list_empty(&_io_jobs));
8917 +       BUG_ON(!list_empty(&_pages_jobs));
8918 +
8919 +       mempool_destroy(_job_pool);
8920 +       kmem_cache_destroy(_job_cache);
8921 +}
8922 +
8923 +/*
8924 + * Functions to push and pop a job onto the head of a given job
8925 + * list.
8926 + */
8927 +static inline struct kcopyd_job *pop(struct list_head *jobs)
8928 +{
8929 +       struct kcopyd_job *job = NULL;
8930 +       unsigned long flags;
8931 +
8932 +       spin_lock_irqsave(&_job_lock, flags);
8933 +
8934 +       if (!list_empty(jobs)) {
8935 +               job = list_entry(jobs->next, struct kcopyd_job, list);
8936 +               list_del(&job->list);
8937 +       }
8938 +       spin_unlock_irqrestore(&_job_lock, flags);
8939 +
8940 +       return job;
8941 +}
8942 +
8943 +static inline void push(struct list_head *jobs, struct kcopyd_job *job)
8944 +{
8945 +       unsigned long flags;
8946 +
8947 +       spin_lock_irqsave(&_job_lock, flags);
8948 +       list_add_tail(&job->list, jobs);
8949 +       spin_unlock_irqrestore(&_job_lock, flags);
8950 +}
8951 +
8952 +/*
8953 + * These three functions process 1 item from the corresponding
8954 + * job list.
8955 + *
8956 + * They return:
8957 + * < 0: error
8958 + *   0: success
8959 + * > 0: can't process yet.
8960 + */
8961 +static int run_complete_job(struct kcopyd_job *job)
8962 +{
8963 +       void *context = job->context;
8964 +       int read_err = job->read_err;
8965 +       unsigned int write_err = job->write_err;
8966 +       kcopyd_notify_fn fn = job->fn;
8967 +
8968 +       kcopyd_put_pages(job->kc, &job->pages);
8969 +       mempool_free(job, _job_pool);
8970 +       fn(read_err, write_err, context);
8971 +       return 0;
8972 +}
8973 +
8974 +static void complete_io(unsigned int error, void *context)
8975 +{
8976 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
8977 +
8978 +       if (error) {
8979 +               if (job->rw == WRITE)
8980 +                       job->write_err &= error;
8981 +               else
8982 +                       job->read_err = 1;
8983 +
8984 +               if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
8985 +                       push(&_complete_jobs, job);
8986 +                       dm_daemon_wake(&_kcopyd);
8987 +                       return;
8988 +               }
8989 +       }
8990 +
8991 +       if (job->rw == WRITE)
8992 +               push(&_complete_jobs, job);
8993 +
8994 +       else {
8995 +               job->rw = WRITE;
8996 +               push(&_io_jobs, job);
8997 +       }
8998 +
8999 +       dm_daemon_wake(&_kcopyd);
9000 +}
9001 +
9002 +/*
9003 + * Request io on as many buffer heads as we can currently get for
9004 + * a particular job.
9005 + */
9006 +static int run_io_job(struct kcopyd_job *job)
9007 +{
9008 +       int r;
9009 +
9010 +       if (job->rw == READ)
9011 +               r = dm_io_async(1, &job->source, job->rw,
9012 +                               list_entry(job->pages.next, struct page, list),
9013 +                               job->offset, complete_io, job);
9014 +
9015 +       else
9016 +               r = dm_io_async(job->num_dests, job->dests, job->rw,
9017 +                               list_entry(job->pages.next, struct page, list),
9018 +                               job->offset, complete_io, job);
9019 +
9020 +       return r;
9021 +}
9022 +
9023 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
9024 +static int run_pages_job(struct kcopyd_job *job)
9025 +{
9026 +       int r;
9027 +
9028 +       job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
9029 +                                 SECTORS_PER_PAGE);
9030 +       r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
9031 +       if (!r) {
9032 +               /* this job is ready for io */
9033 +               push(&_io_jobs, job);
9034 +               return 0;
9035 +       }
9036 +
9037 +       if (r == -ENOMEM)
9038 +               /* can't complete now */
9039 +               return 1;
9040 +
9041 +       return r;
9042 +}
9043 +
9044 +/*
9045 + * Run through a list for as long as possible.  Returns the count
9046 + * of successful jobs.
9047 + */
9048 +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
9049 +{
9050 +       struct kcopyd_job *job;
9051 +       int r, count = 0;
9052 +
9053 +       while ((job = pop(jobs))) {
9054 +
9055 +               r = fn(job);
9056 +
9057 +               if (r < 0) {
9058 +                       /* error this rogue job */
9059 +                       if (job->rw == WRITE)
9060 +                               job->write_err = (unsigned int) -1;
9061 +                       else
9062 +                               job->read_err = 1;
9063 +                       push(&_complete_jobs, job);
9064 +                       break;
9065 +               }
9066 +
9067 +               if (r > 0) {
9068 +                       /*
9069 +                        * We couldn't service this job ATM, so
9070 +                        * push this job back onto the list.
9071 +                        */
9072 +                       push(jobs, job);
9073 +                       break;
9074 +               }
9075 +
9076 +               count++;
9077 +       }
9078 +
9079 +       return count;
9080 +}
9081 +
9082 +/*
9083 + * kcopyd does this every time it's woken up.
9084 + */
9085 +static void do_work(void)
9086 +{
9087 +       /*
9088 +        * The order that these are called is *very* important.
9089 +        * complete jobs can free some pages for pages jobs.
9090 +        * Pages jobs when successful will jump onto the io jobs
9091 +        * list.  io jobs call wake when they complete and it all
9092 +        * starts again.
9093 +        */
9094 +       process_jobs(&_complete_jobs, run_complete_job);
9095 +       process_jobs(&_pages_jobs, run_pages_job);
9096 +       process_jobs(&_io_jobs, run_io_job);
9097 +       run_task_queue(&tq_disk);
9098 +}
9099 +
9100 +/*
9101 + * If we are copying a small region we just dispatch a single job
9102 + * to do the copy, otherwise the io has to be split up into many
9103 + * jobs.
9104 + */
9105 +static void dispatch_job(struct kcopyd_job *job)
9106 +{
9107 +       push(&_pages_jobs, job);
9108 +       dm_daemon_wake(&_kcopyd);
9109 +}
9110 +
9111 +#define SUB_JOB_SIZE 128
9112 +static void segment_complete(int read_err,
9113 +                            unsigned int write_err, void *context)
9114 +{
9115 +       /* FIXME: tidy this function */
9116 +       sector_t progress = 0;
9117 +       sector_t count = 0;
9118 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
9119 +
9120 +       down(&job->lock);
9121 +
9122 +       /* update the error */
9123 +       if (read_err)
9124 +               job->read_err = 1;
9125 +
9126 +       if (write_err)
9127 +               job->write_err &= write_err;
9128 +
9129 +       /*
9130 +        * Only dispatch more work if there hasn't been an error.
9131 +        */
9132 +       if ((!job->read_err && !job->write_err) ||
9133 +           test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
9134 +               /* get the next chunk of work */
9135 +               progress = job->progress;
9136 +               count = job->source.count - progress;
9137 +               if (count) {
9138 +                       if (count > SUB_JOB_SIZE)
9139 +                               count = SUB_JOB_SIZE;
9140 +
9141 +                       job->progress += count;
9142 +               }
9143 +       }
9144 +       up(&job->lock);
9145 +
9146 +       if (count) {
9147 +               int i;
9148 +               struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
9149 +
9150 +               memcpy(sub_job, job, sizeof(*job));
9151 +               sub_job->source.sector += progress;
9152 +               sub_job->source.count = count;
9153 +
9154 +               for (i = 0; i < job->num_dests; i++) {
9155 +                       sub_job->dests[i].sector += progress;
9156 +                       sub_job->dests[i].count = count;
9157 +               }
9158 +
9159 +               sub_job->fn = segment_complete;
9160 +               sub_job->context = job;
9161 +               dispatch_job(sub_job);
9162 +
9163 +       } else if (atomic_dec_and_test(&job->sub_jobs)) {
9164 +
9165 +               /*
9166 +                * To avoid a race we must keep the job around
9167 +                * until after the notify function has completed.
9168 +                * Otherwise the client may try and stop the job
9169 +                * after we've completed.
9170 +                */
9171 +               job->fn(read_err, write_err, job->context);
9172 +               mempool_free(job, _job_pool);
9173 +       }
9174 +}
9175 +
9176 +/*
9177 + * Create some little jobs that will do the move between
9178 + * them.
9179 + */
9180 +#define SPLIT_COUNT 8
9181 +static void split_job(struct kcopyd_job *job)
9182 +{
9183 +       int i;
9184 +
9185 +       atomic_set(&job->sub_jobs, SPLIT_COUNT);
9186 +       for (i = 0; i < SPLIT_COUNT; i++)
9187 +               segment_complete(0, 0u, job);
9188 +}
9189 +
9190 +#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
9191 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9192 +               unsigned int num_dests, struct io_region *dests,
9193 +               unsigned int flags, kcopyd_notify_fn fn, void *context)
9194 +{
9195 +       struct kcopyd_job *job;
9196 +
9197 +       /*
9198 +        * Allocate a new job.
9199 +        */
9200 +       job = mempool_alloc(_job_pool, GFP_NOIO);
9201 +
9202 +       /*
9203 +        * set up for the read.
9204 +        */
9205 +       job->kc = kc;
9206 +       job->flags = flags;
9207 +       job->read_err = 0;
9208 +       job->write_err = 0;
9209 +       job->rw = READ;
9210 +
9211 +       memcpy(&job->source, from, sizeof(*from));
9212 +
9213 +       job->num_dests = num_dests;
9214 +       memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
9215 +
9216 +       job->offset = 0;
9217 +       job->nr_pages = 0;
9218 +       INIT_LIST_HEAD(&job->pages);
9219 +
9220 +       job->fn = fn;
9221 +       job->context = context;
9222 +
9223 +       if (job->source.count < SUB_JOB_THRESHOLD)
9224 +               dispatch_job(job);
9225 +
9226 +       else {
9227 +               init_MUTEX(&job->lock);
9228 +               job->progress = 0;
9229 +               split_job(job);
9230 +       }
9231 +
9232 +       return 0;
9233 +}
9234 +
9235 +/*
9236 + * Cancels a kcopyd job, eg. someone might be deactivating a
9237 + * mirror.
9238 + */
9239 +int kcopyd_cancel(struct kcopyd_job *job, int block)
9240 +{
9241 +       /* FIXME: finish */
9242 +       return -1;
9243 +}
9244 +
9245 +/*-----------------------------------------------------------------
9246 + * Unit setup
9247 + *---------------------------------------------------------------*/
9248 +static DECLARE_MUTEX(_client_lock);
9249 +static LIST_HEAD(_clients);
9250 +
9251 +static int client_add(struct kcopyd_client *kc)
9252 +{
9253 +       down(&_client_lock);
9254 +       list_add(&kc->list, &_clients);
9255 +       up(&_client_lock);
9256 +       return 0;
9257 +}
9258 +
9259 +static void client_del(struct kcopyd_client *kc)
9260 +{
9261 +       down(&_client_lock);
9262 +       list_del(&kc->list);
9263 +       up(&_client_lock);
9264 +}
9265 +
9266 +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
9267 +{
9268 +       int r = 0;
9269 +       struct kcopyd_client *kc;
9270 +
9271 +       kc = kmalloc(sizeof(*kc), GFP_KERNEL);
9272 +       if (!kc)
9273 +               return -ENOMEM;
9274 +
9275 +       kc->lock = SPIN_LOCK_UNLOCKED;
9276 +       INIT_LIST_HEAD(&kc->pages);
9277 +       kc->nr_pages = kc->nr_free_pages = 0;
9278 +       r = client_alloc_pages(kc, nr_pages);
9279 +       if (r) {
9280 +               kfree(kc);
9281 +               return r;
9282 +       }
9283 +
9284 +       r = dm_io_get(nr_pages);
9285 +       if (r) {
9286 +               client_free_pages(kc);
9287 +               kfree(kc);
9288 +               return r;
9289 +       }
9290 +
9291 +       r = client_add(kc);
9292 +       if (r) {
9293 +               dm_io_put(nr_pages);
9294 +               client_free_pages(kc);
9295 +               kfree(kc);
9296 +               return r;
9297 +       }
9298 +
9299 +       *result = kc;
9300 +       return 0;
9301 +}
9302 +
9303 +void kcopyd_client_destroy(struct kcopyd_client *kc)
9304 +{
9305 +       dm_io_put(kc->nr_pages);
9306 +       client_free_pages(kc);
9307 +       client_del(kc);
9308 +       kfree(kc);
9309 +}
9310 +
9311 +
9312 +int __init kcopyd_init(void)
9313 +{
9314 +       int r;
9315 +
9316 +       r = jobs_init();
9317 +       if (r)
9318 +               return r;
9319 +
9320 +       r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
9321 +       if (r)
9322 +               jobs_exit();
9323 +
9324 +       return r;
9325 +}
9326 +
9327 +void kcopyd_exit(void)
9328 +{
9329 +       jobs_exit();
9330 +       dm_daemon_stop(&_kcopyd);
9331 +}
9332 +
9333 +EXPORT_SYMBOL(kcopyd_client_create);
9334 +EXPORT_SYMBOL(kcopyd_client_destroy);
9335 +EXPORT_SYMBOL(kcopyd_copy);
9336 +EXPORT_SYMBOL(kcopyd_cancel);
9337 --- diff/drivers/md/kcopyd.h    1970-01-01 01:00:00.000000000 +0100
9338 +++ source/drivers/md/kcopyd.h  2003-10-16 10:44:23.000000000 +0100
9339 @@ -0,0 +1,47 @@
9340 +/*
9341 + * Copyright (C) 2001 Sistina Software
9342 + *
9343 + * This file is released under the GPL.
9344 + */
9345 +
9346 +#ifndef DM_KCOPYD_H
9347 +#define DM_KCOPYD_H
9348 +
9349 +/*
9350 + * Needed for the definition of offset_t.
9351 + */
9352 +#include <linux/device-mapper.h>
9353 +#include <linux/iobuf.h>
9354 +
9355 +#include "dm-io.h"
9356 +
9357 +int kcopyd_init(void);
9358 +void kcopyd_exit(void);
9359 +
9360 +/* FIXME: make this configurable */
9361 +#define KCOPYD_MAX_REGIONS 8
9362 +
9363 +#define KCOPYD_IGNORE_ERROR 1
9364 +
9365 +/*
9366 + * To use kcopyd you must first create a kcopyd client object.
9367 + */
9368 +struct kcopyd_client;
9369 +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
9370 +void kcopyd_client_destroy(struct kcopyd_client *kc);
9371 +
9372 +/*
9373 + * Submit a copy job to kcopyd.  This is built on top of the
9374 + * previous three fns.
9375 + *
9376 + * read_err is a boolean,
9377 + * write_err is a bitset, with 1 bit for each destination region
9378 + */
9379 +typedef void (*kcopyd_notify_fn)(int read_err,
9380 +                                unsigned int write_err, void *context);
9381 +
9382 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9383 +               unsigned int num_dests, struct io_region *dests,
9384 +               unsigned int flags, kcopyd_notify_fn fn, void *context);
9385 +
9386 +#endif
9387 --- diff/include/linux/device-mapper.h  1970-01-01 01:00:00.000000000 +0100
9388 +++ source/include/linux/device-mapper.h        2003-10-16 10:44:23.000000000 +0100
9389 @@ -0,0 +1,104 @@
9390 +/*
9391 + * Copyright (C) 2001 Sistina Software (UK) Limited.
9392 + *
9393 + * This file is released under the LGPL.
9394 + */
9395 +
9396 +#ifndef _LINUX_DEVICE_MAPPER_H
9397 +#define _LINUX_DEVICE_MAPPER_H
9398 +
9399 +typedef unsigned long sector_t;
9400 +
9401 +struct dm_target;
9402 +struct dm_table;
9403 +struct dm_dev;
9404 +
9405 +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
9406 +
9407 +union map_info {
9408 +       void *ptr;
9409 +       unsigned long long ll;
9410 +};
9411 +
9412 +/*
9413 + * In the constructor the target parameter will already have the
9414 + * table, type, begin and len fields filled in.
9415 + */
9416 +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
9417 +                         char **argv);
9418 +
9419 +/*
9420 + * The destructor doesn't need to free the dm_target, just
9421 + * anything hidden ti->private.
9422 + */
9423 +typedef void (*dm_dtr_fn) (struct dm_target * ti);
9424 +
9425 +/*
9426 + * The map function must return:
9427 + * < 0: error
9428 + * = 0: The target will handle the io by resubmitting it later
9429 + * > 0: simple remap complete
9430 + */
9431 +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
9432 +                         int rw, union map_info *map_context);
9433 +
9434 +/*
9435 + * Returns:
9436 + * < 0 : error (currently ignored)
9437 + * 0   : ended successfully
9438 + * 1   : for some reason the io has still not completed (eg,
9439 + *       multipath target might want to requeue a failed io).
9440 + */
9441 +typedef int (*dm_endio_fn) (struct dm_target * ti,
9442 +                           struct buffer_head * bh, int rw, int error,
9443 +                           union map_info *map_context);
9444 +typedef void (*dm_suspend_fn) (struct dm_target *ti);
9445 +typedef void (*dm_resume_fn) (struct dm_target *ti);
9446 +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
9447 +                            char *result, unsigned int maxlen);
9448 +
9449 +void dm_error(const char *message);
9450 +
9451 +/*
9452 + * Constructors should call these functions to ensure destination devices
9453 + * are opened/closed correctly.
9454 + * FIXME: too many arguments.
9455 + */
9456 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
9457 +                 sector_t len, int mode, struct dm_dev **result);
9458 +void dm_put_device(struct dm_target *ti, struct dm_dev *d);
9459 +
9460 +/*
9461 + * Information about a target type
9462 + */
9463 +struct target_type {
9464 +       const char *name;
9465 +       struct module *module;
9466 +       dm_ctr_fn ctr;
9467 +       dm_dtr_fn dtr;
9468 +       dm_map_fn map;
9469 +       dm_endio_fn end_io;
9470 +       dm_suspend_fn suspend;
9471 +       dm_resume_fn resume;
9472 +       dm_status_fn status;
9473 +};
9474 +
9475 +struct dm_target {
9476 +       struct dm_table *table;
9477 +       struct target_type *type;
9478 +
9479 +       /* target limits */
9480 +       sector_t begin;
9481 +       sector_t len;
9482 +
9483 +       /* target specific data */
9484 +       void *private;
9485 +
9486 +       /* Used to provide an error string from the ctr */
9487 +       char *error;
9488 +};
9489 +
9490 +int dm_register_target(struct target_type *t);
9491 +int dm_unregister_target(struct target_type *t);
9492 +
9493 +#endif                         /* _LINUX_DEVICE_MAPPER_H */
9494 --- diff/include/linux/dm-ioctl.h       1970-01-01 01:00:00.000000000 +0100
9495 +++ source/include/linux/dm-ioctl.h     2003-10-16 10:44:23.000000000 +0100
9496 @@ -0,0 +1,237 @@
9497 +/*
9498 + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
9499 + *
9500 + * This file is released under the LGPL.
9501 + */
9502 +
9503 +#ifndef _LINUX_DM_IOCTL_H
9504 +#define _LINUX_DM_IOCTL_H
9505 +
9506 +#include <linux/types.h>
9507 +
9508 +#define DM_DIR "mapper"                /* Slashes not supported */
9509 +#define DM_MAX_TYPE_NAME 16
9510 +#define DM_NAME_LEN 128
9511 +#define DM_UUID_LEN 129
9512 +
9513 +/*
9514 + * A traditional ioctl interface for the device mapper.
9515 + *
9516 + * Each device can have two tables associated with it, an
9517 + * 'active' table which is the one currently used by io passing
9518 + * through the device, and an 'inactive' one which is a table
9519 + * that is being prepared as a replacement for the 'active' one.
9520 + *
9521 + * DM_VERSION:
9522 + * Just get the version information for the ioctl interface.
9523 + *
9524 + * DM_REMOVE_ALL:
9525 + * Remove all dm devices, destroy all tables.  Only really used
9526 + * for debug.
9527 + *
9528 + * DM_LIST_DEVICES:
9529 + * Get a list of all the dm device names.
9530 + *
9531 + * DM_DEV_CREATE:
9532 + * Create a new device, neither the 'active' or 'inactive' table
9533 + * slots will be filled.  The device will be in suspended state
9534 + * after creation, however any io to the device will get errored
9535 + * since it will be out-of-bounds.
9536 + *
9537 + * DM_DEV_REMOVE:
9538 + * Remove a device, destroy any tables.
9539 + *
9540 + * DM_DEV_RENAME:
9541 + * Rename a device.
9542 + *
9543 + * DM_SUSPEND:
9544 + * This performs both suspend and resume, depending which flag is
9545 + * passed in.
9546 + * Suspend: This command will not return until all pending io to
9547 + * the device has completed.  Further io will be deferred until
9548 + * the device is resumed.
9549 + * Resume: It is no longer an error to issue this command on an
9550 + * unsuspended device.  If a table is present in the 'inactive'
9551 + * slot, it will be moved to the active slot, then the old table
9552 + * from the active slot will be _destroyed_.  Finally the device
9553 + * is resumed.
9554 + *
9555 + * DM_DEV_STATUS:
9556 + * Retrieves the status for the table in the 'active' slot.
9557 + *
9558 + * DM_DEV_WAIT:
9559 + * Wait for a significant event to occur to the device.  This
9560 + * could either be caused by an event triggered by one of the
9561 + * targets of the table in the 'active' slot, or a table change.
9562 + *
9563 + * DM_TABLE_LOAD:
9564 + * Load a table into the 'inactive' slot for the device.  The
9565 + * device does _not_ need to be suspended prior to this command.
9566 + *
9567 + * DM_TABLE_CLEAR:
9568 + * Destroy any table in the 'inactive' slot (ie. abort).
9569 + *
9570 + * DM_TABLE_DEPS:
9571 + * Return a set of device dependencies for the 'active' table.
9572 + *
9573 + * DM_TABLE_STATUS:
9574 + * Return the targets status for the 'active' table.
9575 + */
9576 +
9577 +/*
9578 + * All ioctl arguments consist of a single chunk of memory, with
9579 + * this structure at the start.  If a uuid is specified any
9580 + * lookup (eg. for a DM_INFO) will be done on that, *not* the
9581 + * name.
9582 + */
9583 +struct dm_ioctl {
9584 +       /*
9585 +        * The version number is made up of three parts:
9586 +        * major - no backward or forward compatibility,
9587 +        * minor - only backwards compatible,
9588 +        * patch - both backwards and forwards compatible.
9589 +        *
9590 +        * All clients of the ioctl interface should fill in the
9591 +        * version number of the interface that they were
9592 +        * compiled with.
9593 +        *
9594 +        * All recognised ioctl commands (ie. those that don't
9595 +        * return -ENOTTY) fill out this field, even if the
9596 +        * command failed.
9597 +        */
9598 +       uint32_t version[3];    /* in/out */
9599 +       uint32_t data_size;     /* total size of data passed in
9600 +                                * including this struct */
9601 +
9602 +       uint32_t data_start;    /* offset to start of data
9603 +                                * relative to start of this struct */
9604 +
9605 +       uint32_t target_count;  /* in/out */
9606 +       int32_t open_count;     /* out */
9607 +       uint32_t flags;         /* in/out */
9608 +       uint32_t event_nr;      /* in/out */
9609 +       uint32_t padding;
9610 +
9611 +       uint64_t dev;           /* in/out */
9612 +
9613 +       char name[DM_NAME_LEN]; /* device name */
9614 +       char uuid[DM_UUID_LEN]; /* unique identifier for
9615 +                                * the block device */
9616 +};
9617 +
9618 +/*
9619 + * Used to specify tables.  These structures appear after the
9620 + * dm_ioctl.
9621 + */
9622 +struct dm_target_spec {
9623 +       uint64_t sector_start;
9624 +       uint64_t length;
9625 +       int32_t status;         /* used when reading from kernel only */
9626 +
9627 +       /*
9628 +        * Offset in bytes (from the start of this struct) to
9629 +        * next target_spec.
9630 +        */
9631 +       uint32_t next;
9632 +
9633 +       char target_type[DM_MAX_TYPE_NAME];
9634 +
9635 +       /*
9636 +        * Parameter string starts immediately after this object.
9637 +        * Be careful to add padding after string to ensure correct
9638 +        * alignment of subsequent dm_target_spec.
9639 +        */
9640 +};
9641 +
9642 +/*
9643 + * Used to retrieve the target dependencies.
9644 + */
9645 +struct dm_target_deps {
9646 +       uint32_t count;         /* Array size */
9647 +       uint32_t padding;       /* unused */
9648 +       uint64_t dev[0];        /* out */
9649 +};
9650 +
9651 +/*
9652 + * Used to get a list of all dm devices.
9653 + */
9654 +struct dm_name_list {
9655 +       uint64_t dev;
9656 +       uint32_t next;          /* offset to the next record from
9657 +                                  the _start_ of this */
9658 +       char name[0];
9659 +};
9660 +
9661 +/*
9662 + * If you change this make sure you make the corresponding change
9663 + * to dm-ioctl.c:lookup_ioctl()
9664 + */
9665 +enum {
9666 +       /* Top level cmds */
9667 +       DM_VERSION_CMD = 0,
9668 +       DM_REMOVE_ALL_CMD,
9669 +       DM_LIST_DEVICES_CMD,
9670 +
9671 +       /* device level cmds */
9672 +       DM_DEV_CREATE_CMD,
9673 +       DM_DEV_REMOVE_CMD,
9674 +       DM_DEV_RENAME_CMD,
9675 +       DM_DEV_SUSPEND_CMD,
9676 +       DM_DEV_STATUS_CMD,
9677 +       DM_DEV_WAIT_CMD,
9678 +
9679 +       /* Table level cmds */
9680 +       DM_TABLE_LOAD_CMD,
9681 +       DM_TABLE_CLEAR_CMD,
9682 +       DM_TABLE_DEPS_CMD,
9683 +       DM_TABLE_STATUS_CMD,
9684 +};
9685 +
9686 +#define DM_IOCTL 0xfd
9687 +
9688 +#define DM_VERSION       _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
9689 +#define DM_REMOVE_ALL    _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
9690 +#define DM_LIST_DEVICES  _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
9691 +
9692 +#define DM_DEV_CREATE    _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
9693 +#define DM_DEV_REMOVE    _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
9694 +#define DM_DEV_RENAME    _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
9695 +#define DM_DEV_SUSPEND   _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
9696 +#define DM_DEV_STATUS    _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
9697 +#define DM_DEV_WAIT      _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
9698 +
9699 +#define DM_TABLE_LOAD    _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
9700 +#define DM_TABLE_CLEAR   _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
9701 +#define DM_TABLE_DEPS    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
9702 +#define DM_TABLE_STATUS  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
9703 +
9704 +#define DM_VERSION_MAJOR       4
9705 +#define DM_VERSION_MINOR       0
9706 +#define DM_VERSION_PATCHLEVEL  1
9707 +#define DM_VERSION_EXTRA       "-ioctl (2003-07-12)"
9708 +
9709 +/* Status bits */
9710 +#define DM_READONLY_FLAG       (1 << 0) /* In/Out */
9711 +#define DM_SUSPEND_FLAG                (1 << 1) /* In/Out */
9712 +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
9713 +
9714 +/*
9715 + * Flag passed into ioctl STATUS command to get table information
9716 + * rather than current status.
9717 + */
9718 +#define DM_STATUS_TABLE_FLAG   (1 << 4) /* In */
9719 +
9720 +/*
9721 + * Flags that indicate whether a table is present in either of
9722 + * the two table slots that a device has.
9723 + */
9724 +#define DM_ACTIVE_PRESENT_FLAG   (1 << 5) /* Out */
9725 +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
9726 +
9727 +/*
9728 + * Indicates that the buffer passed in wasn't big enough for the
9729 + * results.
9730 + */
9731 +#define DM_BUFFER_FULL_FLAG    (1 << 8) /* Out */
9732 +
9733 +#endif                         /* _LINUX_DM_IOCTL_H */
9734 --- diff/include/linux/mempool.h        1970-01-01 01:00:00.000000000 +0100
9735 +++ source/include/linux/mempool.h      2003-10-16 10:44:23.000000000 +0100
9736 @@ -0,0 +1,31 @@
9737 +/*
9738 + * memory buffer pool support
9739 + */
9740 +#ifndef _LINUX_MEMPOOL_H
9741 +#define _LINUX_MEMPOOL_H
9742 +
9743 +#include <linux/list.h>
9744 +#include <linux/wait.h>
9745 +
9746 +struct mempool_s;
9747 +typedef struct mempool_s mempool_t;
9748 +
9749 +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
9750 +typedef void (mempool_free_t)(void *element, void *pool_data);
9751 +
9752 +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9753 +                                mempool_free_t *free_fn, void *pool_data);
9754 +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
9755 +extern void mempool_destroy(mempool_t *pool);
9756 +extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
9757 +extern void mempool_free(void *element, mempool_t *pool);
9758 +
9759 +/*
9760 + * A mempool_alloc_t and mempool_free_t that get the memory from
9761 + * a slab that is passed in through pool_data.
9762 + */
9763 +void *mempool_alloc_slab(int gfp_mask, void *pool_data);
9764 +void mempool_free_slab(void *element, void *pool_data);
9765 +
9766 +
9767 +#endif /* _LINUX_MEMPOOL_H */
9768 --- diff/mm/mempool.c   1970-01-01 01:00:00.000000000 +0100
9769 +++ source/mm/mempool.c 2003-10-16 10:44:23.000000000 +0100
9770 @@ -0,0 +1,299 @@
9771 +/*
9772 + *  linux/mm/mempool.c
9773 + *
9774 + *  memory buffer pool support. Such pools are mostly used
9775 + *  for guaranteed, deadlock-free memory allocations during
9776 + *  extreme VM load.
9777 + *
9778 + *  started by Ingo Molnar, Copyright (C) 2001
9779 + */
9780 +
9781 +#include <linux/mm.h>
9782 +#include <linux/slab.h>
9783 +#include <linux/module.h>
9784 +#include <linux/mempool.h>
9785 +
9786 +struct mempool_s {
9787 +       spinlock_t lock;
9788 +       int min_nr;             /* nr of elements at *elements */
9789 +       int curr_nr;            /* Current nr of elements at *elements */
9790 +       void **elements;
9791 +
9792 +       void *pool_data;
9793 +       mempool_alloc_t *alloc;
9794 +       mempool_free_t *free;
9795 +       wait_queue_head_t wait;
9796 +};
9797 +
9798 +static void add_element(mempool_t *pool, void *element)
9799 +{
9800 +       BUG_ON(pool->curr_nr >= pool->min_nr);
9801 +       pool->elements[pool->curr_nr++] = element;
9802 +}
9803 +
9804 +static void *remove_element(mempool_t *pool)
9805 +{
9806 +       BUG_ON(pool->curr_nr <= 0);
9807 +       return pool->elements[--pool->curr_nr];
9808 +}
9809 +
9810 +static void free_pool(mempool_t *pool)
9811 +{
9812 +       while (pool->curr_nr) {
9813 +               void *element = remove_element(pool);
9814 +               pool->free(element, pool->pool_data);
9815 +       }
9816 +       kfree(pool->elements);
9817 +       kfree(pool);
9818 +}
9819 +
9820 +/**
9821 + * mempool_create - create a memory pool
9822 + * @min_nr:    the minimum number of elements guaranteed to be
9823 + *             allocated for this pool.
9824 + * @alloc_fn:  user-defined element-allocation function.
9825 + * @free_fn:   user-defined element-freeing function.
9826 + * @pool_data: optional private data available to the user-defined functions.
9827 + *
9828 + * this function creates and allocates a guaranteed size, preallocated
9829 + * memory pool. The pool can be used from the mempool_alloc and mempool_free
9830 + * functions. This function might sleep. Both the alloc_fn() and the free_fn()
9831 + * functions might sleep - as long as the mempool_alloc function is not called
9832 + * from IRQ contexts.
9833 + */
9834 +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9835 +                               mempool_free_t *free_fn, void *pool_data)
9836 +{
9837 +       mempool_t *pool;
9838 +
9839 +       pool = kmalloc(sizeof(*pool), GFP_KERNEL);
9840 +       if (!pool)
9841 +               return NULL;
9842 +       memset(pool, 0, sizeof(*pool));
9843 +       pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
9844 +       if (!pool->elements) {
9845 +               kfree(pool);
9846 +               return NULL;
9847 +       }
9848 +       spin_lock_init(&pool->lock);
9849 +       pool->min_nr = min_nr;
9850 +       pool->pool_data = pool_data;
9851 +       init_waitqueue_head(&pool->wait);
9852 +       pool->alloc = alloc_fn;
9853 +       pool->free = free_fn;
9854 +
9855 +       /*
9856 +        * First pre-allocate the guaranteed number of buffers.
9857 +        */
9858 +       while (pool->curr_nr < pool->min_nr) {
9859 +               void *element;
9860 +
9861 +               element = pool->alloc(GFP_KERNEL, pool->pool_data);
9862 +               if (unlikely(!element)) {
9863 +                       free_pool(pool);
9864 +                       return NULL;
9865 +               }
9866 +               add_element(pool, element);
9867 +       }
9868 +       return pool;
9869 +}
9870 +
9871 +/**
9872 + * mempool_resize - resize an existing memory pool
9873 + * @pool:       pointer to the memory pool which was allocated via
9874 + *              mempool_create().
9875 + * @new_min_nr: the new minimum number of elements guaranteed to be
9876 + *              allocated for this pool.
9877 + * @gfp_mask:   the usual allocation bitmask.
9878 + *
9879 + * This function shrinks/grows the pool. In the case of growing,
9880 + * it cannot be guaranteed that the pool will be grown to the new
9881 + * size immediately, but new mempool_free() calls will refill it.
9882 + *
9883 + * Note, the caller must guarantee that no mempool_destroy is called
9884 + * while this function is running. mempool_alloc() & mempool_free()
9885 + * might be called (eg. from IRQ contexts) while this function executes.
9886 + */
9887 +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
9888 +{
9889 +       void *element;
9890 +       void **new_elements;
9891 +       unsigned long flags;
9892 +
9893 +       BUG_ON(new_min_nr <= 0);
9894 +
9895 +       spin_lock_irqsave(&pool->lock, flags);
9896 +       if (new_min_nr < pool->min_nr) {
9897 +               while (pool->curr_nr > new_min_nr) {
9898 +                       element = remove_element(pool);
9899 +                       spin_unlock_irqrestore(&pool->lock, flags);
9900 +                       pool->free(element, pool->pool_data);
9901 +                       spin_lock_irqsave(&pool->lock, flags);
9902 +               }
9903 +               pool->min_nr = new_min_nr;
9904 +               goto out_unlock;
9905 +       }
9906 +       spin_unlock_irqrestore(&pool->lock, flags);
9907 +
9908 +       /* Grow the pool */
9909 +       new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
9910 +       if (!new_elements)
9911 +               return -ENOMEM;
9912 +
9913 +       spin_lock_irqsave(&pool->lock, flags);
9914 +       memcpy(new_elements, pool->elements,
9915 +                       pool->curr_nr * sizeof(*new_elements));
9916 +       kfree(pool->elements);
9917 +       pool->elements = new_elements;
9918 +       pool->min_nr = new_min_nr;
9919 +
9920 +       while (pool->curr_nr < pool->min_nr) {
9921 +               spin_unlock_irqrestore(&pool->lock, flags);
9922 +               element = pool->alloc(gfp_mask, pool->pool_data);
9923 +               if (!element)
9924 +                       goto out;
9925 +               spin_lock_irqsave(&pool->lock, flags);
9926 +               if (pool->curr_nr < pool->min_nr)
9927 +                       add_element(pool, element);
9928 +               else
9929 +                       kfree(element);         /* Raced */
9930 +       }
9931 +out_unlock:
9932 +       spin_unlock_irqrestore(&pool->lock, flags);
9933 +out:
9934 +       return 0;
9935 +}
9936 +
9937 +/**
9938 + * mempool_destroy - deallocate a memory pool
9939 + * @pool:      pointer to the memory pool which was allocated via
9940 + *             mempool_create().
9941 + *
9942 + * this function only sleeps if the free_fn() function sleeps. The caller
9943 + * has to guarantee that all elements have been returned to the pool (ie:
9944 + * freed) prior to calling mempool_destroy().
9945 + */
9946 +void mempool_destroy(mempool_t *pool)
9947 +{
9948 +       if (pool->curr_nr != pool->min_nr)
9949 +               BUG();          /* There were outstanding elements */
9950 +       free_pool(pool);
9951 +}
9952 +
9953 +/**
9954 + * mempool_alloc - allocate an element from a specific memory pool
9955 + * @pool:      pointer to the memory pool which was allocated via
9956 + *             mempool_create().
9957 + * @gfp_mask:  the usual allocation bitmask.
9958 + *
9959 + * this function only sleeps if the alloc_fn function sleeps or
9960 + * returns NULL. Note that due to preallocation, this function
9961 + * *never* fails when called from process contexts. (it might
9962 + * fail if called from an IRQ context.)
9963 + */
9964 +void * mempool_alloc(mempool_t *pool, int gfp_mask)
9965 +{
9966 +       void *element;
9967 +       unsigned long flags;
9968 +       int curr_nr;
9969 +       DECLARE_WAITQUEUE(wait, current);
9970 +       int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
9971 +
9972 +repeat_alloc:
9973 +       element = pool->alloc(gfp_nowait, pool->pool_data);
9974 +       if (likely(element != NULL))
9975 +               return element;
9976 +
9977 +       /*
9978 +        * If the pool is less than 50% full then try harder
9979 +        * to allocate an element:
9980 +        */
9981 +       if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
9982 +               element = pool->alloc(gfp_mask, pool->pool_data);
9983 +               if (likely(element != NULL))
9984 +                       return element;
9985 +       }
9986 +
9987 +       /*
9988 +        * Kick the VM at this point.
9989 +        */
9990 +       wakeup_bdflush();
9991 +
9992 +       spin_lock_irqsave(&pool->lock, flags);
9993 +       if (likely(pool->curr_nr)) {
9994 +               element = remove_element(pool);
9995 +               spin_unlock_irqrestore(&pool->lock, flags);
9996 +               return element;
9997 +       }
9998 +       spin_unlock_irqrestore(&pool->lock, flags);
9999 +
10000 +       /* We must not sleep in the GFP_ATOMIC case */
10001 +       if (gfp_mask == gfp_nowait)
10002 +               return NULL;
10003 +
10004 +       run_task_queue(&tq_disk);
10005 +
10006 +       add_wait_queue_exclusive(&pool->wait, &wait);
10007 +       set_task_state(current, TASK_UNINTERRUPTIBLE);
10008 +
10009 +       spin_lock_irqsave(&pool->lock, flags);
10010 +       curr_nr = pool->curr_nr;
10011 +       spin_unlock_irqrestore(&pool->lock, flags);
10012 +
10013 +       if (!curr_nr)
10014 +               schedule();
10015 +
10016 +       current->state = TASK_RUNNING;
10017 +       remove_wait_queue(&pool->wait, &wait);
10018 +
10019 +       goto repeat_alloc;
10020 +}
10021 +
10022 +/**
10023 + * mempool_free - return an element to the pool.
10024 + * @element:   pool element pointer.
10025 + * @pool:      pointer to the memory pool which was allocated via
10026 + *             mempool_create().
10027 + *
10028 + * this function only sleeps if the free_fn() function sleeps.
10029 + */
10030 +void mempool_free(void *element, mempool_t *pool)
10031 +{
10032 +       unsigned long flags;
10033 +
10034 +       if (pool->curr_nr < pool->min_nr) {
10035 +               spin_lock_irqsave(&pool->lock, flags);
10036 +               if (pool->curr_nr < pool->min_nr) {
10037 +                       add_element(pool, element);
10038 +                       spin_unlock_irqrestore(&pool->lock, flags);
10039 +                       wake_up(&pool->wait);
10040 +                       return;
10041 +               }
10042 +               spin_unlock_irqrestore(&pool->lock, flags);
10043 +       }
10044 +       pool->free(element, pool->pool_data);
10045 +}
10046 +
10047 +/*
10048 + * A commonly used alloc and free fn.
10049 + */
10050 +void *mempool_alloc_slab(int gfp_mask, void *pool_data)
10051 +{
10052 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10053 +       return kmem_cache_alloc(mem, gfp_mask);
10054 +}
10055 +
10056 +void mempool_free_slab(void *element, void *pool_data)
10057 +{
10058 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10059 +       kmem_cache_free(mem, element);
10060 +}
10061 +
10062 +
10063 +EXPORT_SYMBOL(mempool_create);
10064 +EXPORT_SYMBOL(mempool_resize);
10065 +EXPORT_SYMBOL(mempool_destroy);
10066 +EXPORT_SYMBOL(mempool_alloc);
10067 +EXPORT_SYMBOL(mempool_free);
10068 +EXPORT_SYMBOL(mempool_alloc_slab);
10069 +EXPORT_SYMBOL(mempool_free_slab);
10070 Only every other metadata area was being read when loading a snapshot!
10071 [Kevin Corry]
10072 --- diff/drivers/md/dm-exception-store.c        2003-10-16 10:44:23.000000000 +0100
10073 +++ source/drivers/md/dm-exception-store.c      2003-10-16 10:44:27.000000000 +0100
10074 @@ -369,8 +369,6 @@
10075                 r = insert_exceptions(ps, &full);
10076                 if (r)
10077                         return r;
10078 -
10079 -               area++;
10080         }
10081
10082         return 0;
10083 Don't initialise static variables to zero/NULL.
10084 --- diff/drivers/md/kcopyd.c    2003-10-16 10:44:23.000000000 +0100
10085 +++ source/drivers/md/kcopyd.c  2003-10-16 10:44:31.000000000 +0100
10086 @@ -183,8 +183,8 @@
10087  /* FIXME: this should scale with the number of pages */
10088  #define MIN_JOBS 512
10089
10090 -static kmem_cache_t *_job_cache = NULL;
10091 -static mempool_t *_job_pool = NULL;
10092 +static kmem_cache_t *_job_cache;
10093 +static mempool_t *_job_pool;
10094
10095  /*
10096   * We maintain three lists of jobs:
10097 Change resume/suspend to do_resume/do_suspend to avoid name clash.
10098 --- diff/drivers/md/dm-ioctl.c  2003-10-16 10:44:23.000000000 +0100
10099 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:44:34.000000000 +0100
10100 @@ -593,7 +593,7 @@
10101         return dm_hash_rename(param->name, new_name);
10102  }
10103
10104 -static int suspend(struct dm_ioctl *param)
10105 +static int do_suspend(struct dm_ioctl *param)
10106  {
10107         int r = 0;
10108         struct mapped_device *md;
10109 @@ -612,7 +612,7 @@
10110         return r;
10111  }
10112
10113 -static int resume(struct dm_ioctl *param)
10114 +static int do_resume(struct dm_ioctl *param)
10115  {
10116         int r = 0;
10117         struct hash_cell *hc;
10118 @@ -675,9 +675,9 @@
10119  static int dev_suspend(struct dm_ioctl *param, size_t param_size)
10120  {
10121         if (param->flags & DM_SUSPEND_FLAG)
10122 -               return suspend(param);
10123 +               return do_suspend(param);
10124
10125 -       return resume(param);
10126 +       return do_resume(param);
10127  }
10128
10129  /*
10130 Hello all,
10131
10132 The current version of the VFS locking patch adds a new semaphore to
10133 fs/super.c.  This is used to make sure a filesystem does not get mounted
10134 on a logical volume while a snapshot is being taken.  It also results in
10135 all mounts on the system being serialized, and isn't in line with the
10136 VFS locking scheme in general.
10137
10138 I've been meaning to fix it forever, here's an updated version that adds
10139 a super with s->s_dev set to the source volume if nothing is currently
10140 mounted on the source volume.  This allows me to use the s_umount
10141 semaphore in the super block to keep things safe, which is cleaner
10142 overall.
10143
10144 The other benefit over the existing patch is this one has zero footprint
10145 outside the lockfs calls.  You're only running new code if you take a
10146 snapshot.
10147
10148 I've done some testing here, but wanted to let LVM people review it
10149 before going further.  Patch is below against 2.4.21-rc6.
10150
10151 This provides zero new functionality over the existing VFS locking
10152 patch, and is experimental.  Do not apply this on production servers,
10153 and do not apply unless you want to help test.
10154
10155 -chris
10156
10157 ===== drivers/md/lvm.c 1.19 vs edited =====
10158 --- diff/drivers/md/dm-snapshot.c       2003-10-16 10:44:23.000000000 +0100
10159 +++ source/drivers/md/dm-snapshot.c     2003-10-16 10:44:38.000000000 +0100
10160 @@ -525,7 +525,7 @@
10161         }
10162
10163         /* Flush IO to the origin device */
10164 -       fsync_dev(s->origin->dev);
10165 +       fsync_dev_lockfs(s->origin->dev);
10166
10167         /* Add snapshot to the list of snapshots for this origin */
10168         if (register_snapshot(s)) {
10169 @@ -539,6 +539,7 @@
10170
10171   bad6:
10172         kcopyd_client_destroy(s->kcopyd_client);
10173 +       unlockfs(s->origin->dev);
10174
10175   bad5:
10176         s->store.destroy(&s->store);
10177 --- diff/drivers/md/lvm.c       2003-10-10 23:39:06.000000000 +0100
10178 +++ source/drivers/md/lvm.c     2003-10-16 10:44:38.000000000 +0100
10179 @@ -236,9 +236,6 @@
10180  #define DEVICE_OFF(device)
10181  #define LOCAL_END_REQUEST
10182
10183 -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
10184 -/* #define     LVM_VFS_ENHANCEMENT */
10185 -
10186  #include <linux/config.h>
10187  #include <linux/module.h>
10188  #include <linux/kernel.h>
10189 @@ -2250,12 +2247,8 @@
10190         if (lv_ptr->lv_access & LV_SNAPSHOT) {
10191                 lv_t *org = lv_ptr->lv_snapshot_org, *last;
10192
10193 -               /* sync the original logical volume */
10194 -               fsync_dev(org->lv_dev);
10195 -#ifdef LVM_VFS_ENHANCEMENT
10196                 /* VFS function call to sync and lock the filesystem */
10197                 fsync_dev_lockfs(org->lv_dev);
10198 -#endif
10199
10200                 down_write(&org->lv_lock);
10201                 org->lv_access |= LV_SNAPSHOT_ORG;
10202 @@ -2281,11 +2274,9 @@
10203         else
10204                 set_device_ro(lv_ptr->lv_dev, 1);
10205
10206 -#ifdef LVM_VFS_ENHANCEMENT
10207  /* VFS function call to unlock the filesystem */
10208         if (lv_ptr->lv_access & LV_SNAPSHOT)
10209                 unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
10210 -#endif
10211
10212         lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
10213             lvm_fs_create_lv(vg_ptr, lv_ptr);
10214 --- diff/fs/buffer.c    2003-10-16 10:44:23.000000000 +0100
10215 +++ source/fs/buffer.c  2003-10-16 10:44:38.000000000 +0100
10216 @@ -383,6 +383,34 @@
10217         fsync_dev(dev);
10218  }
10219
10220 +int fsync_dev_lockfs(kdev_t dev)
10221 +{
10222 +       /* you are not allowed to try locking all the filesystems
10223 +       ** on the system, your chances of getting through without
10224 +       ** total deadlock are slim to none.
10225 +       */
10226 +       if (!dev)
10227 +               return fsync_dev(dev) ;
10228 +
10229 +       sync_buffers(dev, 0);
10230 +
10231 +       lock_kernel();
10232 +       /* note, the FS might need to start transactions to
10233 +       ** sync the inodes, or the quota, no locking until
10234 +       ** after these are done
10235 +       */
10236 +       sync_inodes(dev);
10237 +       DQUOT_SYNC(dev);
10238 +       /* if inodes or quotas could be dirtied during the
10239 +       ** sync_supers_lockfs call, the FS is responsible for getting
10240 +       ** them on disk, without deadlocking against the lock
10241 +       */
10242 +       sync_supers_lockfs(dev) ;
10243 +       unlock_kernel();
10244 +
10245 +       return sync_buffers(dev, 1) ;
10246 +}
10247 +
10248  asmlinkage long sys_sync(void)
10249  {
10250         fsync_dev(0);
10251 --- diff/fs/reiserfs/super.c    2003-08-26 13:50:12.000000000 +0100
10252 +++ source/fs/reiserfs/super.c  2003-10-16 10:44:38.000000000 +0100
10253 @@ -73,7 +73,7 @@
10254      reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
10255      journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
10256      reiserfs_block_writes(&th) ;
10257 -    journal_end(&th, s, 1) ;
10258 +    journal_end_sync(&th, s, 1) ;
10259    }
10260    s->s_dirt = dirty;
10261    unlock_kernel() ;
10262 --- diff/fs/super.c     2003-08-26 13:50:12.000000000 +0100
10263 +++ source/fs/super.c   2003-10-16 10:44:38.000000000 +0100
10264 @@ -39,6 +39,12 @@
10265  spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
10266
10267  /*
10268 + * stub of a filesystem used to make sure an FS isn't mounted
10269 + * in the middle of a lockfs call
10270 + */
10271 +static DECLARE_FSTYPE_DEV(lockfs_fs_type, "lockfs", NULL);
10272 +
10273 +/*
10274   * Handling of filesystem drivers list.
10275   * Rules:
10276   *     Inclusion to/removals from/scanning of list are protected by spinlock.
10277 @@ -436,6 +442,25 @@
10278         put_super(sb);
10279  }
10280
10281 +static void write_super_lockfs(struct super_block *sb)
10282 +{
10283 +       lock_super(sb);
10284 +       if (sb->s_root && sb->s_op) {
10285 +               if (sb->s_dirt && sb->s_op->write_super)
10286 +                       sb->s_op->write_super(sb);
10287 +               if (sb->s_op->write_super_lockfs)
10288 +                       sb->s_op->write_super_lockfs(sb);
10289 +       }
10290 +       unlock_super(sb);
10291 +
10292 +       /*
10293 +        * if no lockfs call is provided, use the sync_fs call instead.
10294 +        * this must be done without the super lock held
10295 +        */
10296 +       if (!sb->s_op->write_super_lockfs && sb->s_op->sync_fs)
10297 +               sb->s_op->sync_fs(sb);
10298 +}
10299 +
10300  static inline void write_super(struct super_block *sb)
10301  {
10302         lock_super(sb);
10303 @@ -483,6 +508,119 @@
10304         spin_unlock(&sb_lock);
10305  }
10306
10307 +static struct super_block *find_super_for_lockfs(kdev_t dev)
10308 +{
10309 +       struct super_block *lockfs_sb = alloc_super();
10310 +       struct super_block * s;
10311 +
10312 +       if (!dev)
10313 +               return NULL;
10314 +restart:
10315 +       spin_lock(&sb_lock);
10316 +       s = find_super(dev);
10317 +       if (s) {
10318 +               spin_unlock(&sb_lock);
10319 +               down_read(&s->s_umount);
10320 +               if (s->s_root) {
10321 +                       destroy_super(lockfs_sb);
10322 +                       return s;
10323 +               }
10324 +               drop_super(s);
10325 +               goto restart;
10326 +       }
10327 +       /* if (s) we either return or goto, so we know s == NULL here.
10328 +        * At this point, there are no mounted filesystems on this device,
10329 +        * so we pretend to mount one.
10330 +        */
10331 +       if (!lockfs_sb) {
10332 +               spin_unlock(&sb_lock);
10333 +               return NULL;
10334 +       }
10335 +       s = lockfs_sb;
10336 +       s->s_dev = dev;
10337 +       if (lockfs_fs_type.fs_supers.prev == NULL)
10338 +               INIT_LIST_HEAD(&lockfs_fs_type.fs_supers);
10339 +       insert_super(s, &lockfs_fs_type);
10340 +       s->s_root = (struct dentry *)1;
10341 +       /* alloc_super gives us a write lock on s_umount, this
10342 +        * way we know there are no concurrent lockfs holders for this dev.
10343 +        * It allows us to remove the temp super from the list of supers
10344 +        * immediately when unlockfs is called
10345 +        */
10346 +       return s;
10347 +}
10348 +/*
10349 + * Note: don't check the dirty flag before waiting, we want the lock
10350 + * to happen every time this is called.  dev must be non-zero
10351 + */
10352 +void sync_supers_lockfs(kdev_t dev)
10353 +{
10354 +       struct super_block *sb;
10355 +       sb = find_super_for_lockfs(dev);
10356 +       if (sb) {
10357 +               write_super_lockfs(sb);
10358 +               /* the drop_super is done by unlockfs */
10359 +       }
10360 +}
10361 +
10362 +static void drop_super_lockfs(struct super_block *s)
10363 +{
10364 +       if (s->s_type == &lockfs_fs_type) {
10365 +               struct file_system_type *fs = s->s_type;
10366 +
10367 +               /*
10368 +                * nobody else is allowed to grab_super() on our temp
10369 +                */
10370 +               if (!deactivate_super(s))
10371 +                       BUG();
10372 +
10373 +               spin_lock(&sb_lock);
10374 +               s->s_root = NULL;
10375 +               list_del(&s->s_list);
10376 +               list_del(&s->s_instances);
10377 +               spin_unlock(&sb_lock);
10378 +
10379 +               up_write(&s->s_umount);
10380 +               put_super(s);
10381 +               put_filesystem(fs);
10382 +       } else
10383 +               drop_super(s);
10384 +}
10385 +
10386 +void unlockfs(kdev_t dev)
10387 +{
10388 +       struct super_block *s;
10389 +       if (!dev)
10390 +               return;
10391 +
10392 +       spin_lock(&sb_lock);
10393 +       s = find_super(dev);
10394 +       if (s) {
10395 +               /*
10396 +                * find_super and the original lockfs call both incremented
10397 +                * the reference count.  drop one of them
10398 +                */
10399 +               s->s_count--;
10400 +               spin_unlock(&sb_lock);
10401 +               if (s->s_root) {
10402 +                       if (s->s_op->unlockfs)
10403 +                               s->s_op->unlockfs(s);
10404 +                       drop_super_lockfs(s);
10405 +                       goto out;
10406 +               } else {
10407 +                       printk("unlockfs: no s_root, dev %s\n", kdevname(dev));
10408 +                       BUG();
10409 +               }
10410 +       } else {
10411 +               printk("unlockfs: no super found, dev %s\n", kdevname(dev));
10412 +               BUG();
10413 +       }
10414 +
10415 +       spin_unlock(&sb_lock);
10416 +out:
10417 +       return;
10418 +}
10419 +
10420  /**
10421   *     get_super       -       get the superblock of a device
10422   *     @dev: device to get the superblock for
10423 --- diff/include/linux/fs.h     2003-10-16 10:44:23.000000000 +0100
10424 +++ source/include/linux/fs.h   2003-10-16 10:44:38.000000000 +0100
10425 @@ -1273,6 +1273,7 @@
10426  extern int sync_buffers(kdev_t, int);
10427  extern void sync_dev(kdev_t);
10428  extern int fsync_dev(kdev_t);
10429 +extern int fsync_dev_lockfs(kdev_t);
10430  extern int fsync_super(struct super_block *);
10431  extern int fsync_no_super(kdev_t);
10432  extern void sync_inodes_sb(struct super_block *);
10433 @@ -1290,6 +1291,8 @@
10434  extern int filemap_fdatasync(struct address_space *);
10435  extern int filemap_fdatawait(struct address_space *);
10436  extern void sync_supers(kdev_t dev, int wait);
10437 +extern void sync_supers_lockfs(kdev_t);
10438 +extern void unlockfs(kdev_t);
10439  extern int bmap(struct inode *, int);
10440  extern int notify_change(struct dentry *, struct iattr *);
10441  extern int permission(struct inode *, int);
10442 --- diff/kernel/ksyms.c 2003-10-16 10:44:23.000000000 +0100
10443 +++ source/kernel/ksyms.c       2003-10-16 10:44:38.000000000 +0100
10444 @@ -189,6 +189,8 @@
10445  EXPORT_SYMBOL(invalidate_inode_pages);
10446  EXPORT_SYMBOL(truncate_inode_pages);
10447  EXPORT_SYMBOL(fsync_dev);
10448 +EXPORT_SYMBOL(fsync_dev_lockfs);
10449 +EXPORT_SYMBOL(unlockfs);
10450  EXPORT_SYMBOL(fsync_no_super);
10451  EXPORT_SYMBOL(permission);
10452  EXPORT_SYMBOL(vfs_permission);
10453 missing parts of the previous vfs patch (merge).
10454 --- diff/drivers/md/dm-snapshot.c       2003-10-16 10:44:38.000000000 +0100
10455 +++ source/drivers/md/dm-snapshot.c     2003-10-16 10:44:41.000000000 +0100
10456 @@ -533,13 +533,14 @@
10457                 ti->error = "Cannot register snapshot origin";
10458                 goto bad6;
10459         }
10460 +       unlockfs(s->origin->dev);
10461
10462         ti->private = s;
10463         return 0;
10464
10465   bad6:
10466 -       kcopyd_client_destroy(s->kcopyd_client);
10467         unlockfs(s->origin->dev);
10468 +       kcopyd_client_destroy(s->kcopyd_client);
10469
10470   bad5:
10471         s->store.destroy(&s->store);
10472 Lift vfs locking to dm_suspend/resume.
10473 --- diff/drivers/md/dm-snapshot.c       2003-10-16 10:44:41.000000000 +0100
10474 +++ source/drivers/md/dm-snapshot.c     2003-10-16 10:44:44.000000000 +0100
10475 @@ -524,22 +524,17 @@
10476                 goto bad5;
10477         }
10478
10479 -       /* Flush IO to the origin device */
10480 -       fsync_dev_lockfs(s->origin->dev);
10481 -
10482         /* Add snapshot to the list of snapshots for this origin */
10483         if (register_snapshot(s)) {
10484                 r = -EINVAL;
10485                 ti->error = "Cannot register snapshot origin";
10486                 goto bad6;
10487         }
10488 -       unlockfs(s->origin->dev);
10489
10490         ti->private = s;
10491         return 0;
10492
10493   bad6:
10494 -       unlockfs(s->origin->dev);
10495         kcopyd_client_destroy(s->kcopyd_client);
10496
10497   bad5:
10498 --- diff/drivers/md/dm.c        2003-10-16 10:44:23.000000000 +0100
10499 +++ source/drivers/md/dm.c      2003-10-16 10:44:44.000000000 +0100
10500 @@ -951,13 +951,23 @@
10501         int r = 0;
10502         DECLARE_WAITQUEUE(wait, current);
10503
10504 -       down_write(&md->lock);
10505 +       /* Flush IO to the origin device */
10506 +       down_read(&md->lock);
10507 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
10508 +               up_read(&md->lock);
10509 +               return -EINVAL;
10510 +       }
10511 +
10512 +       fsync_dev_lockfs(md->dev);
10513 +       up_read(&md->lock);
10514 +
10515
10516         /*
10517 -        * First we set the BLOCK_IO flag so no more ios will be
10518 -        * mapped.
10519 +        * Set the BLOCK_IO flag so no more ios will be mapped.
10520          */
10521 +       down_write(&md->lock);
10522         if (test_bit(DMF_BLOCK_IO, &md->flags)) {
10523 +               unlockfs(md->dev);
10524                 up_write(&md->lock);
10525                 return -EINVAL;
10526         }
10527 @@ -986,6 +996,7 @@
10528
10529         /* did we flush everything ? */
10530         if (atomic_read(&md->pending)) {
10531 +               unlockfs(md->dev);
10532                 clear_bit(DMF_BLOCK_IO, &md->flags);
10533                 r = -EINTR;
10534         } else {
10535 @@ -1017,6 +1028,7 @@
10536         md->deferred = NULL;
10537         up_write(&md->lock);
10538
10539 +       unlockfs(md->dev);
10540         flush_deferred_io(def);
10541         run_task_queue(&tq_disk);
10542
10543 Correct error message when start a dm-daemon.
10544 --- diff/drivers/md/dm-daemon.c 2003-10-16 10:44:23.000000000 +0100
10545 +++ source/drivers/md/dm-daemon.c       2003-10-16 10:44:48.000000000 +0100
10546 @@ -81,7 +81,7 @@
10547         down(&dd->start_lock);
10548         pid = kernel_thread(daemon, dd, 0);
10549         if (pid <= 0) {
10550 -               DMERR("Failed to start kcopyd thread");
10551 +               DMERR("Failed to start %s thread", name);
10552                 return -EAGAIN;
10553         }
10554
10555 When multiple load ioctls are issued the reference count on older
10556 'new_tables' wasn't being dropped.  [Christophe Saout]
10557 --- diff/drivers/md/dm-ioctl.c  2003-10-16 10:44:34.000000000 +0100
10558 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:44:51.000000000 +0100
10559 @@ -816,6 +816,8 @@
10560                 return -ENXIO;
10561         }
10562
10563 +       if (hc->new_map)
10564 +               dm_table_put(hc->new_map);
10565         hc->new_map = t;
10566         param->flags |= DM_INACTIVE_PRESENT_FLAG;
10567
10568 Stop labelling dm as 'experimental'.
10569 --- diff/drivers/md/Config.in   2003-10-16 10:44:23.000000000 +0100
10570 +++ source/drivers/md/Config.in 2003-10-16 10:44:54.000000000 +0100
10571 @@ -14,9 +14,7 @@
10572  dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
10573
10574  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
10575 -if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
10576 -   dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD
10577 -   dep_tristate '  Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
10578 -fi
10579 +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
10580 +dep_tristate '  Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
10581
10582  endmenu
10583 Move retrieve_status up so dev_wait() can use it.
10584 --- diff/drivers/md/dm-ioctl.c  2003-10-16 10:44:51.000000000 +0100
10585 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:44:58.000000000 +0100
10586 @@ -699,6 +699,69 @@
10587  }
10588
10589  /*
10590 + * Build up the status struct for each target
10591 + */
10592 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
10593 +                           size_t param_size)
10594 +{
10595 +       unsigned int i, num_targets;
10596 +       struct dm_target_spec *spec;
10597 +       char *outbuf, *outptr;
10598 +       status_type_t type;
10599 +       size_t remaining, len, used = 0;
10600 +
10601 +       outptr = outbuf = get_result_buffer(param, param_size, &len);
10602 +
10603 +       if (param->flags & DM_STATUS_TABLE_FLAG)
10604 +               type = STATUSTYPE_TABLE;
10605 +       else
10606 +               type = STATUSTYPE_INFO;
10607 +
10608 +       /* Get all the target info */
10609 +       num_targets = dm_table_get_num_targets(table);
10610 +       for (i = 0; i < num_targets; i++) {
10611 +               struct dm_target *ti = dm_table_get_target(table, i);
10612 +
10613 +               remaining = len - (outptr - outbuf);
10614 +               if (remaining < sizeof(struct dm_target_spec)) {
10615 +                       param->flags |= DM_BUFFER_FULL_FLAG;
10616 +                       break;
10617 +               }
10618 +
10619 +               spec = (struct dm_target_spec *) outptr;
10620 +
10621 +               spec->status = 0;
10622 +               spec->sector_start = ti->begin;
10623 +               spec->length = ti->len;
10624 +               strncpy(spec->target_type, ti->type->name,
10625 +                       sizeof(spec->target_type));
10626 +
10627 +               outptr += sizeof(struct dm_target_spec);
10628 +               remaining = len - (outptr - outbuf);
10629 +
10630 +               /* Get the status/table string from the target driver */
10631 +               if (ti->type->status) {
10632 +                       if (ti->type->status(ti, type, outptr, remaining)) {
10633 +                               param->flags |= DM_BUFFER_FULL_FLAG;
10634 +                               break;
10635 +                       }
10636 +               } else
10637 +                       outptr[0] = '\0';
10638 +
10639 +               outptr += strlen(outptr) + 1;
10640 +               used = param->data_start + (outptr - outbuf);
10641 +
10642 +               align_ptr(outptr);
10643 +               spec->next = outptr - outbuf;
10644 +       }
10645 +
10646 +       if (used)
10647 +               param->data_size = used;
10648 +
10649 +       param->target_count = num_targets;
10650 +}
10651 +
10652 +/*
10653   * Wait for a device to report an event
10654   */
10655  static int dev_wait(struct dm_ioctl *param, size_t param_size)
10656 @@ -919,69 +982,6 @@
10657  }
10658
10659  /*
10660 - * Build up the status struct for each target
10661 - */
10662 -static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
10663 -                           size_t param_size)
10664 -{
10665 -       unsigned int i, num_targets;
10666 -       struct dm_target_spec *spec;
10667 -       char *outbuf, *outptr;
10668 -       status_type_t type;
10669 -       size_t remaining, len, used = 0;
10670 -
10671 -       outptr = outbuf = get_result_buffer(param, param_size, &len);
10672 -
10673 -       if (param->flags & DM_STATUS_TABLE_FLAG)
10674 -               type = STATUSTYPE_TABLE;
10675 -       else
10676 -               type = STATUSTYPE_INFO;
10677 -
10678 -       /* Get all the target info */
10679 -       num_targets = dm_table_get_num_targets(table);
10680 -       for (i = 0; i < num_targets; i++) {
10681 -               struct dm_target *ti = dm_table_get_target(table, i);
10682 -
10683 -               remaining = len - (outptr - outbuf);
10684 -               if (remaining < sizeof(struct dm_target_spec)) {
10685 -                       param->flags |= DM_BUFFER_FULL_FLAG;
10686 -                       break;
10687 -               }
10688 -
10689 -               spec = (struct dm_target_spec *) outptr;
10690 -
10691 -               spec->status = 0;
10692 -               spec->sector_start = ti->begin;
10693 -               spec->length = ti->len;
10694 -               strncpy(spec->target_type, ti->type->name,
10695 -                       sizeof(spec->target_type));
10696 -
10697 -               outptr += sizeof(struct dm_target_spec);
10698 -               remaining = len - (outptr - outbuf);
10699 -
10700 -               /* Get the status/table string from the target driver */
10701 -               if (ti->type->status) {
10702 -                       if (ti->type->status(ti, type, outptr, remaining)) {
10703 -                               param->flags |= DM_BUFFER_FULL_FLAG;
10704 -                               break;
10705 -                       }
10706 -               } else
10707 -                       outptr[0] = '\0';
10708 -
10709 -               outptr += strlen(outptr) + 1;
10710 -               used = param->data_start + (outptr - outbuf);
10711 -
10712 -               align_ptr(outptr);
10713 -               spec->next = outptr - outbuf;
10714 -       }
10715 -
10716 -       if (used)
10717 -               param->data_size = used;
10718 -
10719 -       param->target_count = num_targets;
10720 -}
10721 -
10722 -/*
10723   * Return the status of a device as a text string for each
10724   * target.
10725   */
10726 dev_wait was meant to return table status not dev status.  [Alasdair Kergon]
10727 --- diff/drivers/md/dm-ioctl.c  2003-10-16 10:44:58.000000000 +0100
10728 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:45:01.000000000 +0100
10729 @@ -768,6 +768,7 @@
10730  {
10731         int r;
10732         struct mapped_device *md;
10733 +       struct dm_table *table;
10734         DECLARE_WAITQUEUE(wq, current);
10735
10736         md = find_device(param);
10737 @@ -790,7 +791,16 @@
10738          * him and save an ioctl.
10739          */
10740         r = __dev_status(md, param);
10741 +       if (r)
10742 +               goto out;
10743 +
10744 +       table = dm_get_table(md);
10745 +       if (table) {
10746 +               retrieve_status(table, param, param_size);
10747 +               dm_table_put(table);
10748 +       }
10749
10750 + out:
10751         dm_put(md);
10752         return r;
10753  }
10754 Fix error message when linear targets gets handed more than 2 arguments.
10755 [Alasdair Kergon]
10756 --- diff/drivers/md/dm-linear.c 2003-10-16 10:44:23.000000000 +0100
10757 +++ source/drivers/md/dm-linear.c       2003-10-16 10:45:04.000000000 +0100
10758 @@ -27,7 +27,7 @@
10759         struct linear_c *lc;
10760
10761         if (argc != 2) {
10762 -               ti->error = "dm-linear: Not enough arguments";
10763 +               ti->error = "dm-linear: Invalid argument count";
10764                 return -EINVAL;
10765         }
10766
10767 Support an arbitrary number of target parameters.  [Alasdair Kergon]
10768 --- diff/drivers/md/dm-table.c  2003-10-16 10:44:23.000000000 +0100
10769 +++ source/drivers/md/dm-table.c        2003-10-16 10:45:07.000000000 +0100
10770 @@ -441,12 +441,36 @@
10771  }
10772
10773  /*
10774 + * Used to dynamically allocate the arg array.
10775 + */
10776 +static char **realloc_argv(unsigned *array_size, char **old_argv)
10777 +{
10778 +       char **argv;
10779 +       unsigned new_size;
10780 +
10781 +       new_size = *array_size ? *array_size * 2 : 64;
10782 +       argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
10783 +       if (argv) {
10784 +               memcpy(argv, old_argv, *array_size * sizeof(*argv));
10785 +               *array_size = new_size;
10786 +       }
10787 +
10788 +       kfree(old_argv);
10789 +       return argv;
10790 +}
10791 +
10792 +/*
10793   * Destructively splits up the argument list to pass to ctr.
10794   */
10795 -static int split_args(int max, int *argc, char **argv, char *input)
10796 +static int split_args(int *argc, char ***argvp, char *input)
10797  {
10798 -       char *start, *end = input, *out;
10799 +       char *start, *end = input, *out, **argv = NULL;
10800 +       unsigned array_size = 0;
10801 +
10802         *argc = 0;
10803 +       argv = realloc_argv(&array_size, argv);
10804 +       if (!argv)
10805 +               return -ENOMEM;
10806
10807         while (1) {
10808                 start = end;
10809 @@ -475,8 +499,11 @@
10810                 }
10811
10812                 /* have we already filled the array ? */
10813 -               if ((*argc + 1) > max)
10814 -                       return -EINVAL;
10815 +               if ((*argc + 1) > array_size) {
10816 +                       argv = realloc_argv(&array_size, argv);
10817 +                       if (!argv)
10818 +                               return -ENOMEM;
10819 +               }
10820
10821                 /* we know this is whitespace */
10822                 if (*end)
10823 @@ -488,6 +515,7 @@
10824                 (*argc)++;
10825         }
10826
10827 +       *argvp = argv;
10828         return 0;
10829  }
10830
10831 @@ -495,7 +523,7 @@
10832                         sector_t start, sector_t len, char *params)
10833  {
10834         int r = -EINVAL, argc;
10835 -       char *argv[32];
10836 +       char **argv;
10837         struct dm_target *tgt;
10838
10839         if ((r = check_space(t)))
10840 @@ -524,13 +552,14 @@
10841                 goto bad;
10842         }
10843
10844 -       r = split_args(ARRAY_SIZE(argv), &argc, argv, params);
10845 +       r = split_args(&argc, &argv, params);
10846         if (r) {
10847 -               tgt->error = "couldn't split parameters";
10848 +               tgt->error = "couldn't split parameters (insufficient memory)";
10849                 goto bad;
10850         }
10851
10852         r = tgt->type->ctr(tgt, argc, argv);
10853 +       kfree(argv);
10854         if (r)
10855                 goto bad;
10856
10857 Fix VFS patch
10858 --- diff/fs/buffer.c    2003-10-16 10:44:38.000000000 +0100
10859 +++ source/fs/buffer.c  2003-10-16 10:45:11.000000000 +0100
10860 @@ -400,7 +400,7 @@
10861         ** after these are done
10862         */
10863         sync_inodes(dev);
10864 -       DQUOT_SYNC(dev);
10865 +       DQUOT_SYNC_DEV(dev);
10866         /* if inodes or quotas could be dirtied during the
10867         ** sync_supers_lockfs call, the FS is responsible for getting
10868         ** them on disk, without deadlocking against the lock
10869 The ioctl interface always knows how many targets are going to be in
10870 the table, so remove the dynamic array sizing code in dm-table.c.
10871 This fixes a problem with large tables where the dm_target pointer
10872 passed to the target ctr was becoming invalid.
10873 --- diff/drivers/md/dm-ioctl.c  2003-10-16 10:45:01.000000000 +0100
10874 +++ source/drivers/md/dm-ioctl.c        2003-10-16 10:45:14.000000000 +0100
10875 @@ -871,7 +871,7 @@
10876         struct hash_cell *hc;
10877         struct dm_table *t;
10878
10879 -       r = dm_table_create(&t, get_mode(param));
10880 +       r = dm_table_create(&t, get_mode(param), param->target_count);
10881         if (r)
10882                 return r;
10883
10884 --- diff/drivers/md/dm-table.c  2003-10-16 10:45:07.000000000 +0100
10885 +++ source/drivers/md/dm-table.c        2003-10-16 10:45:14.000000000 +0100
10886 @@ -112,42 +112,7 @@
10887         return 0;
10888  }
10889
10890 -/*
10891 - * highs, and targets are managed as dynamic arrays during a
10892 - * table load.
10893 - */
10894 -static int alloc_targets(struct dm_table *t, unsigned int num)
10895 -{
10896 -       sector_t *n_highs;
10897 -       struct dm_target *n_targets;
10898 -       int n = t->num_targets;
10899 -
10900 -       /*
10901 -        * Allocate both the target array and offset array at once.
10902 -        */
10903 -       n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
10904 -                                      sizeof(sector_t), num);
10905 -       if (!n_highs)
10906 -               return -ENOMEM;
10907 -
10908 -       n_targets = (struct dm_target *) (n_highs + num);
10909 -
10910 -       if (n) {
10911 -               memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
10912 -               memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
10913 -       }
10914 -
10915 -       memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
10916 -       vfree(t->highs);
10917 -
10918 -       t->num_allocated = num;
10919 -       t->highs = n_highs;
10920 -       t->targets = n_targets;
10921 -
10922 -       return 0;
10923 -}
10924 -
10925 -int dm_table_create(struct dm_table **result, int mode)
10926 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
10927  {
10928         struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
10929
10930 @@ -158,13 +123,17 @@
10931         INIT_LIST_HEAD(&t->devices);
10932         atomic_set(&t->holders, 1);
10933
10934 -       /* allocate a single nodes worth of targets to begin with */
10935 -       if (alloc_targets(t, KEYS_PER_NODE)) {
10936 +
10937 +       /* allocate both the target array and offset array at once */
10938 +       t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
10939 +                                       sizeof(sector_t), num_targets);
10940 +       if (!t->highs) {
10941                 kfree(t);
10942 -               t = NULL;
10943                 return -ENOMEM;
10944         }
10945
10946 +       t->targets = (struct dm_target *) (t->highs + num_targets);
10947 +       t->num_allocated = num_targets;
10948         t->mode = mode;
10949         *result = t;
10950         return 0;
10951 @@ -224,17 +193,6 @@
10952  }
10953
10954  /*
10955 - * Checks to see if we need to extend highs or targets.
10956 - */
10957 -static inline int check_space(struct dm_table *t)
10958 -{
10959 -       if (t->num_targets >= t->num_allocated)
10960 -               return alloc_targets(t, t->num_allocated * 2);
10961 -
10962 -       return 0;
10963 -}
10964 -
10965 -/*
10966   * Convert a device path to a dev_t.
10967   */
10968  static int lookup_device(const char *path, kdev_t *dev)
10969 @@ -526,8 +484,8 @@
10970         char **argv;
10971         struct dm_target *tgt;
10972
10973 -       if ((r = check_space(t)))
10974 -               return r;
10975 +       if (t->num_targets >= t->num_allocated)
10976 +               return -ENOMEM;
10977
10978         tgt = t->targets + t->num_targets;
10979         memset(tgt, 0, sizeof(*tgt));
10980 --- diff/drivers/md/dm.h        2003-10-16 10:44:23.000000000 +0100
10981 +++ source/drivers/md/dm.h      2003-10-16 10:45:14.000000000 +0100
10982 @@ -96,7 +96,7 @@
10983   * Functions for manipulating a table.  Tables are also reference
10984   * counted.
10985   *---------------------------------------------------------------*/
10986 -int dm_table_create(struct dm_table **result, int mode);
10987 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
10988
10989  void dm_table_get(struct dm_table *t);
10990  void dm_table_put(struct dm_table *t);
10991 Correct calculation of the dirty logs bitset size.
10992 --- diff/drivers/md/dm-log.c    2003-10-16 10:44:23.000000000 +0100
10993 +++ source/drivers/md/dm-log.c  2003-10-16 10:45:18.000000000 +0100
10994 @@ -124,6 +124,7 @@
10995         int sync_search;
10996  };
10997
10998 +#define BYTE_SHIFT 3
10999  static int core_ctr(struct dirty_log *log, sector_t dev_size,
11000                     unsigned int argc, char **argv)
11001  {
11002 @@ -153,7 +154,13 @@
11003         clog->region_size = region_size;
11004         clog->region_count = region_count;
11005
11006 -       bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
11007 +        /*
11008 +         * Work out how many words we need to hold the bitset.
11009 +         */
11010 +        bitset_size = dm_round_up(region_count,
11011 +                                  sizeof(*clog->clean_bits) << BYTE_SHIFT);
11012 +        bitset_size >>= BYTE_SHIFT;
11013 +
11014         clog->clean_bits = vmalloc(bitset_size);
11015         if (!clog->clean_bits) {
11016                 DMWARN("couldn't allocate clean bitset");
11017 Correct the sector calculation in map_buffer().
11018 --- diff/drivers/md/dm-raid1.c  2003-10-16 10:44:23.000000000 +0100
11019 +++ source/drivers/md/dm-raid1.c        2003-10-16 10:45:21.000000000 +0100
11020 @@ -720,11 +720,7 @@
11021  static void map_buffer(struct mirror_set *ms,
11022                        struct mirror *m, struct buffer_head *bh)
11023  {
11024 -       sector_t bsize = bh->b_size >> 9;
11025 -       sector_t rsector = bh->b_blocknr * bsize;
11026 -
11027 -       bh->b_rdev = m->dev->dev;
11028 -       bh->b_rsector = m->offset + (rsector - ms->ti->begin);
11029 +       bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin);
11030  }
11031
11032  static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
11033 If a kcopyd client hadn't allocated enough pages and then submitted a
11034 large io that was being split into sub jobs we could stall waiting for
11035 pages.  There is now a kcopyd_client->max_split field that is an
11036 appropriate number of sub_jobs to split the io into based on the
11037 number of allocated pages.
11038 --- diff/drivers/md/kcopyd.c    2003-10-16 10:44:31.000000000 +0100
11039 +++ source/drivers/md/kcopyd.c  2003-10-16 10:45:24.000000000 +0100
11040 @@ -27,6 +27,11 @@
11041
11042  static struct dm_daemon _kcopyd;
11043
11044 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
11045 +#define SUB_JOB_SIZE 128
11046 +#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE)
11047 +#define SUB_JOB_COUNT 8
11048 +
11049  /*-----------------------------------------------------------------
11050   * Each kcopyd client has its own little pool of preallocated
11051   * pages for kcopyd io.
11052 @@ -38,6 +43,7 @@
11053         struct list_head pages;
11054         unsigned int nr_pages;
11055         unsigned int nr_free_pages;
11056 +       unsigned int max_split;
11057  };
11058
11059  static inline void __push_page(struct kcopyd_client *kc, struct page *p)
11060 @@ -122,6 +128,10 @@
11061
11062         kcopyd_put_pages(kc, &new);
11063         kc->nr_pages += nr;
11064 +       kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB;
11065 +       if (kc->max_split > SUB_JOB_COUNT)
11066 +               kc->max_split = SUB_JOB_COUNT;
11067 +
11068         return 0;
11069  }
11070
11071 @@ -334,7 +344,6 @@
11072         return r;
11073  }
11074
11075 -#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
11076  static int run_pages_job(struct kcopyd_job *job)
11077  {
11078         int r;
11079 @@ -422,7 +431,6 @@
11080         dm_daemon_wake(&_kcopyd);
11081  }
11082
11083 -#define SUB_JOB_SIZE 128
11084  static void segment_complete(int read_err,
11085                              unsigned int write_err, void *context)
11086  {
11087 @@ -491,17 +499,19 @@
11088   * Create some little jobs that will do the move between
11089   * them.
11090   */
11091 -#define SPLIT_COUNT 8
11092  static void split_job(struct kcopyd_job *job)
11093  {
11094 -       int i;
11095 +       int nr;
11096 +
11097 +       nr = dm_div_up(job->source.count, SUB_JOB_SIZE);
11098 +       if (nr > job->kc->max_split)
11099 +               nr = job->kc->max_split;
11100
11101 -       atomic_set(&job->sub_jobs, SPLIT_COUNT);
11102 -       for (i = 0; i < SPLIT_COUNT; i++)
11103 +       atomic_set(&job->sub_jobs, nr);
11104 +       while (nr--)
11105                 segment_complete(0, 0u, job);
11106  }
11107
11108 -#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
11109  int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
11110                 unsigned int num_dests, struct io_region *dests,
11111                 unsigned int flags, kcopyd_notify_fn fn, void *context)
11112 @@ -534,7 +544,7 @@
11113         job->fn = fn;
11114         job->context = context;
11115
11116 -       if (job->source.count < SUB_JOB_THRESHOLD)
11117 +       if (job->source.count < SUB_JOB_SIZE)
11118                 dispatch_job(job);
11119
11120         else {
11121 Fix bug in dm-io.c block size calculation.  [Alasdair Kergon]
11122
11123 --- diff/drivers/md/dm-io.c     2003-10-16 10:44:23.000000000 +0100
11124 +++ source/drivers/md/dm-io.c   2003-10-16 10:45:28.000000000 +0100
11125 @@ -204,7 +204,13 @@
11126         sector_t b = *block;
11127         sector_t blocks_per_page = PAGE_SIZE / block_size;
11128         unsigned int this_size; /* holds the size of the current io */
11129 -       unsigned int len;
11130 +       sector_t len;
11131 +
11132 +       if (!blocks_per_page) {
11133 +               DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported",
11134 +                     PAGE_SIZE, block_size);
11135 +               return 0;
11136 +       }
11137
11138         while ((offset < PAGE_SIZE) && (b != end_block)) {
11139                 bh = mempool_alloc(_buffer_pool, GFP_NOIO);
11140 @@ -215,10 +221,20 @@
11141                  * Block size must be a power of 2 and aligned
11142                  * correctly.
11143                  */
11144 -               len = end_block - b;
11145 -               this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
11146 -               if (this_size > len)
11147 -                       this_size = 1 << log2_align(len);
11148 +
11149 +               len = min(end_block - b, blocks_per_page);
11150 +               len = min(len, blocks_per_page - offset / block_size);
11151 +
11152 +               if (!len) {
11153 +                       DMERR("dm-io: Invalid offset/block_size (%u/%u).",
11154 +                             offset, block_size);
11155 +                       return 0;
11156 +               }
11157 +
11158 +               this_size = 1 << log2_align(len);
11159 +               if (b)
11160 +                       this_size = min(this_size,
11161 +                                       (unsigned) 1 << log2_floor(b));
11162
11163                 /*
11164                  * Add in the job offset.
11165 bh->b_rdev wasn't being set properly.  Bug from earlier patch.
11166 --- diff/drivers/md/dm-raid1.c  2003-10-16 10:45:21.000000000 +0100
11167 +++ source/drivers/md/dm-raid1.c        2003-10-16 10:45:31.000000000 +0100
11168 @@ -720,6 +720,7 @@
11169  static void map_buffer(struct mirror_set *ms,
11170                        struct mirror *m, struct buffer_head *bh)
11171  {
11172 +       bh->b_rdev = m->dev->dev;
11173         bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin);
11174  }
11175