linux-dmcache.patch

   1 diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
   2 --- linux-2.6.21.7-orig/drivers/md/dm-cache.c   1969-12-31 19:00:00.000000000 -0500
   3 +++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c        2007-08-23 14:10:58.000000000 -0400
   4 @@ -0,0 +1,1766 @@
   5 +/****************************************************************************
   6 + *  dm-cache.c
   7 + *  Device mapper target for block-level disk caching
   8 + *
   9 + *  Copyright (C) International Business Machines Corp., 2006
  10 + *  Author: Ming Zhao (mingzhao@ufl.edu)
  11 + *
  12 + *  This program is free software; you can redistribute it and/or modify
  13 + *  it under the terms of the GNU General Public License as published by
  14 + *  the Free Software Foundation; under version 2 of the License.
  15 + *
  16 + *  This program is distributed in the hope that it will be useful,
  17 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 + *  GNU General Public License for more details.
  20 + *
  21 + *  You should have received a copy of the GNU General Public License
  22 + *  along with this program; if not, write to the Free Software
  23 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24 + *
  25 + ****************************************************************************/
  26 +
  27 +#include <asm/atomic.h>
  28 +#include <asm/checksum.h>
  29 +#include <linux/module.h>
  30 +#include <linux/init.h>
  31 +#include <linux/list.h>
  32 +#include <linux/blkdev.h>
  33 +#include <linux/bio.h>
  34 +#include <linux/slab.h>
  35 +#include <linux/hash.h>
  36 +#include <linux/spinlock.h>
  37 +#include <linux/workqueue.h>
  38 +#include <linux/pagemap.h>
  39 +
  40 +#include "dm.h"
  41 +#include "dm-io.h"
  42 +#include "dm-bio-list.h"
  43 +#include "kcopyd.h"
  44 +
  45 +#define DMC_DEBUG 0
  46 +
  47 +#define DM_MSG_PREFIX "cache"
  48 +#define DMC_PREFIX "dm-cache: "
  49 +
  50 +#if DMC_DEBUG
  51 +#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
  52 +#else
  53 +#define DPRINTK( s, arg... )
  54 +#endif
  55 +
  56 +/* Default cache parameters */
  57 +#define DEFAULT_CACHE_SIZE     65536
  58 +#define DEFAULT_CACHE_ASSOC    1024
  59 +#define DEFAULT_BLOCK_SIZE     8
  60 +#define CONSECUTIVE_BLOCKS     512
  61 +
  62 +/* Write policy */
  63 +#define WRITE_THROUGH 0
  64 +#define WRITE_BACK 1
  65 +#define DEFAULT_WRITE_POLICY WRITE_THROUGH
  66 +
  67 +/* Number of pages for I/O */
  68 +#define DMCACHE_COPY_PAGES 1024
  69 +
  70 +/* States of a cache block */
  71 +#define INVALID                0
  72 +#define VALID          1       /* Valid */
  73 +#define RESERVED       2       /* Allocated but data not in place yet */
  74 +#define DIRTY          4       /* Locally modified */
  75 +#define WRITEBACK      8       /* In the process of write back */
  76 +
  77 +#define is_state(x, y)         (x & y)
  78 +#define set_state(x, y)                (x |= y)
  79 +#define clear_state(x, y)      (x &= ~y)
  80 +
  81 +/*
  82 + * Cache context
  83 + */
  84 +struct cache_c {
  85 +       struct dm_dev *src_dev;         /* Source device */
  86 +       struct dm_dev *cache_dev;       /* Cache device */
  87 +       struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
  88 +
  89 +       struct cacheblock *cache;       /* Hash table for cache blocks */
  90 +       sector_t size;                  /* Cache size */
  91 +       unsigned int bits;              /* Cache size in bits */
  92 +       unsigned int assoc;             /* Cache associativity */
  93 +       unsigned int block_size;        /* Cache block size */
  94 +       unsigned int block_shift;       /* Cache block size in bits */
  95 +       unsigned int block_mask;        /* Cache block mask */
  96 +       unsigned int consecutive_shift; /* Consecutive blocks size in bits */
  97 +       unsigned long counter;          /* Logical timestamp of last access */
  98 +       unsigned int write_policy;      /* Cache write policy */
  99 +       sector_t dirty_blocks;          /* Number of dirty blocks */
 100 +
 101 +       spinlock_t lock;                /* Lock to protect page allocation/deallocation */
 102 +       struct page_list *pages;        /* Pages for I/O */
 103 +       unsigned int nr_pages;          /* Number of pages */
 104 +       unsigned int nr_free_pages;     /* Number of free pages */
 105 +       wait_queue_head_t destroyq;     /* Wait queue for I/O completion */
 106 +       atomic_t nr_jobs;               /* Number of I/O jobs */
 107 +       /* Stats */
 108 +       unsigned long reads;            /* Number of reads */
 109 +       unsigned long writes;           /* Number of writes */
 110 +       unsigned long cache_hits;       /* Number of cache hits */
 111 +       unsigned long replace;          /* Number of cache replacements */
 112 +       unsigned long writeback;        /* Number of replaced dirty blocks */
 113 +       unsigned long dirty;            /* Number of submitted dirty blocks */
 114 +};
 115 +
 116 +/* Cache block metadata structure */
 117 +struct cacheblock {
 118 +       spinlock_t lock;        /* Lock to protect operations on the bio list */
 119 +       sector_t block;         /* Sector number of the cached block */
 120 +       unsigned short state;   /* State of a block */
 121 +       unsigned long counter;  /* Logical timestamp of the block's last access */
 122 +       struct bio_list bios;   /* List of pending bios */
 123 +};
 124 +
 125 +
 126 +/****************************************************************************
 127 + *  Functions and data structures for implementing a kcached to handle async
 128 + *  I/O. Code for page and queue handling is borrowed from kcopyd.c.
 129 + ****************************************************************************/
 130 +
 131 +/*
 132 + * Functions for handling pages used by async I/O.
 133 + * The data asked by a bio request may not be aligned with cache blocks, in
 134 + * which case additional pages are required for the request that is forwarded
 135 + * to the server. A pool of pages are reserved for this purpose.
 136 + */
 137 +
 138 +static struct page_list *alloc_pl(void)
 139 +{
 140 +       struct page_list *pl;
 141 +
 142 +       pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 143 +       if (!pl)
 144 +               return NULL;
 145 +
 146 +       pl->page = alloc_page(GFP_KERNEL);
 147 +       if (!pl->page) {
 148 +               kfree(pl);
 149 +               return NULL;
 150 +       }
 151 +
 152 +       return pl;
 153 +}
 154 +
 155 +static void free_pl(struct page_list *pl)
 156 +{
 157 +       __free_page(pl->page);
 158 +       kfree(pl);
 159 +}
 160 +
 161 +static void drop_pages(struct page_list *pl)
 162 +{
 163 +       struct page_list *next;
 164 +
 165 +       while (pl) {
 166 +               next = pl->next;
 167 +               free_pl(pl);
 168 +               pl = next;
 169 +       }
 170 +}
 171 +
 172 +static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
 173 +                                struct page_list **pages)
 174 +{
 175 +       struct page_list *pl;
 176 +
 177 +       spin_lock(&dmc->lock);
 178 +       if (dmc->nr_free_pages < nr) {
 179 +               DPRINTK("kcached_get_pages: No free pages: %u<%u",
 180 +                       dmc->nr_free_pages, nr);
 181 +               spin_unlock(&dmc->lock);
 182 +               return -ENOMEM;
 183 +       }
 184 +
 185 +       dmc->nr_free_pages -= nr;
 186 +       for (*pages = pl = dmc->pages; --nr; pl = pl->next)
 187 +               ;
 188 +
 189 +       dmc->pages = pl->next;
 190 +       pl->next = NULL;
 191 +
 192 +       spin_unlock(&dmc->lock);
 193 +
 194 +       return 0;
 195 +}
 196 +
 197 +static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
 198 +{
 199 +       struct page_list *cursor;
 200 +
 201 +       spin_lock(&dmc->lock);
 202 +       for (cursor = pl; cursor->next; cursor = cursor->next)
 203 +               dmc->nr_free_pages++;
 204 +
 205 +       dmc->nr_free_pages++;
 206 +       cursor->next = dmc->pages;
 207 +       dmc->pages = pl;
 208 +
 209 +       spin_unlock(&dmc->lock);
 210 +}
 211 +
 212 +static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
 213 +{
 214 +       unsigned int i;
 215 +       struct page_list *pl = NULL, *next;
 216 +
 217 +       for (i = 0; i < nr; i++) {
 218 +               next = alloc_pl();
 219 +               if (!next) {
 220 +                       if (pl)
 221 +                               drop_pages(pl);
 222 +                       return -ENOMEM;
 223 +               }
 224 +               next->next = pl;
 225 +               pl = next;
 226 +       }
 227 +
 228 +       kcached_put_pages(dmc, pl);
 229 +       dmc->nr_pages += nr;
 230 +
 231 +       return 0;
 232 +}
 233 +
 234 +static void free_bio_pages(struct cache_c *dmc)
 235 +{
 236 +       BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
 237 +       drop_pages(dmc->pages);
 238 +       dmc->pages = NULL;
 239 +       dmc->nr_free_pages = dmc->nr_pages = 0;
 240 +}
 241 +
 242 +/* Structure for a kcached job */
 243 +struct kcached_job {
 244 +       struct list_head list;
 245 +       struct cache_c *dmc;
 246 +       struct bio *bio;        /* Original bio */
 247 +       struct io_region src;
 248 +       struct io_region dest;
 249 +       struct cacheblock *cacheblock;
 250 +       int rw;
 251 +       /*
 252 +        * When the original bio is not aligned with cache blocks,
 253 +        * we need extra bvecs and pages for padding.
 254 +        */
 255 +       struct bio_vec *bvec;
 256 +       unsigned int nr_pages;
 257 +       struct page_list *pages;
 258 +};
 259 +
 260 +static struct workqueue_struct *_kcached_wq;
 261 +static struct work_struct _kcached_work;
 262 +
 263 +static inline void wake(void)
 264 +{
 265 +       queue_work(_kcached_wq, &_kcached_work);
 266 +}
 267 +
 268 +#define MIN_JOBS 1024
 269 +
 270 +static struct kmem_cache *_job_cache;
 271 +static mempool_t *_job_pool;
 272 +
 273 +static DEFINE_SPINLOCK(_job_lock);
 274 +
 275 +static LIST_HEAD(_complete_jobs);
 276 +static LIST_HEAD(_io_jobs);
 277 +static LIST_HEAD(_pages_jobs);
 278 +
 279 +static int jobs_init(void)
 280 +{
 281 +       _job_cache = kmem_cache_create("kcached-jobs",
 282 +                                      sizeof(struct kcached_job),
 283 +                                      __alignof__(struct kcached_job),
 284 +                                      0, NULL, NULL);
 285 +       if (!_job_cache)
 286 +               return -ENOMEM;
 287 +
 288 +       _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
 289 +                                  mempool_free_slab, _job_cache);
 290 +       if (!_job_pool) {
 291 +               kmem_cache_destroy(_job_cache);
 292 +               return -ENOMEM;
 293 +       }
 294 +
 295 +       return 0;
 296 +}
 297 +
 298 +static void jobs_exit(void)
 299 +{
 300 +       BUG_ON(!list_empty(&_complete_jobs));
 301 +       BUG_ON(!list_empty(&_io_jobs));
 302 +       BUG_ON(!list_empty(&_pages_jobs));
 303 +
 304 +       mempool_destroy(_job_pool);
 305 +       kmem_cache_destroy(_job_cache);
 306 +       _job_pool = NULL;
 307 +       _job_cache = NULL;
 308 +}
 309 +
 310 +/*
 311 + * Functions to push and pop a job onto the head of a given job list.
 312 + */
 313 +static inline struct kcached_job *pop(struct list_head *jobs)
 314 +{
 315 +       struct kcached_job *job = NULL;
 316 +       unsigned long flags;
 317 +
 318 +       spin_lock_irqsave(&_job_lock, flags);
 319 +
 320 +       if (!list_empty(jobs)) {
 321 +               job = list_entry(jobs->next, struct kcached_job, list);
 322 +               list_del(&job->list);
 323 +       }
 324 +       spin_unlock_irqrestore(&_job_lock, flags);
 325 +
 326 +       return job;
 327 +}
 328 +
 329 +static inline void push(struct list_head *jobs, struct kcached_job *job)
 330 +{
 331 +       unsigned long flags;
 332 +
 333 +       spin_lock_irqsave(&_job_lock, flags);
 334 +       list_add_tail(&job->list, jobs);
 335 +       spin_unlock_irqrestore(&_job_lock, flags);
 336 +}
 337 +
 338 +
 339 +/****************************************************************************
 340 + * Functions for asynchronously fetching data from source device and storing
 341 + * data in cache device. Because the requested data may not align with the
 342 + * cache blocks, extra handling is required to pad a block request and extract
 343 + * the requested data from the results.
 344 + ****************************************************************************/
 345 +
 346 +static void io_callback(unsigned long error, void *context)
 347 +{
 348 +       struct kcached_job *job = (struct kcached_job *) context;
 349 +
 350 +       if (error) {
 351 +               /* TODO */
 352 +               DMERR("io_callback: io error");
 353 +               return;
 354 +       }
 355 +
 356 +       if (job->rw == READ) {
 357 +               job->rw = WRITE;
 358 +               push(&_io_jobs, job);
 359 +       } else
 360 +               push(&_complete_jobs, job);
 361 +       wake();
 362 +}
 363 +
 364 +/*
 365 + * Fetch data from the source device asynchronously.
 366 + * For a READ bio, if a cache block is larger than the requested data, then
 367 + * additional data are prefetched. Larger cache block size enables more
 368 + * aggressive read prefetching, which is useful for read-mostly usage.
 369 + * For a WRITE bio, if a cache block is larger than the requested data, the
 370 + * entire block needs to be fetched, and larger block size incurs more overhead.
 371 + * In scenaros where writes are frequent, 4KB is a good cache block size.
 372 + */
 373 +static int do_fetch(struct kcached_job *job)
 374 +{
 375 +       int r = 0, i, j;
 376 +       struct bio *bio = job->bio;
 377 +       struct cache_c *dmc = job->dmc;
 378 +       unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
 379 +       struct bio_vec *bvec;
 380 +       struct page_list *pl;
 381 +
 382 +       offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
 383 +       head = to_bytes(offset);
 384 +       tail = to_bytes(dmc->block_size) - bio->bi_size - head;
 385 +
 386 +       DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
 387 +               bio->bi_sector, job->src.sector, job->dest.sector,
 388 +               job->src.count, head, tail);
 389 +
 390 +       if (bio_data_dir(bio) == READ) { /* The original request is a READ */
 391 +               if (0 == job->nr_pages) { /* The request is aligned to cache block */
 392 +                       r = dm_io_async_bvec(1, &job->src, READ,
 393 +                                            bio->bi_io_vec + bio->bi_idx,
 394 +                                            io_callback, job);
 395 +                       return r;
 396 +               }
 397 +
 398 +               nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
 399 +               bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
 400 +               if (!bvec) {
 401 +                       DMERR("do_fetch: No memory");
 402 +                       return 1;
 403 +               }
 404 +
 405 +               pl = job->pages;
 406 +               i = 0;
 407 +               while (head) {
 408 +                       bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
 409 +                       bvec[i].bv_offset = 0;
 410 +                       bvec[i].bv_page = pl->page;
 411 +                       head -= bvec[i].bv_len;
 412 +                       pl = pl->next;
 413 +                       i++;
 414 +               }
 415 +
 416 +               remaining = bio->bi_size;
 417 +               j = bio->bi_idx;
 418 +               while (remaining) {
 419 +                       bvec[i] = bio->bi_io_vec[j];
 420 +                       remaining -= bvec[i].bv_len;
 421 +                       i++; j++;
 422 +               }
 423 +
 424 +               while (tail) {
 425 +                       bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
 426 +                       bvec[i].bv_offset = 0;
 427 +                       bvec[i].bv_page = pl->page;
 428 +                       tail -= bvec[i].bv_len;
 429 +                       pl = pl->next;
 430 +                       i++;
 431 +               }
 432 +
 433 +               job->bvec = bvec;
 434 +               r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
 435 +               return r;
 436 +       } else { /* The original request is a WRITE */
 437 +               pl = job->pages;
 438 +
 439 +               if (head && tail) { /* Special case */
 440 +                       bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
 441 +                       if (!bvec) {
 442 +                               DMERR("do_fetch: No memory");
 443 +                               return 1;
 444 +                       }
 445 +                       for (i=0; i<job->nr_pages; i++) {
 446 +                               bvec[i].bv_len = PAGE_SIZE;
 447 +                               bvec[i].bv_offset = 0;
 448 +                               bvec[i].bv_page = pl->page;
 449 +                               pl = pl->next;
 450 +                       }
 451 +                       job->bvec = bvec;
 452 +                       r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
 453 +                                            io_callback, job);
 454 +                       return r;
 455 +               }
 456 +
 457 +               bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
 458 +                               * sizeof(*bvec), GFP_KERNEL);
 459 +               if (!bvec) {
 460 +                       DMERR("do_fetch: No memory");
 461 +                       return 1;
 462 +               }
 463 +
 464 +               i = 0;
 465 +               while (head) {
 466 +                       bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
 467 +                       bvec[i].bv_offset = 0;
 468 +                       bvec[i].bv_page = pl->page;
 469 +                       head -= bvec[i].bv_len;
 470 +                       pl = pl->next;
 471 +                       i++;
 472 +               }
 473 +
 474 +               remaining = bio->bi_size;
 475 +               j = bio->bi_idx;
 476 +               while (remaining) {
 477 +                       bvec[i] = bio->bi_io_vec[j];
 478 +                       remaining -= bvec[i].bv_len;
 479 +                       i++; j++;
 480 +               }
 481 +
 482 +               if (tail) {
 483 +                       idx = i;
 484 +                       bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
 485 +                                           (PAGE_SIZE - 1);
 486 +                       bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
 487 +                       bvec[i].bv_page = pl->page;
 488 +                       tail -= bvec[i].bv_len;
 489 +                       pl = pl->next; i++;
 490 +                       while (tail) {
 491 +                               bvec[i].bv_len = PAGE_SIZE;
 492 +                               bvec[i].bv_offset = 0;
 493 +                               bvec[i].bv_page = pl->page;
 494 +                               tail -= bvec[i].bv_len;
 495 +                               pl = pl->next; i++;
 496 +                       }
 497 +               }
 498 +
 499 +               job->bvec = bvec;
 500 +               r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
 501 +                                    io_callback, job);
 502 +
 503 +               return r;
 504 +       }
 505 +}
 506 +
 507 +/*
 508 + * Store data to the cache source device asynchronously.
 509 + * For a READ bio request, the data fetched from the source device are returned
 510 + * to kernel and stored in cache at the same time.
 511 + * For a WRITE bio request, the data are written to the cache and source device
 512 + * at the same time.
 513 + */
 514 +static int do_store(struct kcached_job *job)
 515 +{
 516 +       int i, j, r = 0;
 517 +       struct bio *bio = job->bio, *clone;
 518 +       struct cache_c *dmc = job->dmc;
 519 +       unsigned int offset, head, tail, remaining, nr_vecs;
 520 +       struct bio_vec *bvec;
 521 +
 522 +       offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
 523 +       head = to_bytes(offset);
 524 +       tail = to_bytes(dmc->block_size) - bio->bi_size - head;
 525 +
 526 +       DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
 527 +               bio->bi_sector, job->src.sector, job->dest.sector,
 528 +               job->src.count, head, tail);
 529 +
 530 +       /* A READ is acknowledged as soon as the requested data is fetched, and
 531 +          does not have to wait for it being stored in cache. The bio is cloned
 532 +          so that the original one can be ended here. But to avoid copying
 533 +          pages, we reuse the pages allocated for the original bio, and mark
 534 +          each of them to prevent the pages being freed before the cache
 535 +          insertion is completed.
 536 +        */
 537 +       if (bio_data_dir(bio) == READ) {
 538 +               clone = bio_clone(bio, GFP_NOIO);
 539 +               for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
 540 +                       get_page(bio->bi_io_vec[i].bv_page);
 541 +               }
 542 +               DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
 543 +               bio_endio(bio, bio->bi_size, 0);
 544 +               bio = clone;
 545 +               job->bio = clone;
 546 +       }
 547 +
 548 +       if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
 549 +               r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
 550 +                                    io_callback, job);
 551 +       else {
 552 +               if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
 553 +                       DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
 554 +                       nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
 555 +                       if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
 556 +                       DPRINTK("Create %u new vecs", nr_vecs);
 557 +                       bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
 558 +                       if (!bvec) {
 559 +                               DMERR("do_store: No memory");
 560 +                               return 1;
 561 +                       }
 562 +
 563 +                       i = 0;
 564 +                       while (head) {
 565 +                               bvec[i].bv_len = min(head, job->bvec[i].bv_len);
 566 +                               bvec[i].bv_offset = 0;
 567 +                               bvec[i].bv_page = job->bvec[i].bv_page;
 568 +                               head -= bvec[i].bv_len;
 569 +                               i++;
 570 +                       }
 571 +                       remaining = bio->bi_size;
 572 +                       j = bio->bi_idx;
 573 +                       while (remaining) {
 574 +                               bvec[i] = bio->bi_io_vec[j];
 575 +                               remaining -= bvec[i].bv_len;
 576 +                               i++; j++;
 577 +                       }
 578 +                       j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
 579 +                       bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
 580 +                                           j * PAGE_SIZE;
 581 +                       bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
 582 +                       bvec[i].bv_page = job->bvec[j].bv_page;
 583 +                       tail -= bvec[i].bv_len;
 584 +                       i++; j++;
 585 +                       while (tail) {
 586 +                               bvec[i] = job->bvec[j];
 587 +                               tail -= bvec[i].bv_len;
 588 +                               i++; j++;
 589 +                       }
 590 +                       kfree(job->bvec);
 591 +                       job->bvec = bvec;
 592 +               }
 593 +
 594 +               r = dm_io_async_bvec(1, &job->dest, WRITE, job->bvec, io_callback, job);
 595 +       }
 596 +
 597 +       return r;
 598 +}
 599 +
 600 +static int do_io(struct kcached_job *job)
 601 +{
 602 +       int r = 0;
 603 +
 604 +       if (job->rw == READ) { /* Read from source device */
 605 +               r = do_fetch(job);
 606 +       } else { /* Write to cache device */
 607 +               r = do_store(job);
 608 +       }
 609 +
 610 +       return r;
 611 +}
 612 +
 613 +static int do_pages(struct kcached_job *job)
 614 +{
 615 +       int r = 0;
 616 +
 617 +       r = kcached_get_pages(job->dmc, job->nr_pages, &job->pages);
 618 +
 619 +       if (r == -ENOMEM) /* can't complete now */
 620 +               return 1;
 621 +
 622 +       /* this job is ready for io */
 623 +       push(&_io_jobs, job);
 624 +       return 0;
 625 +}
 626 +
 627 +/*
 628 + * Flush the bios that are waiting for this cache insertion or write back.
 629 + */
 630 +static void flush_bios(struct cacheblock *cacheblock)
 631 +{
 632 +       struct bio *bio;
 633 +       struct bio *n;
 634 +
 635 +       spin_lock(&cacheblock->lock);
 636 +       bio = bio_list_get(&cacheblock->bios);
 637 +       if (is_state(cacheblock->state, WRITEBACK)) { /* Write back finished */
 638 +               cacheblock->state = VALID;
 639 +       } else { /* Cache insertion finished */
 640 +               set_state(cacheblock->state, VALID);
 641 +               clear_state(cacheblock->state, RESERVED);
 642 +       }
 643 +       spin_unlock(&cacheblock->lock);
 644 +
 645 +       while (bio) {
 646 +               n = bio->bi_next;
 647 +               bio->bi_next = NULL;
 648 +               DPRINTK("Flush bio: %llu->%llu (%u bytes)",
 649 +                       cacheblock->block, bio->bi_sector, bio->bi_size);
 650 +               generic_make_request(bio);
 651 +               bio = n;
 652 +       }
 653 +}
 654 +
 655 +static int do_complete(struct kcached_job *job)
 656 +{
 657 +       int i, r = 0;
 658 +       struct bio *bio = job->bio;
 659 +
 660 +       DPRINTK("do_complete: %llu", bio->bi_sector);
 661 +
 662 +       if (bio_data_dir(bio) == READ) {
 663 +               for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
 664 +                       put_page(bio->bi_io_vec[i].bv_page);
 665 +               }
 666 +               bio_put(bio);
 667 +       } else
 668 +               bio_endio(bio, bio->bi_size, 0);
 669 +
 670 +       if (job->nr_pages > 0) {
 671 +               kfree(job->bvec);
 672 +               kcached_put_pages(job->dmc, job->pages);
 673 +       }
 674 +
 675 +       flush_bios(job->cacheblock);
 676 +       mempool_free(job, _job_pool);
 677 +
 678 +       if (atomic_dec_and_test(&job->dmc->nr_jobs))
 679 +               wake_up(&job->dmc->destroyq);
 680 +
 681 +       return r;
 682 +}
 683 +
 684 +/*
 685 + * Run through a list for as long as possible.  Returns the count
 686 + * of successful jobs.
 687 + */
 688 +static int process_jobs(struct list_head *jobs,
 689 +                           int (*fn) (struct kcached_job *))
 690 +{
 691 +       struct kcached_job *job;
 692 +       int r, count = 0;
 693 +
 694 +       while ((job = pop(jobs))) {
 695 +               r = fn(job);
 696 +
 697 +               if (r < 0) {
 698 +                       /* error this rogue job */
 699 +                       DMERR("process_jobs: Job processing error");
 700 +               }
 701 +
 702 +               if (r > 0) {
 703 +                       /*
 704 +                        * We couldn't service this job ATM, so
 705 +                        * push this job back onto the list.
 706 +                        */
 707 +                       push(jobs, job);
 708 +                       break;
 709 +               }
 710 +
 711 +               count++;
 712 +       }
 713 +
 714 +       return count;
 715 +}
 716 +
 717 +static void do_work(struct work_struct *ignored)
 718 +{
 719 +       process_jobs(&_complete_jobs, do_complete);
 720 +       process_jobs(&_pages_jobs, do_pages);
 721 +       process_jobs(&_io_jobs, do_io);
 722 +}
 723 +
 724 +static void queue_job(struct kcached_job *job)
 725 +{
 726 +       atomic_inc(&job->dmc->nr_jobs);
 727 +       if (job->nr_pages > 0) /* Request pages */
 728 +               push(&_pages_jobs, job);
 729 +       else /* Go ahead to do I/O */
 730 +               push(&_io_jobs, job);
 731 +       wake();
 732 +}
 733 +
 734 +static int kcached_init(struct cache_c *dmc)
 735 +{
 736 +       int r;
 737 +
 738 +       spin_lock_init(&dmc->lock);
 739 +       dmc->pages = NULL;
 740 +       dmc->nr_pages = dmc->nr_free_pages = 0;
 741 +       r = alloc_bio_pages(dmc, DMCACHE_COPY_PAGES);
 742 +       if (r) {
 743 +               DMERR("kcached_init: Could not allocate bio pages");
 744 +               return r;
 745 +       }
 746 +
 747 +       r = dm_io_get(DMCACHE_COPY_PAGES);
 748 +       if (r) {
 749 +               DMERR("kcached_init: Could not resize dm io pool");
 750 +               free_bio_pages(dmc);
 751 +               return r;
 752 +       }
 753 +
 754 +       init_waitqueue_head(&dmc->destroyq);
 755 +       atomic_set(&dmc->nr_jobs, 0);
 756 +
 757 +       return 0;
 758 +}
 759 +
 760 +void kcached_client_destroy(struct cache_c *dmc)
 761 +{
 762 +       /* Wait for completion of all jobs submitted by this client. */
 763 +       wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs));
 764 +
 765 +       dm_io_put(dmc->nr_pages);
 766 +       free_bio_pages(dmc);
 767 +}
 768 +
 769 +
 770 +/****************************************************************************
 771 + * Functions for writing back dirty blocks.
 772 + * We leverage kcopyd to write back dirty blocks because it is convenient to
 773 + * use and it is not reasonble to reimplement the same function here. But we
 774 + * need to reserve pages for both kcached and kcopyd. TODO: dynamically change
 775 + * the number of reserved pages.
 776 + ****************************************************************************/
 777 +
 778 +static void copy_callback(int read_err, unsigned int write_err, void *context)
 779 +{
 780 +       struct cacheblock *cacheblock = (struct cacheblock *) context;
 781 +
 782 +       flush_bios(cacheblock);
 783 +}
 784 +
 785 +static void copy_block(struct cache_c *dmc, struct io_region src,
 786 +                          struct io_region dest, struct cacheblock *cacheblock)
 787 +{
 788 +       DPRINTK("Copying: %llu:%llu->%llu:%llu",
 789 +                       src.sector, src.count * 512, dest.sector, dest.count * 512);
 790 +       kcopyd_copy(dmc->kcp_client, &src, 1, &dest, 0, copy_callback, cacheblock);
 791 +}
 792 +
 793 +static void write_back(struct cache_c *dmc, sector_t index, unsigned int length)
 794 +{
 795 +       struct io_region src, dest;
 796 +       struct cacheblock *cacheblock = &dmc->cache[index];
 797 +       unsigned int i;
 798 +
 799 +       DPRINTK("Write back block %llu(%llu, %u)",
 800 +               index, cacheblock->block, length);
 801 +       src.bdev = dmc->cache_dev->bdev;
 802 +       src.sector = index << dmc->block_shift;
 803 +       src.count = dmc->block_size * length;
 804 +       dest.bdev = dmc->src_dev->bdev;
 805 +       dest.sector = cacheblock->block;
 806 +       dest.count = dmc->block_size * length;
 807 +
 808 +       for (i=0; i<length; i++)
 809 +               set_state(dmc->cache[index+i].state, WRITEBACK);
 810 +       dmc->dirty_blocks -= length;
 811 +       copy_block(dmc, src, dest, cacheblock);
 812 +}
 813 +
 814 +
 815 +/****************************************************************************
 816 + *  Functions for implementing the various cache operations.
 817 + ****************************************************************************/
 818 +
 819 +/*
 820 + * Map a block from the source device to a block in the cache device.
 821 + */
 822 +static unsigned long hash_block(struct cache_c *dmc, sector_t block)
 823 +{
 824 +       unsigned long set_number, value;
 825 +
 826 +       value = (unsigned long)(block >> (dmc->block_shift +
 827 +               dmc->consecutive_shift));
 828 +       set_number = hash_long(value, dmc->bits) / dmc->assoc;
 829 +
 830 +       return set_number;
 831 +}
 832 +
 833 +/*
 834 + * Reset the LRU counters (the cache's global counter and each cache block's
 835 + * counter). This seems to be a naive implementaion. However, consider the
 836 + * rareness of this event, it might be more efficient that other more complex
 837 + * schemes. TODO: a more elegant solution.
 838 + */
 839 +static void cache_reset_counter(struct cache_c *dmc)
 840 +{
 841 +       sector_t i;
 842 +       struct cacheblock *cache = dmc->cache;
 843 +
 844 +       DPRINTK("Reset LRU counters");
 845 +       for (i=0; i<dmc->size; i++)
 846 +               cache[i].counter = 0;
 847 +
 848 +       dmc->counter = 0;
 849 +}
 850 +
 851 +/*
 852 + * Lookup a block in the cache.
 853 + *
 854 + * Return value:
 855 + *  1: cache hit (cache_block stores the index of the matched block)
 856 + *  0: cache miss but frame is allocated for insertion; cache_block stores the
 857 + *     frame's index:
 858 + *      If there are empty frames, then the first encounted is used.
 859 + *      If there are clean frames, then the LRU clean block is replaced.
 860 + *  2: cache miss and frame is not allocated; cache_block stores the LRU dirty
 861 + *     block's index:
 862 + *      This happens when the entire set is dirty.
 863 + * -1: cache miss and no room for insertion:
 864 + *      This happens when the entire set in transition modes (RESERVED or
 865 + *      WRITEBACK).
 866 + *
 867 + */
 868 +static int cache_lookup(struct cache_c *dmc, sector_t block,
 869 +                           sector_t *cache_block)
 870 +{
 871 +       unsigned long set_number = hash_block(dmc, block);
 872 +       sector_t index;
 873 +       int i, res;
 874 +       unsigned int cache_assoc = dmc->assoc;
 875 +       struct cacheblock *cache = dmc->cache;
 876 +       int invalid = -1, oldest = -1, oldest_clean = -1;
 877 +       unsigned long counter = ULONG_MAX, clean_counter = ULONG_MAX;
 878 +
 879 +       index=set_number * cache_assoc;
 880 +
 881 +       for (i=0; i<cache_assoc; i++, index++) {
 882 +               if (is_state(cache[index].state, VALID) ||
 883 +                   is_state(cache[index].state, RESERVED)) {
 884 +                       if (cache[index].block == block) {
 885 +                               *cache_block = index;
 886 +                               /* Reset all counters if the largest one is going to overflow */
 887 +                               if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
 888 +                               cache[index].counter = ++dmc->counter;
 889 +                               break;
 890 +                       } else {
 891 +                               /* Don't consider blocks that are in the middle of copying */
 892 +                               if (!is_state(cache[index].state, RESERVED) &&
 893 +                                   !is_state(cache[index].state, WRITEBACK)) {
 894 +                                       if (!is_state(cache[index].state, DIRTY) &&
 895 +                                           cache[index].counter < clean_counter) {
 896 +                                               clean_counter = cache[index].counter;
 897 +                                               oldest_clean = i;
 898 +                                       }
 899 +                                       if (cache[index].counter < counter) {
 900 +                                               counter = cache[index].counter;
 901 +                                               oldest = i;
 902 +                                       }
 903 +                               }
 904 +                       }
 905 +               } else {
 906 +                       if (-1 == invalid) invalid = i;
 907 +               }
 908 +       }
 909 +
 910 +       res = i < cache_assoc ? 1 : 0;
 911 +       if (!res) { /* Cache miss */
 912 +               if (invalid != -1) /* Choose the first empty frame */
 913 +                       *cache_block = set_number * cache_assoc + invalid;
 914 +               else if (oldest_clean != -1) /* Choose the LRU clean block to replace */
 915 +                       *cache_block = set_number * cache_assoc + oldest_clean;
 916 +               else if (oldest != -1) { /* Choose the LRU dirty block to evict */
 917 +                       res = 2;
 918 +                       *cache_block = set_number * cache_assoc + oldest;
 919 +               } else {
 920 +                       res = -1;
 921 +               }
 922 +       }
 923 +
 924 +       if (-1 == res)
 925 +               DPRINTK("Cache lookup: Block %llu(%lu):%s",
 926 +                   block, set_number, "NO ROOM");
 927 +       else
 928 +               DPRINTK("Cache lookup: Block %llu(%lu):%llu(%s)",
 929 +                       block, set_number, *cache_block,
 930 +                       1 == res ? "HIT" : (0 == res ? "MISS" : "WB NEEDED"));
 931 +       return res;
 932 +}
 933 +
 934 +/*
 935 + * Insert a block into the cache (in the frame specified by cache_block).
 936 + */
 937 +static int cache_insert(struct cache_c *dmc, sector_t block,
 938 +                           sector_t cache_block)
 939 +{
 940 +       struct cacheblock *cache = dmc->cache;
 941 +
 942 +       /* Mark the block as RESERVED because although it is allocated, the data are
 943 +       not in place until kcopyd finishes its job.
 944 +        */
 945 +       cache[cache_block].block = block;
 946 +       cache[cache_block].state = RESERVED;
 947 +       if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
 948 +       cache[cache_block].counter = ++dmc->counter;
 949 +
 950 +       return 1;
 951 +}
 952 +
 953 +/*
 954 + * Invalidate a block (specified by cache_block) in the cache.
 955 + */
 956 +static void cache_invalidate(struct cache_c *dmc, sector_t cache_block)
 957 +{
 958 +       struct cacheblock *cache = dmc->cache;
 959 +
 960 +       DPRINTK("Cache invalidate: Block %llu(%llu)",
 961 +               cache_block, cache[cache_block].block);
 962 +       clear_state(cache[cache_block].state, VALID);
 963 +}
 964 +
 965 +/*
 966 + * Handle a cache hit:
 967 + *  For READ, serve the request from cache is the block is ready; otherwise,
 968 + *  queue the request for later processing.
 969 + *  For write, invalidate the cache block if write-through. If write-back,
 970 + *  serve the request from cache if the block is ready, or queue the request
 971 + *  for later processing if otherwise.
 972 + */
 973 +static int cache_hit(struct cache_c *dmc, struct bio* bio, sector_t cache_block)
 974 +{
 975 +       unsigned int offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
 976 +       struct cacheblock *cache = dmc->cache;
 977 +
 978 +       dmc->cache_hits++;
 979 +
 980 +       if (bio_data_dir(bio) == READ) { /* READ hit */
 981 +               bio->bi_bdev = dmc->cache_dev->bdev;
 982 +               bio->bi_sector = (cache_block << dmc->block_shift)  + offset;
 983 +
 984 +               spin_lock(&cache[cache_block].lock);
 985 +
 986 +               if (is_state(cache[cache_block].state, VALID)) { /* Valid cache block */
 987 +                       spin_unlock(&cache[cache_block].lock);
 988 +                       return 1;
 989 +               }
 990 +
 991 +               /* Cache block is not ready yet */
 992 +               DPRINTK("Add to bio list %s(%llu)",
 993 +                               dmc->cache_dev->name, bio->bi_sector);
 994 +               bio_list_add(&cache[cache_block].bios, bio);
 995 +
 996 +               spin_unlock(&cache[cache_block].lock);
 997 +               return 0;
 998 +       } else { /* WRITE hit */
 999 +               if (dmc->write_policy == WRITE_THROUGH) { /* Invalidate cached data */
1000 +                       cache_invalidate(dmc, cache_block);
1001 +                       bio->bi_bdev = dmc->src_dev->bdev;
1002 +                       return 1;
1003 +               }
1004 +
1005 +               /* Write delay */
1006 +               if (!is_state(cache[cache_block].state, DIRTY)) {
1007 +                       set_state(cache[cache_block].state, DIRTY);
1008 +                       dmc->dirty_blocks++;
1009 +               }
1010 +
1011 +               spin_lock(&cache[cache_block].lock);
1012 +
1013 +               /* In the middle of write back */
1014 +               if (is_state(cache[cache_block].state, WRITEBACK)) {
1015 +                       /* Delay this write until the block is written back */
1016 +                       bio->bi_bdev = dmc->src_dev->bdev;
1017 +                       DPRINTK("Add to bio list %s(%llu)",
1018 +                                       dmc->src_dev->name, bio->bi_sector);
1019 +                       bio_list_add(&cache[cache_block].bios, bio);
1020 +                       spin_unlock(&cache[cache_block].lock);
1021 +                       return 0;
1022 +               }
1023 +
1024 +               /* Cache block not ready yet */
1025 +               if (is_state(cache[cache_block].state, RESERVED)) {
1026 +                       bio->bi_bdev = dmc->cache_dev->bdev;
1027 +                       bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1028 +                       DPRINTK("Add to bio list %s(%llu)",
1029 +                                       dmc->cache_dev->name, bio->bi_sector);
1030 +                       bio_list_add(&cache[cache_block].bios, bio);
1031 +                       spin_unlock(&cache[cache_block].lock);
1032 +                       return 0;
1033 +               }
1034 +
1035 +               /* Serve the request from cache */
1036 +               bio->bi_bdev = dmc->cache_dev->bdev;
1037 +               bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1038 +
1039 +               spin_unlock(&cache[cache_block].lock);
1040 +               return 1;
1041 +       }
1042 +}
1043 +
1044 +static struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio,
1045 +                                              sector_t request_block,
1046 +                                           sector_t cache_block)
1047 +{
1048 +       struct io_region src, dest;
1049 +       struct kcached_job *job;
1050 +
1051 +       src.bdev = dmc->src_dev->bdev;
1052 +       src.sector = request_block;
1053 +       src.count = dmc->block_size;
1054 +       dest.bdev = dmc->cache_dev->bdev;
1055 +       dest.sector = cache_block << dmc->block_shift;
1056 +       dest.count = src.count;
1057 +
1058 +       job = mempool_alloc(_job_pool, GFP_NOIO);
1059 +       job->dmc = dmc;
1060 +       job->bio = bio;
1061 +       job->src = src;
1062 +       job->dest = dest;
1063 +       job->cacheblock = &dmc->cache[cache_block];
1064 +
1065 +       return job;
1066 +}
1067 +
1068 +/*
1069 + * Handle a read cache miss:
1070 + *  Update the metadata; fetch the necessary block from source device;
1071 + *  store data to cache device.
1072 + */
1073 +static int cache_read_miss(struct cache_c *dmc, struct bio* bio,
1074 +                              sector_t cache_block) {
1075 +       struct cacheblock *cache = dmc->cache;
1076 +       unsigned int offset, head, tail;
1077 +       struct kcached_job *job;
1078 +       sector_t request_block, left;
1079 +
1080 +       offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1081 +       request_block = bio->bi_sector - offset;
1082 +
1083 +       if (cache[cache_block].state & VALID) {
1084 +               DPRINTK("Replacing %llu->%llu",
1085 +                       cache[cache_block].block, request_block);
1086 +               dmc->replace++;
1087 +       } else DPRINTK("Insert block %llu at empty frame %llu",
1088 +               request_block, cache_block);
1089 +
1090 +       cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1091 +
1092 +       job = new_kcached_job(dmc, bio, request_block, cache_block);
1093 +
1094 +       head = to_bytes(offset);
1095 +
1096 +       left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1097 +       if (left < dmc->block_size) {
1098 +               tail = to_bytes(left) - bio->bi_size - head;
1099 +               job->src.count = left;
1100 +               job->dest.count = left;
1101 +       } else
1102 +               tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1103 +
1104 +       /* Requested block is aligned with a cache block */
1105 +       if (0 == head && 0 == tail)
1106 +               job->nr_pages= 0;
1107 +       else /* Need new pages to store extra data */
1108 +               job->nr_pages = dm_div_up(head, PAGE_SIZE) + dm_div_up(tail, PAGE_SIZE);
1109 +       job->rw = READ; /* Fetch data from the source device */
1110 +
1111 +       DPRINTK("Queue job for %llu (need %u pages)",
1112 +               bio->bi_sector, job->nr_pages);
1113 +       queue_job(job);
1114 +
1115 +       return 0;
1116 +}
1117 +
1118 +/*
1119 + * Handle a write cache miss:
1120 + *  If write-through, forward the request to source device.
1121 + *  If write-back, update the metadata; fetch the necessary block from source
1122 + *  device; write to cache device.
1123 + */
1124 +static int cache_write_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1125 +       struct cacheblock *cache = dmc->cache;
1126 +       unsigned int offset, head, tail;
1127 +       struct kcached_job *job;
1128 +       sector_t request_block, left;
1129 +
1130 +       if (dmc->write_policy == WRITE_THROUGH) { /* Forward request to souuce */
1131 +               bio->bi_bdev = dmc->src_dev->bdev;
1132 +               return 1;
1133 +       }
1134 +
1135 +       offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1136 +       request_block = bio->bi_sector - offset;
1137 +
1138 +       if (cache[cache_block].state & VALID) {
1139 +               DPRINTK("Replacing %llu->%llu",
1140 +                       cache[cache_block].block, request_block);
1141 +               dmc->replace++;
1142 +       } else DPRINTK("Insert block %llu at empty frame %llu",
1143 +               request_block, cache_block);
1144 +
1145 +       /* Write delay */
1146 +       cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1147 +       set_state(cache[cache_block].state, DIRTY);
1148 +       dmc->dirty_blocks++;
1149 +
1150 +       job = new_kcached_job(dmc, bio, request_block, cache_block);
1151 +
1152 +       head = to_bytes(offset);
1153 +       left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1154 +       if (left < dmc->block_size) {
1155 +               tail = to_bytes(left) - bio->bi_size - head;
1156 +               job->src.count = left;
1157 +               job->dest.count = left;
1158 +       } else
1159 +               tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1160 +
1161 +       if (0 == head && 0 == tail) { /* Requested is aligned with a cache block */
1162 +               job->nr_pages = 0;
1163 +               job->rw = WRITE;
1164 +       } else if (head && tail){ /* Special case: need to pad both head and tail */
1165 +               job->nr_pages = dm_div_up(to_bytes(job->src.count), PAGE_SIZE);
1166 +               job->rw = READ;
1167 +       } else {
1168 +               if (head) { /* Fetch only head */
1169 +                       job->src.count = to_sector(head);
1170 +                       job->nr_pages = dm_div_up(head, PAGE_SIZE);
1171 +               } else { /* Fetch only tail */
1172 +                       job->src.sector = bio->bi_sector + to_sector(bio->bi_size);
1173 +                       job->src.count = to_sector(tail);
1174 +                       job->nr_pages = dm_div_up(tail, PAGE_SIZE);
1175 +               }
1176 +               job->rw = READ;
1177 +       }
1178 +
1179 +       queue_job(job);
1180 +
1181 +       return 0;
1182 +}
1183 +
1184 +/* Handle cache misses */
1185 +static int cache_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1186 +       if (bio_data_dir(bio) == READ)
1187 +               return cache_read_miss(dmc, bio, cache_block);
1188 +       else
1189 +               return cache_write_miss(dmc, bio, cache_block);
1190 +}
1191 +
1192 +
1193 +/****************************************************************************
1194 + *  Functions for implementing the operations on a cache mapping.
1195 + ****************************************************************************/
1196 +
1197 +/*
1198 + * Decide the mapping and perform necessary cache operations for a bio request.
1199 + */
1200 +static int cache_map(struct dm_target *ti, struct bio *bio,
1201 +                     union map_info *map_context)
1202 +{
1203 +       struct cache_c *dmc = (struct cache_c *) ti->private;
1204 +       sector_t request_block, cache_block = 0, offset;
1205 +       int res;
1206 +
1207 +       offset = bio->bi_sector & dmc->block_mask;
1208 +       request_block = bio->bi_sector - offset;
1209 +
1210 +       DPRINTK("Got a %s for %llu ((%llu:%llu), %u bytes)",
1211 +               bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ?
1212 +               "READ":"READA"), bio->bi_sector, request_block, offset,
1213 +               bio->bi_size);
1214 +
1215 +       if (bio_data_dir(bio) == READ) dmc->reads++;
1216 +       else dmc->writes++;
1217 +
1218 +       res = cache_lookup(dmc, request_block, &cache_block);
1219 +       if (1 == res)  /* Cache hit; server request from cache */
1220 +               return cache_hit(dmc, bio, cache_block);
1221 +       else if (0 == res) /* Cache miss; replacement block is found */
1222 +               return cache_miss(dmc, bio, cache_block);
1223 +       else if (2 == res) { /* Entire cache set is dirty; initiate a write-back */
1224 +               write_back(dmc, cache_block, 1);
1225 +               dmc->writeback++;
1226 +       }
1227 +
1228 +       /* Forward to source device */
1229 +       bio->bi_bdev = dmc->src_dev->bdev;
1230 +
1231 +       return 1;
1232 +}
1233 +
1234 +struct meta_dmc {
1235 +       sector_t size;
1236 +       unsigned int block_size;
1237 +       unsigned int assoc;
1238 +       unsigned int write_policy;
1239 +       unsigned int chksum;
1240 +};
1241 +
1242 +/* Load metadata stored by previous session from disk. */
1243 +static int load_metadata(struct cache_c *dmc) {
1244 +       struct io_region where;
1245 +       unsigned long bits;
1246 +       sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1247 +       sector_t meta_size, *meta_data, i, j, index = 0, limit, order;
1248 +       struct meta_dmc *meta_dmc;
1249 +       unsigned int chksum = 0, chksum_sav, consecutive_blocks;
1250 +
1251 +       meta_dmc = (struct meta_dmc *)vmalloc(512);
1252 +       if (!meta_dmc) {
1253 +               DMERR("load_metadata: Unable to allocate memory");
1254 +               return 1;
1255 +       }
1256 +
1257 +       where.bdev = dmc->cache_dev->bdev;
1258 +       where.sector = dev_size - 1;
1259 +       where.count = 1;
1260 +       dm_io_sync_vm(1, &where, READ, meta_dmc, &bits);
1261 +       DPRINTK("Loaded cache conf: block size(%u), cache size(%llu), " \
1262 +               "associativity(%u), write policy(%u), chksum(%u)",
1263 +               meta_dmc->block_size, meta_dmc->size,
1264 +               meta_dmc->assoc, meta_dmc->write_policy,
1265 +               meta_dmc->chksum);
1266 +
1267 +       dmc->block_size = meta_dmc->block_size;
1268 +       dmc->block_shift = ffs(dmc->block_size) - 1;
1269 +       dmc->block_mask = dmc->block_size - 1;
1270 +
1271 +       dmc->size = meta_dmc->size;
1272 +       dmc->bits = ffs(dmc->size) - 1;
1273 +
1274 +       dmc->assoc = meta_dmc->assoc;
1275 +       consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1276 +                            dmc->assoc : CONSECUTIVE_BLOCKS;
1277 +       dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1278 +
1279 +       dmc->write_policy = meta_dmc->write_policy;
1280 +       chksum_sav = meta_dmc->chksum;
1281 +
1282 +       vfree((void *)meta_dmc);
1283 +
1284 +
1285 +       order = dmc->size * sizeof(struct cacheblock);
1286 +       DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1287 +              "(capacity:%lluMB, associativity:%u, block size:%u " \
1288 +              "sectors(%uKB), %s)",
1289 +              (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1290 +              (unsigned long long) dmc->size,
1291 +              (unsigned long long) dmc->size * dmc->block_size >> (20-SECTOR_SHIFT),
1292 +              dmc->assoc, dmc->block_size,
1293 +              dmc->block_size >> (10-SECTOR_SHIFT),
1294 +              dmc->write_policy ? "write-back" : "write-through");
1295 +       dmc->cache = (struct cacheblock *)vmalloc(order);
1296 +       if (!dmc->cache) {
1297 +               DMERR("load_metadata: Unable to allocate memory");
1298 +               return 1;
1299 +       }
1300 +
1301 +       meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1302 +       /* When requesting a new bio, the number of requested bvecs has to be
1303 +          less than BIO_MAX_PAGES. Otherwise, null is returned. In dm-io.c,
1304 +          this return value is not checked and kernel Oops may happen. We set
1305 +          the limit here to avoid such situations. (2 additional bvecs are
1306 +          required by dm-io for bookeeping.)
1307 +        */
1308 +       limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1309 +       meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1310 +       if (!meta_data) {
1311 +               DMERR("load_metadata: Unable to allocate memory");
1312 +               vfree((void *)dmc->cache);
1313 +               return 1;
1314 +       }
1315 +
1316 +       while(index < meta_size) {
1317 +               where.sector = dev_size - 1 - meta_size + index;
1318 +               where.count = min(meta_size - index, limit);
1319 +               dm_io_sync_vm(1, &where, READ, meta_data, &bits);
1320 +
1321 +               for (i=to_bytes(index)/sizeof(sector_t), j=0;
1322 +                    j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1323 +                    i++, j++) {
1324 +                       if(meta_data[j]) {
1325 +                               dmc->cache[i].block = meta_data[j];
1326 +                               dmc->cache[i].state = 1;
1327 +                       } else
1328 +                               dmc->cache[i].state = 0;
1329 +               }
1330 +               chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1331 +               index += where.count;
1332 +       }
1333 +
1334 +       vfree((void *)meta_data);
1335 +
1336 +       if (chksum != chksum_sav) { /* Check the checksum of the metadata */
1337 +               DPRINTK("Cache metadata loaded from disk is corrupted");
1338 +               vfree((void *)dmc->cache);
1339 +               return 1;
1340 +       }
1341 +
1342 +       DMINFO("Cache metadata loaded from disk (offset %llu)",
1343 +              (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);;
1344 +
1345 +       return 0;
1346 +}
1347 +
1348 +/* Store metadata onto disk. */
1349 +static int dump_metadata(struct cache_c *dmc) {
1350 +       struct io_region where;
1351 +       unsigned long bits;
1352 +       sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1353 +       sector_t meta_size, i, j, index = 0, limit, *meta_data;
1354 +       struct meta_dmc *meta_dmc;
1355 +       unsigned int chksum = 0;
1356 +
1357 +       meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1358 +       limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1359 +       meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1360 +       if (!meta_data) {
1361 +               DMERR("dump_metadata: Unable to allocate memory");
1362 +               return 1;
1363 +       }
1364 +
1365 +       where.bdev = dmc->cache_dev->bdev;
1366 +       while(index < meta_size) {
1367 +               where.sector = dev_size - 1 - meta_size + index;
1368 +               where.count = min(meta_size - index, limit);
1369 +
1370 +               for (i=to_bytes(index)/sizeof(sector_t), j=0;
1371 +                    j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1372 +                    i++, j++) {
1373 +                       /* Assume all invalid cache blocks store 0. We lose the block that
1374 +                        * is actually mapped to offset 0.
1375 +                        */
1376 +                       meta_data[j] = dmc->cache[i].state ? dmc->cache[i].block : 0;
1377 +               }
1378 +               chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1379 +
1380 +               dm_io_sync_vm(1, &where, WRITE, meta_data, &bits);
1381 +               index += where.count;
1382 +       }
1383 +
1384 +       vfree((void *)meta_data);
1385 +
1386 +       meta_dmc = (struct meta_dmc *)vmalloc(512);
1387 +       if (!meta_dmc) {
1388 +               DMERR("dump_metadata: Unable to allocate memory");
1389 +               return 1;
1390 +       }
1391 +
1392 +       meta_dmc->block_size = dmc->block_size;
1393 +       meta_dmc->size = dmc->size;
1394 +       meta_dmc->assoc = dmc->assoc;
1395 +       meta_dmc->write_policy = dmc->write_policy;
1396 +       meta_dmc->chksum = chksum;
1397 +
1398 +       DPRINTK("Store metadata to disk: block size(%u), cache size(%llu), " \
1399 +               "associativity(%u), write policy(%u), checksum(%u)",
1400 +               meta_dmc->block_size, (unsigned long long) meta_dmc->size,
1401 +               meta_dmc->assoc, meta_dmc->write_policy,
1402 +               meta_dmc->chksum);
1403 +
1404 +       where.sector = dev_size - 1;
1405 +       where.count = 1;
1406 +       dm_io_sync_vm(1, &where, WRITE, meta_dmc, &bits);
1407 +
1408 +       vfree((void *)meta_dmc);
1409 +
1410 +       DMINFO("Cache metadata saved to disk (offset %llu)",
1411 +              (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);
1412 +
1413 +       return 0;
1414 +}
1415 +
1416 +/*
1417 + * Construct a cache mapping.
1418 + *  arg[0]: path to source device
1419 + *  arg[1]: path to cache device
1420 + *  arg[2]: cache persistence (if set, cache conf is loaded from disk)
1421 + * Cache configuration parameters (if not set, default values are used.
1422 + *  arg[3]: cache block size (in sectors)
1423 + *  arg[4]: cache size (in blocks)
1424 + *  arg[5]: cache associativity
1425 + *  arg[6]: write caching policy
1426 + */
1427 +static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1428 +{
1429 +       struct cache_c *dmc;
1430 +       unsigned int consecutive_blocks, persistence = 0;
1431 +       sector_t localsize, i, order;
1432 +       sector_t data_size, meta_size, dev_size;
1433 +       unsigned long long cache_size;
1434 +       int r = -EINVAL;
1435 +
1436 +       if (argc < 2) {
1437 +               ti->error = "dm-cache: Need at least 2 arguments (src dev and cache dev)";
1438 +               goto bad;
1439 +       }
1440 +
1441 +       dmc = kmalloc(sizeof(*dmc), GFP_KERNEL);
1442 +       if (dmc == NULL) {
1443 +               ti->error = "dm-cache: Failed to allocate cache context";
1444 +               r = ENOMEM;
1445 +               goto bad;
1446 +       }
1447 +
1448 +       r = dm_get_device(ti, argv[0], 0, ti->len,
1449 +                         dm_table_get_mode(ti->table), &dmc->src_dev);
1450 +       if (r) {
1451 +               ti->error = "dm-cache: Source device lookup failed";
1452 +               goto bad1;
1453 +       }
1454 +
1455 +       r = dm_get_device(ti, argv[1], 0, 0,
1456 +                         dm_table_get_mode(ti->table), &dmc->cache_dev);
1457 +       if (r) {
1458 +               ti->error = "dm-cache: Cache device lookup failed";
1459 +               goto bad2;
1460 +       }
1461 +
1462 +
1463 +       r = kcopyd_client_create(DMCACHE_COPY_PAGES, &dmc->kcp_client);
1464 +       if (r) {
1465 +               ti->error = "Failed to initialize kcopyd client\n";
1466 +               goto bad3;
1467 +       }
1468 +
1469 +       r = kcached_init(dmc);
1470 +       if (r) {
1471 +               ti->error = "Failed to initialize kcached";
1472 +               goto bad4;
1473 +       }
1474 +
1475 +       if (argc >= 3) {
1476 +               if (sscanf(argv[2], "%u", &persistence) != 1) {
1477 +                       ti->error = "dm-cache: Invalid cache persistence";
1478 +                       r = -EINVAL;
1479 +                       goto bad5;
1480 +               }
1481 +       }
1482 +       if (1 == persistence) {
1483 +               if (load_metadata(dmc)) {
1484 +                       ti->error = "dm-cache: Invalid cache configuration";
1485 +                       r = -EINVAL;
1486 +                       goto bad5;
1487 +               }
1488 +               goto init; /* Skip reading cache parameters from command line */
1489 +       } else if (persistence != 0) {
1490 +                       ti->error = "dm-cache: Invalid cache persistence";
1491 +                       r = -EINVAL;
1492 +                       goto bad5;
1493 +       }
1494 +
1495 +       if (argc >= 4) {
1496 +               if (sscanf(argv[3], "%u", &dmc->block_size) != 1) {
1497 +                       ti->error = "dm-cache: Invalid block size";
1498 +                       r = -EINVAL;
1499 +                       goto bad5;
1500 +               }
1501 +               if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) {
1502 +                       ti->error = "dm-cache: Invalid block size";
1503 +                       r = -EINVAL;
1504 +                       goto bad5;
1505 +               }
1506 +       } else
1507 +               dmc->block_size = DEFAULT_BLOCK_SIZE;
1508 +       dmc->block_shift = ffs(dmc->block_size) - 1;
1509 +       dmc->block_mask = dmc->block_size - 1;
1510 +
1511 +       if (argc >= 5) {
1512 +               if (sscanf(argv[4], "%llu", &cache_size) != 1) {
1513 +                       ti->error = "dm-cache: Invalid cache size";
1514 +                       r = -EINVAL;
1515 +                       goto bad5;
1516 +               }
1517 +               dmc->size = (sector_t) cache_size;
1518 +               if (!dmc->size || (dmc->size & (dmc->size - 1))) {
1519 +                       ti->error = "dm-cache: Invalid cache size";
1520 +                       r = -EINVAL;
1521 +                       goto bad5;
1522 +               }
1523 +       } else
1524 +               dmc->size = DEFAULT_CACHE_SIZE;
1525 +       localsize = dmc->size;
1526 +       dmc->bits = ffs(dmc->size) - 1;
1527 +
1528 +       if (argc >= 6) {
1529 +               if (sscanf(argv[5], "%u", &dmc->assoc) != 1) {
1530 +                       ti->error = "dm-cache: Invalid cache associativity";
1531 +                       r = -EINVAL;
1532 +                       goto bad5;
1533 +               }
1534 +               if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) ||
1535 +                       dmc->size < dmc->assoc) {
1536 +                       ti->error = "dm-cache: Invalid cache associativity";
1537 +                       r = -EINVAL;
1538 +                       goto bad5;
1539 +               }
1540 +       } else
1541 +               dmc->assoc = DEFAULT_CACHE_ASSOC;
1542 +
1543 +       DMINFO("%lld", dmc->cache_dev->bdev->bd_inode->i_size);
1544 +       dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1545 +       data_size = dmc->size * dmc->block_size;
1546 +       meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512) + 1;
1547 +       if ((data_size + meta_size) > dev_size) {
1548 +               DMERR("Requested cache size exeeds the cache device's capacity" \
1549 +                     "(%llu+%llu>%llu)",
1550 +                     (unsigned long long) data_size, (unsigned long long) meta_size,
1551 +                     (unsigned long long) dev_size);
1552 +               ti->error = "dm-cache: Invalid cache size";
1553 +               r = -EINVAL;
1554 +               goto bad5;
1555 +       }
1556 +       consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1557 +                            dmc->assoc : CONSECUTIVE_BLOCKS;
1558 +       dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1559 +
1560 +       if (argc >= 7) {
1561 +               if (sscanf(argv[6], "%u", &dmc->write_policy) != 1) {
1562 +                       ti->error = "dm-cache: Invalid cache write policy";
1563 +                       r = -EINVAL;
1564 +                       goto bad5;
1565 +               }
1566 +               if (dmc->write_policy != 0 && dmc->write_policy != 1) {
1567 +                       ti->error = "dm-cache: Invalid cache write policy";
1568 +                       r = -EINVAL;
1569 +                       goto bad5;
1570 +               }
1571 +       } else
1572 +               dmc->write_policy = DEFAULT_WRITE_POLICY;
1573 +
1574 +       order = dmc->size * sizeof(struct cacheblock);
1575 +       localsize = data_size >> 11;
1576 +       DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1577 +              "(capacity:%lluMB, associativity:%u, block size:%u " \
1578 +              "sectors(%uKB), %s)",
1579 +              (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1580 +              (unsigned long long) dmc->size,
1581 +              (unsigned long long) data_size >> (20-SECTOR_SHIFT),
1582 +              dmc->assoc, dmc->block_size,
1583 +              dmc->block_size >> (10-SECTOR_SHIFT),
1584 +              dmc->write_policy ? "write-back" : "write-through");
1585 +
1586 +       dmc->cache = (struct cacheblock *)vmalloc(order);
1587 +       if (!dmc->cache) {
1588 +               ti->error = "Unable to allocate memory";
1589 +               r = -ENOMEM;
1590 +               goto bad5;
1591 +       }
1592 +
1593 +init:  /* Initialize the cache structs */
1594 +       for (i=0; i<dmc->size; i++) {
1595 +               bio_list_init(&dmc->cache[i].bios);
1596 +               if(!persistence) dmc->cache[i].state = 0;
1597 +               dmc->cache[i].counter = 0;
1598 +               spin_lock_init(&dmc->cache[i].lock);
1599 +       }
1600 +
1601 +       dmc->counter = 0;
1602 +       dmc->dirty_blocks = 0;
1603 +       dmc->reads = 0;
1604 +       dmc->writes = 0;
1605 +       dmc->cache_hits = 0;
1606 +       dmc->replace = 0;
1607 +       dmc->writeback = 0;
1608 +       dmc->dirty = 0;
1609 +
1610 +       ti->split_io = dmc->block_size;
1611 +       ti->private = dmc;
1612 +       return 0;
1613 +
1614 +bad5:
1615 +       kcached_client_destroy(dmc);
1616 +bad4:
1617 +       kcopyd_client_destroy(dmc->kcp_client);
1618 +bad3:
1619 +       dm_put_device(ti, dmc->cache_dev);
1620 +bad2:
1621 +       dm_put_device(ti, dmc->src_dev);
1622 +bad1:
1623 +       kfree(dmc);
1624 +bad:
1625 +       return r;
1626 +}
1627 +
1628 +
1629 +static void cache_flush(struct cache_c *dmc)
1630 +{
1631 +       struct cacheblock *cache = dmc->cache;
1632 +       sector_t i = 0;
1633 +       unsigned int j;
1634 +
1635 +       DMINFO("Flush dirty blocks (%llu) ...", (unsigned long long) dmc->dirty_blocks);
1636 +       while (i< dmc->size) {
1637 +               j = 1;
1638 +               if (is_state(cache[i].state, DIRTY)) {
1639 +                       while ((i+j) < dmc->size && is_state(cache[i+j].state, DIRTY)
1640 +                              && (cache[i+j].block == cache[i].block + j *
1641 +                              dmc->block_size)) {
1642 +                               j++;
1643 +                       }
1644 +                       dmc->dirty += j;
1645 +                       write_back(dmc, i, j);
1646 +               }
1647 +               i += j;
1648 +       }
1649 +}
1650 +
1651 +/*
1652 + * Destroy the cache mapping.
1653 + */
1654 +static void cache_dtr(struct dm_target *ti)
1655 +{
1656 +       struct cache_c *dmc = (struct cache_c *) ti->private;
1657 +
1658 +       if (dmc->dirty_blocks > 0) cache_flush(dmc);
1659 +
1660 +       kcached_client_destroy(dmc);
1661 +
1662 +       kcopyd_client_destroy(dmc->kcp_client);
1663 +
1664 +       if (dmc->reads + dmc->writes > 0)
1665 +               DMINFO("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1666 +                      "replacement(%lu), replaced dirty blocks(%lu), " \
1667 +                  "flushed dirty blocks(%lu)",
1668 +                      dmc->reads, dmc->writes, dmc->cache_hits,
1669 +                      dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1670 +                      dmc->replace, dmc->writeback, dmc->dirty);
1671 +
1672 +       dump_metadata(dmc); /* Always dump metadata to disk before exit */
1673 +       vfree((void *)dmc->cache);
1674 +
1675 +       dm_put_device(ti, dmc->src_dev);
1676 +       dm_put_device(ti, dmc->cache_dev);
1677 +       kfree(dmc);
1678 +}
1679 +
1680 +/*
1681 + * Report cache status:
1682 + *  Output cache stats upon request of device status;
1683 + *  Output cache configuration upon request of table status.
1684 + */
1685 +static int cache_status(struct dm_target *ti, status_type_t type,
1686 +                        char *result, unsigned int maxlen)
1687 +{
1688 +       struct cache_c *dmc = (struct cache_c *) ti->private;
1689 +       int sz = 0;
1690 +
1691 +       switch (type) {
1692 +       case STATUSTYPE_INFO:
1693 +               DMEMIT("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1694 +                  "replacement(%lu), replaced dirty blocks(%lu)",
1695 +                  dmc->reads, dmc->writes, dmc->cache_hits,
1696 +                  dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1697 +                  dmc->replace, dmc->writeback);
1698 +               break;
1699 +       case STATUSTYPE_TABLE:
1700 +               DMEMIT("conf: capacity(%lluM), associativity(%u), block size(%uK), %s",
1701 +                  (unsigned long long) dmc->size * dmc->block_size >> 11,
1702 +                  dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT),
1703 +                  dmc->write_policy ? "write-back":"write-through");
1704 +               break;
1705 +       }
1706 +       return 0;
1707 +}
1708 +
1709 +
1710 +/****************************************************************************
1711 + *  Functions for manipulating a cache target.
1712 + ****************************************************************************/
1713 +
1714 +static struct target_type cache_target = {
1715 +       .name   = "cache",
1716 +       .version= {1, 0, 1},
1717 +       .module = THIS_MODULE,
1718 +       .ctr    = cache_ctr,
1719 +       .dtr    = cache_dtr,
1720 +       .map    = cache_map,
1721 +       .status = cache_status,
1722 +};
1723 +
1724 +/*
1725 + * Initiate a cache target.
1726 + */
1727 +int __init dm_cache_init(void)
1728 +{
1729 +       int r;
1730 +
1731 +       r = jobs_init();
1732 +       if (r)
1733 +               return r;
1734 +
1735 +       _kcached_wq = create_singlethread_workqueue("kcached");
1736 +       if (!_kcached_wq) {
1737 +               DMERR("failed to start kcached");
1738 +               return -ENOMEM;
1739 +       }
1740 +       INIT_WORK(&_kcached_work, do_work);
1741 +
1742 +       r = dm_register_target(&cache_target);
1743 +       if (r < 0) {
1744 +               DMERR("cache: register failed %d", r);
1745 +               destroy_workqueue(_kcached_wq);
1746 +       }
1747 +
1748 +       return r;
1749 +}
1750 +
1751 +/*
1752 + * Destroy a cache target.
1753 + */
1754 +void dm_cache_exit(void)
1755 +{
1756 +       int r = dm_unregister_target(&cache_target);
1757 +
1758 +       if (r < 0)
1759 +               DMERR("cache: unregister failed %d", r);
1760 +
1761 +       jobs_exit();
1762 +       destroy_workqueue(_kcached_wq);
1763 +}
1764 +
1765 +module_init(dm_cache_init);
1766 +module_exit(dm_cache_exit);
1767 +
1768 +MODULE_DESCRIPTION(DM_NAME " cache target");
1769 +MODULE_AUTHOR("Ming Zhao <mingzhao99th@gmail.com>");
1770 +MODULE_LICENSE("GPL");
1771 diff -Naur linux-2.6.21.7-orig/drivers/md/Kconfig linux-2.6.21.7-dmcache/drivers/md/Kconfig
1772 --- linux-2.6.21.7-orig/drivers/md/Kconfig      2007-08-04 12:11:13.000000000 -0400
1773 +++ linux-2.6.21.7-dmcache/drivers/md/Kconfig   2007-08-23 14:16:07.000000000 -0400
1774 @@ -262,6 +262,12 @@
1775         ---help---
1776           Multipath support for EMC CX/AX series hardware.
1777
1778 +config DM_CACHE
1779 +       tristate "Cache target support (EXPERIMENTAL)"
1780 +       depends on BLK_DEV_DM && EXPERIMENTAL
1781 +       ---help---
1782 +         Support for generic cache target for device-mapper.
1783 +
1784  endmenu
1785
1786  endif
1787 diff -Naur linux-2.6.21.7-orig/drivers/md/Makefile linux-2.6.21.7-dmcache/drivers/md/Makefile
1788 --- linux-2.6.21.7-orig/drivers/md/Makefile     2007-08-04 12:11:13.000000000 -0400
1789 +++ linux-2.6.21.7-dmcache/drivers/md/Makefile  2007-08-23 14:16:25.000000000 -0400
1790 @@ -36,6 +36,7 @@
1791  obj-$(CONFIG_DM_SNAPSHOT)      += dm-snapshot.o
1792  obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o
1793  obj-$(CONFIG_DM_ZERO)          += dm-zero.o
1794 +obj-$(CONFIG_DM_CACHE)         += dm-cache.o
1795
1796  quiet_cmd_unroll = UNROLL  $@
1797        cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \