1 diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
2 --- linux-2.6.21.7-orig/drivers/md/dm-cache.c 1969-12-31 19:00:00.000000000 -0500
3 +++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c 2007-08-23 14:10:58.000000000 -0400
5 +/****************************************************************************
7 + * Device mapper target for block-level disk caching
9 + * Copyright (C) International Business Machines Corp., 2006
10 + * Author: Ming Zhao (mingzhao@ufl.edu)
12 + * This program is free software; you can redistribute it and/or modify
13 + * it under the terms of the GNU General Public License as published by
14 + * the Free Software Foundation; under version 2 of the License.
16 + * This program is distributed in the hope that it will be useful,
17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 + * GNU General Public License for more details.
21 + * You should have received a copy of the GNU General Public License
22 + * along with this program; if not, write to the Free Software
23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 + ****************************************************************************/
27 +#include <asm/atomic.h>
28 +#include <asm/checksum.h>
29 +#include <linux/module.h>
30 +#include <linux/init.h>
31 +#include <linux/list.h>
32 +#include <linux/blkdev.h>
33 +#include <linux/bio.h>
34 +#include <linux/slab.h>
35 +#include <linux/hash.h>
36 +#include <linux/spinlock.h>
37 +#include <linux/workqueue.h>
38 +#include <linux/pagemap.h>
42 +#include "dm-bio-list.h"
47 +#define DM_MSG_PREFIX "cache"
48 +#define DMC_PREFIX "dm-cache: "
51 +#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
53 +#define DPRINTK( s, arg... )
56 +/* Default cache parameters */
57 +#define DEFAULT_CACHE_SIZE 65536
58 +#define DEFAULT_CACHE_ASSOC 1024
59 +#define DEFAULT_BLOCK_SIZE 8
60 +#define CONSECUTIVE_BLOCKS 512
63 +#define WRITE_THROUGH 0
65 +#define DEFAULT_WRITE_POLICY WRITE_THROUGH
67 +/* Number of pages for I/O */
68 +#define DMCACHE_COPY_PAGES 1024
70 +/* States of a cache block */
72 +#define VALID 1 /* Valid */
73 +#define RESERVED 2 /* Allocated but data not in place yet */
74 +#define DIRTY 4 /* Locally modified */
75 +#define WRITEBACK 8 /* In the process of write back */
77 +#define is_state(x, y) (x & y)
78 +#define set_state(x, y) (x |= y)
79 +#define clear_state(x, y) (x &= ~y)
85 + struct dm_dev *src_dev; /* Source device */
86 + struct dm_dev *cache_dev; /* Cache device */
87 + struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
89 + struct cacheblock *cache; /* Hash table for cache blocks */
90 + sector_t size; /* Cache size */
91 + unsigned int bits; /* Cache size in bits */
92 + unsigned int assoc; /* Cache associativity */
93 + unsigned int block_size; /* Cache block size */
94 + unsigned int block_shift; /* Cache block size in bits */
95 + unsigned int block_mask; /* Cache block mask */
96 + unsigned int consecutive_shift; /* Consecutive blocks size in bits */
97 + unsigned long counter; /* Logical timestamp of last access */
98 + unsigned int write_policy; /* Cache write policy */
99 + sector_t dirty_blocks; /* Number of dirty blocks */
101 + spinlock_t lock; /* Lock to protect page allocation/deallocation */
102 + struct page_list *pages; /* Pages for I/O */
103 + unsigned int nr_pages; /* Number of pages */
104 + unsigned int nr_free_pages; /* Number of free pages */
105 + wait_queue_head_t destroyq; /* Wait queue for I/O completion */
106 + atomic_t nr_jobs; /* Number of I/O jobs */
108 + unsigned long reads; /* Number of reads */
109 + unsigned long writes; /* Number of writes */
110 + unsigned long cache_hits; /* Number of cache hits */
111 + unsigned long replace; /* Number of cache replacements */
112 + unsigned long writeback; /* Number of replaced dirty blocks */
113 + unsigned long dirty; /* Number of submitted dirty blocks */
116 +/* Cache block metadata structure */
118 + spinlock_t lock; /* Lock to protect operations on the bio list */
119 + sector_t block; /* Sector number of the cached block */
120 + unsigned short state; /* State of a block */
121 + unsigned long counter; /* Logical timestamp of the block's last access */
122 + struct bio_list bios; /* List of pending bios */
126 +/****************************************************************************
127 + * Functions and data structures for implementing a kcached to handle async
128 + * I/O. Code for page and queue handling is borrowed from kcopyd.c.
129 + ****************************************************************************/
132 + * Functions for handling pages used by async I/O.
133 + * The data asked by a bio request may not be aligned with cache blocks, in
134 + * which case additional pages are required for the request that is forwarded
135 + * to the server. A pool of pages are reserved for this purpose.
138 +static struct page_list *alloc_pl(void)
140 + struct page_list *pl;
142 + pl = kmalloc(sizeof(*pl), GFP_KERNEL);
146 + pl->page = alloc_page(GFP_KERNEL);
155 +static void free_pl(struct page_list *pl)
157 + __free_page(pl->page);
161 +static void drop_pages(struct page_list *pl)
163 + struct page_list *next;
172 +static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
173 + struct page_list **pages)
175 + struct page_list *pl;
177 + spin_lock(&dmc->lock);
178 + if (dmc->nr_free_pages < nr) {
179 + DPRINTK("kcached_get_pages: No free pages: %u<%u",
180 + dmc->nr_free_pages, nr);
181 + spin_unlock(&dmc->lock);
185 + dmc->nr_free_pages -= nr;
186 + for (*pages = pl = dmc->pages; --nr; pl = pl->next)
189 + dmc->pages = pl->next;
192 + spin_unlock(&dmc->lock);
197 +static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
199 + struct page_list *cursor;
201 + spin_lock(&dmc->lock);
202 + for (cursor = pl; cursor->next; cursor = cursor->next)
203 + dmc->nr_free_pages++;
205 + dmc->nr_free_pages++;
206 + cursor->next = dmc->pages;
209 + spin_unlock(&dmc->lock);
212 +static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
215 + struct page_list *pl = NULL, *next;
217 + for (i = 0; i < nr; i++) {
228 + kcached_put_pages(dmc, pl);
229 + dmc->nr_pages += nr;
234 +static void free_bio_pages(struct cache_c *dmc)
236 + BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
237 + drop_pages(dmc->pages);
239 + dmc->nr_free_pages = dmc->nr_pages = 0;
242 +/* Structure for a kcached job */
243 +struct kcached_job {
244 + struct list_head list;
245 + struct cache_c *dmc;
246 + struct bio *bio; /* Original bio */
247 + struct io_region src;
248 + struct io_region dest;
249 + struct cacheblock *cacheblock;
252 + * When the original bio is not aligned with cache blocks,
253 + * we need extra bvecs and pages for padding.
255 + struct bio_vec *bvec;
256 + unsigned int nr_pages;
257 + struct page_list *pages;
260 +static struct workqueue_struct *_kcached_wq;
261 +static struct work_struct _kcached_work;
263 +static inline void wake(void)
265 + queue_work(_kcached_wq, &_kcached_work);
268 +#define MIN_JOBS 1024
270 +static struct kmem_cache *_job_cache;
271 +static mempool_t *_job_pool;
273 +static DEFINE_SPINLOCK(_job_lock);
275 +static LIST_HEAD(_complete_jobs);
276 +static LIST_HEAD(_io_jobs);
277 +static LIST_HEAD(_pages_jobs);
279 +static int jobs_init(void)
281 + _job_cache = kmem_cache_create("kcached-jobs",
282 + sizeof(struct kcached_job),
283 + __alignof__(struct kcached_job),
288 + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
289 + mempool_free_slab, _job_cache);
291 + kmem_cache_destroy(_job_cache);
298 +static void jobs_exit(void)
300 + BUG_ON(!list_empty(&_complete_jobs));
301 + BUG_ON(!list_empty(&_io_jobs));
302 + BUG_ON(!list_empty(&_pages_jobs));
304 + mempool_destroy(_job_pool);
305 + kmem_cache_destroy(_job_cache);
311 + * Functions to push and pop a job onto the head of a given job list.
313 +static inline struct kcached_job *pop(struct list_head *jobs)
315 + struct kcached_job *job = NULL;
316 + unsigned long flags;
318 + spin_lock_irqsave(&_job_lock, flags);
320 + if (!list_empty(jobs)) {
321 + job = list_entry(jobs->next, struct kcached_job, list);
322 + list_del(&job->list);
324 + spin_unlock_irqrestore(&_job_lock, flags);
329 +static inline void push(struct list_head *jobs, struct kcached_job *job)
331 + unsigned long flags;
333 + spin_lock_irqsave(&_job_lock, flags);
334 + list_add_tail(&job->list, jobs);
335 + spin_unlock_irqrestore(&_job_lock, flags);
339 +/****************************************************************************
340 + * Functions for asynchronously fetching data from source device and storing
341 + * data in cache device. Because the requested data may not align with the
342 + * cache blocks, extra handling is required to pad a block request and extract
343 + * the requested data from the results.
344 + ****************************************************************************/
346 +static void io_callback(unsigned long error, void *context)
348 + struct kcached_job *job = (struct kcached_job *) context;
352 + DMERR("io_callback: io error");
356 + if (job->rw == READ) {
358 + push(&_io_jobs, job);
360 + push(&_complete_jobs, job);
365 + * Fetch data from the source device asynchronously.
366 + * For a READ bio, if a cache block is larger than the requested data, then
367 + * additional data are prefetched. Larger cache block size enables more
368 + * aggressive read prefetching, which is useful for read-mostly usage.
369 + * For a WRITE bio, if a cache block is larger than the requested data, the
370 + * entire block needs to be fetched, and larger block size incurs more overhead.
371 + * In scenaros where writes are frequent, 4KB is a good cache block size.
373 +static int do_fetch(struct kcached_job *job)
376 + struct bio *bio = job->bio;
377 + struct cache_c *dmc = job->dmc;
378 + unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
379 + struct bio_vec *bvec;
380 + struct page_list *pl;
382 + offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
383 + head = to_bytes(offset);
384 + tail = to_bytes(dmc->block_size) - bio->bi_size - head;
386 + DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
387 + bio->bi_sector, job->src.sector, job->dest.sector,
388 + job->src.count, head, tail);
390 + if (bio_data_dir(bio) == READ) { /* The original request is a READ */
391 + if (0 == job->nr_pages) { /* The request is aligned to cache block */
392 + r = dm_io_async_bvec(1, &job->src, READ,
393 + bio->bi_io_vec + bio->bi_idx,
398 + nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
399 + bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
401 + DMERR("do_fetch: No memory");
408 + bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
409 + bvec[i].bv_offset = 0;
410 + bvec[i].bv_page = pl->page;
411 + head -= bvec[i].bv_len;
416 + remaining = bio->bi_size;
418 + while (remaining) {
419 + bvec[i] = bio->bi_io_vec[j];
420 + remaining -= bvec[i].bv_len;
425 + bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
426 + bvec[i].bv_offset = 0;
427 + bvec[i].bv_page = pl->page;
428 + tail -= bvec[i].bv_len;
434 + r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
436 + } else { /* The original request is a WRITE */
439 + if (head && tail) { /* Special case */
440 + bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
442 + DMERR("do_fetch: No memory");
445 + for (i=0; i<job->nr_pages; i++) {
446 + bvec[i].bv_len = PAGE_SIZE;
447 + bvec[i].bv_offset = 0;
448 + bvec[i].bv_page = pl->page;
452 + r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
457 + bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
458 + * sizeof(*bvec), GFP_KERNEL);
460 + DMERR("do_fetch: No memory");
466 + bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
467 + bvec[i].bv_offset = 0;
468 + bvec[i].bv_page = pl->page;
469 + head -= bvec[i].bv_len;
474 + remaining = bio->bi_size;
476 + while (remaining) {
477 + bvec[i] = bio->bi_io_vec[j];
478 + remaining -= bvec[i].bv_len;
484 + bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
486 + bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
487 + bvec[i].bv_page = pl->page;
488 + tail -= bvec[i].bv_len;
489 + pl = pl->next; i++;
491 + bvec[i].bv_len = PAGE_SIZE;
492 + bvec[i].bv_offset = 0;
493 + bvec[i].bv_page = pl->page;
494 + tail -= bvec[i].bv_len;
495 + pl = pl->next; i++;
500 + r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
508 + * Store data to the cache source device asynchronously.
509 + * For a READ bio request, the data fetched from the source device are returned
510 + * to kernel and stored in cache at the same time.
511 + * For a WRITE bio request, the data are written to the cache and source device
512 + * at the same time.
514 +static int do_store(struct kcached_job *job)
517 + struct bio *bio = job->bio, *clone;
518 + struct cache_c *dmc = job->dmc;
519 + unsigned int offset, head, tail, remaining, nr_vecs;
520 + struct bio_vec *bvec;
522 + offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
523 + head = to_bytes(offset);
524 + tail = to_bytes(dmc->block_size) - bio->bi_size - head;
526 + DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
527 + bio->bi_sector, job->src.sector, job->dest.sector,
528 + job->src.count, head, tail);
530 + /* A READ is acknowledged as soon as the requested data is fetched, and
531 + does not have to wait for it being stored in cache. The bio is cloned
532 + so that the original one can be ended here. But to avoid copying
533 + pages, we reuse the pages allocated for the original bio, and mark
534 + each of them to prevent the pages being freed before the cache
535 + insertion is completed.
537 + if (bio_data_dir(bio) == READ) {
538 + clone = bio_clone(bio, GFP_NOIO);
539 + for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
540 + get_page(bio->bi_io_vec[i].bv_page);
542 + DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
543 + bio_endio(bio, bio->bi_size, 0);
548 + if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
549 + r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
552 + if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
553 + DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
554 + nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
555 + if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
556 + DPRINTK("Create %u new vecs", nr_vecs);
557 + bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
559 + DMERR("do_store: No memory");
565 + bvec[i].bv_len = min(head, job->bvec[i].bv_len);
566 + bvec[i].bv_offset = 0;
567 + bvec[i].bv_page = job->bvec[i].bv_page;
568 + head -= bvec[i].bv_len;
571 + remaining = bio->bi_size;
573 + while (remaining) {
574 + bvec[i] = bio->bi_io_vec[j];
575 + remaining -= bvec[i].bv_len;
578 + j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
579 + bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
581 + bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
582 + bvec[i].bv_page = job->bvec[j].bv_page;
583 + tail -= bvec[i].bv_len;
586 + bvec[i] = job->bvec[j];
587 + tail -= bvec[i].bv_len;
594 + r = dm_io_async_bvec(1, &job->dest, WRITE, job->bvec, io_callback, job);
600 +static int do_io(struct kcached_job *job)
604 + if (job->rw == READ) { /* Read from source device */
606 + } else { /* Write to cache device */
613 +static int do_pages(struct kcached_job *job)
617 + r = kcached_get_pages(job->dmc, job->nr_pages, &job->pages);
619 + if (r == -ENOMEM) /* can't complete now */
622 + /* this job is ready for io */
623 + push(&_io_jobs, job);
628 + * Flush the bios that are waiting for this cache insertion or write back.
630 +static void flush_bios(struct cacheblock *cacheblock)
635 + spin_lock(&cacheblock->lock);
636 + bio = bio_list_get(&cacheblock->bios);
637 + if (is_state(cacheblock->state, WRITEBACK)) { /* Write back finished */
638 + cacheblock->state = VALID;
639 + } else { /* Cache insertion finished */
640 + set_state(cacheblock->state, VALID);
641 + clear_state(cacheblock->state, RESERVED);
643 + spin_unlock(&cacheblock->lock);
647 + bio->bi_next = NULL;
648 + DPRINTK("Flush bio: %llu->%llu (%u bytes)",
649 + cacheblock->block, bio->bi_sector, bio->bi_size);
650 + generic_make_request(bio);
655 +static int do_complete(struct kcached_job *job)
658 + struct bio *bio = job->bio;
660 + DPRINTK("do_complete: %llu", bio->bi_sector);
662 + if (bio_data_dir(bio) == READ) {
663 + for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
664 + put_page(bio->bi_io_vec[i].bv_page);
668 + bio_endio(bio, bio->bi_size, 0);
670 + if (job->nr_pages > 0) {
672 + kcached_put_pages(job->dmc, job->pages);
675 + flush_bios(job->cacheblock);
676 + mempool_free(job, _job_pool);
678 + if (atomic_dec_and_test(&job->dmc->nr_jobs))
679 + wake_up(&job->dmc->destroyq);
685 + * Run through a list for as long as possible. Returns the count
686 + * of successful jobs.
688 +static int process_jobs(struct list_head *jobs,
689 + int (*fn) (struct kcached_job *))
691 + struct kcached_job *job;
694 + while ((job = pop(jobs))) {
698 + /* error this rogue job */
699 + DMERR("process_jobs: Job processing error");
704 + * We couldn't service this job ATM, so
705 + * push this job back onto the list.
717 +static void do_work(struct work_struct *ignored)
719 + process_jobs(&_complete_jobs, do_complete);
720 + process_jobs(&_pages_jobs, do_pages);
721 + process_jobs(&_io_jobs, do_io);
724 +static void queue_job(struct kcached_job *job)
726 + atomic_inc(&job->dmc->nr_jobs);
727 + if (job->nr_pages > 0) /* Request pages */
728 + push(&_pages_jobs, job);
729 + else /* Go ahead to do I/O */
730 + push(&_io_jobs, job);
734 +static int kcached_init(struct cache_c *dmc)
738 + spin_lock_init(&dmc->lock);
740 + dmc->nr_pages = dmc->nr_free_pages = 0;
741 + r = alloc_bio_pages(dmc, DMCACHE_COPY_PAGES);
743 + DMERR("kcached_init: Could not allocate bio pages");
747 + r = dm_io_get(DMCACHE_COPY_PAGES);
749 + DMERR("kcached_init: Could not resize dm io pool");
750 + free_bio_pages(dmc);
754 + init_waitqueue_head(&dmc->destroyq);
755 + atomic_set(&dmc->nr_jobs, 0);
760 +void kcached_client_destroy(struct cache_c *dmc)
762 + /* Wait for completion of all jobs submitted by this client. */
763 + wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs));
765 + dm_io_put(dmc->nr_pages);
766 + free_bio_pages(dmc);
770 +/****************************************************************************
771 + * Functions for writing back dirty blocks.
772 + * We leverage kcopyd to write back dirty blocks because it is convenient to
773 + * use and it is not reasonble to reimplement the same function here. But we
774 + * need to reserve pages for both kcached and kcopyd. TODO: dynamically change
775 + * the number of reserved pages.
776 + ****************************************************************************/
778 +static void copy_callback(int read_err, unsigned int write_err, void *context)
780 + struct cacheblock *cacheblock = (struct cacheblock *) context;
782 + flush_bios(cacheblock);
785 +static void copy_block(struct cache_c *dmc, struct io_region src,
786 + struct io_region dest, struct cacheblock *cacheblock)
788 + DPRINTK("Copying: %llu:%llu->%llu:%llu",
789 + src.sector, src.count * 512, dest.sector, dest.count * 512);
790 + kcopyd_copy(dmc->kcp_client, &src, 1, &dest, 0, copy_callback, cacheblock);
793 +static void write_back(struct cache_c *dmc, sector_t index, unsigned int length)
795 + struct io_region src, dest;
796 + struct cacheblock *cacheblock = &dmc->cache[index];
799 + DPRINTK("Write back block %llu(%llu, %u)",
800 + index, cacheblock->block, length);
801 + src.bdev = dmc->cache_dev->bdev;
802 + src.sector = index << dmc->block_shift;
803 + src.count = dmc->block_size * length;
804 + dest.bdev = dmc->src_dev->bdev;
805 + dest.sector = cacheblock->block;
806 + dest.count = dmc->block_size * length;
808 + for (i=0; i<length; i++)
809 + set_state(dmc->cache[index+i].state, WRITEBACK);
810 + dmc->dirty_blocks -= length;
811 + copy_block(dmc, src, dest, cacheblock);
815 +/****************************************************************************
816 + * Functions for implementing the various cache operations.
817 + ****************************************************************************/
820 + * Map a block from the source device to a block in the cache device.
822 +static unsigned long hash_block(struct cache_c *dmc, sector_t block)
824 + unsigned long set_number, value;
826 + value = (unsigned long)(block >> (dmc->block_shift +
827 + dmc->consecutive_shift));
828 + set_number = hash_long(value, dmc->bits) / dmc->assoc;
834 + * Reset the LRU counters (the cache's global counter and each cache block's
835 + * counter). This seems to be a naive implementaion. However, consider the
836 + * rareness of this event, it might be more efficient that other more complex
837 + * schemes. TODO: a more elegant solution.
839 +static void cache_reset_counter(struct cache_c *dmc)
842 + struct cacheblock *cache = dmc->cache;
844 + DPRINTK("Reset LRU counters");
845 + for (i=0; i<dmc->size; i++)
846 + cache[i].counter = 0;
852 + * Lookup a block in the cache.
855 + * 1: cache hit (cache_block stores the index of the matched block)
856 + * 0: cache miss but frame is allocated for insertion; cache_block stores the
858 + * If there are empty frames, then the first encounted is used.
859 + * If there are clean frames, then the LRU clean block is replaced.
860 + * 2: cache miss and frame is not allocated; cache_block stores the LRU dirty
862 + * This happens when the entire set is dirty.
863 + * -1: cache miss and no room for insertion:
864 + * This happens when the entire set in transition modes (RESERVED or
868 +static int cache_lookup(struct cache_c *dmc, sector_t block,
869 + sector_t *cache_block)
871 + unsigned long set_number = hash_block(dmc, block);
874 + unsigned int cache_assoc = dmc->assoc;
875 + struct cacheblock *cache = dmc->cache;
876 + int invalid = -1, oldest = -1, oldest_clean = -1;
877 + unsigned long counter = ULONG_MAX, clean_counter = ULONG_MAX;
879 + index=set_number * cache_assoc;
881 + for (i=0; i<cache_assoc; i++, index++) {
882 + if (is_state(cache[index].state, VALID) ||
883 + is_state(cache[index].state, RESERVED)) {
884 + if (cache[index].block == block) {
885 + *cache_block = index;
886 + /* Reset all counters if the largest one is going to overflow */
887 + if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
888 + cache[index].counter = ++dmc->counter;
891 + /* Don't consider blocks that are in the middle of copying */
892 + if (!is_state(cache[index].state, RESERVED) &&
893 + !is_state(cache[index].state, WRITEBACK)) {
894 + if (!is_state(cache[index].state, DIRTY) &&
895 + cache[index].counter < clean_counter) {
896 + clean_counter = cache[index].counter;
899 + if (cache[index].counter < counter) {
900 + counter = cache[index].counter;
906 + if (-1 == invalid) invalid = i;
910 + res = i < cache_assoc ? 1 : 0;
911 + if (!res) { /* Cache miss */
912 + if (invalid != -1) /* Choose the first empty frame */
913 + *cache_block = set_number * cache_assoc + invalid;
914 + else if (oldest_clean != -1) /* Choose the LRU clean block to replace */
915 + *cache_block = set_number * cache_assoc + oldest_clean;
916 + else if (oldest != -1) { /* Choose the LRU dirty block to evict */
918 + *cache_block = set_number * cache_assoc + oldest;
925 + DPRINTK("Cache lookup: Block %llu(%lu):%s",
926 + block, set_number, "NO ROOM");
928 + DPRINTK("Cache lookup: Block %llu(%lu):%llu(%s)",
929 + block, set_number, *cache_block,
930 + 1 == res ? "HIT" : (0 == res ? "MISS" : "WB NEEDED"));
935 + * Insert a block into the cache (in the frame specified by cache_block).
937 +static int cache_insert(struct cache_c *dmc, sector_t block,
938 + sector_t cache_block)
940 + struct cacheblock *cache = dmc->cache;
942 + /* Mark the block as RESERVED because although it is allocated, the data are
943 + not in place until kcopyd finishes its job.
945 + cache[cache_block].block = block;
946 + cache[cache_block].state = RESERVED;
947 + if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
948 + cache[cache_block].counter = ++dmc->counter;
954 + * Invalidate a block (specified by cache_block) in the cache.
956 +static void cache_invalidate(struct cache_c *dmc, sector_t cache_block)
958 + struct cacheblock *cache = dmc->cache;
960 + DPRINTK("Cache invalidate: Block %llu(%llu)",
961 + cache_block, cache[cache_block].block);
962 + clear_state(cache[cache_block].state, VALID);
966 + * Handle a cache hit:
967 + * For READ, serve the request from cache is the block is ready; otherwise,
968 + * queue the request for later processing.
969 + * For write, invalidate the cache block if write-through. If write-back,
970 + * serve the request from cache if the block is ready, or queue the request
971 + * for later processing if otherwise.
973 +static int cache_hit(struct cache_c *dmc, struct bio* bio, sector_t cache_block)
975 + unsigned int offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
976 + struct cacheblock *cache = dmc->cache;
980 + if (bio_data_dir(bio) == READ) { /* READ hit */
981 + bio->bi_bdev = dmc->cache_dev->bdev;
982 + bio->bi_sector = (cache_block << dmc->block_shift) + offset;
984 + spin_lock(&cache[cache_block].lock);
986 + if (is_state(cache[cache_block].state, VALID)) { /* Valid cache block */
987 + spin_unlock(&cache[cache_block].lock);
991 + /* Cache block is not ready yet */
992 + DPRINTK("Add to bio list %s(%llu)",
993 + dmc->cache_dev->name, bio->bi_sector);
994 + bio_list_add(&cache[cache_block].bios, bio);
996 + spin_unlock(&cache[cache_block].lock);
998 + } else { /* WRITE hit */
999 + if (dmc->write_policy == WRITE_THROUGH) { /* Invalidate cached data */
1000 + cache_invalidate(dmc, cache_block);
1001 + bio->bi_bdev = dmc->src_dev->bdev;
1006 + if (!is_state(cache[cache_block].state, DIRTY)) {
1007 + set_state(cache[cache_block].state, DIRTY);
1008 + dmc->dirty_blocks++;
1011 + spin_lock(&cache[cache_block].lock);
1013 + /* In the middle of write back */
1014 + if (is_state(cache[cache_block].state, WRITEBACK)) {
1015 + /* Delay this write until the block is written back */
1016 + bio->bi_bdev = dmc->src_dev->bdev;
1017 + DPRINTK("Add to bio list %s(%llu)",
1018 + dmc->src_dev->name, bio->bi_sector);
1019 + bio_list_add(&cache[cache_block].bios, bio);
1020 + spin_unlock(&cache[cache_block].lock);
1024 + /* Cache block not ready yet */
1025 + if (is_state(cache[cache_block].state, RESERVED)) {
1026 + bio->bi_bdev = dmc->cache_dev->bdev;
1027 + bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1028 + DPRINTK("Add to bio list %s(%llu)",
1029 + dmc->cache_dev->name, bio->bi_sector);
1030 + bio_list_add(&cache[cache_block].bios, bio);
1031 + spin_unlock(&cache[cache_block].lock);
1035 + /* Serve the request from cache */
1036 + bio->bi_bdev = dmc->cache_dev->bdev;
1037 + bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1039 + spin_unlock(&cache[cache_block].lock);
1044 +static struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio,
1045 + sector_t request_block,
1046 + sector_t cache_block)
1048 + struct io_region src, dest;
1049 + struct kcached_job *job;
1051 + src.bdev = dmc->src_dev->bdev;
1052 + src.sector = request_block;
1053 + src.count = dmc->block_size;
1054 + dest.bdev = dmc->cache_dev->bdev;
1055 + dest.sector = cache_block << dmc->block_shift;
1056 + dest.count = src.count;
1058 + job = mempool_alloc(_job_pool, GFP_NOIO);
1063 + job->cacheblock = &dmc->cache[cache_block];
1069 + * Handle a read cache miss:
1070 + * Update the metadata; fetch the necessary block from source device;
1071 + * store data to cache device.
1073 +static int cache_read_miss(struct cache_c *dmc, struct bio* bio,
1074 + sector_t cache_block) {
1075 + struct cacheblock *cache = dmc->cache;
1076 + unsigned int offset, head, tail;
1077 + struct kcached_job *job;
1078 + sector_t request_block, left;
1080 + offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1081 + request_block = bio->bi_sector - offset;
1083 + if (cache[cache_block].state & VALID) {
1084 + DPRINTK("Replacing %llu->%llu",
1085 + cache[cache_block].block, request_block);
1087 + } else DPRINTK("Insert block %llu at empty frame %llu",
1088 + request_block, cache_block);
1090 + cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1092 + job = new_kcached_job(dmc, bio, request_block, cache_block);
1094 + head = to_bytes(offset);
1096 + left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1097 + if (left < dmc->block_size) {
1098 + tail = to_bytes(left) - bio->bi_size - head;
1099 + job->src.count = left;
1100 + job->dest.count = left;
1102 + tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1104 + /* Requested block is aligned with a cache block */
1105 + if (0 == head && 0 == tail)
1107 + else /* Need new pages to store extra data */
1108 + job->nr_pages = dm_div_up(head, PAGE_SIZE) + dm_div_up(tail, PAGE_SIZE);
1109 + job->rw = READ; /* Fetch data from the source device */
1111 + DPRINTK("Queue job for %llu (need %u pages)",
1112 + bio->bi_sector, job->nr_pages);
1119 + * Handle a write cache miss:
1120 + * If write-through, forward the request to source device.
1121 + * If write-back, update the metadata; fetch the necessary block from source
1122 + * device; write to cache device.
1124 +static int cache_write_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1125 + struct cacheblock *cache = dmc->cache;
1126 + unsigned int offset, head, tail;
1127 + struct kcached_job *job;
1128 + sector_t request_block, left;
1130 + if (dmc->write_policy == WRITE_THROUGH) { /* Forward request to souuce */
1131 + bio->bi_bdev = dmc->src_dev->bdev;
1135 + offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1136 + request_block = bio->bi_sector - offset;
1138 + if (cache[cache_block].state & VALID) {
1139 + DPRINTK("Replacing %llu->%llu",
1140 + cache[cache_block].block, request_block);
1142 + } else DPRINTK("Insert block %llu at empty frame %llu",
1143 + request_block, cache_block);
1146 + cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1147 + set_state(cache[cache_block].state, DIRTY);
1148 + dmc->dirty_blocks++;
1150 + job = new_kcached_job(dmc, bio, request_block, cache_block);
1152 + head = to_bytes(offset);
1153 + left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1154 + if (left < dmc->block_size) {
1155 + tail = to_bytes(left) - bio->bi_size - head;
1156 + job->src.count = left;
1157 + job->dest.count = left;
1159 + tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1161 + if (0 == head && 0 == tail) { /* Requested is aligned with a cache block */
1162 + job->nr_pages = 0;
1164 + } else if (head && tail){ /* Special case: need to pad both head and tail */
1165 + job->nr_pages = dm_div_up(to_bytes(job->src.count), PAGE_SIZE);
1168 + if (head) { /* Fetch only head */
1169 + job->src.count = to_sector(head);
1170 + job->nr_pages = dm_div_up(head, PAGE_SIZE);
1171 + } else { /* Fetch only tail */
1172 + job->src.sector = bio->bi_sector + to_sector(bio->bi_size);
1173 + job->src.count = to_sector(tail);
1174 + job->nr_pages = dm_div_up(tail, PAGE_SIZE);
1184 +/* Handle cache misses */
1185 +static int cache_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1186 + if (bio_data_dir(bio) == READ)
1187 + return cache_read_miss(dmc, bio, cache_block);
1189 + return cache_write_miss(dmc, bio, cache_block);
1193 +/****************************************************************************
1194 + * Functions for implementing the operations on a cache mapping.
1195 + ****************************************************************************/
1198 + * Decide the mapping and perform necessary cache operations for a bio request.
1200 +static int cache_map(struct dm_target *ti, struct bio *bio,
1201 + union map_info *map_context)
1203 + struct cache_c *dmc = (struct cache_c *) ti->private;
1204 + sector_t request_block, cache_block = 0, offset;
1207 + offset = bio->bi_sector & dmc->block_mask;
1208 + request_block = bio->bi_sector - offset;
1210 + DPRINTK("Got a %s for %llu ((%llu:%llu), %u bytes)",
1211 + bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ?
1212 + "READ":"READA"), bio->bi_sector, request_block, offset,
1215 + if (bio_data_dir(bio) == READ) dmc->reads++;
1216 + else dmc->writes++;
1218 + res = cache_lookup(dmc, request_block, &cache_block);
1219 + if (1 == res) /* Cache hit; server request from cache */
1220 + return cache_hit(dmc, bio, cache_block);
1221 + else if (0 == res) /* Cache miss; replacement block is found */
1222 + return cache_miss(dmc, bio, cache_block);
1223 + else if (2 == res) { /* Entire cache set is dirty; initiate a write-back */
1224 + write_back(dmc, cache_block, 1);
1228 + /* Forward to source device */
1229 + bio->bi_bdev = dmc->src_dev->bdev;
1236 + unsigned int block_size;
1237 + unsigned int assoc;
1238 + unsigned int write_policy;
1239 + unsigned int chksum;
1242 +/* Load metadata stored by previous session from disk. */
1243 +static int load_metadata(struct cache_c *dmc) {
1244 + struct io_region where;
1245 + unsigned long bits;
1246 + sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1247 + sector_t meta_size, *meta_data, i, j, index = 0, limit, order;
1248 + struct meta_dmc *meta_dmc;
1249 + unsigned int chksum = 0, chksum_sav, consecutive_blocks;
1251 + meta_dmc = (struct meta_dmc *)vmalloc(512);
1253 + DMERR("load_metadata: Unable to allocate memory");
1257 + where.bdev = dmc->cache_dev->bdev;
1258 + where.sector = dev_size - 1;
1260 + dm_io_sync_vm(1, &where, READ, meta_dmc, &bits);
1261 + DPRINTK("Loaded cache conf: block size(%u), cache size(%llu), " \
1262 + "associativity(%u), write policy(%u), chksum(%u)",
1263 + meta_dmc->block_size, meta_dmc->size,
1264 + meta_dmc->assoc, meta_dmc->write_policy,
1265 + meta_dmc->chksum);
1267 + dmc->block_size = meta_dmc->block_size;
1268 + dmc->block_shift = ffs(dmc->block_size) - 1;
1269 + dmc->block_mask = dmc->block_size - 1;
1271 + dmc->size = meta_dmc->size;
1272 + dmc->bits = ffs(dmc->size) - 1;
1274 + dmc->assoc = meta_dmc->assoc;
1275 + consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1276 + dmc->assoc : CONSECUTIVE_BLOCKS;
1277 + dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1279 + dmc->write_policy = meta_dmc->write_policy;
1280 + chksum_sav = meta_dmc->chksum;
1282 + vfree((void *)meta_dmc);
1285 + order = dmc->size * sizeof(struct cacheblock);
1286 + DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1287 + "(capacity:%lluMB, associativity:%u, block size:%u " \
1288 + "sectors(%uKB), %s)",
1289 + (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1290 + (unsigned long long) dmc->size,
1291 + (unsigned long long) dmc->size * dmc->block_size >> (20-SECTOR_SHIFT),
1292 + dmc->assoc, dmc->block_size,
1293 + dmc->block_size >> (10-SECTOR_SHIFT),
1294 + dmc->write_policy ? "write-back" : "write-through");
1295 + dmc->cache = (struct cacheblock *)vmalloc(order);
1296 + if (!dmc->cache) {
1297 + DMERR("load_metadata: Unable to allocate memory");
1301 + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1302 + /* When requesting a new bio, the number of requested bvecs has to be
1303 + less than BIO_MAX_PAGES. Otherwise, null is returned. In dm-io.c,
1304 + this return value is not checked and kernel Oops may happen. We set
1305 + the limit here to avoid such situations. (2 additional bvecs are
1306 + required by dm-io for bookeeping.)
1308 + limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1309 + meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1311 + DMERR("load_metadata: Unable to allocate memory");
1312 + vfree((void *)dmc->cache);
1316 + while(index < meta_size) {
1317 + where.sector = dev_size - 1 - meta_size + index;
1318 + where.count = min(meta_size - index, limit);
1319 + dm_io_sync_vm(1, &where, READ, meta_data, &bits);
1321 + for (i=to_bytes(index)/sizeof(sector_t), j=0;
1322 + j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1324 + if(meta_data[j]) {
1325 + dmc->cache[i].block = meta_data[j];
1326 + dmc->cache[i].state = 1;
1328 + dmc->cache[i].state = 0;
1330 + chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1331 + index += where.count;
1334 + vfree((void *)meta_data);
1336 + if (chksum != chksum_sav) { /* Check the checksum of the metadata */
1337 + DPRINTK("Cache metadata loaded from disk is corrupted");
1338 + vfree((void *)dmc->cache);
1342 + DMINFO("Cache metadata loaded from disk (offset %llu)",
1343 + (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);;
1348 +/* Store metadata onto disk. */
1349 +static int dump_metadata(struct cache_c *dmc) {
1350 + struct io_region where;
1351 + unsigned long bits;
1352 + sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1353 + sector_t meta_size, i, j, index = 0, limit, *meta_data;
1354 + struct meta_dmc *meta_dmc;
1355 + unsigned int chksum = 0;
1357 + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1358 + limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1359 + meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1361 + DMERR("dump_metadata: Unable to allocate memory");
1365 + where.bdev = dmc->cache_dev->bdev;
1366 + while(index < meta_size) {
1367 + where.sector = dev_size - 1 - meta_size + index;
1368 + where.count = min(meta_size - index, limit);
1370 + for (i=to_bytes(index)/sizeof(sector_t), j=0;
1371 + j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1373 + /* Assume all invalid cache blocks store 0. We lose the block that
1374 + * is actually mapped to offset 0.
1376 + meta_data[j] = dmc->cache[i].state ? dmc->cache[i].block : 0;
1378 + chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1380 + dm_io_sync_vm(1, &where, WRITE, meta_data, &bits);
1381 + index += where.count;
1384 + vfree((void *)meta_data);
1386 + meta_dmc = (struct meta_dmc *)vmalloc(512);
1388 + DMERR("dump_metadata: Unable to allocate memory");
1392 + meta_dmc->block_size = dmc->block_size;
1393 + meta_dmc->size = dmc->size;
1394 + meta_dmc->assoc = dmc->assoc;
1395 + meta_dmc->write_policy = dmc->write_policy;
1396 + meta_dmc->chksum = chksum;
1398 + DPRINTK("Store metadata to disk: block size(%u), cache size(%llu), " \
1399 + "associativity(%u), write policy(%u), checksum(%u)",
1400 + meta_dmc->block_size, (unsigned long long) meta_dmc->size,
1401 + meta_dmc->assoc, meta_dmc->write_policy,
1402 + meta_dmc->chksum);
1404 + where.sector = dev_size - 1;
1406 + dm_io_sync_vm(1, &where, WRITE, meta_dmc, &bits);
1408 + vfree((void *)meta_dmc);
1410 + DMINFO("Cache metadata saved to disk (offset %llu)",
1411 + (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);
1417 + * Construct a cache mapping.
1418 + * arg[0]: path to source device
1419 + * arg[1]: path to cache device
1420 + * arg[2]: cache persistence (if set, cache conf is loaded from disk)
1421 + * Cache configuration parameters (if not set, default values are used.
1422 + * arg[3]: cache block size (in sectors)
1423 + * arg[4]: cache size (in blocks)
1424 + * arg[5]: cache associativity
1425 + * arg[6]: write caching policy
1427 +static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1429 + struct cache_c *dmc;
1430 + unsigned int consecutive_blocks, persistence = 0;
1431 + sector_t localsize, i, order;
1432 + sector_t data_size, meta_size, dev_size;
1433 + unsigned long long cache_size;
1437 + ti->error = "dm-cache: Need at least 2 arguments (src dev and cache dev)";
1441 + dmc = kmalloc(sizeof(*dmc), GFP_KERNEL);
1442 + if (dmc == NULL) {
1443 + ti->error = "dm-cache: Failed to allocate cache context";
1448 + r = dm_get_device(ti, argv[0], 0, ti->len,
1449 + dm_table_get_mode(ti->table), &dmc->src_dev);
1451 + ti->error = "dm-cache: Source device lookup failed";
1455 + r = dm_get_device(ti, argv[1], 0, 0,
1456 + dm_table_get_mode(ti->table), &dmc->cache_dev);
1458 + ti->error = "dm-cache: Cache device lookup failed";
1463 + r = kcopyd_client_create(DMCACHE_COPY_PAGES, &dmc->kcp_client);
1465 + ti->error = "Failed to initialize kcopyd client\n";
1469 + r = kcached_init(dmc);
1471 + ti->error = "Failed to initialize kcached";
1476 + if (sscanf(argv[2], "%u", &persistence) != 1) {
1477 + ti->error = "dm-cache: Invalid cache persistence";
1482 + if (1 == persistence) {
1483 + if (load_metadata(dmc)) {
1484 + ti->error = "dm-cache: Invalid cache configuration";
1488 + goto init; /* Skip reading cache parameters from command line */
1489 + } else if (persistence != 0) {
1490 + ti->error = "dm-cache: Invalid cache persistence";
1496 + if (sscanf(argv[3], "%u", &dmc->block_size) != 1) {
1497 + ti->error = "dm-cache: Invalid block size";
1501 + if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) {
1502 + ti->error = "dm-cache: Invalid block size";
1507 + dmc->block_size = DEFAULT_BLOCK_SIZE;
1508 + dmc->block_shift = ffs(dmc->block_size) - 1;
1509 + dmc->block_mask = dmc->block_size - 1;
1512 + if (sscanf(argv[4], "%llu", &cache_size) != 1) {
1513 + ti->error = "dm-cache: Invalid cache size";
1517 + dmc->size = (sector_t) cache_size;
1518 + if (!dmc->size || (dmc->size & (dmc->size - 1))) {
1519 + ti->error = "dm-cache: Invalid cache size";
1524 + dmc->size = DEFAULT_CACHE_SIZE;
1525 + localsize = dmc->size;
1526 + dmc->bits = ffs(dmc->size) - 1;
1529 + if (sscanf(argv[5], "%u", &dmc->assoc) != 1) {
1530 + ti->error = "dm-cache: Invalid cache associativity";
1534 + if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) ||
1535 + dmc->size < dmc->assoc) {
1536 + ti->error = "dm-cache: Invalid cache associativity";
1541 + dmc->assoc = DEFAULT_CACHE_ASSOC;
1543 + DMINFO("%lld", dmc->cache_dev->bdev->bd_inode->i_size);
1544 + dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1545 + data_size = dmc->size * dmc->block_size;
1546 + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512) + 1;
1547 + if ((data_size + meta_size) > dev_size) {
1548 + DMERR("Requested cache size exeeds the cache device's capacity" \
1549 + "(%llu+%llu>%llu)",
1550 + (unsigned long long) data_size, (unsigned long long) meta_size,
1551 + (unsigned long long) dev_size);
1552 + ti->error = "dm-cache: Invalid cache size";
1556 + consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1557 + dmc->assoc : CONSECUTIVE_BLOCKS;
1558 + dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1561 + if (sscanf(argv[6], "%u", &dmc->write_policy) != 1) {
1562 + ti->error = "dm-cache: Invalid cache write policy";
1566 + if (dmc->write_policy != 0 && dmc->write_policy != 1) {
1567 + ti->error = "dm-cache: Invalid cache write policy";
1572 + dmc->write_policy = DEFAULT_WRITE_POLICY;
1574 + order = dmc->size * sizeof(struct cacheblock);
1575 + localsize = data_size >> 11;
1576 + DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1577 + "(capacity:%lluMB, associativity:%u, block size:%u " \
1578 + "sectors(%uKB), %s)",
1579 + (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1580 + (unsigned long long) dmc->size,
1581 + (unsigned long long) data_size >> (20-SECTOR_SHIFT),
1582 + dmc->assoc, dmc->block_size,
1583 + dmc->block_size >> (10-SECTOR_SHIFT),
1584 + dmc->write_policy ? "write-back" : "write-through");
1586 + dmc->cache = (struct cacheblock *)vmalloc(order);
1587 + if (!dmc->cache) {
1588 + ti->error = "Unable to allocate memory";
1593 +init: /* Initialize the cache structs */
1594 + for (i=0; i<dmc->size; i++) {
1595 + bio_list_init(&dmc->cache[i].bios);
1596 + if(!persistence) dmc->cache[i].state = 0;
1597 + dmc->cache[i].counter = 0;
1598 + spin_lock_init(&dmc->cache[i].lock);
1602 + dmc->dirty_blocks = 0;
1605 + dmc->cache_hits = 0;
1607 + dmc->writeback = 0;
1610 + ti->split_io = dmc->block_size;
1611 + ti->private = dmc;
1615 + kcached_client_destroy(dmc);
1617 + kcopyd_client_destroy(dmc->kcp_client);
1619 + dm_put_device(ti, dmc->cache_dev);
1621 + dm_put_device(ti, dmc->src_dev);
1629 +static void cache_flush(struct cache_c *dmc)
1631 + struct cacheblock *cache = dmc->cache;
1635 + DMINFO("Flush dirty blocks (%llu) ...", (unsigned long long) dmc->dirty_blocks);
1636 + while (i< dmc->size) {
1638 + if (is_state(cache[i].state, DIRTY)) {
1639 + while ((i+j) < dmc->size && is_state(cache[i+j].state, DIRTY)
1640 + && (cache[i+j].block == cache[i].block + j *
1641 + dmc->block_size)) {
1645 + write_back(dmc, i, j);
1652 + * Destroy the cache mapping.
1654 +static void cache_dtr(struct dm_target *ti)
1656 + struct cache_c *dmc = (struct cache_c *) ti->private;
1658 + if (dmc->dirty_blocks > 0) cache_flush(dmc);
1660 + kcached_client_destroy(dmc);
1662 + kcopyd_client_destroy(dmc->kcp_client);
1664 + if (dmc->reads + dmc->writes > 0)
1665 + DMINFO("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1666 + "replacement(%lu), replaced dirty blocks(%lu), " \
1667 + "flushed dirty blocks(%lu)",
1668 + dmc->reads, dmc->writes, dmc->cache_hits,
1669 + dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1670 + dmc->replace, dmc->writeback, dmc->dirty);
1672 + dump_metadata(dmc); /* Always dump metadata to disk before exit */
1673 + vfree((void *)dmc->cache);
1675 + dm_put_device(ti, dmc->src_dev);
1676 + dm_put_device(ti, dmc->cache_dev);
1681 + * Report cache status:
1682 + * Output cache stats upon request of device status;
1683 + * Output cache configuration upon request of table status.
1685 +static int cache_status(struct dm_target *ti, status_type_t type,
1686 + char *result, unsigned int maxlen)
1688 + struct cache_c *dmc = (struct cache_c *) ti->private;
1692 + case STATUSTYPE_INFO:
1693 + DMEMIT("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1694 + "replacement(%lu), replaced dirty blocks(%lu)",
1695 + dmc->reads, dmc->writes, dmc->cache_hits,
1696 + dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1697 + dmc->replace, dmc->writeback);
1699 + case STATUSTYPE_TABLE:
1700 + DMEMIT("conf: capacity(%lluM), associativity(%u), block size(%uK), %s",
1701 + (unsigned long long) dmc->size * dmc->block_size >> 11,
1702 + dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT),
1703 + dmc->write_policy ? "write-back":"write-through");
1710 +/****************************************************************************
1711 + * Functions for manipulating a cache target.
1712 + ****************************************************************************/
1714 +static struct target_type cache_target = {
1716 + .version= {1, 0, 1},
1717 + .module = THIS_MODULE,
1721 + .status = cache_status,
1725 + * Initiate a cache target.
1727 +int __init dm_cache_init(void)
1735 + _kcached_wq = create_singlethread_workqueue("kcached");
1736 + if (!_kcached_wq) {
1737 + DMERR("failed to start kcached");
1740 + INIT_WORK(&_kcached_work, do_work);
1742 + r = dm_register_target(&cache_target);
1744 + DMERR("cache: register failed %d", r);
1745 + destroy_workqueue(_kcached_wq);
1752 + * Destroy a cache target.
1754 +void dm_cache_exit(void)
1756 + int r = dm_unregister_target(&cache_target);
1759 + DMERR("cache: unregister failed %d", r);
1762 + destroy_workqueue(_kcached_wq);
1765 +module_init(dm_cache_init);
1766 +module_exit(dm_cache_exit);
1768 +MODULE_DESCRIPTION(DM_NAME " cache target");
1769 +MODULE_AUTHOR("Ming Zhao <mingzhao99th@gmail.com>");
1770 +MODULE_LICENSE("GPL");
1771 diff -Naur linux-2.6.21.7-orig/drivers/md/Kconfig linux-2.6.21.7-dmcache/drivers/md/Kconfig
1772 --- linux-2.6.21.7-orig/drivers/md/Kconfig 2007-08-04 12:11:13.000000000 -0400
1773 +++ linux-2.6.21.7-dmcache/drivers/md/Kconfig 2007-08-23 14:16:07.000000000 -0400
1774 @@ -262,6 +262,12 @@
1776 Multipath support for EMC CX/AX series hardware.
1779 + tristate "Cache target support (EXPERIMENTAL)"
1780 + depends on BLK_DEV_DM && EXPERIMENTAL
1782 + Support for generic cache target for device-mapper.
1787 diff -Naur linux-2.6.21.7-orig/drivers/md/Makefile linux-2.6.21.7-dmcache/drivers/md/Makefile
1788 --- linux-2.6.21.7-orig/drivers/md/Makefile 2007-08-04 12:11:13.000000000 -0400
1789 +++ linux-2.6.21.7-dmcache/drivers/md/Makefile 2007-08-23 14:16:25.000000000 -0400
1791 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
1792 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
1793 obj-$(CONFIG_DM_ZERO) += dm-zero.o
1794 +obj-$(CONFIG_DM_CACHE) += dm-cache.o
1796 quiet_cmd_unroll = UNROLL $@
1797 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \