]> git.pld-linux.org Git - packages/dmcache.git/blame - linux-dmcache.patch
- pl
[packages/dmcache.git] / linux-dmcache.patch
CommitLineData
a7aacd32
ER
1diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
2--- linux-2.6.21.7-orig/drivers/md/dm-cache.c 1969-12-31 19:00:00.000000000 -0500
3+++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c 2007-08-23 14:10:58.000000000 -0400
4@@ -0,0 +1,1766 @@
5+/****************************************************************************
6+ * dm-cache.c
7+ * Device mapper target for block-level disk caching
8+ *
9+ * Copyright (C) International Business Machines Corp., 2006
10+ * Author: Ming Zhao (mingzhao@ufl.edu)
11+ *
12+ * This program is free software; you can redistribute it and/or modify
13+ * it under the terms of the GNU General Public License as published by
14+ * the Free Software Foundation; under version 2 of the License.
15+ *
16+ * This program is distributed in the hope that it will be useful,
17+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19+ * GNU General Public License for more details.
20+ *
21+ * You should have received a copy of the GNU General Public License
22+ * along with this program; if not, write to the Free Software
23+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24+ *
25+ ****************************************************************************/
26+
27+#include <asm/atomic.h>
28+#include <asm/checksum.h>
29+#include <linux/module.h>
30+#include <linux/init.h>
31+#include <linux/list.h>
32+#include <linux/blkdev.h>
33+#include <linux/bio.h>
34+#include <linux/slab.h>
35+#include <linux/hash.h>
36+#include <linux/spinlock.h>
37+#include <linux/workqueue.h>
38+#include <linux/pagemap.h>
39+
40+#include "dm.h"
41+#include "dm-io.h"
42+#include "dm-bio-list.h"
43+#include "kcopyd.h"
44+
45+#define DMC_DEBUG 0
46+
47+#define DM_MSG_PREFIX "cache"
48+#define DMC_PREFIX "dm-cache: "
49+
50+#if DMC_DEBUG
51+#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
52+#else
53+#define DPRINTK( s, arg... )
54+#endif
55+
56+/* Default cache parameters */
57+#define DEFAULT_CACHE_SIZE 65536
58+#define DEFAULT_CACHE_ASSOC 1024
59+#define DEFAULT_BLOCK_SIZE 8
60+#define CONSECUTIVE_BLOCKS 512
61+
62+/* Write policy */
63+#define WRITE_THROUGH 0
64+#define WRITE_BACK 1
65+#define DEFAULT_WRITE_POLICY WRITE_THROUGH
66+
67+/* Number of pages for I/O */
68+#define DMCACHE_COPY_PAGES 1024
69+
70+/* States of a cache block */
71+#define INVALID 0
72+#define VALID 1 /* Valid */
73+#define RESERVED 2 /* Allocated but data not in place yet */
74+#define DIRTY 4 /* Locally modified */
75+#define WRITEBACK 8 /* In the process of write back */
76+
77+#define is_state(x, y) (x & y)
78+#define set_state(x, y) (x |= y)
79+#define clear_state(x, y) (x &= ~y)
80+
81+/*
82+ * Cache context
83+ */
84+struct cache_c {
85+ struct dm_dev *src_dev; /* Source device */
86+ struct dm_dev *cache_dev; /* Cache device */
87+ struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
88+
89+ struct cacheblock *cache; /* Hash table for cache blocks */
90+ sector_t size; /* Cache size */
91+ unsigned int bits; /* Cache size in bits */
92+ unsigned int assoc; /* Cache associativity */
93+ unsigned int block_size; /* Cache block size */
94+ unsigned int block_shift; /* Cache block size in bits */
95+ unsigned int block_mask; /* Cache block mask */
96+ unsigned int consecutive_shift; /* Consecutive blocks size in bits */
97+ unsigned long counter; /* Logical timestamp of last access */
98+ unsigned int write_policy; /* Cache write policy */
99+ sector_t dirty_blocks; /* Number of dirty blocks */
100+
101+ spinlock_t lock; /* Lock to protect page allocation/deallocation */
102+ struct page_list *pages; /* Pages for I/O */
103+ unsigned int nr_pages; /* Number of pages */
104+ unsigned int nr_free_pages; /* Number of free pages */
105+ wait_queue_head_t destroyq; /* Wait queue for I/O completion */
106+ atomic_t nr_jobs; /* Number of I/O jobs */
107+ /* Stats */
108+ unsigned long reads; /* Number of reads */
109+ unsigned long writes; /* Number of writes */
110+ unsigned long cache_hits; /* Number of cache hits */
111+ unsigned long replace; /* Number of cache replacements */
112+ unsigned long writeback; /* Number of replaced dirty blocks */
113+ unsigned long dirty; /* Number of submitted dirty blocks */
114+};
115+
116+/* Cache block metadata structure */
117+struct cacheblock {
118+ spinlock_t lock; /* Lock to protect operations on the bio list */
119+ sector_t block; /* Sector number of the cached block */
120+ unsigned short state; /* State of a block */
121+ unsigned long counter; /* Logical timestamp of the block's last access */
122+ struct bio_list bios; /* List of pending bios */
123+};
124+
125+
126+/****************************************************************************
127+ * Functions and data structures for implementing a kcached to handle async
128+ * I/O. Code for page and queue handling is borrowed from kcopyd.c.
129+ ****************************************************************************/
130+
131+/*
132+ * Functions for handling pages used by async I/O.
133+ * The data asked by a bio request may not be aligned with cache blocks, in
134+ * which case additional pages are required for the request that is forwarded
135+ * to the server. A pool of pages are reserved for this purpose.
136+ */
137+
138+static struct page_list *alloc_pl(void)
139+{
140+ struct page_list *pl;
141+
142+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
143+ if (!pl)
144+ return NULL;
145+
146+ pl->page = alloc_page(GFP_KERNEL);
147+ if (!pl->page) {
148+ kfree(pl);
149+ return NULL;
150+ }
151+
152+ return pl;
153+}
154+
155+static void free_pl(struct page_list *pl)
156+{
157+ __free_page(pl->page);
158+ kfree(pl);
159+}
160+
161+static void drop_pages(struct page_list *pl)
162+{
163+ struct page_list *next;
164+
165+ while (pl) {
166+ next = pl->next;
167+ free_pl(pl);
168+ pl = next;
169+ }
170+}
171+
172+static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
173+ struct page_list **pages)
174+{
175+ struct page_list *pl;
176+
177+ spin_lock(&dmc->lock);
178+ if (dmc->nr_free_pages < nr) {
179+ DPRINTK("kcached_get_pages: No free pages: %u<%u",
180+ dmc->nr_free_pages, nr);
181+ spin_unlock(&dmc->lock);
182+ return -ENOMEM;
183+ }
184+
185+ dmc->nr_free_pages -= nr;
186+ for (*pages = pl = dmc->pages; --nr; pl = pl->next)
187+ ;
188+
189+ dmc->pages = pl->next;
190+ pl->next = NULL;
191+
192+ spin_unlock(&dmc->lock);
193+
194+ return 0;
195+}
196+
197+static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
198+{
199+ struct page_list *cursor;
200+
201+ spin_lock(&dmc->lock);
202+ for (cursor = pl; cursor->next; cursor = cursor->next)
203+ dmc->nr_free_pages++;
204+
205+ dmc->nr_free_pages++;
206+ cursor->next = dmc->pages;
207+ dmc->pages = pl;
208+
209+ spin_unlock(&dmc->lock);
210+}
211+
212+static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
213+{
214+ unsigned int i;
215+ struct page_list *pl = NULL, *next;
216+
217+ for (i = 0; i < nr; i++) {
218+ next = alloc_pl();
219+ if (!next) {
220+ if (pl)
221+ drop_pages(pl);
222+ return -ENOMEM;
223+ }
224+ next->next = pl;
225+ pl = next;
226+ }
227+
228+ kcached_put_pages(dmc, pl);
229+ dmc->nr_pages += nr;
230+
231+ return 0;
232+}
233+
234+static void free_bio_pages(struct cache_c *dmc)
235+{
236+ BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
237+ drop_pages(dmc->pages);
238+ dmc->pages = NULL;
239+ dmc->nr_free_pages = dmc->nr_pages = 0;
240+}
241+
242+/* Structure for a kcached job */
243+struct kcached_job {
244+ struct list_head list;
245+ struct cache_c *dmc;
246+ struct bio *bio; /* Original bio */
247+ struct io_region src;
248+ struct io_region dest;
249+ struct cacheblock *cacheblock;
250+ int rw;
251+ /*
252+ * When the original bio is not aligned with cache blocks,
253+ * we need extra bvecs and pages for padding.
254+ */
255+ struct bio_vec *bvec;
256+ unsigned int nr_pages;
257+ struct page_list *pages;
258+};
259+
260+static struct workqueue_struct *_kcached_wq;
261+static struct work_struct _kcached_work;
262+
263+static inline void wake(void)
264+{
265+ queue_work(_kcached_wq, &_kcached_work);
266+}
267+
268+#define MIN_JOBS 1024
269+
270+static struct kmem_cache *_job_cache;
271+static mempool_t *_job_pool;
272+
273+static DEFINE_SPINLOCK(_job_lock);
274+
275+static LIST_HEAD(_complete_jobs);
276+static LIST_HEAD(_io_jobs);
277+static LIST_HEAD(_pages_jobs);
278+
279+static int jobs_init(void)
280+{
281+ _job_cache = kmem_cache_create("kcached-jobs",
282+ sizeof(struct kcached_job),
283+ __alignof__(struct kcached_job),
284+ 0, NULL, NULL);
285+ if (!_job_cache)
286+ return -ENOMEM;
287+
288+ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
289+ mempool_free_slab, _job_cache);
290+ if (!_job_pool) {
291+ kmem_cache_destroy(_job_cache);
292+ return -ENOMEM;
293+ }
294+
295+ return 0;
296+}
297+
298+static void jobs_exit(void)
299+{
300+ BUG_ON(!list_empty(&_complete_jobs));
301+ BUG_ON(!list_empty(&_io_jobs));
302+ BUG_ON(!list_empty(&_pages_jobs));
303+
304+ mempool_destroy(_job_pool);
305+ kmem_cache_destroy(_job_cache);
306+ _job_pool = NULL;
307+ _job_cache = NULL;
308+}
309+
310+/*
311+ * Functions to push and pop a job onto the head of a given job list.
312+ */
313+static inline struct kcached_job *pop(struct list_head *jobs)
314+{
315+ struct kcached_job *job = NULL;
316+ unsigned long flags;
317+
318+ spin_lock_irqsave(&_job_lock, flags);
319+
320+ if (!list_empty(jobs)) {
321+ job = list_entry(jobs->next, struct kcached_job, list);
322+ list_del(&job->list);
323+ }
324+ spin_unlock_irqrestore(&_job_lock, flags);
325+
326+ return job;
327+}
328+
329+static inline void push(struct list_head *jobs, struct kcached_job *job)
330+{
331+ unsigned long flags;
332+
333+ spin_lock_irqsave(&_job_lock, flags);
334+ list_add_tail(&job->list, jobs);
335+ spin_unlock_irqrestore(&_job_lock, flags);
336+}
337+
338+
339+/****************************************************************************
340+ * Functions for asynchronously fetching data from source device and storing
341+ * data in cache device. Because the requested data may not align with the
342+ * cache blocks, extra handling is required to pad a block request and extract
343+ * the requested data from the results.
344+ ****************************************************************************/
345+
346+static void io_callback(unsigned long error, void *context)
347+{
348+ struct kcached_job *job = (struct kcached_job *) context;
349+
350+ if (error) {
351+ /* TODO */
352+ DMERR("io_callback: io error");
353+ return;
354+ }
355+
356+ if (job->rw == READ) {
357+ job->rw = WRITE;
358+ push(&_io_jobs, job);
359+ } else
360+ push(&_complete_jobs, job);
361+ wake();
362+}
363+
364+/*
365+ * Fetch data from the source device asynchronously.
366+ * For a READ bio, if a cache block is larger than the requested data, then
367+ * additional data are prefetched. Larger cache block size enables more
368+ * aggressive read prefetching, which is useful for read-mostly usage.
369+ * For a WRITE bio, if a cache block is larger than the requested data, the
370+ * entire block needs to be fetched, and larger block size incurs more overhead.
371+ * In scenaros where writes are frequent, 4KB is a good cache block size.
372+ */
373+static int do_fetch(struct kcached_job *job)
374+{
375+ int r = 0, i, j;
376+ struct bio *bio = job->bio;
377+ struct cache_c *dmc = job->dmc;
378+ unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
379+ struct bio_vec *bvec;
380+ struct page_list *pl;
381+
382+ offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
383+ head = to_bytes(offset);
384+ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
385+
386+ DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
387+ bio->bi_sector, job->src.sector, job->dest.sector,
388+ job->src.count, head, tail);
389+
390+ if (bio_data_dir(bio) == READ) { /* The original request is a READ */
391+ if (0 == job->nr_pages) { /* The request is aligned to cache block */
392+ r = dm_io_async_bvec(1, &job->src, READ,
393+ bio->bi_io_vec + bio->bi_idx,
394+ io_callback, job);
395+ return r;
396+ }
397+
398+ nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
399+ bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
400+ if (!bvec) {
401+ DMERR("do_fetch: No memory");
402+ return 1;
403+ }
404+
405+ pl = job->pages;
406+ i = 0;
407+ while (head) {
408+ bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
409+ bvec[i].bv_offset = 0;
410+ bvec[i].bv_page = pl->page;
411+ head -= bvec[i].bv_len;
412+ pl = pl->next;
413+ i++;
414+ }
415+
416+ remaining = bio->bi_size;
417+ j = bio->bi_idx;
418+ while (remaining) {
419+ bvec[i] = bio->bi_io_vec[j];
420+ remaining -= bvec[i].bv_len;
421+ i++; j++;
422+ }
423+
424+ while (tail) {
425+ bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
426+ bvec[i].bv_offset = 0;
427+ bvec[i].bv_page = pl->page;
428+ tail -= bvec[i].bv_len;
429+ pl = pl->next;
430+ i++;
431+ }
432+
433+ job->bvec = bvec;
434+ r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
435+ return r;
436+ } else { /* The original request is a WRITE */
437+ pl = job->pages;
438+
439+ if (head && tail) { /* Special case */
440+ bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
441+ if (!bvec) {
442+ DMERR("do_fetch: No memory");
443+ return 1;
444+ }
445+ for (i=0; i<job->nr_pages; i++) {
446+ bvec[i].bv_len = PAGE_SIZE;
447+ bvec[i].bv_offset = 0;
448+ bvec[i].bv_page = pl->page;
449+ pl = pl->next;
450+ }
451+ job->bvec = bvec;
452+ r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
453+ io_callback, job);
454+ return r;
455+ }
456+
457+ bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
458+ * sizeof(*bvec), GFP_KERNEL);
459+ if (!bvec) {
460+ DMERR("do_fetch: No memory");
461+ return 1;
462+ }
463+
464+ i = 0;
465+ while (head) {
466+ bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
467+ bvec[i].bv_offset = 0;
468+ bvec[i].bv_page = pl->page;
469+ head -= bvec[i].bv_len;
470+ pl = pl->next;
471+ i++;
472+ }
473+
474+ remaining = bio->bi_size;
475+ j = bio->bi_idx;
476+ while (remaining) {
477+ bvec[i] = bio->bi_io_vec[j];
478+ remaining -= bvec[i].bv_len;
479+ i++; j++;
480+ }
481+
482+ if (tail) {
483+ idx = i;
484+ bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
485+ (PAGE_SIZE - 1);
486+ bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
487+ bvec[i].bv_page = pl->page;
488+ tail -= bvec[i].bv_len;
489+ pl = pl->next; i++;
490+ while (tail) {
491+ bvec[i].bv_len = PAGE_SIZE;
492+ bvec[i].bv_offset = 0;
493+ bvec[i].bv_page = pl->page;
494+ tail -= bvec[i].bv_len;
495+ pl = pl->next; i++;
496+ }
497+ }
498+
499+ job->bvec = bvec;
500+ r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
501+ io_callback, job);
502+
503+ return r;
504+ }
505+}
506+
507+/*
508+ * Store data to the cache source device asynchronously.
509+ * For a READ bio request, the data fetched from the source device are returned
510+ * to kernel and stored in cache at the same time.
511+ * For a WRITE bio request, the data are written to the cache and source device
512+ * at the same time.
513+ */
514+static int do_store(struct kcached_job *job)
515+{
516+ int i, j, r = 0;
517+ struct bio *bio = job->bio, *clone;
518+ struct cache_c *dmc = job->dmc;
519+ unsigned int offset, head, tail, remaining, nr_vecs;
520+ struct bio_vec *bvec;
521+
522+ offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
523+ head = to_bytes(offset);
524+ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
525+
526+ DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
527+ bio->bi_sector, job->src.sector, job->dest.sector,
528+ job->src.count, head, tail);
529+
530+ /* A READ is acknowledged as soon as the requested data is fetched, and
531+ does not have to wait for it being stored in cache. The bio is cloned
532+ so that the original one can be ended here. But to avoid copying
533+ pages, we reuse the pages allocated for the original bio, and mark
534+ each of them to prevent the pages being freed before the cache
535+ insertion is completed.
536+ */
537+ if (bio_data_dir(bio) == READ) {
538+ clone = bio_clone(bio, GFP_NOIO);
539+ for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
540+ get_page(bio->bi_io_vec[i].bv_page);
541+ }
542+ DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
543+ bio_endio(bio, bio->bi_size, 0);
544+ bio = clone;
545+ job->bio = clone;
546+ }
547+
548+ if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
549+ r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
550+ io_callback, job);
551+ else {
552+ if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
553+ DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
554+ nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
555+ if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
556+ DPRINTK("Create %u new vecs", nr_vecs);
557+ bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
558+ if (!bvec) {
559+ DMERR("do_store: No memory");
560+ return 1;
561+ }
562+
563+ i = 0;
564+ while (head) {
565+ bvec[i].bv_len = min(head, job->bvec[i].bv_len);
566+ bvec[i].bv_offset = 0;
567+ bvec[i].bv_page = job->bvec[i].bv_page;
568+ head -= bvec[i].bv_len;
569+ i++;
570+ }
571+ remaining = bio->bi_size;
572+ j = bio->bi_idx;
573+ while (remaining) {
574+ bvec[i] = bio->bi_io_vec[j];
575+ remaining -= bvec[i].bv_len;
576+ i++; j++;
577+ }
578+ j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
579+ bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
580+ j * PAGE_SIZE;
581+ bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
582+ bvec[i].bv_page = job->bvec[j].bv_page;
583+ tail -= bvec[i].bv_len;
584+ i++; j++;
585+ while (tail) {
586+ bvec[i] = job->bvec[j];
587+ tail -= bvec[i].bv_len;
588+ i++; j++;
589+ }
590+ kfree(job->bvec);
591+ job->bvec = bvec;
592+ }
593+
594+ r = dm_io_async_bvec(1, &job->dest, WRITE, job->bvec, io_callback, job);
595+ }
596+
597+ return r;
598+}
599+
600+static int do_io(struct kcached_job *job)
601+{
602+ int r = 0;
603+
604+ if (job->rw == READ) { /* Read from source device */
605+ r = do_fetch(job);
606+ } else { /* Write to cache device */
607+ r = do_store(job);
608+ }
609+
610+ return r;
611+}
612+
613+static int do_pages(struct kcached_job *job)
614+{
615+ int r = 0;
616+
617+ r = kcached_get_pages(job->dmc, job->nr_pages, &job->pages);
618+
619+ if (r == -ENOMEM) /* can't complete now */
620+ return 1;
621+
622+ /* this job is ready for io */
623+ push(&_io_jobs, job);
624+ return 0;
625+}
626+
627+/*
628+ * Flush the bios that are waiting for this cache insertion or write back.
629+ */
630+static void flush_bios(struct cacheblock *cacheblock)
631+{
632+ struct bio *bio;
633+ struct bio *n;
634+
635+ spin_lock(&cacheblock->lock);
636+ bio = bio_list_get(&cacheblock->bios);
637+ if (is_state(cacheblock->state, WRITEBACK)) { /* Write back finished */
638+ cacheblock->state = VALID;
639+ } else { /* Cache insertion finished */
640+ set_state(cacheblock->state, VALID);
641+ clear_state(cacheblock->state, RESERVED);
642+ }
643+ spin_unlock(&cacheblock->lock);
644+
645+ while (bio) {
646+ n = bio->bi_next;
647+ bio->bi_next = NULL;
648+ DPRINTK("Flush bio: %llu->%llu (%u bytes)",
649+ cacheblock->block, bio->bi_sector, bio->bi_size);
650+ generic_make_request(bio);
651+ bio = n;
652+ }
653+}
654+
655+static int do_complete(struct kcached_job *job)
656+{
657+ int i, r = 0;
658+ struct bio *bio = job->bio;
659+
660+ DPRINTK("do_complete: %llu", bio->bi_sector);
661+
662+ if (bio_data_dir(bio) == READ) {
663+ for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
664+ put_page(bio->bi_io_vec[i].bv_page);
665+ }
666+ bio_put(bio);
667+ } else
668+ bio_endio(bio, bio->bi_size, 0);
669+
670+ if (job->nr_pages > 0) {
671+ kfree(job->bvec);
672+ kcached_put_pages(job->dmc, job->pages);
673+ }
674+
675+ flush_bios(job->cacheblock);
676+ mempool_free(job, _job_pool);
677+
678+ if (atomic_dec_and_test(&job->dmc->nr_jobs))
679+ wake_up(&job->dmc->destroyq);
680+
681+ return r;
682+}
683+
684+/*
685+ * Run through a list for as long as possible. Returns the count
686+ * of successful jobs.
687+ */
688+static int process_jobs(struct list_head *jobs,
689+ int (*fn) (struct kcached_job *))
690+{
691+ struct kcached_job *job;
692+ int r, count = 0;
693+
694+ while ((job = pop(jobs))) {
695+ r = fn(job);
696+
697+ if (r < 0) {
698+ /* error this rogue job */
699+ DMERR("process_jobs: Job processing error");
700+ }
701+
702+ if (r > 0) {
703+ /*
704+ * We couldn't service this job ATM, so
705+ * push this job back onto the list.
706+ */
707+ push(jobs, job);
708+ break;
709+ }
710+
711+ count++;
712+ }
713+
714+ return count;
715+}
716+
717+static void do_work(struct work_struct *ignored)
718+{
719+ process_jobs(&_complete_jobs, do_complete);
720+ process_jobs(&_pages_jobs, do_pages);
721+ process_jobs(&_io_jobs, do_io);
722+}
723+
724+static void queue_job(struct kcached_job *job)
725+{
726+ atomic_inc(&job->dmc->nr_jobs);
727+ if (job->nr_pages > 0) /* Request pages */
728+ push(&_pages_jobs, job);
729+ else /* Go ahead to do I/O */
730+ push(&_io_jobs, job);
731+ wake();
732+}
733+
734+static int kcached_init(struct cache_c *dmc)
735+{
736+ int r;
737+
738+ spin_lock_init(&dmc->lock);
739+ dmc->pages = NULL;
740+ dmc->nr_pages = dmc->nr_free_pages = 0;
741+ r = alloc_bio_pages(dmc, DMCACHE_COPY_PAGES);
742+ if (r) {
743+ DMERR("kcached_init: Could not allocate bio pages");
744+ return r;
745+ }
746+
747+ r = dm_io_get(DMCACHE_COPY_PAGES);
748+ if (r) {
749+ DMERR("kcached_init: Could not resize dm io pool");
750+ free_bio_pages(dmc);
751+ return r;
752+ }
753+
754+ init_waitqueue_head(&dmc->destroyq);
755+ atomic_set(&dmc->nr_jobs, 0);
756+
757+ return 0;
758+}
759+
760+void kcached_client_destroy(struct cache_c *dmc)
761+{
762+ /* Wait for completion of all jobs submitted by this client. */
763+ wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs));
764+
765+ dm_io_put(dmc->nr_pages);
766+ free_bio_pages(dmc);
767+}
768+
769+
770+/****************************************************************************
771+ * Functions for writing back dirty blocks.
772+ * We leverage kcopyd to write back dirty blocks because it is convenient to
773+ * use and it is not reasonble to reimplement the same function here. But we
774+ * need to reserve pages for both kcached and kcopyd. TODO: dynamically change
775+ * the number of reserved pages.
776+ ****************************************************************************/
777+
778+static void copy_callback(int read_err, unsigned int write_err, void *context)
779+{
780+ struct cacheblock *cacheblock = (struct cacheblock *) context;
781+
782+ flush_bios(cacheblock);
783+}
784+
785+static void copy_block(struct cache_c *dmc, struct io_region src,
786+ struct io_region dest, struct cacheblock *cacheblock)
787+{
788+ DPRINTK("Copying: %llu:%llu->%llu:%llu",
789+ src.sector, src.count * 512, dest.sector, dest.count * 512);
790+ kcopyd_copy(dmc->kcp_client, &src, 1, &dest, 0, copy_callback, cacheblock);
791+}
792+
793+static void write_back(struct cache_c *dmc, sector_t index, unsigned int length)
794+{
795+ struct io_region src, dest;
796+ struct cacheblock *cacheblock = &dmc->cache[index];
797+ unsigned int i;
798+
799+ DPRINTK("Write back block %llu(%llu, %u)",
800+ index, cacheblock->block, length);
801+ src.bdev = dmc->cache_dev->bdev;
802+ src.sector = index << dmc->block_shift;
803+ src.count = dmc->block_size * length;
804+ dest.bdev = dmc->src_dev->bdev;
805+ dest.sector = cacheblock->block;
806+ dest.count = dmc->block_size * length;
807+
808+ for (i=0; i<length; i++)
809+ set_state(dmc->cache[index+i].state, WRITEBACK);
810+ dmc->dirty_blocks -= length;
811+ copy_block(dmc, src, dest, cacheblock);
812+}
813+
814+
815+/****************************************************************************
816+ * Functions for implementing the various cache operations.
817+ ****************************************************************************/
818+
819+/*
820+ * Map a block from the source device to a block in the cache device.
821+ */
822+static unsigned long hash_block(struct cache_c *dmc, sector_t block)
823+{
824+ unsigned long set_number, value;
825+
826+ value = (unsigned long)(block >> (dmc->block_shift +
827+ dmc->consecutive_shift));
828+ set_number = hash_long(value, dmc->bits) / dmc->assoc;
829+
830+ return set_number;
831+}
832+
833+/*
834+ * Reset the LRU counters (the cache's global counter and each cache block's
835+ * counter). This seems to be a naive implementaion. However, consider the
836+ * rareness of this event, it might be more efficient that other more complex
837+ * schemes. TODO: a more elegant solution.
838+ */
839+static void cache_reset_counter(struct cache_c *dmc)
840+{
841+ sector_t i;
842+ struct cacheblock *cache = dmc->cache;
843+
844+ DPRINTK("Reset LRU counters");
845+ for (i=0; i<dmc->size; i++)
846+ cache[i].counter = 0;
847+
848+ dmc->counter = 0;
849+}
850+
851+/*
852+ * Lookup a block in the cache.
853+ *
854+ * Return value:
855+ * 1: cache hit (cache_block stores the index of the matched block)
856+ * 0: cache miss but frame is allocated for insertion; cache_block stores the
857+ * frame's index:
858+ * If there are empty frames, then the first encounted is used.
859+ * If there are clean frames, then the LRU clean block is replaced.
860+ * 2: cache miss and frame is not allocated; cache_block stores the LRU dirty
861+ * block's index:
862+ * This happens when the entire set is dirty.
863+ * -1: cache miss and no room for insertion:
864+ * This happens when the entire set in transition modes (RESERVED or
865+ * WRITEBACK).
866+ *
867+ */
868+static int cache_lookup(struct cache_c *dmc, sector_t block,
869+ sector_t *cache_block)
870+{
871+ unsigned long set_number = hash_block(dmc, block);
872+ sector_t index;
873+ int i, res;
874+ unsigned int cache_assoc = dmc->assoc;
875+ struct cacheblock *cache = dmc->cache;
876+ int invalid = -1, oldest = -1, oldest_clean = -1;
877+ unsigned long counter = ULONG_MAX, clean_counter = ULONG_MAX;
878+
879+ index=set_number * cache_assoc;
880+
881+ for (i=0; i<cache_assoc; i++, index++) {
882+ if (is_state(cache[index].state, VALID) ||
883+ is_state(cache[index].state, RESERVED)) {
884+ if (cache[index].block == block) {
885+ *cache_block = index;
886+ /* Reset all counters if the largest one is going to overflow */
887+ if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
888+ cache[index].counter = ++dmc->counter;
889+ break;
890+ } else {
891+ /* Don't consider blocks that are in the middle of copying */
892+ if (!is_state(cache[index].state, RESERVED) &&
893+ !is_state(cache[index].state, WRITEBACK)) {
894+ if (!is_state(cache[index].state, DIRTY) &&
895+ cache[index].counter < clean_counter) {
896+ clean_counter = cache[index].counter;
897+ oldest_clean = i;
898+ }
899+ if (cache[index].counter < counter) {
900+ counter = cache[index].counter;
901+ oldest = i;
902+ }
903+ }
904+ }
905+ } else {
906+ if (-1 == invalid) invalid = i;
907+ }
908+ }
909+
910+ res = i < cache_assoc ? 1 : 0;
911+ if (!res) { /* Cache miss */
912+ if (invalid != -1) /* Choose the first empty frame */
913+ *cache_block = set_number * cache_assoc + invalid;
914+ else if (oldest_clean != -1) /* Choose the LRU clean block to replace */
915+ *cache_block = set_number * cache_assoc + oldest_clean;
916+ else if (oldest != -1) { /* Choose the LRU dirty block to evict */
917+ res = 2;
918+ *cache_block = set_number * cache_assoc + oldest;
919+ } else {
920+ res = -1;
921+ }
922+ }
923+
924+ if (-1 == res)
925+ DPRINTK("Cache lookup: Block %llu(%lu):%s",
926+ block, set_number, "NO ROOM");
927+ else
928+ DPRINTK("Cache lookup: Block %llu(%lu):%llu(%s)",
929+ block, set_number, *cache_block,
930+ 1 == res ? "HIT" : (0 == res ? "MISS" : "WB NEEDED"));
931+ return res;
932+}
933+
934+/*
935+ * Insert a block into the cache (in the frame specified by cache_block).
936+ */
937+static int cache_insert(struct cache_c *dmc, sector_t block,
938+ sector_t cache_block)
939+{
940+ struct cacheblock *cache = dmc->cache;
941+
942+ /* Mark the block as RESERVED because although it is allocated, the data are
943+ not in place until kcopyd finishes its job.
944+ */
945+ cache[cache_block].block = block;
946+ cache[cache_block].state = RESERVED;
947+ if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc);
948+ cache[cache_block].counter = ++dmc->counter;
949+
950+ return 1;
951+}
952+
953+/*
954+ * Invalidate a block (specified by cache_block) in the cache.
955+ */
956+static void cache_invalidate(struct cache_c *dmc, sector_t cache_block)
957+{
958+ struct cacheblock *cache = dmc->cache;
959+
960+ DPRINTK("Cache invalidate: Block %llu(%llu)",
961+ cache_block, cache[cache_block].block);
962+ clear_state(cache[cache_block].state, VALID);
963+}
964+
965+/*
966+ * Handle a cache hit:
967+ * For READ, serve the request from cache is the block is ready; otherwise,
968+ * queue the request for later processing.
969+ * For write, invalidate the cache block if write-through. If write-back,
970+ * serve the request from cache if the block is ready, or queue the request
971+ * for later processing if otherwise.
972+ */
973+static int cache_hit(struct cache_c *dmc, struct bio* bio, sector_t cache_block)
974+{
975+ unsigned int offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
976+ struct cacheblock *cache = dmc->cache;
977+
978+ dmc->cache_hits++;
979+
980+ if (bio_data_dir(bio) == READ) { /* READ hit */
981+ bio->bi_bdev = dmc->cache_dev->bdev;
982+ bio->bi_sector = (cache_block << dmc->block_shift) + offset;
983+
984+ spin_lock(&cache[cache_block].lock);
985+
986+ if (is_state(cache[cache_block].state, VALID)) { /* Valid cache block */
987+ spin_unlock(&cache[cache_block].lock);
988+ return 1;
989+ }
990+
991+ /* Cache block is not ready yet */
992+ DPRINTK("Add to bio list %s(%llu)",
993+ dmc->cache_dev->name, bio->bi_sector);
994+ bio_list_add(&cache[cache_block].bios, bio);
995+
996+ spin_unlock(&cache[cache_block].lock);
997+ return 0;
998+ } else { /* WRITE hit */
999+ if (dmc->write_policy == WRITE_THROUGH) { /* Invalidate cached data */
1000+ cache_invalidate(dmc, cache_block);
1001+ bio->bi_bdev = dmc->src_dev->bdev;
1002+ return 1;
1003+ }
1004+
1005+ /* Write delay */
1006+ if (!is_state(cache[cache_block].state, DIRTY)) {
1007+ set_state(cache[cache_block].state, DIRTY);
1008+ dmc->dirty_blocks++;
1009+ }
1010+
1011+ spin_lock(&cache[cache_block].lock);
1012+
1013+ /* In the middle of write back */
1014+ if (is_state(cache[cache_block].state, WRITEBACK)) {
1015+ /* Delay this write until the block is written back */
1016+ bio->bi_bdev = dmc->src_dev->bdev;
1017+ DPRINTK("Add to bio list %s(%llu)",
1018+ dmc->src_dev->name, bio->bi_sector);
1019+ bio_list_add(&cache[cache_block].bios, bio);
1020+ spin_unlock(&cache[cache_block].lock);
1021+ return 0;
1022+ }
1023+
1024+ /* Cache block not ready yet */
1025+ if (is_state(cache[cache_block].state, RESERVED)) {
1026+ bio->bi_bdev = dmc->cache_dev->bdev;
1027+ bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1028+ DPRINTK("Add to bio list %s(%llu)",
1029+ dmc->cache_dev->name, bio->bi_sector);
1030+ bio_list_add(&cache[cache_block].bios, bio);
1031+ spin_unlock(&cache[cache_block].lock);
1032+ return 0;
1033+ }
1034+
1035+ /* Serve the request from cache */
1036+ bio->bi_bdev = dmc->cache_dev->bdev;
1037+ bio->bi_sector = (cache_block << dmc->block_shift) + offset;
1038+
1039+ spin_unlock(&cache[cache_block].lock);
1040+ return 1;
1041+ }
1042+}
1043+
1044+static struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio,
1045+ sector_t request_block,
1046+ sector_t cache_block)
1047+{
1048+ struct io_region src, dest;
1049+ struct kcached_job *job;
1050+
1051+ src.bdev = dmc->src_dev->bdev;
1052+ src.sector = request_block;
1053+ src.count = dmc->block_size;
1054+ dest.bdev = dmc->cache_dev->bdev;
1055+ dest.sector = cache_block << dmc->block_shift;
1056+ dest.count = src.count;
1057+
1058+ job = mempool_alloc(_job_pool, GFP_NOIO);
1059+ job->dmc = dmc;
1060+ job->bio = bio;
1061+ job->src = src;
1062+ job->dest = dest;
1063+ job->cacheblock = &dmc->cache[cache_block];
1064+
1065+ return job;
1066+}
1067+
1068+/*
1069+ * Handle a read cache miss:
1070+ * Update the metadata; fetch the necessary block from source device;
1071+ * store data to cache device.
1072+ */
1073+static int cache_read_miss(struct cache_c *dmc, struct bio* bio,
1074+ sector_t cache_block) {
1075+ struct cacheblock *cache = dmc->cache;
1076+ unsigned int offset, head, tail;
1077+ struct kcached_job *job;
1078+ sector_t request_block, left;
1079+
1080+ offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1081+ request_block = bio->bi_sector - offset;
1082+
1083+ if (cache[cache_block].state & VALID) {
1084+ DPRINTK("Replacing %llu->%llu",
1085+ cache[cache_block].block, request_block);
1086+ dmc->replace++;
1087+ } else DPRINTK("Insert block %llu at empty frame %llu",
1088+ request_block, cache_block);
1089+
1090+ cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1091+
1092+ job = new_kcached_job(dmc, bio, request_block, cache_block);
1093+
1094+ head = to_bytes(offset);
1095+
1096+ left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1097+ if (left < dmc->block_size) {
1098+ tail = to_bytes(left) - bio->bi_size - head;
1099+ job->src.count = left;
1100+ job->dest.count = left;
1101+ } else
1102+ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1103+
1104+ /* Requested block is aligned with a cache block */
1105+ if (0 == head && 0 == tail)
1106+ job->nr_pages= 0;
1107+ else /* Need new pages to store extra data */
1108+ job->nr_pages = dm_div_up(head, PAGE_SIZE) + dm_div_up(tail, PAGE_SIZE);
1109+ job->rw = READ; /* Fetch data from the source device */
1110+
1111+ DPRINTK("Queue job for %llu (need %u pages)",
1112+ bio->bi_sector, job->nr_pages);
1113+ queue_job(job);
1114+
1115+ return 0;
1116+}
1117+
1118+/*
1119+ * Handle a write cache miss:
1120+ * If write-through, forward the request to source device.
1121+ * If write-back, update the metadata; fetch the necessary block from source
1122+ * device; write to cache device.
1123+ */
1124+static int cache_write_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1125+ struct cacheblock *cache = dmc->cache;
1126+ unsigned int offset, head, tail;
1127+ struct kcached_job *job;
1128+ sector_t request_block, left;
1129+
1130+ if (dmc->write_policy == WRITE_THROUGH) { /* Forward request to souuce */
1131+ bio->bi_bdev = dmc->src_dev->bdev;
1132+ return 1;
1133+ }
1134+
1135+ offset = (unsigned int)(bio->bi_sector & dmc->block_mask);
1136+ request_block = bio->bi_sector - offset;
1137+
1138+ if (cache[cache_block].state & VALID) {
1139+ DPRINTK("Replacing %llu->%llu",
1140+ cache[cache_block].block, request_block);
1141+ dmc->replace++;
1142+ } else DPRINTK("Insert block %llu at empty frame %llu",
1143+ request_block, cache_block);
1144+
1145+ /* Write delay */
1146+ cache_insert(dmc, request_block, cache_block); /* Update metadata first */
1147+ set_state(cache[cache_block].state, DIRTY);
1148+ dmc->dirty_blocks++;
1149+
1150+ job = new_kcached_job(dmc, bio, request_block, cache_block);
1151+
1152+ head = to_bytes(offset);
1153+ left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block;
1154+ if (left < dmc->block_size) {
1155+ tail = to_bytes(left) - bio->bi_size - head;
1156+ job->src.count = left;
1157+ job->dest.count = left;
1158+ } else
1159+ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
1160+
1161+ if (0 == head && 0 == tail) { /* Requested is aligned with a cache block */
1162+ job->nr_pages = 0;
1163+ job->rw = WRITE;
1164+ } else if (head && tail){ /* Special case: need to pad both head and tail */
1165+ job->nr_pages = dm_div_up(to_bytes(job->src.count), PAGE_SIZE);
1166+ job->rw = READ;
1167+ } else {
1168+ if (head) { /* Fetch only head */
1169+ job->src.count = to_sector(head);
1170+ job->nr_pages = dm_div_up(head, PAGE_SIZE);
1171+ } else { /* Fetch only tail */
1172+ job->src.sector = bio->bi_sector + to_sector(bio->bi_size);
1173+ job->src.count = to_sector(tail);
1174+ job->nr_pages = dm_div_up(tail, PAGE_SIZE);
1175+ }
1176+ job->rw = READ;
1177+ }
1178+
1179+ queue_job(job);
1180+
1181+ return 0;
1182+}
1183+
1184+/* Handle cache misses */
1185+static int cache_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) {
1186+ if (bio_data_dir(bio) == READ)
1187+ return cache_read_miss(dmc, bio, cache_block);
1188+ else
1189+ return cache_write_miss(dmc, bio, cache_block);
1190+}
1191+
1192+
1193+/****************************************************************************
1194+ * Functions for implementing the operations on a cache mapping.
1195+ ****************************************************************************/
1196+
1197+/*
1198+ * Decide the mapping and perform necessary cache operations for a bio request.
1199+ */
1200+static int cache_map(struct dm_target *ti, struct bio *bio,
1201+ union map_info *map_context)
1202+{
1203+ struct cache_c *dmc = (struct cache_c *) ti->private;
1204+ sector_t request_block, cache_block = 0, offset;
1205+ int res;
1206+
1207+ offset = bio->bi_sector & dmc->block_mask;
1208+ request_block = bio->bi_sector - offset;
1209+
1210+ DPRINTK("Got a %s for %llu ((%llu:%llu), %u bytes)",
1211+ bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ?
1212+ "READ":"READA"), bio->bi_sector, request_block, offset,
1213+ bio->bi_size);
1214+
1215+ if (bio_data_dir(bio) == READ) dmc->reads++;
1216+ else dmc->writes++;
1217+
1218+ res = cache_lookup(dmc, request_block, &cache_block);
1219+ if (1 == res) /* Cache hit; server request from cache */
1220+ return cache_hit(dmc, bio, cache_block);
1221+ else if (0 == res) /* Cache miss; replacement block is found */
1222+ return cache_miss(dmc, bio, cache_block);
1223+ else if (2 == res) { /* Entire cache set is dirty; initiate a write-back */
1224+ write_back(dmc, cache_block, 1);
1225+ dmc->writeback++;
1226+ }
1227+
1228+ /* Forward to source device */
1229+ bio->bi_bdev = dmc->src_dev->bdev;
1230+
1231+ return 1;
1232+}
1233+
1234+struct meta_dmc {
1235+ sector_t size;
1236+ unsigned int block_size;
1237+ unsigned int assoc;
1238+ unsigned int write_policy;
1239+ unsigned int chksum;
1240+};
1241+
1242+/* Load metadata stored by previous session from disk. */
1243+static int load_metadata(struct cache_c *dmc) {
1244+ struct io_region where;
1245+ unsigned long bits;
1246+ sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1247+ sector_t meta_size, *meta_data, i, j, index = 0, limit, order;
1248+ struct meta_dmc *meta_dmc;
1249+ unsigned int chksum = 0, chksum_sav, consecutive_blocks;
1250+
1251+ meta_dmc = (struct meta_dmc *)vmalloc(512);
1252+ if (!meta_dmc) {
1253+ DMERR("load_metadata: Unable to allocate memory");
1254+ return 1;
1255+ }
1256+
1257+ where.bdev = dmc->cache_dev->bdev;
1258+ where.sector = dev_size - 1;
1259+ where.count = 1;
1260+ dm_io_sync_vm(1, &where, READ, meta_dmc, &bits);
1261+ DPRINTK("Loaded cache conf: block size(%u), cache size(%llu), " \
1262+ "associativity(%u), write policy(%u), chksum(%u)",
1263+ meta_dmc->block_size, meta_dmc->size,
1264+ meta_dmc->assoc, meta_dmc->write_policy,
1265+ meta_dmc->chksum);
1266+
1267+ dmc->block_size = meta_dmc->block_size;
1268+ dmc->block_shift = ffs(dmc->block_size) - 1;
1269+ dmc->block_mask = dmc->block_size - 1;
1270+
1271+ dmc->size = meta_dmc->size;
1272+ dmc->bits = ffs(dmc->size) - 1;
1273+
1274+ dmc->assoc = meta_dmc->assoc;
1275+ consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1276+ dmc->assoc : CONSECUTIVE_BLOCKS;
1277+ dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1278+
1279+ dmc->write_policy = meta_dmc->write_policy;
1280+ chksum_sav = meta_dmc->chksum;
1281+
1282+ vfree((void *)meta_dmc);
1283+
1284+
1285+ order = dmc->size * sizeof(struct cacheblock);
1286+ DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1287+ "(capacity:%lluMB, associativity:%u, block size:%u " \
1288+ "sectors(%uKB), %s)",
1289+ (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1290+ (unsigned long long) dmc->size,
1291+ (unsigned long long) dmc->size * dmc->block_size >> (20-SECTOR_SHIFT),
1292+ dmc->assoc, dmc->block_size,
1293+ dmc->block_size >> (10-SECTOR_SHIFT),
1294+ dmc->write_policy ? "write-back" : "write-through");
1295+ dmc->cache = (struct cacheblock *)vmalloc(order);
1296+ if (!dmc->cache) {
1297+ DMERR("load_metadata: Unable to allocate memory");
1298+ return 1;
1299+ }
1300+
1301+ meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1302+ /* When requesting a new bio, the number of requested bvecs has to be
1303+ less than BIO_MAX_PAGES. Otherwise, null is returned. In dm-io.c,
1304+ this return value is not checked and kernel Oops may happen. We set
1305+ the limit here to avoid such situations. (2 additional bvecs are
1306+ required by dm-io for bookeeping.)
1307+ */
1308+ limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1309+ meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1310+ if (!meta_data) {
1311+ DMERR("load_metadata: Unable to allocate memory");
1312+ vfree((void *)dmc->cache);
1313+ return 1;
1314+ }
1315+
1316+ while(index < meta_size) {
1317+ where.sector = dev_size - 1 - meta_size + index;
1318+ where.count = min(meta_size - index, limit);
1319+ dm_io_sync_vm(1, &where, READ, meta_data, &bits);
1320+
1321+ for (i=to_bytes(index)/sizeof(sector_t), j=0;
1322+ j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1323+ i++, j++) {
1324+ if(meta_data[j]) {
1325+ dmc->cache[i].block = meta_data[j];
1326+ dmc->cache[i].state = 1;
1327+ } else
1328+ dmc->cache[i].state = 0;
1329+ }
1330+ chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1331+ index += where.count;
1332+ }
1333+
1334+ vfree((void *)meta_data);
1335+
1336+ if (chksum != chksum_sav) { /* Check the checksum of the metadata */
1337+ DPRINTK("Cache metadata loaded from disk is corrupted");
1338+ vfree((void *)dmc->cache);
1339+ return 1;
1340+ }
1341+
1342+ DMINFO("Cache metadata loaded from disk (offset %llu)",
1343+ (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);;
1344+
1345+ return 0;
1346+}
1347+
1348+/* Store metadata onto disk. */
1349+static int dump_metadata(struct cache_c *dmc) {
1350+ struct io_region where;
1351+ unsigned long bits;
1352+ sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1353+ sector_t meta_size, i, j, index = 0, limit, *meta_data;
1354+ struct meta_dmc *meta_dmc;
1355+ unsigned int chksum = 0;
1356+
1357+ meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512);
1358+ limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT);
1359+ meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit)));
1360+ if (!meta_data) {
1361+ DMERR("dump_metadata: Unable to allocate memory");
1362+ return 1;
1363+ }
1364+
1365+ where.bdev = dmc->cache_dev->bdev;
1366+ while(index < meta_size) {
1367+ where.sector = dev_size - 1 - meta_size + index;
1368+ where.count = min(meta_size - index, limit);
1369+
1370+ for (i=to_bytes(index)/sizeof(sector_t), j=0;
1371+ j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size;
1372+ i++, j++) {
1373+ /* Assume all invalid cache blocks store 0. We lose the block that
1374+ * is actually mapped to offset 0.
1375+ */
1376+ meta_data[j] = dmc->cache[i].state ? dmc->cache[i].block : 0;
1377+ }
1378+ chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum);
1379+
1380+ dm_io_sync_vm(1, &where, WRITE, meta_data, &bits);
1381+ index += where.count;
1382+ }
1383+
1384+ vfree((void *)meta_data);
1385+
1386+ meta_dmc = (struct meta_dmc *)vmalloc(512);
1387+ if (!meta_dmc) {
1388+ DMERR("dump_metadata: Unable to allocate memory");
1389+ return 1;
1390+ }
1391+
1392+ meta_dmc->block_size = dmc->block_size;
1393+ meta_dmc->size = dmc->size;
1394+ meta_dmc->assoc = dmc->assoc;
1395+ meta_dmc->write_policy = dmc->write_policy;
1396+ meta_dmc->chksum = chksum;
1397+
1398+ DPRINTK("Store metadata to disk: block size(%u), cache size(%llu), " \
1399+ "associativity(%u), write policy(%u), checksum(%u)",
1400+ meta_dmc->block_size, (unsigned long long) meta_dmc->size,
1401+ meta_dmc->assoc, meta_dmc->write_policy,
1402+ meta_dmc->chksum);
1403+
1404+ where.sector = dev_size - 1;
1405+ where.count = 1;
1406+ dm_io_sync_vm(1, &where, WRITE, meta_dmc, &bits);
1407+
1408+ vfree((void *)meta_dmc);
1409+
1410+ DMINFO("Cache metadata saved to disk (offset %llu)",
1411+ (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);
1412+
1413+ return 0;
1414+}
1415+
1416+/*
1417+ * Construct a cache mapping.
1418+ * arg[0]: path to source device
1419+ * arg[1]: path to cache device
1420+ * arg[2]: cache persistence (if set, cache conf is loaded from disk)
1421+ * Cache configuration parameters (if not set, default values are used.
1422+ * arg[3]: cache block size (in sectors)
1423+ * arg[4]: cache size (in blocks)
1424+ * arg[5]: cache associativity
1425+ * arg[6]: write caching policy
1426+ */
1427+static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1428+{
1429+ struct cache_c *dmc;
1430+ unsigned int consecutive_blocks, persistence = 0;
1431+ sector_t localsize, i, order;
1432+ sector_t data_size, meta_size, dev_size;
1433+ unsigned long long cache_size;
1434+ int r = -EINVAL;
1435+
1436+ if (argc < 2) {
1437+ ti->error = "dm-cache: Need at least 2 arguments (src dev and cache dev)";
1438+ goto bad;
1439+ }
1440+
1441+ dmc = kmalloc(sizeof(*dmc), GFP_KERNEL);
1442+ if (dmc == NULL) {
1443+ ti->error = "dm-cache: Failed to allocate cache context";
1444+ r = ENOMEM;
1445+ goto bad;
1446+ }
1447+
1448+ r = dm_get_device(ti, argv[0], 0, ti->len,
1449+ dm_table_get_mode(ti->table), &dmc->src_dev);
1450+ if (r) {
1451+ ti->error = "dm-cache: Source device lookup failed";
1452+ goto bad1;
1453+ }
1454+
1455+ r = dm_get_device(ti, argv[1], 0, 0,
1456+ dm_table_get_mode(ti->table), &dmc->cache_dev);
1457+ if (r) {
1458+ ti->error = "dm-cache: Cache device lookup failed";
1459+ goto bad2;
1460+ }
1461+
1462+
1463+ r = kcopyd_client_create(DMCACHE_COPY_PAGES, &dmc->kcp_client);
1464+ if (r) {
1465+ ti->error = "Failed to initialize kcopyd client\n";
1466+ goto bad3;
1467+ }
1468+
1469+ r = kcached_init(dmc);
1470+ if (r) {
1471+ ti->error = "Failed to initialize kcached";
1472+ goto bad4;
1473+ }
1474+
1475+ if (argc >= 3) {
1476+ if (sscanf(argv[2], "%u", &persistence) != 1) {
1477+ ti->error = "dm-cache: Invalid cache persistence";
1478+ r = -EINVAL;
1479+ goto bad5;
1480+ }
1481+ }
1482+ if (1 == persistence) {
1483+ if (load_metadata(dmc)) {
1484+ ti->error = "dm-cache: Invalid cache configuration";
1485+ r = -EINVAL;
1486+ goto bad5;
1487+ }
1488+ goto init; /* Skip reading cache parameters from command line */
1489+ } else if (persistence != 0) {
1490+ ti->error = "dm-cache: Invalid cache persistence";
1491+ r = -EINVAL;
1492+ goto bad5;
1493+ }
1494+
1495+ if (argc >= 4) {
1496+ if (sscanf(argv[3], "%u", &dmc->block_size) != 1) {
1497+ ti->error = "dm-cache: Invalid block size";
1498+ r = -EINVAL;
1499+ goto bad5;
1500+ }
1501+ if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) {
1502+ ti->error = "dm-cache: Invalid block size";
1503+ r = -EINVAL;
1504+ goto bad5;
1505+ }
1506+ } else
1507+ dmc->block_size = DEFAULT_BLOCK_SIZE;
1508+ dmc->block_shift = ffs(dmc->block_size) - 1;
1509+ dmc->block_mask = dmc->block_size - 1;
1510+
1511+ if (argc >= 5) {
1512+ if (sscanf(argv[4], "%llu", &cache_size) != 1) {
1513+ ti->error = "dm-cache: Invalid cache size";
1514+ r = -EINVAL;
1515+ goto bad5;
1516+ }
1517+ dmc->size = (sector_t) cache_size;
1518+ if (!dmc->size || (dmc->size & (dmc->size - 1))) {
1519+ ti->error = "dm-cache: Invalid cache size";
1520+ r = -EINVAL;
1521+ goto bad5;
1522+ }
1523+ } else
1524+ dmc->size = DEFAULT_CACHE_SIZE;
1525+ localsize = dmc->size;
1526+ dmc->bits = ffs(dmc->size) - 1;
1527+
1528+ if (argc >= 6) {
1529+ if (sscanf(argv[5], "%u", &dmc->assoc) != 1) {
1530+ ti->error = "dm-cache: Invalid cache associativity";
1531+ r = -EINVAL;
1532+ goto bad5;
1533+ }
1534+ if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) ||
1535+ dmc->size < dmc->assoc) {
1536+ ti->error = "dm-cache: Invalid cache associativity";
1537+ r = -EINVAL;
1538+ goto bad5;
1539+ }
1540+ } else
1541+ dmc->assoc = DEFAULT_CACHE_ASSOC;
1542+
1543+ DMINFO("%lld", dmc->cache_dev->bdev->bd_inode->i_size);
1544+ dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9;
1545+ data_size = dmc->size * dmc->block_size;
1546+ meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512) + 1;
1547+ if ((data_size + meta_size) > dev_size) {
1548+ DMERR("Requested cache size exeeds the cache device's capacity" \
1549+ "(%llu+%llu>%llu)",
1550+ (unsigned long long) data_size, (unsigned long long) meta_size,
1551+ (unsigned long long) dev_size);
1552+ ti->error = "dm-cache: Invalid cache size";
1553+ r = -EINVAL;
1554+ goto bad5;
1555+ }
1556+ consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ?
1557+ dmc->assoc : CONSECUTIVE_BLOCKS;
1558+ dmc->consecutive_shift = ffs(consecutive_blocks) - 1;
1559+
1560+ if (argc >= 7) {
1561+ if (sscanf(argv[6], "%u", &dmc->write_policy) != 1) {
1562+ ti->error = "dm-cache: Invalid cache write policy";
1563+ r = -EINVAL;
1564+ goto bad5;
1565+ }
1566+ if (dmc->write_policy != 0 && dmc->write_policy != 1) {
1567+ ti->error = "dm-cache: Invalid cache write policy";
1568+ r = -EINVAL;
1569+ goto bad5;
1570+ }
1571+ } else
1572+ dmc->write_policy = DEFAULT_WRITE_POLICY;
1573+
1574+ order = dmc->size * sizeof(struct cacheblock);
1575+ localsize = data_size >> 11;
1576+ DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \
1577+ "(capacity:%lluMB, associativity:%u, block size:%u " \
1578+ "sectors(%uKB), %s)",
1579+ (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock),
1580+ (unsigned long long) dmc->size,
1581+ (unsigned long long) data_size >> (20-SECTOR_SHIFT),
1582+ dmc->assoc, dmc->block_size,
1583+ dmc->block_size >> (10-SECTOR_SHIFT),
1584+ dmc->write_policy ? "write-back" : "write-through");
1585+
1586+ dmc->cache = (struct cacheblock *)vmalloc(order);
1587+ if (!dmc->cache) {
1588+ ti->error = "Unable to allocate memory";
1589+ r = -ENOMEM;
1590+ goto bad5;
1591+ }
1592+
1593+init: /* Initialize the cache structs */
1594+ for (i=0; i<dmc->size; i++) {
1595+ bio_list_init(&dmc->cache[i].bios);
1596+ if(!persistence) dmc->cache[i].state = 0;
1597+ dmc->cache[i].counter = 0;
1598+ spin_lock_init(&dmc->cache[i].lock);
1599+ }
1600+
1601+ dmc->counter = 0;
1602+ dmc->dirty_blocks = 0;
1603+ dmc->reads = 0;
1604+ dmc->writes = 0;
1605+ dmc->cache_hits = 0;
1606+ dmc->replace = 0;
1607+ dmc->writeback = 0;
1608+ dmc->dirty = 0;
1609+
1610+ ti->split_io = dmc->block_size;
1611+ ti->private = dmc;
1612+ return 0;
1613+
1614+bad5:
1615+ kcached_client_destroy(dmc);
1616+bad4:
1617+ kcopyd_client_destroy(dmc->kcp_client);
1618+bad3:
1619+ dm_put_device(ti, dmc->cache_dev);
1620+bad2:
1621+ dm_put_device(ti, dmc->src_dev);
1622+bad1:
1623+ kfree(dmc);
1624+bad:
1625+ return r;
1626+}
1627+
1628+
1629+static void cache_flush(struct cache_c *dmc)
1630+{
1631+ struct cacheblock *cache = dmc->cache;
1632+ sector_t i = 0;
1633+ unsigned int j;
1634+
1635+ DMINFO("Flush dirty blocks (%llu) ...", (unsigned long long) dmc->dirty_blocks);
1636+ while (i< dmc->size) {
1637+ j = 1;
1638+ if (is_state(cache[i].state, DIRTY)) {
1639+ while ((i+j) < dmc->size && is_state(cache[i+j].state, DIRTY)
1640+ && (cache[i+j].block == cache[i].block + j *
1641+ dmc->block_size)) {
1642+ j++;
1643+ }
1644+ dmc->dirty += j;
1645+ write_back(dmc, i, j);
1646+ }
1647+ i += j;
1648+ }
1649+}
1650+
1651+/*
1652+ * Destroy the cache mapping.
1653+ */
1654+static void cache_dtr(struct dm_target *ti)
1655+{
1656+ struct cache_c *dmc = (struct cache_c *) ti->private;
1657+
1658+ if (dmc->dirty_blocks > 0) cache_flush(dmc);
1659+
1660+ kcached_client_destroy(dmc);
1661+
1662+ kcopyd_client_destroy(dmc->kcp_client);
1663+
1664+ if (dmc->reads + dmc->writes > 0)
1665+ DMINFO("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1666+ "replacement(%lu), replaced dirty blocks(%lu), " \
1667+ "flushed dirty blocks(%lu)",
1668+ dmc->reads, dmc->writes, dmc->cache_hits,
1669+ dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1670+ dmc->replace, dmc->writeback, dmc->dirty);
1671+
1672+ dump_metadata(dmc); /* Always dump metadata to disk before exit */
1673+ vfree((void *)dmc->cache);
1674+
1675+ dm_put_device(ti, dmc->src_dev);
1676+ dm_put_device(ti, dmc->cache_dev);
1677+ kfree(dmc);
1678+}
1679+
1680+/*
1681+ * Report cache status:
1682+ * Output cache stats upon request of device status;
1683+ * Output cache configuration upon request of table status.
1684+ */
1685+static int cache_status(struct dm_target *ti, status_type_t type,
1686+ char *result, unsigned int maxlen)
1687+{
1688+ struct cache_c *dmc = (struct cache_c *) ti->private;
1689+ int sz = 0;
1690+
1691+ switch (type) {
1692+ case STATUSTYPE_INFO:
1693+ DMEMIT("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \
1694+ "replacement(%lu), replaced dirty blocks(%lu)",
1695+ dmc->reads, dmc->writes, dmc->cache_hits,
1696+ dmc->cache_hits * 100 / (dmc->reads + dmc->writes),
1697+ dmc->replace, dmc->writeback);
1698+ break;
1699+ case STATUSTYPE_TABLE:
1700+ DMEMIT("conf: capacity(%lluM), associativity(%u), block size(%uK), %s",
1701+ (unsigned long long) dmc->size * dmc->block_size >> 11,
1702+ dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT),
1703+ dmc->write_policy ? "write-back":"write-through");
1704+ break;
1705+ }
1706+ return 0;
1707+}
1708+
1709+
1710+/****************************************************************************
1711+ * Functions for manipulating a cache target.
1712+ ****************************************************************************/
1713+
1714+static struct target_type cache_target = {
1715+ .name = "cache",
1716+ .version= {1, 0, 1},
1717+ .module = THIS_MODULE,
1718+ .ctr = cache_ctr,
1719+ .dtr = cache_dtr,
1720+ .map = cache_map,
1721+ .status = cache_status,
1722+};
1723+
1724+/*
1725+ * Initiate a cache target.
1726+ */
1727+int __init dm_cache_init(void)
1728+{
1729+ int r;
1730+
1731+ r = jobs_init();
1732+ if (r)
1733+ return r;
1734+
1735+ _kcached_wq = create_singlethread_workqueue("kcached");
1736+ if (!_kcached_wq) {
1737+ DMERR("failed to start kcached");
1738+ return -ENOMEM;
1739+ }
1740+ INIT_WORK(&_kcached_work, do_work);
1741+
1742+ r = dm_register_target(&cache_target);
1743+ if (r < 0) {
1744+ DMERR("cache: register failed %d", r);
1745+ destroy_workqueue(_kcached_wq);
1746+ }
1747+
1748+ return r;
1749+}
1750+
1751+/*
1752+ * Destroy a cache target.
1753+ */
1754+void dm_cache_exit(void)
1755+{
1756+ int r = dm_unregister_target(&cache_target);
1757+
1758+ if (r < 0)
1759+ DMERR("cache: unregister failed %d", r);
1760+
1761+ jobs_exit();
1762+ destroy_workqueue(_kcached_wq);
1763+}
1764+
1765+module_init(dm_cache_init);
1766+module_exit(dm_cache_exit);
1767+
1768+MODULE_DESCRIPTION(DM_NAME " cache target");
1769+MODULE_AUTHOR("Ming Zhao <mingzhao99th@gmail.com>");
1770+MODULE_LICENSE("GPL");
1771diff -Naur linux-2.6.21.7-orig/drivers/md/Kconfig linux-2.6.21.7-dmcache/drivers/md/Kconfig
1772--- linux-2.6.21.7-orig/drivers/md/Kconfig 2007-08-04 12:11:13.000000000 -0400
1773+++ linux-2.6.21.7-dmcache/drivers/md/Kconfig 2007-08-23 14:16:07.000000000 -0400
1774@@ -262,6 +262,12 @@
1775 ---help---
1776 Multipath support for EMC CX/AX series hardware.
1777
1778+config DM_CACHE
1779+ tristate "Cache target support (EXPERIMENTAL)"
1780+ depends on BLK_DEV_DM && EXPERIMENTAL
1781+ ---help---
1782+ Support for generic cache target for device-mapper.
1783+
1784 endmenu
1785
1786 endif
1787diff -Naur linux-2.6.21.7-orig/drivers/md/Makefile linux-2.6.21.7-dmcache/drivers/md/Makefile
1788--- linux-2.6.21.7-orig/drivers/md/Makefile 2007-08-04 12:11:13.000000000 -0400
1789+++ linux-2.6.21.7-dmcache/drivers/md/Makefile 2007-08-23 14:16:25.000000000 -0400
1790@@ -36,6 +36,7 @@
1791 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
1792 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
1793 obj-$(CONFIG_DM_ZERO) += dm-zero.o
1794+obj-$(CONFIG_DM_CACHE) += dm-cache.o
1795
1796 quiet_cmd_unroll = UNROLL $@
1797 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
This page took 0.402177 seconds and 4 git commands to generate.