]>
Commit | Line | Data |
---|---|---|
a7aacd32 ER |
1 | diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c |
2 | --- linux-2.6.21.7-orig/drivers/md/dm-cache.c 1969-12-31 19:00:00.000000000 -0500 | |
3 | +++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c 2007-08-23 14:10:58.000000000 -0400 | |
4 | @@ -0,0 +1,1766 @@ | |
5 | +/**************************************************************************** | |
6 | + * dm-cache.c | |
7 | + * Device mapper target for block-level disk caching | |
8 | + * | |
9 | + * Copyright (C) International Business Machines Corp., 2006 | |
10 | + * Author: Ming Zhao (mingzhao@ufl.edu) | |
11 | + * | |
12 | + * This program is free software; you can redistribute it and/or modify | |
13 | + * it under the terms of the GNU General Public License as published by | |
14 | + * the Free Software Foundation; under version 2 of the License. | |
15 | + * | |
16 | + * This program is distributed in the hope that it will be useful, | |
17 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | + * GNU General Public License for more details. | |
20 | + * | |
21 | + * You should have received a copy of the GNU General Public License | |
22 | + * along with this program; if not, write to the Free Software | |
23 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
24 | + * | |
25 | + ****************************************************************************/ | |
26 | + | |
27 | +#include <asm/atomic.h> | |
28 | +#include <asm/checksum.h> | |
29 | +#include <linux/module.h> | |
30 | +#include <linux/init.h> | |
31 | +#include <linux/list.h> | |
32 | +#include <linux/blkdev.h> | |
33 | +#include <linux/bio.h> | |
34 | +#include <linux/slab.h> | |
35 | +#include <linux/hash.h> | |
36 | +#include <linux/spinlock.h> | |
37 | +#include <linux/workqueue.h> | |
38 | +#include <linux/pagemap.h> | |
39 | + | |
40 | +#include "dm.h" | |
41 | +#include "dm-io.h" | |
42 | +#include "dm-bio-list.h" | |
43 | +#include "kcopyd.h" | |
44 | + | |
45 | +#define DMC_DEBUG 0 | |
46 | + | |
47 | +#define DM_MSG_PREFIX "cache" | |
48 | +#define DMC_PREFIX "dm-cache: " | |
49 | + | |
50 | +#if DMC_DEBUG | |
51 | +#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg) | |
52 | +#else | |
53 | +#define DPRINTK( s, arg... ) | |
54 | +#endif | |
55 | + | |
56 | +/* Default cache parameters */ | |
57 | +#define DEFAULT_CACHE_SIZE 65536 | |
58 | +#define DEFAULT_CACHE_ASSOC 1024 | |
59 | +#define DEFAULT_BLOCK_SIZE 8 | |
60 | +#define CONSECUTIVE_BLOCKS 512 | |
61 | + | |
62 | +/* Write policy */ | |
63 | +#define WRITE_THROUGH 0 | |
64 | +#define WRITE_BACK 1 | |
65 | +#define DEFAULT_WRITE_POLICY WRITE_THROUGH | |
66 | + | |
67 | +/* Number of pages for I/O */ | |
68 | +#define DMCACHE_COPY_PAGES 1024 | |
69 | + | |
70 | +/* States of a cache block */ | |
71 | +#define INVALID 0 | |
72 | +#define VALID 1 /* Valid */ | |
73 | +#define RESERVED 2 /* Allocated but data not in place yet */ | |
74 | +#define DIRTY 4 /* Locally modified */ | |
75 | +#define WRITEBACK 8 /* In the process of write back */ | |
76 | + | |
77 | +#define is_state(x, y) (x & y) | |
78 | +#define set_state(x, y) (x |= y) | |
79 | +#define clear_state(x, y) (x &= ~y) | |
80 | + | |
81 | +/* | |
82 | + * Cache context | |
83 | + */ | |
84 | +struct cache_c { | |
85 | + struct dm_dev *src_dev; /* Source device */ | |
86 | + struct dm_dev *cache_dev; /* Cache device */ | |
87 | + struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */ | |
88 | + | |
89 | + struct cacheblock *cache; /* Hash table for cache blocks */ | |
90 | + sector_t size; /* Cache size */ | |
91 | + unsigned int bits; /* Cache size in bits */ | |
92 | + unsigned int assoc; /* Cache associativity */ | |
93 | + unsigned int block_size; /* Cache block size */ | |
94 | + unsigned int block_shift; /* Cache block size in bits */ | |
95 | + unsigned int block_mask; /* Cache block mask */ | |
96 | + unsigned int consecutive_shift; /* Consecutive blocks size in bits */ | |
97 | + unsigned long counter; /* Logical timestamp of last access */ | |
98 | + unsigned int write_policy; /* Cache write policy */ | |
99 | + sector_t dirty_blocks; /* Number of dirty blocks */ | |
100 | + | |
101 | + spinlock_t lock; /* Lock to protect page allocation/deallocation */ | |
102 | + struct page_list *pages; /* Pages for I/O */ | |
103 | + unsigned int nr_pages; /* Number of pages */ | |
104 | + unsigned int nr_free_pages; /* Number of free pages */ | |
105 | + wait_queue_head_t destroyq; /* Wait queue for I/O completion */ | |
106 | + atomic_t nr_jobs; /* Number of I/O jobs */ | |
107 | + /* Stats */ | |
108 | + unsigned long reads; /* Number of reads */ | |
109 | + unsigned long writes; /* Number of writes */ | |
110 | + unsigned long cache_hits; /* Number of cache hits */ | |
111 | + unsigned long replace; /* Number of cache replacements */ | |
112 | + unsigned long writeback; /* Number of replaced dirty blocks */ | |
113 | + unsigned long dirty; /* Number of submitted dirty blocks */ | |
114 | +}; | |
115 | + | |
116 | +/* Cache block metadata structure */ | |
117 | +struct cacheblock { | |
118 | + spinlock_t lock; /* Lock to protect operations on the bio list */ | |
119 | + sector_t block; /* Sector number of the cached block */ | |
120 | + unsigned short state; /* State of a block */ | |
121 | + unsigned long counter; /* Logical timestamp of the block's last access */ | |
122 | + struct bio_list bios; /* List of pending bios */ | |
123 | +}; | |
124 | + | |
125 | + | |
126 | +/**************************************************************************** | |
127 | + * Functions and data structures for implementing a kcached to handle async | |
128 | + * I/O. Code for page and queue handling is borrowed from kcopyd.c. | |
129 | + ****************************************************************************/ | |
130 | + | |
131 | +/* | |
132 | + * Functions for handling pages used by async I/O. | |
133 | + * The data asked by a bio request may not be aligned with cache blocks, in | |
134 | + * which case additional pages are required for the request that is forwarded | |
135 | + * to the server. A pool of pages are reserved for this purpose. | |
136 | + */ | |
137 | + | |
138 | +static struct page_list *alloc_pl(void) | |
139 | +{ | |
140 | + struct page_list *pl; | |
141 | + | |
142 | + pl = kmalloc(sizeof(*pl), GFP_KERNEL); | |
143 | + if (!pl) | |
144 | + return NULL; | |
145 | + | |
146 | + pl->page = alloc_page(GFP_KERNEL); | |
147 | + if (!pl->page) { | |
148 | + kfree(pl); | |
149 | + return NULL; | |
150 | + } | |
151 | + | |
152 | + return pl; | |
153 | +} | |
154 | + | |
155 | +static void free_pl(struct page_list *pl) | |
156 | +{ | |
157 | + __free_page(pl->page); | |
158 | + kfree(pl); | |
159 | +} | |
160 | + | |
161 | +static void drop_pages(struct page_list *pl) | |
162 | +{ | |
163 | + struct page_list *next; | |
164 | + | |
165 | + while (pl) { | |
166 | + next = pl->next; | |
167 | + free_pl(pl); | |
168 | + pl = next; | |
169 | + } | |
170 | +} | |
171 | + | |
172 | +static int kcached_get_pages(struct cache_c *dmc, unsigned int nr, | |
173 | + struct page_list **pages) | |
174 | +{ | |
175 | + struct page_list *pl; | |
176 | + | |
177 | + spin_lock(&dmc->lock); | |
178 | + if (dmc->nr_free_pages < nr) { | |
179 | + DPRINTK("kcached_get_pages: No free pages: %u<%u", | |
180 | + dmc->nr_free_pages, nr); | |
181 | + spin_unlock(&dmc->lock); | |
182 | + return -ENOMEM; | |
183 | + } | |
184 | + | |
185 | + dmc->nr_free_pages -= nr; | |
186 | + for (*pages = pl = dmc->pages; --nr; pl = pl->next) | |
187 | + ; | |
188 | + | |
189 | + dmc->pages = pl->next; | |
190 | + pl->next = NULL; | |
191 | + | |
192 | + spin_unlock(&dmc->lock); | |
193 | + | |
194 | + return 0; | |
195 | +} | |
196 | + | |
197 | +static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl) | |
198 | +{ | |
199 | + struct page_list *cursor; | |
200 | + | |
201 | + spin_lock(&dmc->lock); | |
202 | + for (cursor = pl; cursor->next; cursor = cursor->next) | |
203 | + dmc->nr_free_pages++; | |
204 | + | |
205 | + dmc->nr_free_pages++; | |
206 | + cursor->next = dmc->pages; | |
207 | + dmc->pages = pl; | |
208 | + | |
209 | + spin_unlock(&dmc->lock); | |
210 | +} | |
211 | + | |
212 | +static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr) | |
213 | +{ | |
214 | + unsigned int i; | |
215 | + struct page_list *pl = NULL, *next; | |
216 | + | |
217 | + for (i = 0; i < nr; i++) { | |
218 | + next = alloc_pl(); | |
219 | + if (!next) { | |
220 | + if (pl) | |
221 | + drop_pages(pl); | |
222 | + return -ENOMEM; | |
223 | + } | |
224 | + next->next = pl; | |
225 | + pl = next; | |
226 | + } | |
227 | + | |
228 | + kcached_put_pages(dmc, pl); | |
229 | + dmc->nr_pages += nr; | |
230 | + | |
231 | + return 0; | |
232 | +} | |
233 | + | |
234 | +static void free_bio_pages(struct cache_c *dmc) | |
235 | +{ | |
236 | + BUG_ON(dmc->nr_free_pages != dmc->nr_pages); | |
237 | + drop_pages(dmc->pages); | |
238 | + dmc->pages = NULL; | |
239 | + dmc->nr_free_pages = dmc->nr_pages = 0; | |
240 | +} | |
241 | + | |
242 | +/* Structure for a kcached job */ | |
243 | +struct kcached_job { | |
244 | + struct list_head list; | |
245 | + struct cache_c *dmc; | |
246 | + struct bio *bio; /* Original bio */ | |
247 | + struct io_region src; | |
248 | + struct io_region dest; | |
249 | + struct cacheblock *cacheblock; | |
250 | + int rw; | |
251 | + /* | |
252 | + * When the original bio is not aligned with cache blocks, | |
253 | + * we need extra bvecs and pages for padding. | |
254 | + */ | |
255 | + struct bio_vec *bvec; | |
256 | + unsigned int nr_pages; | |
257 | + struct page_list *pages; | |
258 | +}; | |
259 | + | |
260 | +static struct workqueue_struct *_kcached_wq; | |
261 | +static struct work_struct _kcached_work; | |
262 | + | |
263 | +static inline void wake(void) | |
264 | +{ | |
265 | + queue_work(_kcached_wq, &_kcached_work); | |
266 | +} | |
267 | + | |
268 | +#define MIN_JOBS 1024 | |
269 | + | |
270 | +static struct kmem_cache *_job_cache; | |
271 | +static mempool_t *_job_pool; | |
272 | + | |
273 | +static DEFINE_SPINLOCK(_job_lock); | |
274 | + | |
275 | +static LIST_HEAD(_complete_jobs); | |
276 | +static LIST_HEAD(_io_jobs); | |
277 | +static LIST_HEAD(_pages_jobs); | |
278 | + | |
279 | +static int jobs_init(void) | |
280 | +{ | |
281 | + _job_cache = kmem_cache_create("kcached-jobs", | |
282 | + sizeof(struct kcached_job), | |
283 | + __alignof__(struct kcached_job), | |
284 | + 0, NULL, NULL); | |
285 | + if (!_job_cache) | |
286 | + return -ENOMEM; | |
287 | + | |
288 | + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, | |
289 | + mempool_free_slab, _job_cache); | |
290 | + if (!_job_pool) { | |
291 | + kmem_cache_destroy(_job_cache); | |
292 | + return -ENOMEM; | |
293 | + } | |
294 | + | |
295 | + return 0; | |
296 | +} | |
297 | + | |
298 | +static void jobs_exit(void) | |
299 | +{ | |
300 | + BUG_ON(!list_empty(&_complete_jobs)); | |
301 | + BUG_ON(!list_empty(&_io_jobs)); | |
302 | + BUG_ON(!list_empty(&_pages_jobs)); | |
303 | + | |
304 | + mempool_destroy(_job_pool); | |
305 | + kmem_cache_destroy(_job_cache); | |
306 | + _job_pool = NULL; | |
307 | + _job_cache = NULL; | |
308 | +} | |
309 | + | |
310 | +/* | |
311 | + * Functions to push and pop a job onto the head of a given job list. | |
312 | + */ | |
313 | +static inline struct kcached_job *pop(struct list_head *jobs) | |
314 | +{ | |
315 | + struct kcached_job *job = NULL; | |
316 | + unsigned long flags; | |
317 | + | |
318 | + spin_lock_irqsave(&_job_lock, flags); | |
319 | + | |
320 | + if (!list_empty(jobs)) { | |
321 | + job = list_entry(jobs->next, struct kcached_job, list); | |
322 | + list_del(&job->list); | |
323 | + } | |
324 | + spin_unlock_irqrestore(&_job_lock, flags); | |
325 | + | |
326 | + return job; | |
327 | +} | |
328 | + | |
329 | +static inline void push(struct list_head *jobs, struct kcached_job *job) | |
330 | +{ | |
331 | + unsigned long flags; | |
332 | + | |
333 | + spin_lock_irqsave(&_job_lock, flags); | |
334 | + list_add_tail(&job->list, jobs); | |
335 | + spin_unlock_irqrestore(&_job_lock, flags); | |
336 | +} | |
337 | + | |
338 | + | |
339 | +/**************************************************************************** | |
340 | + * Functions for asynchronously fetching data from source device and storing | |
341 | + * data in cache device. Because the requested data may not align with the | |
342 | + * cache blocks, extra handling is required to pad a block request and extract | |
343 | + * the requested data from the results. | |
344 | + ****************************************************************************/ | |
345 | + | |
346 | +static void io_callback(unsigned long error, void *context) | |
347 | +{ | |
348 | + struct kcached_job *job = (struct kcached_job *) context; | |
349 | + | |
350 | + if (error) { | |
351 | + /* TODO */ | |
352 | + DMERR("io_callback: io error"); | |
353 | + return; | |
354 | + } | |
355 | + | |
356 | + if (job->rw == READ) { | |
357 | + job->rw = WRITE; | |
358 | + push(&_io_jobs, job); | |
359 | + } else | |
360 | + push(&_complete_jobs, job); | |
361 | + wake(); | |
362 | +} | |
363 | + | |
364 | +/* | |
365 | + * Fetch data from the source device asynchronously. | |
366 | + * For a READ bio, if a cache block is larger than the requested data, then | |
367 | + * additional data are prefetched. Larger cache block size enables more | |
368 | + * aggressive read prefetching, which is useful for read-mostly usage. | |
369 | + * For a WRITE bio, if a cache block is larger than the requested data, the | |
370 | + * entire block needs to be fetched, and larger block size incurs more overhead. | |
371 | + * In scenaros where writes are frequent, 4KB is a good cache block size. | |
372 | + */ | |
373 | +static int do_fetch(struct kcached_job *job) | |
374 | +{ | |
375 | + int r = 0, i, j; | |
376 | + struct bio *bio = job->bio; | |
377 | + struct cache_c *dmc = job->dmc; | |
378 | + unsigned int offset, head, tail, remaining, nr_vecs, idx = 0; | |
379 | + struct bio_vec *bvec; | |
380 | + struct page_list *pl; | |
381 | + | |
382 | + offset = (unsigned int) (bio->bi_sector & dmc->block_mask); | |
383 | + head = to_bytes(offset); | |
384 | + tail = to_bytes(dmc->block_size) - bio->bi_size - head; | |
385 | + | |
386 | + DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u", | |
387 | + bio->bi_sector, job->src.sector, job->dest.sector, | |
388 | + job->src.count, head, tail); | |
389 | + | |
390 | + if (bio_data_dir(bio) == READ) { /* The original request is a READ */ | |
391 | + if (0 == job->nr_pages) { /* The request is aligned to cache block */ | |
392 | + r = dm_io_async_bvec(1, &job->src, READ, | |
393 | + bio->bi_io_vec + bio->bi_idx, | |
394 | + io_callback, job); | |
395 | + return r; | |
396 | + } | |
397 | + | |
398 | + nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages; | |
399 | + bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO); | |
400 | + if (!bvec) { | |
401 | + DMERR("do_fetch: No memory"); | |
402 | + return 1; | |
403 | + } | |
404 | + | |
405 | + pl = job->pages; | |
406 | + i = 0; | |
407 | + while (head) { | |
408 | + bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE); | |
409 | + bvec[i].bv_offset = 0; | |
410 | + bvec[i].bv_page = pl->page; | |
411 | + head -= bvec[i].bv_len; | |
412 | + pl = pl->next; | |
413 | + i++; | |
414 | + } | |
415 | + | |
416 | + remaining = bio->bi_size; | |
417 | + j = bio->bi_idx; | |
418 | + while (remaining) { | |
419 | + bvec[i] = bio->bi_io_vec[j]; | |
420 | + remaining -= bvec[i].bv_len; | |
421 | + i++; j++; | |
422 | + } | |
423 | + | |
424 | + while (tail) { | |
425 | + bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE); | |
426 | + bvec[i].bv_offset = 0; | |
427 | + bvec[i].bv_page = pl->page; | |
428 | + tail -= bvec[i].bv_len; | |
429 | + pl = pl->next; | |
430 | + i++; | |
431 | + } | |
432 | + | |
433 | + job->bvec = bvec; | |
434 | + r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job); | |
435 | + return r; | |
436 | + } else { /* The original request is a WRITE */ | |
437 | + pl = job->pages; | |
438 | + | |
439 | + if (head && tail) { /* Special case */ | |
440 | + bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL); | |
441 | + if (!bvec) { | |
442 | + DMERR("do_fetch: No memory"); | |
443 | + return 1; | |
444 | + } | |
445 | + for (i=0; i<job->nr_pages; i++) { | |
446 | + bvec[i].bv_len = PAGE_SIZE; | |
447 | + bvec[i].bv_offset = 0; | |
448 | + bvec[i].bv_page = pl->page; | |
449 | + pl = pl->next; | |
450 | + } | |
451 | + job->bvec = bvec; | |
452 | + r = dm_io_async_bvec(1, &job->src, READ, job->bvec, | |
453 | + io_callback, job); | |
454 | + return r; | |
455 | + } | |
456 | + | |
457 | + bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx) | |
458 | + * sizeof(*bvec), GFP_KERNEL); | |
459 | + if (!bvec) { | |
460 | + DMERR("do_fetch: No memory"); | |
461 | + return 1; | |
462 | + } | |
463 | + | |
464 | + i = 0; | |
465 | + while (head) { | |
466 | + bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE); | |
467 | + bvec[i].bv_offset = 0; | |
468 | + bvec[i].bv_page = pl->page; | |
469 | + head -= bvec[i].bv_len; | |
470 | + pl = pl->next; | |
471 | + i++; | |
472 | + } | |
473 | + | |
474 | + remaining = bio->bi_size; | |
475 | + j = bio->bi_idx; | |
476 | + while (remaining) { | |
477 | + bvec[i] = bio->bi_io_vec[j]; | |
478 | + remaining -= bvec[i].bv_len; | |
479 | + i++; j++; | |
480 | + } | |
481 | + | |
482 | + if (tail) { | |
483 | + idx = i; | |
484 | + bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) & | |
485 | + (PAGE_SIZE - 1); | |
486 | + bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset; | |
487 | + bvec[i].bv_page = pl->page; | |
488 | + tail -= bvec[i].bv_len; | |
489 | + pl = pl->next; i++; | |
490 | + while (tail) { | |
491 | + bvec[i].bv_len = PAGE_SIZE; | |
492 | + bvec[i].bv_offset = 0; | |
493 | + bvec[i].bv_page = pl->page; | |
494 | + tail -= bvec[i].bv_len; | |
495 | + pl = pl->next; i++; | |
496 | + } | |
497 | + } | |
498 | + | |
499 | + job->bvec = bvec; | |
500 | + r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx, | |
501 | + io_callback, job); | |
502 | + | |
503 | + return r; | |
504 | + } | |
505 | +} | |
506 | + | |
507 | +/* | |
508 | + * Store data to the cache source device asynchronously. | |
509 | + * For a READ bio request, the data fetched from the source device are returned | |
510 | + * to kernel and stored in cache at the same time. | |
511 | + * For a WRITE bio request, the data are written to the cache and source device | |
512 | + * at the same time. | |
513 | + */ | |
514 | +static int do_store(struct kcached_job *job) | |
515 | +{ | |
516 | + int i, j, r = 0; | |
517 | + struct bio *bio = job->bio, *clone; | |
518 | + struct cache_c *dmc = job->dmc; | |
519 | + unsigned int offset, head, tail, remaining, nr_vecs; | |
520 | + struct bio_vec *bvec; | |
521 | + | |
522 | + offset = (unsigned int) (bio->bi_sector & dmc->block_mask); | |
523 | + head = to_bytes(offset); | |
524 | + tail = to_bytes(dmc->block_size) - bio->bi_size - head; | |
525 | + | |
526 | + DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u", | |
527 | + bio->bi_sector, job->src.sector, job->dest.sector, | |
528 | + job->src.count, head, tail); | |
529 | + | |
530 | + /* A READ is acknowledged as soon as the requested data is fetched, and | |
531 | + does not have to wait for it being stored in cache. The bio is cloned | |
532 | + so that the original one can be ended here. But to avoid copying | |
533 | + pages, we reuse the pages allocated for the original bio, and mark | |
534 | + each of them to prevent the pages being freed before the cache | |
535 | + insertion is completed. | |
536 | + */ | |
537 | + if (bio_data_dir(bio) == READ) { | |
538 | + clone = bio_clone(bio, GFP_NOIO); | |
539 | + for (i=bio->bi_idx; i<bio->bi_vcnt; i++) { | |
540 | + get_page(bio->bi_io_vec[i].bv_page); | |
541 | + } | |
542 | + DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size); | |
543 | + bio_endio(bio, bio->bi_size, 0); | |
544 | + bio = clone; | |
545 | + job->bio = clone; | |
546 | + } | |
547 | + | |
548 | + if (0 == job->nr_pages) /* Original request is aligned with cache blocks */ | |
549 | + r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx, | |
550 | + io_callback, job); | |
551 | + else { | |
552 | + if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) { | |
553 | + DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail); | |
554 | + nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx; | |
555 | + if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++; | |
556 | + DPRINTK("Create %u new vecs", nr_vecs); | |
557 | + bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL); | |
558 | + if (!bvec) { | |
559 | + DMERR("do_store: No memory"); | |
560 | + return 1; | |
561 | + } | |
562 | + | |
563 | + i = 0; | |
564 | + while (head) { | |
565 | + bvec[i].bv_len = min(head, job->bvec[i].bv_len); | |
566 | + bvec[i].bv_offset = 0; | |
567 | + bvec[i].bv_page = job->bvec[i].bv_page; | |
568 | + head -= bvec[i].bv_len; | |
569 | + i++; | |
570 | + } | |
571 | + remaining = bio->bi_size; | |
572 | + j = bio->bi_idx; | |
573 | + while (remaining) { | |
574 | + bvec[i] = bio->bi_io_vec[j]; | |
575 | + remaining -= bvec[i].bv_len; | |
576 | + i++; j++; | |
577 | + } | |
578 | + j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE; | |
579 | + bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) - | |
580 | + j * PAGE_SIZE; | |
581 | + bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset; | |
582 | + bvec[i].bv_page = job->bvec[j].bv_page; | |
583 | + tail -= bvec[i].bv_len; | |
584 | + i++; j++; | |
585 | + while (tail) { | |
586 | + bvec[i] = job->bvec[j]; | |
587 | + tail -= bvec[i].bv_len; | |
588 | + i++; j++; | |
589 | + } | |
590 | + kfree(job->bvec); | |
591 | + job->bvec = bvec; | |
592 | + } | |
593 | + | |
594 | + r = dm_io_async_bvec(1, &job->dest, WRITE, job->bvec, io_callback, job); | |
595 | + } | |
596 | + | |
597 | + return r; | |
598 | +} | |
599 | + | |
600 | +static int do_io(struct kcached_job *job) | |
601 | +{ | |
602 | + int r = 0; | |
603 | + | |
604 | + if (job->rw == READ) { /* Read from source device */ | |
605 | + r = do_fetch(job); | |
606 | + } else { /* Write to cache device */ | |
607 | + r = do_store(job); | |
608 | + } | |
609 | + | |
610 | + return r; | |
611 | +} | |
612 | + | |
613 | +static int do_pages(struct kcached_job *job) | |
614 | +{ | |
615 | + int r = 0; | |
616 | + | |
617 | + r = kcached_get_pages(job->dmc, job->nr_pages, &job->pages); | |
618 | + | |
619 | + if (r == -ENOMEM) /* can't complete now */ | |
620 | + return 1; | |
621 | + | |
622 | + /* this job is ready for io */ | |
623 | + push(&_io_jobs, job); | |
624 | + return 0; | |
625 | +} | |
626 | + | |
627 | +/* | |
628 | + * Flush the bios that are waiting for this cache insertion or write back. | |
629 | + */ | |
630 | +static void flush_bios(struct cacheblock *cacheblock) | |
631 | +{ | |
632 | + struct bio *bio; | |
633 | + struct bio *n; | |
634 | + | |
635 | + spin_lock(&cacheblock->lock); | |
636 | + bio = bio_list_get(&cacheblock->bios); | |
637 | + if (is_state(cacheblock->state, WRITEBACK)) { /* Write back finished */ | |
638 | + cacheblock->state = VALID; | |
639 | + } else { /* Cache insertion finished */ | |
640 | + set_state(cacheblock->state, VALID); | |
641 | + clear_state(cacheblock->state, RESERVED); | |
642 | + } | |
643 | + spin_unlock(&cacheblock->lock); | |
644 | + | |
645 | + while (bio) { | |
646 | + n = bio->bi_next; | |
647 | + bio->bi_next = NULL; | |
648 | + DPRINTK("Flush bio: %llu->%llu (%u bytes)", | |
649 | + cacheblock->block, bio->bi_sector, bio->bi_size); | |
650 | + generic_make_request(bio); | |
651 | + bio = n; | |
652 | + } | |
653 | +} | |
654 | + | |
655 | +static int do_complete(struct kcached_job *job) | |
656 | +{ | |
657 | + int i, r = 0; | |
658 | + struct bio *bio = job->bio; | |
659 | + | |
660 | + DPRINTK("do_complete: %llu", bio->bi_sector); | |
661 | + | |
662 | + if (bio_data_dir(bio) == READ) { | |
663 | + for (i=bio->bi_idx; i<bio->bi_vcnt; i++) { | |
664 | + put_page(bio->bi_io_vec[i].bv_page); | |
665 | + } | |
666 | + bio_put(bio); | |
667 | + } else | |
668 | + bio_endio(bio, bio->bi_size, 0); | |
669 | + | |
670 | + if (job->nr_pages > 0) { | |
671 | + kfree(job->bvec); | |
672 | + kcached_put_pages(job->dmc, job->pages); | |
673 | + } | |
674 | + | |
675 | + flush_bios(job->cacheblock); | |
676 | + mempool_free(job, _job_pool); | |
677 | + | |
678 | + if (atomic_dec_and_test(&job->dmc->nr_jobs)) | |
679 | + wake_up(&job->dmc->destroyq); | |
680 | + | |
681 | + return r; | |
682 | +} | |
683 | + | |
684 | +/* | |
685 | + * Run through a list for as long as possible. Returns the count | |
686 | + * of successful jobs. | |
687 | + */ | |
688 | +static int process_jobs(struct list_head *jobs, | |
689 | + int (*fn) (struct kcached_job *)) | |
690 | +{ | |
691 | + struct kcached_job *job; | |
692 | + int r, count = 0; | |
693 | + | |
694 | + while ((job = pop(jobs))) { | |
695 | + r = fn(job); | |
696 | + | |
697 | + if (r < 0) { | |
698 | + /* error this rogue job */ | |
699 | + DMERR("process_jobs: Job processing error"); | |
700 | + } | |
701 | + | |
702 | + if (r > 0) { | |
703 | + /* | |
704 | + * We couldn't service this job ATM, so | |
705 | + * push this job back onto the list. | |
706 | + */ | |
707 | + push(jobs, job); | |
708 | + break; | |
709 | + } | |
710 | + | |
711 | + count++; | |
712 | + } | |
713 | + | |
714 | + return count; | |
715 | +} | |
716 | + | |
717 | +static void do_work(struct work_struct *ignored) | |
718 | +{ | |
719 | + process_jobs(&_complete_jobs, do_complete); | |
720 | + process_jobs(&_pages_jobs, do_pages); | |
721 | + process_jobs(&_io_jobs, do_io); | |
722 | +} | |
723 | + | |
724 | +static void queue_job(struct kcached_job *job) | |
725 | +{ | |
726 | + atomic_inc(&job->dmc->nr_jobs); | |
727 | + if (job->nr_pages > 0) /* Request pages */ | |
728 | + push(&_pages_jobs, job); | |
729 | + else /* Go ahead to do I/O */ | |
730 | + push(&_io_jobs, job); | |
731 | + wake(); | |
732 | +} | |
733 | + | |
734 | +static int kcached_init(struct cache_c *dmc) | |
735 | +{ | |
736 | + int r; | |
737 | + | |
738 | + spin_lock_init(&dmc->lock); | |
739 | + dmc->pages = NULL; | |
740 | + dmc->nr_pages = dmc->nr_free_pages = 0; | |
741 | + r = alloc_bio_pages(dmc, DMCACHE_COPY_PAGES); | |
742 | + if (r) { | |
743 | + DMERR("kcached_init: Could not allocate bio pages"); | |
744 | + return r; | |
745 | + } | |
746 | + | |
747 | + r = dm_io_get(DMCACHE_COPY_PAGES); | |
748 | + if (r) { | |
749 | + DMERR("kcached_init: Could not resize dm io pool"); | |
750 | + free_bio_pages(dmc); | |
751 | + return r; | |
752 | + } | |
753 | + | |
754 | + init_waitqueue_head(&dmc->destroyq); | |
755 | + atomic_set(&dmc->nr_jobs, 0); | |
756 | + | |
757 | + return 0; | |
758 | +} | |
759 | + | |
760 | +void kcached_client_destroy(struct cache_c *dmc) | |
761 | +{ | |
762 | + /* Wait for completion of all jobs submitted by this client. */ | |
763 | + wait_event(dmc->destroyq, !atomic_read(&dmc->nr_jobs)); | |
764 | + | |
765 | + dm_io_put(dmc->nr_pages); | |
766 | + free_bio_pages(dmc); | |
767 | +} | |
768 | + | |
769 | + | |
770 | +/**************************************************************************** | |
771 | + * Functions for writing back dirty blocks. | |
772 | + * We leverage kcopyd to write back dirty blocks because it is convenient to | |
773 | + * use and it is not reasonble to reimplement the same function here. But we | |
774 | + * need to reserve pages for both kcached and kcopyd. TODO: dynamically change | |
775 | + * the number of reserved pages. | |
776 | + ****************************************************************************/ | |
777 | + | |
778 | +static void copy_callback(int read_err, unsigned int write_err, void *context) | |
779 | +{ | |
780 | + struct cacheblock *cacheblock = (struct cacheblock *) context; | |
781 | + | |
782 | + flush_bios(cacheblock); | |
783 | +} | |
784 | + | |
785 | +static void copy_block(struct cache_c *dmc, struct io_region src, | |
786 | + struct io_region dest, struct cacheblock *cacheblock) | |
787 | +{ | |
788 | + DPRINTK("Copying: %llu:%llu->%llu:%llu", | |
789 | + src.sector, src.count * 512, dest.sector, dest.count * 512); | |
790 | + kcopyd_copy(dmc->kcp_client, &src, 1, &dest, 0, copy_callback, cacheblock); | |
791 | +} | |
792 | + | |
793 | +static void write_back(struct cache_c *dmc, sector_t index, unsigned int length) | |
794 | +{ | |
795 | + struct io_region src, dest; | |
796 | + struct cacheblock *cacheblock = &dmc->cache[index]; | |
797 | + unsigned int i; | |
798 | + | |
799 | + DPRINTK("Write back block %llu(%llu, %u)", | |
800 | + index, cacheblock->block, length); | |
801 | + src.bdev = dmc->cache_dev->bdev; | |
802 | + src.sector = index << dmc->block_shift; | |
803 | + src.count = dmc->block_size * length; | |
804 | + dest.bdev = dmc->src_dev->bdev; | |
805 | + dest.sector = cacheblock->block; | |
806 | + dest.count = dmc->block_size * length; | |
807 | + | |
808 | + for (i=0; i<length; i++) | |
809 | + set_state(dmc->cache[index+i].state, WRITEBACK); | |
810 | + dmc->dirty_blocks -= length; | |
811 | + copy_block(dmc, src, dest, cacheblock); | |
812 | +} | |
813 | + | |
814 | + | |
815 | +/**************************************************************************** | |
816 | + * Functions for implementing the various cache operations. | |
817 | + ****************************************************************************/ | |
818 | + | |
819 | +/* | |
820 | + * Map a block from the source device to a block in the cache device. | |
821 | + */ | |
822 | +static unsigned long hash_block(struct cache_c *dmc, sector_t block) | |
823 | +{ | |
824 | + unsigned long set_number, value; | |
825 | + | |
826 | + value = (unsigned long)(block >> (dmc->block_shift + | |
827 | + dmc->consecutive_shift)); | |
828 | + set_number = hash_long(value, dmc->bits) / dmc->assoc; | |
829 | + | |
830 | + return set_number; | |
831 | +} | |
832 | + | |
833 | +/* | |
834 | + * Reset the LRU counters (the cache's global counter and each cache block's | |
835 | + * counter). This seems to be a naive implementaion. However, consider the | |
836 | + * rareness of this event, it might be more efficient that other more complex | |
837 | + * schemes. TODO: a more elegant solution. | |
838 | + */ | |
839 | +static void cache_reset_counter(struct cache_c *dmc) | |
840 | +{ | |
841 | + sector_t i; | |
842 | + struct cacheblock *cache = dmc->cache; | |
843 | + | |
844 | + DPRINTK("Reset LRU counters"); | |
845 | + for (i=0; i<dmc->size; i++) | |
846 | + cache[i].counter = 0; | |
847 | + | |
848 | + dmc->counter = 0; | |
849 | +} | |
850 | + | |
851 | +/* | |
852 | + * Lookup a block in the cache. | |
853 | + * | |
854 | + * Return value: | |
855 | + * 1: cache hit (cache_block stores the index of the matched block) | |
856 | + * 0: cache miss but frame is allocated for insertion; cache_block stores the | |
857 | + * frame's index: | |
858 | + * If there are empty frames, then the first encounted is used. | |
859 | + * If there are clean frames, then the LRU clean block is replaced. | |
860 | + * 2: cache miss and frame is not allocated; cache_block stores the LRU dirty | |
861 | + * block's index: | |
862 | + * This happens when the entire set is dirty. | |
863 | + * -1: cache miss and no room for insertion: | |
864 | + * This happens when the entire set in transition modes (RESERVED or | |
865 | + * WRITEBACK). | |
866 | + * | |
867 | + */ | |
868 | +static int cache_lookup(struct cache_c *dmc, sector_t block, | |
869 | + sector_t *cache_block) | |
870 | +{ | |
871 | + unsigned long set_number = hash_block(dmc, block); | |
872 | + sector_t index; | |
873 | + int i, res; | |
874 | + unsigned int cache_assoc = dmc->assoc; | |
875 | + struct cacheblock *cache = dmc->cache; | |
876 | + int invalid = -1, oldest = -1, oldest_clean = -1; | |
877 | + unsigned long counter = ULONG_MAX, clean_counter = ULONG_MAX; | |
878 | + | |
879 | + index=set_number * cache_assoc; | |
880 | + | |
881 | + for (i=0; i<cache_assoc; i++, index++) { | |
882 | + if (is_state(cache[index].state, VALID) || | |
883 | + is_state(cache[index].state, RESERVED)) { | |
884 | + if (cache[index].block == block) { | |
885 | + *cache_block = index; | |
886 | + /* Reset all counters if the largest one is going to overflow */ | |
887 | + if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc); | |
888 | + cache[index].counter = ++dmc->counter; | |
889 | + break; | |
890 | + } else { | |
891 | + /* Don't consider blocks that are in the middle of copying */ | |
892 | + if (!is_state(cache[index].state, RESERVED) && | |
893 | + !is_state(cache[index].state, WRITEBACK)) { | |
894 | + if (!is_state(cache[index].state, DIRTY) && | |
895 | + cache[index].counter < clean_counter) { | |
896 | + clean_counter = cache[index].counter; | |
897 | + oldest_clean = i; | |
898 | + } | |
899 | + if (cache[index].counter < counter) { | |
900 | + counter = cache[index].counter; | |
901 | + oldest = i; | |
902 | + } | |
903 | + } | |
904 | + } | |
905 | + } else { | |
906 | + if (-1 == invalid) invalid = i; | |
907 | + } | |
908 | + } | |
909 | + | |
910 | + res = i < cache_assoc ? 1 : 0; | |
911 | + if (!res) { /* Cache miss */ | |
912 | + if (invalid != -1) /* Choose the first empty frame */ | |
913 | + *cache_block = set_number * cache_assoc + invalid; | |
914 | + else if (oldest_clean != -1) /* Choose the LRU clean block to replace */ | |
915 | + *cache_block = set_number * cache_assoc + oldest_clean; | |
916 | + else if (oldest != -1) { /* Choose the LRU dirty block to evict */ | |
917 | + res = 2; | |
918 | + *cache_block = set_number * cache_assoc + oldest; | |
919 | + } else { | |
920 | + res = -1; | |
921 | + } | |
922 | + } | |
923 | + | |
924 | + if (-1 == res) | |
925 | + DPRINTK("Cache lookup: Block %llu(%lu):%s", | |
926 | + block, set_number, "NO ROOM"); | |
927 | + else | |
928 | + DPRINTK("Cache lookup: Block %llu(%lu):%llu(%s)", | |
929 | + block, set_number, *cache_block, | |
930 | + 1 == res ? "HIT" : (0 == res ? "MISS" : "WB NEEDED")); | |
931 | + return res; | |
932 | +} | |
933 | + | |
934 | +/* | |
935 | + * Insert a block into the cache (in the frame specified by cache_block). | |
936 | + */ | |
937 | +static int cache_insert(struct cache_c *dmc, sector_t block, | |
938 | + sector_t cache_block) | |
939 | +{ | |
940 | + struct cacheblock *cache = dmc->cache; | |
941 | + | |
942 | + /* Mark the block as RESERVED because although it is allocated, the data are | |
943 | + not in place until kcopyd finishes its job. | |
944 | + */ | |
945 | + cache[cache_block].block = block; | |
946 | + cache[cache_block].state = RESERVED; | |
947 | + if (dmc->counter == ULONG_MAX) cache_reset_counter(dmc); | |
948 | + cache[cache_block].counter = ++dmc->counter; | |
949 | + | |
950 | + return 1; | |
951 | +} | |
952 | + | |
953 | +/* | |
954 | + * Invalidate a block (specified by cache_block) in the cache. | |
955 | + */ | |
956 | +static void cache_invalidate(struct cache_c *dmc, sector_t cache_block) | |
957 | +{ | |
958 | + struct cacheblock *cache = dmc->cache; | |
959 | + | |
960 | + DPRINTK("Cache invalidate: Block %llu(%llu)", | |
961 | + cache_block, cache[cache_block].block); | |
962 | + clear_state(cache[cache_block].state, VALID); | |
963 | +} | |
964 | + | |
965 | +/* | |
966 | + * Handle a cache hit: | |
967 | + * For READ, serve the request from cache is the block is ready; otherwise, | |
968 | + * queue the request for later processing. | |
969 | + * For write, invalidate the cache block if write-through. If write-back, | |
970 | + * serve the request from cache if the block is ready, or queue the request | |
971 | + * for later processing if otherwise. | |
972 | + */ | |
973 | +static int cache_hit(struct cache_c *dmc, struct bio* bio, sector_t cache_block) | |
974 | +{ | |
975 | + unsigned int offset = (unsigned int)(bio->bi_sector & dmc->block_mask); | |
976 | + struct cacheblock *cache = dmc->cache; | |
977 | + | |
978 | + dmc->cache_hits++; | |
979 | + | |
980 | + if (bio_data_dir(bio) == READ) { /* READ hit */ | |
981 | + bio->bi_bdev = dmc->cache_dev->bdev; | |
982 | + bio->bi_sector = (cache_block << dmc->block_shift) + offset; | |
983 | + | |
984 | + spin_lock(&cache[cache_block].lock); | |
985 | + | |
986 | + if (is_state(cache[cache_block].state, VALID)) { /* Valid cache block */ | |
987 | + spin_unlock(&cache[cache_block].lock); | |
988 | + return 1; | |
989 | + } | |
990 | + | |
991 | + /* Cache block is not ready yet */ | |
992 | + DPRINTK("Add to bio list %s(%llu)", | |
993 | + dmc->cache_dev->name, bio->bi_sector); | |
994 | + bio_list_add(&cache[cache_block].bios, bio); | |
995 | + | |
996 | + spin_unlock(&cache[cache_block].lock); | |
997 | + return 0; | |
998 | + } else { /* WRITE hit */ | |
999 | + if (dmc->write_policy == WRITE_THROUGH) { /* Invalidate cached data */ | |
1000 | + cache_invalidate(dmc, cache_block); | |
1001 | + bio->bi_bdev = dmc->src_dev->bdev; | |
1002 | + return 1; | |
1003 | + } | |
1004 | + | |
1005 | + /* Write delay */ | |
1006 | + if (!is_state(cache[cache_block].state, DIRTY)) { | |
1007 | + set_state(cache[cache_block].state, DIRTY); | |
1008 | + dmc->dirty_blocks++; | |
1009 | + } | |
1010 | + | |
1011 | + spin_lock(&cache[cache_block].lock); | |
1012 | + | |
1013 | + /* In the middle of write back */ | |
1014 | + if (is_state(cache[cache_block].state, WRITEBACK)) { | |
1015 | + /* Delay this write until the block is written back */ | |
1016 | + bio->bi_bdev = dmc->src_dev->bdev; | |
1017 | + DPRINTK("Add to bio list %s(%llu)", | |
1018 | + dmc->src_dev->name, bio->bi_sector); | |
1019 | + bio_list_add(&cache[cache_block].bios, bio); | |
1020 | + spin_unlock(&cache[cache_block].lock); | |
1021 | + return 0; | |
1022 | + } | |
1023 | + | |
1024 | + /* Cache block not ready yet */ | |
1025 | + if (is_state(cache[cache_block].state, RESERVED)) { | |
1026 | + bio->bi_bdev = dmc->cache_dev->bdev; | |
1027 | + bio->bi_sector = (cache_block << dmc->block_shift) + offset; | |
1028 | + DPRINTK("Add to bio list %s(%llu)", | |
1029 | + dmc->cache_dev->name, bio->bi_sector); | |
1030 | + bio_list_add(&cache[cache_block].bios, bio); | |
1031 | + spin_unlock(&cache[cache_block].lock); | |
1032 | + return 0; | |
1033 | + } | |
1034 | + | |
1035 | + /* Serve the request from cache */ | |
1036 | + bio->bi_bdev = dmc->cache_dev->bdev; | |
1037 | + bio->bi_sector = (cache_block << dmc->block_shift) + offset; | |
1038 | + | |
1039 | + spin_unlock(&cache[cache_block].lock); | |
1040 | + return 1; | |
1041 | + } | |
1042 | +} | |
1043 | + | |
1044 | +static struct kcached_job *new_kcached_job(struct cache_c *dmc, struct bio* bio, | |
1045 | + sector_t request_block, | |
1046 | + sector_t cache_block) | |
1047 | +{ | |
1048 | + struct io_region src, dest; | |
1049 | + struct kcached_job *job; | |
1050 | + | |
1051 | + src.bdev = dmc->src_dev->bdev; | |
1052 | + src.sector = request_block; | |
1053 | + src.count = dmc->block_size; | |
1054 | + dest.bdev = dmc->cache_dev->bdev; | |
1055 | + dest.sector = cache_block << dmc->block_shift; | |
1056 | + dest.count = src.count; | |
1057 | + | |
1058 | + job = mempool_alloc(_job_pool, GFP_NOIO); | |
1059 | + job->dmc = dmc; | |
1060 | + job->bio = bio; | |
1061 | + job->src = src; | |
1062 | + job->dest = dest; | |
1063 | + job->cacheblock = &dmc->cache[cache_block]; | |
1064 | + | |
1065 | + return job; | |
1066 | +} | |
1067 | + | |
1068 | +/* | |
1069 | + * Handle a read cache miss: | |
1070 | + * Update the metadata; fetch the necessary block from source device; | |
1071 | + * store data to cache device. | |
1072 | + */ | |
1073 | +static int cache_read_miss(struct cache_c *dmc, struct bio* bio, | |
1074 | + sector_t cache_block) { | |
1075 | + struct cacheblock *cache = dmc->cache; | |
1076 | + unsigned int offset, head, tail; | |
1077 | + struct kcached_job *job; | |
1078 | + sector_t request_block, left; | |
1079 | + | |
1080 | + offset = (unsigned int)(bio->bi_sector & dmc->block_mask); | |
1081 | + request_block = bio->bi_sector - offset; | |
1082 | + | |
1083 | + if (cache[cache_block].state & VALID) { | |
1084 | + DPRINTK("Replacing %llu->%llu", | |
1085 | + cache[cache_block].block, request_block); | |
1086 | + dmc->replace++; | |
1087 | + } else DPRINTK("Insert block %llu at empty frame %llu", | |
1088 | + request_block, cache_block); | |
1089 | + | |
1090 | + cache_insert(dmc, request_block, cache_block); /* Update metadata first */ | |
1091 | + | |
1092 | + job = new_kcached_job(dmc, bio, request_block, cache_block); | |
1093 | + | |
1094 | + head = to_bytes(offset); | |
1095 | + | |
1096 | + left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block; | |
1097 | + if (left < dmc->block_size) { | |
1098 | + tail = to_bytes(left) - bio->bi_size - head; | |
1099 | + job->src.count = left; | |
1100 | + job->dest.count = left; | |
1101 | + } else | |
1102 | + tail = to_bytes(dmc->block_size) - bio->bi_size - head; | |
1103 | + | |
1104 | + /* Requested block is aligned with a cache block */ | |
1105 | + if (0 == head && 0 == tail) | |
1106 | + job->nr_pages= 0; | |
1107 | + else /* Need new pages to store extra data */ | |
1108 | + job->nr_pages = dm_div_up(head, PAGE_SIZE) + dm_div_up(tail, PAGE_SIZE); | |
1109 | + job->rw = READ; /* Fetch data from the source device */ | |
1110 | + | |
1111 | + DPRINTK("Queue job for %llu (need %u pages)", | |
1112 | + bio->bi_sector, job->nr_pages); | |
1113 | + queue_job(job); | |
1114 | + | |
1115 | + return 0; | |
1116 | +} | |
1117 | + | |
1118 | +/* | |
1119 | + * Handle a write cache miss: | |
1120 | + * If write-through, forward the request to source device. | |
1121 | + * If write-back, update the metadata; fetch the necessary block from source | |
1122 | + * device; write to cache device. | |
1123 | + */ | |
1124 | +static int cache_write_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { | |
1125 | + struct cacheblock *cache = dmc->cache; | |
1126 | + unsigned int offset, head, tail; | |
1127 | + struct kcached_job *job; | |
1128 | + sector_t request_block, left; | |
1129 | + | |
1130 | + if (dmc->write_policy == WRITE_THROUGH) { /* Forward request to souuce */ | |
1131 | + bio->bi_bdev = dmc->src_dev->bdev; | |
1132 | + return 1; | |
1133 | + } | |
1134 | + | |
1135 | + offset = (unsigned int)(bio->bi_sector & dmc->block_mask); | |
1136 | + request_block = bio->bi_sector - offset; | |
1137 | + | |
1138 | + if (cache[cache_block].state & VALID) { | |
1139 | + DPRINTK("Replacing %llu->%llu", | |
1140 | + cache[cache_block].block, request_block); | |
1141 | + dmc->replace++; | |
1142 | + } else DPRINTK("Insert block %llu at empty frame %llu", | |
1143 | + request_block, cache_block); | |
1144 | + | |
1145 | + /* Write delay */ | |
1146 | + cache_insert(dmc, request_block, cache_block); /* Update metadata first */ | |
1147 | + set_state(cache[cache_block].state, DIRTY); | |
1148 | + dmc->dirty_blocks++; | |
1149 | + | |
1150 | + job = new_kcached_job(dmc, bio, request_block, cache_block); | |
1151 | + | |
1152 | + head = to_bytes(offset); | |
1153 | + left = (dmc->src_dev->bdev->bd_inode->i_size>>9) - request_block; | |
1154 | + if (left < dmc->block_size) { | |
1155 | + tail = to_bytes(left) - bio->bi_size - head; | |
1156 | + job->src.count = left; | |
1157 | + job->dest.count = left; | |
1158 | + } else | |
1159 | + tail = to_bytes(dmc->block_size) - bio->bi_size - head; | |
1160 | + | |
1161 | + if (0 == head && 0 == tail) { /* Requested is aligned with a cache block */ | |
1162 | + job->nr_pages = 0; | |
1163 | + job->rw = WRITE; | |
1164 | + } else if (head && tail){ /* Special case: need to pad both head and tail */ | |
1165 | + job->nr_pages = dm_div_up(to_bytes(job->src.count), PAGE_SIZE); | |
1166 | + job->rw = READ; | |
1167 | + } else { | |
1168 | + if (head) { /* Fetch only head */ | |
1169 | + job->src.count = to_sector(head); | |
1170 | + job->nr_pages = dm_div_up(head, PAGE_SIZE); | |
1171 | + } else { /* Fetch only tail */ | |
1172 | + job->src.sector = bio->bi_sector + to_sector(bio->bi_size); | |
1173 | + job->src.count = to_sector(tail); | |
1174 | + job->nr_pages = dm_div_up(tail, PAGE_SIZE); | |
1175 | + } | |
1176 | + job->rw = READ; | |
1177 | + } | |
1178 | + | |
1179 | + queue_job(job); | |
1180 | + | |
1181 | + return 0; | |
1182 | +} | |
1183 | + | |
1184 | +/* Handle cache misses */ | |
1185 | +static int cache_miss(struct cache_c *dmc, struct bio* bio, sector_t cache_block) { | |
1186 | + if (bio_data_dir(bio) == READ) | |
1187 | + return cache_read_miss(dmc, bio, cache_block); | |
1188 | + else | |
1189 | + return cache_write_miss(dmc, bio, cache_block); | |
1190 | +} | |
1191 | + | |
1192 | + | |
1193 | +/**************************************************************************** | |
1194 | + * Functions for implementing the operations on a cache mapping. | |
1195 | + ****************************************************************************/ | |
1196 | + | |
1197 | +/* | |
1198 | + * Decide the mapping and perform necessary cache operations for a bio request. | |
1199 | + */ | |
1200 | +static int cache_map(struct dm_target *ti, struct bio *bio, | |
1201 | + union map_info *map_context) | |
1202 | +{ | |
1203 | + struct cache_c *dmc = (struct cache_c *) ti->private; | |
1204 | + sector_t request_block, cache_block = 0, offset; | |
1205 | + int res; | |
1206 | + | |
1207 | + offset = bio->bi_sector & dmc->block_mask; | |
1208 | + request_block = bio->bi_sector - offset; | |
1209 | + | |
1210 | + DPRINTK("Got a %s for %llu ((%llu:%llu), %u bytes)", | |
1211 | + bio_rw(bio) == WRITE ? "WRITE" : (bio_rw(bio) == READ ? | |
1212 | + "READ":"READA"), bio->bi_sector, request_block, offset, | |
1213 | + bio->bi_size); | |
1214 | + | |
1215 | + if (bio_data_dir(bio) == READ) dmc->reads++; | |
1216 | + else dmc->writes++; | |
1217 | + | |
1218 | + res = cache_lookup(dmc, request_block, &cache_block); | |
1219 | + if (1 == res) /* Cache hit; server request from cache */ | |
1220 | + return cache_hit(dmc, bio, cache_block); | |
1221 | + else if (0 == res) /* Cache miss; replacement block is found */ | |
1222 | + return cache_miss(dmc, bio, cache_block); | |
1223 | + else if (2 == res) { /* Entire cache set is dirty; initiate a write-back */ | |
1224 | + write_back(dmc, cache_block, 1); | |
1225 | + dmc->writeback++; | |
1226 | + } | |
1227 | + | |
1228 | + /* Forward to source device */ | |
1229 | + bio->bi_bdev = dmc->src_dev->bdev; | |
1230 | + | |
1231 | + return 1; | |
1232 | +} | |
1233 | + | |
1234 | +struct meta_dmc { | |
1235 | + sector_t size; | |
1236 | + unsigned int block_size; | |
1237 | + unsigned int assoc; | |
1238 | + unsigned int write_policy; | |
1239 | + unsigned int chksum; | |
1240 | +}; | |
1241 | + | |
1242 | +/* Load metadata stored by previous session from disk. */ | |
1243 | +static int load_metadata(struct cache_c *dmc) { | |
1244 | + struct io_region where; | |
1245 | + unsigned long bits; | |
1246 | + sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; | |
1247 | + sector_t meta_size, *meta_data, i, j, index = 0, limit, order; | |
1248 | + struct meta_dmc *meta_dmc; | |
1249 | + unsigned int chksum = 0, chksum_sav, consecutive_blocks; | |
1250 | + | |
1251 | + meta_dmc = (struct meta_dmc *)vmalloc(512); | |
1252 | + if (!meta_dmc) { | |
1253 | + DMERR("load_metadata: Unable to allocate memory"); | |
1254 | + return 1; | |
1255 | + } | |
1256 | + | |
1257 | + where.bdev = dmc->cache_dev->bdev; | |
1258 | + where.sector = dev_size - 1; | |
1259 | + where.count = 1; | |
1260 | + dm_io_sync_vm(1, &where, READ, meta_dmc, &bits); | |
1261 | + DPRINTK("Loaded cache conf: block size(%u), cache size(%llu), " \ | |
1262 | + "associativity(%u), write policy(%u), chksum(%u)", | |
1263 | + meta_dmc->block_size, meta_dmc->size, | |
1264 | + meta_dmc->assoc, meta_dmc->write_policy, | |
1265 | + meta_dmc->chksum); | |
1266 | + | |
1267 | + dmc->block_size = meta_dmc->block_size; | |
1268 | + dmc->block_shift = ffs(dmc->block_size) - 1; | |
1269 | + dmc->block_mask = dmc->block_size - 1; | |
1270 | + | |
1271 | + dmc->size = meta_dmc->size; | |
1272 | + dmc->bits = ffs(dmc->size) - 1; | |
1273 | + | |
1274 | + dmc->assoc = meta_dmc->assoc; | |
1275 | + consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ? | |
1276 | + dmc->assoc : CONSECUTIVE_BLOCKS; | |
1277 | + dmc->consecutive_shift = ffs(consecutive_blocks) - 1; | |
1278 | + | |
1279 | + dmc->write_policy = meta_dmc->write_policy; | |
1280 | + chksum_sav = meta_dmc->chksum; | |
1281 | + | |
1282 | + vfree((void *)meta_dmc); | |
1283 | + | |
1284 | + | |
1285 | + order = dmc->size * sizeof(struct cacheblock); | |
1286 | + DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \ | |
1287 | + "(capacity:%lluMB, associativity:%u, block size:%u " \ | |
1288 | + "sectors(%uKB), %s)", | |
1289 | + (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock), | |
1290 | + (unsigned long long) dmc->size, | |
1291 | + (unsigned long long) dmc->size * dmc->block_size >> (20-SECTOR_SHIFT), | |
1292 | + dmc->assoc, dmc->block_size, | |
1293 | + dmc->block_size >> (10-SECTOR_SHIFT), | |
1294 | + dmc->write_policy ? "write-back" : "write-through"); | |
1295 | + dmc->cache = (struct cacheblock *)vmalloc(order); | |
1296 | + if (!dmc->cache) { | |
1297 | + DMERR("load_metadata: Unable to allocate memory"); | |
1298 | + return 1; | |
1299 | + } | |
1300 | + | |
1301 | + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512); | |
1302 | + /* When requesting a new bio, the number of requested bvecs has to be | |
1303 | + less than BIO_MAX_PAGES. Otherwise, null is returned. In dm-io.c, | |
1304 | + this return value is not checked and kernel Oops may happen. We set | |
1305 | + the limit here to avoid such situations. (2 additional bvecs are | |
1306 | + required by dm-io for bookeeping.) | |
1307 | + */ | |
1308 | + limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT); | |
1309 | + meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit))); | |
1310 | + if (!meta_data) { | |
1311 | + DMERR("load_metadata: Unable to allocate memory"); | |
1312 | + vfree((void *)dmc->cache); | |
1313 | + return 1; | |
1314 | + } | |
1315 | + | |
1316 | + while(index < meta_size) { | |
1317 | + where.sector = dev_size - 1 - meta_size + index; | |
1318 | + where.count = min(meta_size - index, limit); | |
1319 | + dm_io_sync_vm(1, &where, READ, meta_data, &bits); | |
1320 | + | |
1321 | + for (i=to_bytes(index)/sizeof(sector_t), j=0; | |
1322 | + j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size; | |
1323 | + i++, j++) { | |
1324 | + if(meta_data[j]) { | |
1325 | + dmc->cache[i].block = meta_data[j]; | |
1326 | + dmc->cache[i].state = 1; | |
1327 | + } else | |
1328 | + dmc->cache[i].state = 0; | |
1329 | + } | |
1330 | + chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum); | |
1331 | + index += where.count; | |
1332 | + } | |
1333 | + | |
1334 | + vfree((void *)meta_data); | |
1335 | + | |
1336 | + if (chksum != chksum_sav) { /* Check the checksum of the metadata */ | |
1337 | + DPRINTK("Cache metadata loaded from disk is corrupted"); | |
1338 | + vfree((void *)dmc->cache); | |
1339 | + return 1; | |
1340 | + } | |
1341 | + | |
1342 | + DMINFO("Cache metadata loaded from disk (offset %llu)", | |
1343 | + (unsigned long long) dev_size - 1 - (unsigned long long) meta_size);; | |
1344 | + | |
1345 | + return 0; | |
1346 | +} | |
1347 | + | |
1348 | +/* Store metadata onto disk. */ | |
1349 | +static int dump_metadata(struct cache_c *dmc) { | |
1350 | + struct io_region where; | |
1351 | + unsigned long bits; | |
1352 | + sector_t dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; | |
1353 | + sector_t meta_size, i, j, index = 0, limit, *meta_data; | |
1354 | + struct meta_dmc *meta_dmc; | |
1355 | + unsigned int chksum = 0; | |
1356 | + | |
1357 | + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512); | |
1358 | + limit = (BIO_MAX_PAGES - 2) * (PAGE_SIZE >> SECTOR_SHIFT); | |
1359 | + meta_data = (sector_t *)vmalloc(to_bytes(min(meta_size, limit))); | |
1360 | + if (!meta_data) { | |
1361 | + DMERR("dump_metadata: Unable to allocate memory"); | |
1362 | + return 1; | |
1363 | + } | |
1364 | + | |
1365 | + where.bdev = dmc->cache_dev->bdev; | |
1366 | + while(index < meta_size) { | |
1367 | + where.sector = dev_size - 1 - meta_size + index; | |
1368 | + where.count = min(meta_size - index, limit); | |
1369 | + | |
1370 | + for (i=to_bytes(index)/sizeof(sector_t), j=0; | |
1371 | + j<to_bytes(where.count)/sizeof(sector_t) && i<dmc->size; | |
1372 | + i++, j++) { | |
1373 | + /* Assume all invalid cache blocks store 0. We lose the block that | |
1374 | + * is actually mapped to offset 0. | |
1375 | + */ | |
1376 | + meta_data[j] = dmc->cache[i].state ? dmc->cache[i].block : 0; | |
1377 | + } | |
1378 | + chksum = csum_partial((char *)meta_data, to_bytes(where.count), chksum); | |
1379 | + | |
1380 | + dm_io_sync_vm(1, &where, WRITE, meta_data, &bits); | |
1381 | + index += where.count; | |
1382 | + } | |
1383 | + | |
1384 | + vfree((void *)meta_data); | |
1385 | + | |
1386 | + meta_dmc = (struct meta_dmc *)vmalloc(512); | |
1387 | + if (!meta_dmc) { | |
1388 | + DMERR("dump_metadata: Unable to allocate memory"); | |
1389 | + return 1; | |
1390 | + } | |
1391 | + | |
1392 | + meta_dmc->block_size = dmc->block_size; | |
1393 | + meta_dmc->size = dmc->size; | |
1394 | + meta_dmc->assoc = dmc->assoc; | |
1395 | + meta_dmc->write_policy = dmc->write_policy; | |
1396 | + meta_dmc->chksum = chksum; | |
1397 | + | |
1398 | + DPRINTK("Store metadata to disk: block size(%u), cache size(%llu), " \ | |
1399 | + "associativity(%u), write policy(%u), checksum(%u)", | |
1400 | + meta_dmc->block_size, (unsigned long long) meta_dmc->size, | |
1401 | + meta_dmc->assoc, meta_dmc->write_policy, | |
1402 | + meta_dmc->chksum); | |
1403 | + | |
1404 | + where.sector = dev_size - 1; | |
1405 | + where.count = 1; | |
1406 | + dm_io_sync_vm(1, &where, WRITE, meta_dmc, &bits); | |
1407 | + | |
1408 | + vfree((void *)meta_dmc); | |
1409 | + | |
1410 | + DMINFO("Cache metadata saved to disk (offset %llu)", | |
1411 | + (unsigned long long) dev_size - 1 - (unsigned long long) meta_size); | |
1412 | + | |
1413 | + return 0; | |
1414 | +} | |
1415 | + | |
1416 | +/* | |
1417 | + * Construct a cache mapping. | |
1418 | + * arg[0]: path to source device | |
1419 | + * arg[1]: path to cache device | |
1420 | + * arg[2]: cache persistence (if set, cache conf is loaded from disk) | |
1421 | + * Cache configuration parameters (if not set, default values are used. | |
1422 | + * arg[3]: cache block size (in sectors) | |
1423 | + * arg[4]: cache size (in blocks) | |
1424 | + * arg[5]: cache associativity | |
1425 | + * arg[6]: write caching policy | |
1426 | + */ | |
1427 | +static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
1428 | +{ | |
1429 | + struct cache_c *dmc; | |
1430 | + unsigned int consecutive_blocks, persistence = 0; | |
1431 | + sector_t localsize, i, order; | |
1432 | + sector_t data_size, meta_size, dev_size; | |
1433 | + unsigned long long cache_size; | |
1434 | + int r = -EINVAL; | |
1435 | + | |
1436 | + if (argc < 2) { | |
1437 | + ti->error = "dm-cache: Need at least 2 arguments (src dev and cache dev)"; | |
1438 | + goto bad; | |
1439 | + } | |
1440 | + | |
1441 | + dmc = kmalloc(sizeof(*dmc), GFP_KERNEL); | |
1442 | + if (dmc == NULL) { | |
1443 | + ti->error = "dm-cache: Failed to allocate cache context"; | |
1444 | + r = ENOMEM; | |
1445 | + goto bad; | |
1446 | + } | |
1447 | + | |
1448 | + r = dm_get_device(ti, argv[0], 0, ti->len, | |
1449 | + dm_table_get_mode(ti->table), &dmc->src_dev); | |
1450 | + if (r) { | |
1451 | + ti->error = "dm-cache: Source device lookup failed"; | |
1452 | + goto bad1; | |
1453 | + } | |
1454 | + | |
1455 | + r = dm_get_device(ti, argv[1], 0, 0, | |
1456 | + dm_table_get_mode(ti->table), &dmc->cache_dev); | |
1457 | + if (r) { | |
1458 | + ti->error = "dm-cache: Cache device lookup failed"; | |
1459 | + goto bad2; | |
1460 | + } | |
1461 | + | |
1462 | + | |
1463 | + r = kcopyd_client_create(DMCACHE_COPY_PAGES, &dmc->kcp_client); | |
1464 | + if (r) { | |
1465 | + ti->error = "Failed to initialize kcopyd client\n"; | |
1466 | + goto bad3; | |
1467 | + } | |
1468 | + | |
1469 | + r = kcached_init(dmc); | |
1470 | + if (r) { | |
1471 | + ti->error = "Failed to initialize kcached"; | |
1472 | + goto bad4; | |
1473 | + } | |
1474 | + | |
1475 | + if (argc >= 3) { | |
1476 | + if (sscanf(argv[2], "%u", &persistence) != 1) { | |
1477 | + ti->error = "dm-cache: Invalid cache persistence"; | |
1478 | + r = -EINVAL; | |
1479 | + goto bad5; | |
1480 | + } | |
1481 | + } | |
1482 | + if (1 == persistence) { | |
1483 | + if (load_metadata(dmc)) { | |
1484 | + ti->error = "dm-cache: Invalid cache configuration"; | |
1485 | + r = -EINVAL; | |
1486 | + goto bad5; | |
1487 | + } | |
1488 | + goto init; /* Skip reading cache parameters from command line */ | |
1489 | + } else if (persistence != 0) { | |
1490 | + ti->error = "dm-cache: Invalid cache persistence"; | |
1491 | + r = -EINVAL; | |
1492 | + goto bad5; | |
1493 | + } | |
1494 | + | |
1495 | + if (argc >= 4) { | |
1496 | + if (sscanf(argv[3], "%u", &dmc->block_size) != 1) { | |
1497 | + ti->error = "dm-cache: Invalid block size"; | |
1498 | + r = -EINVAL; | |
1499 | + goto bad5; | |
1500 | + } | |
1501 | + if (!dmc->block_size || (dmc->block_size & (dmc->block_size - 1))) { | |
1502 | + ti->error = "dm-cache: Invalid block size"; | |
1503 | + r = -EINVAL; | |
1504 | + goto bad5; | |
1505 | + } | |
1506 | + } else | |
1507 | + dmc->block_size = DEFAULT_BLOCK_SIZE; | |
1508 | + dmc->block_shift = ffs(dmc->block_size) - 1; | |
1509 | + dmc->block_mask = dmc->block_size - 1; | |
1510 | + | |
1511 | + if (argc >= 5) { | |
1512 | + if (sscanf(argv[4], "%llu", &cache_size) != 1) { | |
1513 | + ti->error = "dm-cache: Invalid cache size"; | |
1514 | + r = -EINVAL; | |
1515 | + goto bad5; | |
1516 | + } | |
1517 | + dmc->size = (sector_t) cache_size; | |
1518 | + if (!dmc->size || (dmc->size & (dmc->size - 1))) { | |
1519 | + ti->error = "dm-cache: Invalid cache size"; | |
1520 | + r = -EINVAL; | |
1521 | + goto bad5; | |
1522 | + } | |
1523 | + } else | |
1524 | + dmc->size = DEFAULT_CACHE_SIZE; | |
1525 | + localsize = dmc->size; | |
1526 | + dmc->bits = ffs(dmc->size) - 1; | |
1527 | + | |
1528 | + if (argc >= 6) { | |
1529 | + if (sscanf(argv[5], "%u", &dmc->assoc) != 1) { | |
1530 | + ti->error = "dm-cache: Invalid cache associativity"; | |
1531 | + r = -EINVAL; | |
1532 | + goto bad5; | |
1533 | + } | |
1534 | + if (!dmc->assoc || (dmc->assoc & (dmc->assoc - 1)) || | |
1535 | + dmc->size < dmc->assoc) { | |
1536 | + ti->error = "dm-cache: Invalid cache associativity"; | |
1537 | + r = -EINVAL; | |
1538 | + goto bad5; | |
1539 | + } | |
1540 | + } else | |
1541 | + dmc->assoc = DEFAULT_CACHE_ASSOC; | |
1542 | + | |
1543 | + DMINFO("%lld", dmc->cache_dev->bdev->bd_inode->i_size); | |
1544 | + dev_size = dmc->cache_dev->bdev->bd_inode->i_size >> 9; | |
1545 | + data_size = dmc->size * dmc->block_size; | |
1546 | + meta_size = dm_div_up(dmc->size * sizeof(sector_t), 512) + 1; | |
1547 | + if ((data_size + meta_size) > dev_size) { | |
1548 | + DMERR("Requested cache size exeeds the cache device's capacity" \ | |
1549 | + "(%llu+%llu>%llu)", | |
1550 | + (unsigned long long) data_size, (unsigned long long) meta_size, | |
1551 | + (unsigned long long) dev_size); | |
1552 | + ti->error = "dm-cache: Invalid cache size"; | |
1553 | + r = -EINVAL; | |
1554 | + goto bad5; | |
1555 | + } | |
1556 | + consecutive_blocks = dmc->assoc < CONSECUTIVE_BLOCKS ? | |
1557 | + dmc->assoc : CONSECUTIVE_BLOCKS; | |
1558 | + dmc->consecutive_shift = ffs(consecutive_blocks) - 1; | |
1559 | + | |
1560 | + if (argc >= 7) { | |
1561 | + if (sscanf(argv[6], "%u", &dmc->write_policy) != 1) { | |
1562 | + ti->error = "dm-cache: Invalid cache write policy"; | |
1563 | + r = -EINVAL; | |
1564 | + goto bad5; | |
1565 | + } | |
1566 | + if (dmc->write_policy != 0 && dmc->write_policy != 1) { | |
1567 | + ti->error = "dm-cache: Invalid cache write policy"; | |
1568 | + r = -EINVAL; | |
1569 | + goto bad5; | |
1570 | + } | |
1571 | + } else | |
1572 | + dmc->write_policy = DEFAULT_WRITE_POLICY; | |
1573 | + | |
1574 | + order = dmc->size * sizeof(struct cacheblock); | |
1575 | + localsize = data_size >> 11; | |
1576 | + DMINFO("Allocate %lluKB (%luB per) mem for %llu-entry cache" \ | |
1577 | + "(capacity:%lluMB, associativity:%u, block size:%u " \ | |
1578 | + "sectors(%uKB), %s)", | |
1579 | + (unsigned long long) order >> 10, (unsigned long) sizeof(struct cacheblock), | |
1580 | + (unsigned long long) dmc->size, | |
1581 | + (unsigned long long) data_size >> (20-SECTOR_SHIFT), | |
1582 | + dmc->assoc, dmc->block_size, | |
1583 | + dmc->block_size >> (10-SECTOR_SHIFT), | |
1584 | + dmc->write_policy ? "write-back" : "write-through"); | |
1585 | + | |
1586 | + dmc->cache = (struct cacheblock *)vmalloc(order); | |
1587 | + if (!dmc->cache) { | |
1588 | + ti->error = "Unable to allocate memory"; | |
1589 | + r = -ENOMEM; | |
1590 | + goto bad5; | |
1591 | + } | |
1592 | + | |
1593 | +init: /* Initialize the cache structs */ | |
1594 | + for (i=0; i<dmc->size; i++) { | |
1595 | + bio_list_init(&dmc->cache[i].bios); | |
1596 | + if(!persistence) dmc->cache[i].state = 0; | |
1597 | + dmc->cache[i].counter = 0; | |
1598 | + spin_lock_init(&dmc->cache[i].lock); | |
1599 | + } | |
1600 | + | |
1601 | + dmc->counter = 0; | |
1602 | + dmc->dirty_blocks = 0; | |
1603 | + dmc->reads = 0; | |
1604 | + dmc->writes = 0; | |
1605 | + dmc->cache_hits = 0; | |
1606 | + dmc->replace = 0; | |
1607 | + dmc->writeback = 0; | |
1608 | + dmc->dirty = 0; | |
1609 | + | |
1610 | + ti->split_io = dmc->block_size; | |
1611 | + ti->private = dmc; | |
1612 | + return 0; | |
1613 | + | |
1614 | +bad5: | |
1615 | + kcached_client_destroy(dmc); | |
1616 | +bad4: | |
1617 | + kcopyd_client_destroy(dmc->kcp_client); | |
1618 | +bad3: | |
1619 | + dm_put_device(ti, dmc->cache_dev); | |
1620 | +bad2: | |
1621 | + dm_put_device(ti, dmc->src_dev); | |
1622 | +bad1: | |
1623 | + kfree(dmc); | |
1624 | +bad: | |
1625 | + return r; | |
1626 | +} | |
1627 | + | |
1628 | + | |
1629 | +static void cache_flush(struct cache_c *dmc) | |
1630 | +{ | |
1631 | + struct cacheblock *cache = dmc->cache; | |
1632 | + sector_t i = 0; | |
1633 | + unsigned int j; | |
1634 | + | |
1635 | + DMINFO("Flush dirty blocks (%llu) ...", (unsigned long long) dmc->dirty_blocks); | |
1636 | + while (i< dmc->size) { | |
1637 | + j = 1; | |
1638 | + if (is_state(cache[i].state, DIRTY)) { | |
1639 | + while ((i+j) < dmc->size && is_state(cache[i+j].state, DIRTY) | |
1640 | + && (cache[i+j].block == cache[i].block + j * | |
1641 | + dmc->block_size)) { | |
1642 | + j++; | |
1643 | + } | |
1644 | + dmc->dirty += j; | |
1645 | + write_back(dmc, i, j); | |
1646 | + } | |
1647 | + i += j; | |
1648 | + } | |
1649 | +} | |
1650 | + | |
1651 | +/* | |
1652 | + * Destroy the cache mapping. | |
1653 | + */ | |
1654 | +static void cache_dtr(struct dm_target *ti) | |
1655 | +{ | |
1656 | + struct cache_c *dmc = (struct cache_c *) ti->private; | |
1657 | + | |
1658 | + if (dmc->dirty_blocks > 0) cache_flush(dmc); | |
1659 | + | |
1660 | + kcached_client_destroy(dmc); | |
1661 | + | |
1662 | + kcopyd_client_destroy(dmc->kcp_client); | |
1663 | + | |
1664 | + if (dmc->reads + dmc->writes > 0) | |
1665 | + DMINFO("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \ | |
1666 | + "replacement(%lu), replaced dirty blocks(%lu), " \ | |
1667 | + "flushed dirty blocks(%lu)", | |
1668 | + dmc->reads, dmc->writes, dmc->cache_hits, | |
1669 | + dmc->cache_hits * 100 / (dmc->reads + dmc->writes), | |
1670 | + dmc->replace, dmc->writeback, dmc->dirty); | |
1671 | + | |
1672 | + dump_metadata(dmc); /* Always dump metadata to disk before exit */ | |
1673 | + vfree((void *)dmc->cache); | |
1674 | + | |
1675 | + dm_put_device(ti, dmc->src_dev); | |
1676 | + dm_put_device(ti, dmc->cache_dev); | |
1677 | + kfree(dmc); | |
1678 | +} | |
1679 | + | |
1680 | +/* | |
1681 | + * Report cache status: | |
1682 | + * Output cache stats upon request of device status; | |
1683 | + * Output cache configuration upon request of table status. | |
1684 | + */ | |
1685 | +static int cache_status(struct dm_target *ti, status_type_t type, | |
1686 | + char *result, unsigned int maxlen) | |
1687 | +{ | |
1688 | + struct cache_c *dmc = (struct cache_c *) ti->private; | |
1689 | + int sz = 0; | |
1690 | + | |
1691 | + switch (type) { | |
1692 | + case STATUSTYPE_INFO: | |
1693 | + DMEMIT("stats: reads(%lu), writes(%lu), cache hits(%lu, 0.%lu)," \ | |
1694 | + "replacement(%lu), replaced dirty blocks(%lu)", | |
1695 | + dmc->reads, dmc->writes, dmc->cache_hits, | |
1696 | + dmc->cache_hits * 100 / (dmc->reads + dmc->writes), | |
1697 | + dmc->replace, dmc->writeback); | |
1698 | + break; | |
1699 | + case STATUSTYPE_TABLE: | |
1700 | + DMEMIT("conf: capacity(%lluM), associativity(%u), block size(%uK), %s", | |
1701 | + (unsigned long long) dmc->size * dmc->block_size >> 11, | |
1702 | + dmc->assoc, dmc->block_size>>(10-SECTOR_SHIFT), | |
1703 | + dmc->write_policy ? "write-back":"write-through"); | |
1704 | + break; | |
1705 | + } | |
1706 | + return 0; | |
1707 | +} | |
1708 | + | |
1709 | + | |
1710 | +/**************************************************************************** | |
1711 | + * Functions for manipulating a cache target. | |
1712 | + ****************************************************************************/ | |
1713 | + | |
1714 | +static struct target_type cache_target = { | |
1715 | + .name = "cache", | |
1716 | + .version= {1, 0, 1}, | |
1717 | + .module = THIS_MODULE, | |
1718 | + .ctr = cache_ctr, | |
1719 | + .dtr = cache_dtr, | |
1720 | + .map = cache_map, | |
1721 | + .status = cache_status, | |
1722 | +}; | |
1723 | + | |
1724 | +/* | |
1725 | + * Initiate a cache target. | |
1726 | + */ | |
1727 | +int __init dm_cache_init(void) | |
1728 | +{ | |
1729 | + int r; | |
1730 | + | |
1731 | + r = jobs_init(); | |
1732 | + if (r) | |
1733 | + return r; | |
1734 | + | |
1735 | + _kcached_wq = create_singlethread_workqueue("kcached"); | |
1736 | + if (!_kcached_wq) { | |
1737 | + DMERR("failed to start kcached"); | |
1738 | + return -ENOMEM; | |
1739 | + } | |
1740 | + INIT_WORK(&_kcached_work, do_work); | |
1741 | + | |
1742 | + r = dm_register_target(&cache_target); | |
1743 | + if (r < 0) { | |
1744 | + DMERR("cache: register failed %d", r); | |
1745 | + destroy_workqueue(_kcached_wq); | |
1746 | + } | |
1747 | + | |
1748 | + return r; | |
1749 | +} | |
1750 | + | |
1751 | +/* | |
1752 | + * Destroy a cache target. | |
1753 | + */ | |
1754 | +void dm_cache_exit(void) | |
1755 | +{ | |
1756 | + int r = dm_unregister_target(&cache_target); | |
1757 | + | |
1758 | + if (r < 0) | |
1759 | + DMERR("cache: unregister failed %d", r); | |
1760 | + | |
1761 | + jobs_exit(); | |
1762 | + destroy_workqueue(_kcached_wq); | |
1763 | +} | |
1764 | + | |
1765 | +module_init(dm_cache_init); | |
1766 | +module_exit(dm_cache_exit); | |
1767 | + | |
1768 | +MODULE_DESCRIPTION(DM_NAME " cache target"); | |
1769 | +MODULE_AUTHOR("Ming Zhao <mingzhao99th@gmail.com>"); | |
1770 | +MODULE_LICENSE("GPL"); | |
1771 | diff -Naur linux-2.6.21.7-orig/drivers/md/Kconfig linux-2.6.21.7-dmcache/drivers/md/Kconfig | |
1772 | --- linux-2.6.21.7-orig/drivers/md/Kconfig 2007-08-04 12:11:13.000000000 -0400 | |
1773 | +++ linux-2.6.21.7-dmcache/drivers/md/Kconfig 2007-08-23 14:16:07.000000000 -0400 | |
1774 | @@ -262,6 +262,12 @@ | |
1775 | ---help--- | |
1776 | Multipath support for EMC CX/AX series hardware. | |
1777 | ||
1778 | +config DM_CACHE | |
1779 | + tristate "Cache target support (EXPERIMENTAL)" | |
1780 | + depends on BLK_DEV_DM && EXPERIMENTAL | |
1781 | + ---help--- | |
1782 | + Support for generic cache target for device-mapper. | |
1783 | + | |
1784 | endmenu | |
1785 | ||
1786 | endif | |
1787 | diff -Naur linux-2.6.21.7-orig/drivers/md/Makefile linux-2.6.21.7-dmcache/drivers/md/Makefile | |
1788 | --- linux-2.6.21.7-orig/drivers/md/Makefile 2007-08-04 12:11:13.000000000 -0400 | |
1789 | +++ linux-2.6.21.7-dmcache/drivers/md/Makefile 2007-08-23 14:16:25.000000000 -0400 | |
1790 | @@ -36,6 +36,7 @@ | |
1791 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | |
1792 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o | |
1793 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | |
1794 | +obj-$(CONFIG_DM_CACHE) += dm-cache.o | |
1795 | ||
1796 | quiet_cmd_unroll = UNROLL $@ | |
1797 | cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ |