]>
Commit | Line | Data |
---|---|---|
4dd5eeca JR |
1 | diff -urN linux-2.4.22/drivers/md/Config.in linux-2.4.22-evms/drivers/md/Config.in |
2 | --- linux-2.4.22/drivers/md/Config.in 2003-09-15 17:07:45.000000000 +0200 | |
3 | +++ linux-2.4.22-evms/drivers/md/Config.in 2003-09-15 17:09:48.000000000 +0200 | |
4 | @@ -16,5 +16,9 @@ | |
5 | dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD | |
6 | dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD | |
7 | dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM | |
8 | +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then | |
9 | + dep_tristate ' Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM | |
10 | + dep_tristate ' Sparse Device Target' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM | |
11 | +fi | |
12 | ||
13 | endmenu | |
14 | diff -urN linux-2.4.22/drivers/md/Makefile linux-2.4.22-evms/drivers/md/Makefile | |
15 | --- linux-2.4.22/drivers/md/Makefile 2003-09-15 17:07:45.000000000 +0200 | |
16 | +++ linux-2.4.22-evms/drivers/md/Makefile 2003-09-15 17:09:48.000000000 +0200 | |
17 | @@ -30,6 +30,8 @@ | |
18 | ||
19 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |
20 | obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o | |
21 | +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o | |
22 | +obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o | |
23 | ||
24 | include $(TOPDIR)/Rules.make | |
25 | ||
26 | diff -urN linux-2.4.22/drivers/md/dm-bbr.c linux-2.4.22-evms/drivers/md/dm-bbr.c | |
27 | --- linux-2.4.22/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100 | |
28 | +++ linux-2.4.22-evms/drivers/md/dm-bbr.c 2003-09-15 17:08:42.000000000 +0200 | |
29 | @@ -0,0 +1,1228 @@ | |
30 | +/* | |
31 | + * Copyright (c) International Business Machines Corp., 2002-2003 | |
32 | + * | |
33 | + * This program is free software; you can redistribute it and/or modify | |
34 | + * it under the terms of the GNU General Public License as published by | |
35 | + * the Free Software Foundation; either version 2 of the License, or | |
36 | + * (at your option) any later version. | |
37 | + * | |
38 | + * This program is distributed in the hope that it will be useful, | |
39 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
40 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
41 | + * the GNU General Public License for more details. | |
42 | + * | |
43 | + * You should have received a copy of the GNU General Public License | |
44 | + * along with this program; if not, write to the Free Software | |
45 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
46 | + * | |
47 | + * linux/drivers/md/dm-bbr.c | |
48 | + * | |
49 | + * Bad-block-relocation (BBR) target for device-mapper. | |
50 | + * | |
51 | + * The BBR target is designed to remap I/O write failures to another safe | |
52 | + * location on disk. Note that most disk drives have BBR built into them, | |
53 | + * this means that our software BBR will be only activated when all hardware | |
54 | + * BBR replacement sectors have been used. | |
55 | + */ | |
56 | + | |
57 | +#include <linux/kernel.h> | |
58 | +#include <linux/module.h> | |
59 | +#include <linux/init.h> | |
60 | +#include <linux/blkdev.h> | |
61 | +#include <linux/spinlock.h> | |
62 | +#include <linux/smp_lock.h> | |
63 | +#include <linux/slab.h> | |
64 | +#include <linux/mempool.h> | |
65 | +#include "dm.h" | |
66 | +#include "dm-bbr.h" | |
67 | +#include "dm-daemon.h" | |
68 | +#include "dm-io.h" | |
69 | + | |
70 | +/* Number of active BBR devices. */ | |
71 | +static int bbr_instances = 0; | |
72 | +static DECLARE_MUTEX(bbr_instances_lock); | |
73 | + | |
74 | +/* Data pertaining to the I/O thread. */ | |
75 | +static struct dm_daemon * bbr_io_thread = NULL; | |
76 | +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED; | |
77 | +static LIST_HEAD(bbr_io_list); | |
78 | +static void bbr_io_handler(void); | |
79 | + | |
80 | +/* Global pools for bbr_io_buf's and bbr_remap's. */ | |
81 | +static kmem_cache_t * bbr_io_buf_cache; | |
82 | +static mempool_t * bbr_io_buf_pool; | |
83 | +static kmem_cache_t * bbr_remap_cache; | |
84 | +static mempool_t * bbr_remap_pool; | |
85 | + | |
86 | +static void bbr_free_remap(struct bbr_private * bbr_id); | |
87 | + | |
88 | +/** | |
89 | + * destroy_pools | |
90 | + * | |
91 | + * Delete the pools for the remap list and I/O anchors. | |
92 | + **/ | |
93 | +static void destroy_pools(void) | |
94 | +{ | |
95 | + if (bbr_io_buf_pool) { | |
96 | + mempool_destroy(bbr_io_buf_pool); | |
97 | + bbr_io_buf_pool = NULL; | |
98 | + } | |
99 | + if (bbr_io_buf_cache) { | |
100 | + kmem_cache_destroy(bbr_io_buf_cache); | |
101 | + bbr_io_buf_cache = NULL; | |
102 | + } | |
103 | + if (bbr_remap_pool) { | |
104 | + mempool_destroy(bbr_remap_pool); | |
105 | + bbr_remap_pool = NULL; | |
106 | + } | |
107 | + if (bbr_remap_cache) { | |
108 | + kmem_cache_destroy(bbr_remap_cache); | |
109 | + bbr_remap_cache = NULL; | |
110 | + } | |
111 | +} | |
112 | + | |
113 | +/** | |
114 | + * create_pools | |
115 | + * | |
116 | + * Create mempools for the remap list and I/O anchors. | |
117 | + **/ | |
118 | +static int create_pools(void) | |
119 | +{ | |
120 | + if (!bbr_remap_cache) { | |
121 | + bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache", | |
122 | + sizeof(struct bbr_runtime_remap), | |
123 | + 0, SLAB_HWCACHE_ALIGN, | |
124 | + NULL, NULL); | |
125 | + if (!bbr_remap_cache) { | |
126 | + DMERR("Unable to create BBR remap cache."); | |
127 | + goto out; | |
128 | + } | |
129 | + } | |
130 | + if (!bbr_remap_pool) { | |
131 | + bbr_remap_pool = mempool_create(64, mempool_alloc_slab, | |
132 | + mempool_free_slab, | |
133 | + bbr_remap_cache); | |
134 | + if (!bbr_remap_pool) { | |
135 | + DMERR("Unable to create BBR remap mempool."); | |
136 | + goto out; | |
137 | + } | |
138 | + } | |
139 | + | |
140 | + if (!bbr_io_buf_cache) { | |
141 | + bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache", | |
142 | + sizeof(struct bbr_io_buffer), | |
143 | + 0, SLAB_HWCACHE_ALIGN, | |
144 | + NULL, NULL); | |
145 | + if (!bbr_io_buf_cache) { | |
146 | + DMERR("Unable to create BBR I/O buffer cache."); | |
147 | + goto out; | |
148 | + } | |
149 | + } | |
150 | + if (!bbr_io_buf_pool) { | |
151 | + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab, | |
152 | + mempool_free_slab, | |
153 | + bbr_io_buf_cache); | |
154 | + if (!bbr_io_buf_pool) { | |
155 | + DMERR("Unable to create BBR I/O buffer mempool."); | |
156 | + goto out; | |
157 | + } | |
158 | + } | |
159 | + | |
160 | +out: | |
161 | + if (!bbr_remap_cache || !bbr_remap_pool || | |
162 | + !bbr_io_buf_cache || !bbr_io_buf_pool ) { | |
163 | + destroy_pools(); | |
164 | + return -ENOMEM; | |
165 | + } | |
166 | + | |
167 | + return 0; | |
168 | +} | |
169 | + | |
170 | +/** | |
171 | + * stop_io_thread | |
172 | + * | |
173 | + * Use the dm-daemon services to stop the BBR I/O thread. | |
174 | + **/ | |
175 | +static void stop_io_thread(void) | |
176 | +{ | |
177 | + if (bbr_io_thread) { | |
178 | + dm_daemon_stop(bbr_io_thread); | |
179 | + kfree(bbr_io_thread); | |
180 | + bbr_io_thread = NULL; | |
181 | + } | |
182 | +} | |
183 | + | |
184 | +/** | |
185 | + * stop_io_thread | |
186 | + * | |
187 | + * Use the dm-daemon services to start the BBR I/O thread. | |
188 | + **/ | |
189 | +static int start_io_thread(void) | |
190 | +{ | |
191 | + int rc; | |
192 | + | |
193 | + if (!bbr_io_thread) { | |
194 | + bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL); | |
195 | + if (!bbr_io_thread) { | |
196 | + return -ENOMEM; | |
197 | + } | |
198 | + | |
199 | + rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler); | |
200 | + if (rc) { | |
201 | + kfree(bbr_io_thread); | |
202 | + return rc; | |
203 | + } | |
204 | + } | |
205 | + | |
206 | + return 0; | |
207 | +} | |
208 | + | |
209 | +/** | |
210 | + * bbr_global_init | |
211 | + * | |
212 | + * Set up the mempools, I/O thread, and sync-I/O service. This should | |
213 | + * be called only when the first bbr device is created. | |
214 | + **/ | |
215 | +static int bbr_global_init(void) | |
216 | +{ | |
217 | + int rc; | |
218 | + | |
219 | + rc = create_pools(); | |
220 | + if (rc) { | |
221 | + goto out; | |
222 | + } | |
223 | + | |
224 | + rc = start_io_thread(); | |
225 | + if (rc) { | |
226 | + destroy_pools(); | |
227 | + goto out; | |
228 | + } | |
229 | + | |
230 | + rc = dm_io_get(1); | |
231 | + if (rc) { | |
232 | + destroy_pools(); | |
233 | + stop_io_thread(); | |
234 | + goto out; | |
235 | + } | |
236 | + | |
237 | +out: | |
238 | + return rc; | |
239 | +} | |
240 | + | |
241 | +/** | |
242 | + * bbr_global_cleanup | |
243 | + * | |
244 | + * Cleanup the mempools, I/O thread and sync-I/O service. This should | |
245 | + * be called only when the last bbr device is removed. | |
246 | + **/ | |
247 | +static void bbr_global_cleanup(void) | |
248 | +{ | |
249 | + destroy_pools(); | |
250 | + stop_io_thread(); | |
251 | + dm_io_put(1); | |
252 | +} | |
253 | + | |
254 | +static struct bbr_private * bbr_alloc_private(void) | |
255 | +{ | |
256 | + struct bbr_private * bbr_id; | |
257 | + | |
258 | + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL); | |
259 | + if (bbr_id) { | |
260 | + memset(bbr_id, 0, sizeof(*bbr_id)); | |
261 | + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); | |
262 | + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED; | |
263 | + } | |
264 | + | |
265 | + return bbr_id; | |
266 | +} | |
267 | + | |
268 | +static void bbr_free_private(struct bbr_private * bbr_id) | |
269 | +{ | |
270 | + if (bbr_id->bbr_table) { | |
271 | + kfree(bbr_id->bbr_table); | |
272 | + } | |
273 | + bbr_free_remap(bbr_id); | |
274 | + kfree(bbr_id); | |
275 | +} | |
276 | + | |
277 | +static u32 crc_table[256]; | |
278 | +static u32 crc_table_built = 0; | |
279 | + | |
280 | +static void build_crc_table(void) | |
281 | +{ | |
282 | + u32 i, j, crc; | |
283 | + | |
284 | + for (i = 0; i <= 255; i++) { | |
285 | + crc = i; | |
286 | + for (j = 8; j > 0; j--) { | |
287 | + if (crc & 1) | |
288 | + crc = (crc >> 1) ^ CRC_POLYNOMIAL; | |
289 | + else | |
290 | + crc >>= 1; | |
291 | + } | |
292 | + crc_table[i] = crc; | |
293 | + } | |
294 | + crc_table_built = 1; | |
295 | +} | |
296 | + | |
297 | +static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize) | |
298 | +{ | |
299 | + unsigned char * current_byte; | |
300 | + u32 temp1, temp2, i; | |
301 | + | |
302 | + current_byte = (unsigned char *) buffer; | |
303 | + /* Make sure the crc table is available */ | |
304 | + if (!crc_table_built) | |
305 | + build_crc_table(); | |
306 | + /* Process each byte in the buffer. */ | |
307 | + for (i = 0; i < buffersize; i++) { | |
308 | + temp1 = (crc >> 8) & 0x00FFFFFF; | |
309 | + temp2 = crc_table[(crc ^ (u32) * current_byte) & | |
310 | + (u32) 0xff]; | |
311 | + current_byte++; | |
312 | + crc = temp1 ^ temp2; | |
313 | + } | |
314 | + return crc; | |
315 | +} | |
316 | + | |
317 | +/** | |
318 | + * le_bbr_table_sector_to_cpu | |
319 | + * | |
320 | + * Convert bbr meta data from on-disk (LE) format | |
321 | + * to the native cpu endian format. | |
322 | + **/ | |
323 | +static void le_bbr_table_sector_to_cpu(struct bbr_table * p) | |
324 | +{ | |
325 | + int i; | |
326 | + p->signature = le32_to_cpup(&p->signature); | |
327 | + p->crc = le32_to_cpup(&p->crc); | |
328 | + p->sequence_number = le32_to_cpup(&p->sequence_number); | |
329 | + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); | |
330 | + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { | |
331 | + p->entries[i].bad_sect = | |
332 | + le64_to_cpup(&p->entries[i].bad_sect); | |
333 | + p->entries[i].replacement_sect = | |
334 | + le64_to_cpup(&p->entries[i].replacement_sect); | |
335 | + } | |
336 | +} | |
337 | + | |
338 | +/** | |
339 | + * cpu_bbr_table_sector_to_le | |
340 | + * | |
341 | + * Convert bbr meta data from cpu endian format to on-disk (LE) format | |
342 | + **/ | |
343 | +static void cpu_bbr_table_sector_to_le(struct bbr_table * p, | |
344 | + struct bbr_table * le) | |
345 | +{ | |
346 | + int i; | |
347 | + le->signature = cpu_to_le32p(&p->signature); | |
348 | + le->crc = cpu_to_le32p(&p->crc); | |
349 | + le->sequence_number = cpu_to_le32p(&p->sequence_number); | |
350 | + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); | |
351 | + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { | |
352 | + le->entries[i].bad_sect = | |
353 | + cpu_to_le64p(&p->entries[i].bad_sect); | |
354 | + le->entries[i].replacement_sect = | |
355 | + cpu_to_le64p(&p->entries[i].replacement_sect); | |
356 | + } | |
357 | +} | |
358 | + | |
359 | +/** | |
360 | + * validate_bbr_table_sector | |
361 | + * | |
362 | + * Check the specified BBR table sector for a valid signature and CRC. If it's | |
363 | + * valid, endian-convert the table sector. | |
364 | + **/ | |
365 | +static int validate_bbr_table_sector(struct bbr_table * p) | |
366 | +{ | |
367 | + int rc = 0; | |
368 | + int org_crc, final_crc; | |
369 | + | |
370 | + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) { | |
371 | + DMERR("BBR table signature doesn't match!"); | |
372 | + DMERR("Found 0x%x. Expecting 0x%x", | |
373 | + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE); | |
374 | + rc = -EINVAL; | |
375 | + goto out; | |
376 | + } | |
377 | + | |
378 | + if (!p->crc) { | |
379 | + DMERR("BBR table sector has no CRC!"); | |
380 | + rc = -EINVAL; | |
381 | + goto out; | |
382 | + } | |
383 | + | |
384 | + org_crc = le32_to_cpup(&p->crc); | |
385 | + p->crc = 0; | |
386 | + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p)); | |
387 | + if (final_crc != org_crc) { | |
388 | + DMERR("CRC failed!"); | |
389 | + DMERR("Found 0x%x. Expecting 0x%x", | |
390 | + org_crc, final_crc); | |
391 | + rc = -EINVAL; | |
392 | + goto out; | |
393 | + } | |
394 | + | |
395 | + p->crc = cpu_to_le32p(&org_crc); | |
396 | + le_bbr_table_sector_to_cpu(p); | |
397 | + | |
398 | +out: | |
399 | + return rc; | |
400 | +} | |
401 | + | |
402 | +/** | |
403 | + * bbr_binary_tree_insert | |
404 | + * | |
405 | + * Insert a node into the binary tree. | |
406 | + **/ | |
407 | +static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root, | |
408 | + struct bbr_runtime_remap * newnode) | |
409 | +{ | |
410 | + struct bbr_runtime_remap ** node = root; | |
411 | + while (node && *node) { | |
412 | + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) { | |
413 | + node = &((*node)->right); | |
414 | + } else { | |
415 | + node = &((*node)->left); | |
416 | + } | |
417 | + } | |
418 | + | |
419 | + newnode->left = newnode->right = NULL; | |
420 | + *node = newnode; | |
421 | +} | |
422 | + | |
423 | +/** | |
424 | + * bbr_binary_search | |
425 | + * | |
426 | + * Search for a node that contains bad_sect == lsn. | |
427 | + **/ | |
428 | +static struct bbr_runtime_remap * bbr_binary_search( | |
429 | + struct bbr_runtime_remap * root, | |
430 | + u64 lsn) | |
431 | +{ | |
432 | + struct bbr_runtime_remap * node = root; | |
433 | + while (node) { | |
434 | + if (node->remap.bad_sect == lsn) { | |
435 | + break; | |
436 | + } | |
437 | + if (lsn > node->remap.bad_sect) { | |
438 | + node = node->right; | |
439 | + } else { | |
440 | + node = node->left; | |
441 | + } | |
442 | + } | |
443 | + return node; | |
444 | +} | |
445 | + | |
446 | +/** | |
447 | + * bbr_binary_tree_destroy | |
448 | + * | |
449 | + * Destroy the binary tree. | |
450 | + **/ | |
451 | +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root, | |
452 | + struct bbr_private * bbr_id) | |
453 | +{ | |
454 | + struct bbr_runtime_remap ** link = NULL; | |
455 | + struct bbr_runtime_remap * node = root; | |
456 | + | |
457 | + while (node) { | |
458 | + if (node->left) { | |
459 | + link = &(node->left); | |
460 | + node = node->left; | |
461 | + continue; | |
462 | + } | |
463 | + if (node->right) { | |
464 | + link = &(node->right); | |
465 | + node = node->right; | |
466 | + continue; | |
467 | + } | |
468 | + | |
469 | + mempool_free(node, bbr_remap_pool); | |
470 | + if (node == root) { | |
471 | + /* If root is deleted, we're done. */ | |
472 | + break; | |
473 | + } | |
474 | + | |
475 | + /* Back to root. */ | |
476 | + node = root; | |
477 | + *link = NULL; | |
478 | + } | |
479 | +} | |
480 | + | |
481 | +static void bbr_free_remap(struct bbr_private * bbr_id) | |
482 | +{ | |
483 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
484 | + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id); | |
485 | + bbr_id->remap_root = NULL; | |
486 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
487 | +} | |
488 | + | |
489 | +/** | |
490 | + * bbr_insert_remap_entry | |
491 | + * | |
492 | + * Create a new remap entry and add it to the binary tree for this node. | |
493 | + **/ | |
494 | +static int bbr_insert_remap_entry(struct bbr_private * bbr_id, | |
495 | + struct bbr_table_entry * new_bbr_entry) | |
496 | +{ | |
497 | + struct bbr_runtime_remap * newnode; | |
498 | + | |
499 | + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO); | |
500 | + if (!newnode) { | |
501 | + DMERR("Could not allocate from remap mempool!"); | |
502 | + return -ENOMEM; | |
503 | + } | |
504 | + newnode->remap.bad_sect = new_bbr_entry->bad_sect; | |
505 | + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; | |
506 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
507 | + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); | |
508 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
509 | + return 0; | |
510 | +} | |
511 | + | |
512 | +/** | |
513 | + * bbr_table_to_remap_list | |
514 | + * | |
515 | + * The on-disk bbr table is sorted by the replacement sector LBA. In order to | |
516 | + * improve run time performance, the in memory remap list must be sorted by | |
517 | + * the bad sector LBA. This function is called at discovery time to initialize | |
518 | + * the remap list. This function assumes that at least one copy of meta data | |
519 | + * is valid. | |
520 | + **/ | |
521 | +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id) | |
522 | +{ | |
523 | + u32 in_use_blks = 0; | |
524 | + int i, j; | |
525 | + struct bbr_table * p; | |
526 | + | |
527 | + | |
528 | + for (i = 0, p = bbr_id->bbr_table; | |
529 | + i < bbr_id->nr_sects_bbr_table; | |
530 | + i++, p++ ) { | |
531 | + if (!p->in_use_cnt) { | |
532 | + break; | |
533 | + } | |
534 | + in_use_blks += p->in_use_cnt; | |
535 | + for (j = 0; j < p->in_use_cnt; j++) { | |
536 | + bbr_insert_remap_entry(bbr_id, &p->entries[j]); | |
537 | + } | |
538 | + } | |
539 | + if (in_use_blks) | |
540 | + DMWARN("There are %u BBR entries for device %u:%u", | |
541 | + in_use_blks, MAJOR(bbr_id->dev->dev), | |
542 | + MINOR(bbr_id->dev->dev)); | |
543 | + | |
544 | + return in_use_blks; | |
545 | +} | |
546 | + | |
547 | +/** | |
548 | + * bbr_search_remap_entry | |
549 | + * | |
550 | + * Search remap entry for the specified sector. If found, return a pointer to | |
551 | + * the table entry. Otherwise, return NULL. | |
552 | + **/ | |
553 | +static struct bbr_table_entry * bbr_search_remap_entry( | |
554 | + struct bbr_private * bbr_id, | |
555 | + u64 lsn) | |
556 | +{ | |
557 | + struct bbr_runtime_remap * p; | |
558 | + | |
559 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
560 | + p = bbr_binary_search(bbr_id->remap_root, lsn); | |
561 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
562 | + if (p) { | |
563 | + return (&p->remap); | |
564 | + } else { | |
565 | + return NULL; | |
566 | + } | |
567 | +} | |
568 | + | |
569 | +/** | |
570 | + * bbr_remap | |
571 | + * | |
572 | + * If *lsn is in the remap table, return TRUE and modify *lsn, | |
573 | + * else, return FALSE. | |
574 | + **/ | |
575 | +static inline int bbr_remap(struct bbr_private * bbr_id, | |
576 | + u64 * lsn) | |
577 | +{ | |
578 | + struct bbr_table_entry * e; | |
579 | + | |
580 | + if (atomic_read(&bbr_id->in_use_replacement_blks)) { | |
581 | + e = bbr_search_remap_entry(bbr_id, *lsn); | |
582 | + if (e) { | |
583 | + *lsn = e->replacement_sect; | |
584 | + return 1; | |
585 | + } | |
586 | + } | |
587 | + return 0; | |
588 | +} | |
589 | + | |
590 | +/** | |
591 | + * bbr_remap_probe | |
592 | + * | |
593 | + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap | |
594 | + * table return TRUE, Else, return FALSE. | |
595 | + **/ | |
596 | +static inline int bbr_remap_probe(struct bbr_private * bbr_id, | |
597 | + u64 lsn, u64 nr_sects) | |
598 | +{ | |
599 | + u64 tmp, cnt; | |
600 | + | |
601 | + if (atomic_read(&bbr_id->in_use_replacement_blks)) { | |
602 | + for (cnt = 0, tmp = lsn; | |
603 | + cnt < nr_sects; | |
604 | + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { | |
605 | + if (bbr_remap(bbr_id,&tmp)) { | |
606 | + return 1; | |
607 | + } | |
608 | + } | |
609 | + } | |
610 | + return 0; | |
611 | +} | |
612 | + | |
613 | +/** | |
614 | + * bbr_setup | |
615 | + * | |
616 | + * Read the remap tables from disk and set up the initial remap tree. | |
617 | + **/ | |
618 | +static int bbr_setup(struct bbr_private * bbr_id) | |
619 | +{ | |
620 | + struct bbr_table * table = bbr_id->bbr_table; | |
621 | + struct page * page; | |
622 | + struct io_region job; | |
623 | + unsigned int error, offset; | |
624 | + int i, rc = 0; | |
625 | + | |
626 | + job.dev = bbr_id->dev->dev; | |
627 | + job.count = 1; | |
628 | + | |
629 | + /* Read and verify each BBR table sector individually. */ | |
630 | + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) { | |
631 | + job.sector = bbr_id->lba_table1 + i; | |
632 | + page = virt_to_page(table); | |
633 | + offset = (unsigned long)table & ~PAGE_MASK; | |
634 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
635 | + if (rc && bbr_id->lba_table2) { | |
636 | + job.sector = bbr_id->lba_table2 + i; | |
637 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
638 | + } | |
639 | + if (rc) { | |
640 | + goto out; | |
641 | + } | |
642 | + | |
643 | + rc = validate_bbr_table_sector(table); | |
644 | + if (rc) { | |
645 | + goto out; | |
646 | + } | |
647 | + } | |
648 | + atomic_set(&bbr_id->in_use_replacement_blks, | |
649 | + bbr_table_to_remap_list(bbr_id)); | |
650 | + | |
651 | +out: | |
652 | + if (rc) { | |
653 | + DMERR("dm-bbr: error during device setup: %d", rc); | |
654 | + } | |
655 | + return rc; | |
656 | +} | |
657 | + | |
658 | +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id, | |
659 | + struct buffer_head * bh, | |
660 | + int rw) | |
661 | +{ | |
662 | + struct bbr_io_buffer * bbr_io_buf; | |
663 | + | |
664 | + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO); | |
665 | + if (bbr_io_buf) { | |
666 | + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer)); | |
667 | + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list); | |
668 | + bbr_io_buf->bbr_id = bbr_id; | |
669 | + bbr_io_buf->sector = bh->b_rsector; | |
670 | + bbr_io_buf->bh = bh; | |
671 | + bbr_io_buf->rw = rw; | |
672 | + } else { | |
673 | + DMWARN("Could not allocate from BBR I/O buffer pool!"); | |
674 | + } | |
675 | + return bbr_io_buf; | |
676 | +} | |
677 | + | |
678 | +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf) | |
679 | +{ | |
680 | + mempool_free(bbr_io_buf, bbr_io_buf_pool); | |
681 | +} | |
682 | + | |
683 | +/** | |
684 | + * bbr_io_remap_error | |
685 | + * @bbr_id: Private data for the BBR node. | |
686 | + * @rw: READ or WRITE. | |
687 | + * @starting_lsn: Starting sector of request to remap. | |
688 | + * @count: Number of sectors in the request. | |
689 | + * @buffer: Data buffer for the request. | |
690 | + * | |
691 | + * For the requested range, try to write each sector individually. For each | |
692 | + * sector that fails, find the next available remap location and write the | |
693 | + * data to that new location. Then update the table and write both copies | |
694 | + * of the table to disk. Finally, update the in-memory mapping and do any | |
695 | + * other necessary bookkeeping. | |
696 | + **/ | |
697 | +static int bbr_io_remap_error(struct bbr_private * bbr_id, | |
698 | + int rw, | |
699 | + u64 starting_lsn, | |
700 | + u64 count, | |
701 | + char * buffer) | |
702 | +{ | |
703 | + struct bbr_table * bbr_table; | |
704 | + struct io_region job; | |
705 | + struct page * page; | |
706 | + unsigned long table_sector_index; | |
707 | + unsigned long table_sector_offset; | |
708 | + unsigned long index; | |
709 | + unsigned int offset_in_page, error; | |
710 | + u64 lsn, new_lsn; | |
711 | + int rc; | |
712 | + | |
713 | + if (rw == READ) { | |
714 | + /* Nothing can be done about read errors. */ | |
715 | + return -EIO; | |
716 | + } | |
717 | + | |
718 | + job.dev = bbr_id->dev->dev; | |
719 | + | |
720 | + /* For each sector in the request. */ | |
721 | + for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) { | |
722 | + job.sector = starting_lsn + lsn; | |
723 | + job.count = 1; | |
724 | + page = virt_to_page(buffer); | |
725 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
726 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
727 | + while (rc) { | |
728 | + /* Find the next available relocation sector. */ | |
729 | + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); | |
730 | + if (new_lsn >= bbr_id->nr_replacement_blks) { | |
731 | + /* No more replacement sectors available. */ | |
732 | + return -EIO; | |
733 | + } | |
734 | + new_lsn += bbr_id->start_replacement_sect; | |
735 | + | |
736 | + /* Write the data to its new location. */ | |
737 | + DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64, | |
738 | + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), | |
739 | + starting_lsn + lsn, new_lsn); | |
740 | + job.sector = new_lsn; | |
741 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
742 | + if (rc) { | |
743 | + /* This replacement sector is bad. | |
744 | + * Try the next one. | |
745 | + */ | |
746 | + DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.", | |
747 | + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn); | |
748 | + atomic_inc(&bbr_id->in_use_replacement_blks); | |
749 | + continue; | |
750 | + } | |
751 | + | |
752 | + /* Add this new entry to the on-disk table. */ | |
753 | + table_sector_index = new_lsn - | |
754 | + bbr_id->start_replacement_sect; | |
755 | + table_sector_offset = table_sector_index / | |
756 | + BBR_ENTRIES_PER_SECT; | |
757 | + index = table_sector_index % BBR_ENTRIES_PER_SECT; | |
758 | + | |
759 | + bbr_table = &bbr_id->bbr_table[table_sector_offset]; | |
760 | + bbr_table->entries[index].bad_sect = starting_lsn + lsn; | |
761 | + bbr_table->entries[index].replacement_sect = new_lsn; | |
762 | + bbr_table->in_use_cnt++; | |
763 | + bbr_table->sequence_number++; | |
764 | + bbr_table->crc = 0; | |
765 | + bbr_table->crc = calculate_crc(INITIAL_CRC, | |
766 | + bbr_table, | |
767 | + sizeof(struct bbr_table)); | |
768 | + | |
769 | + /* Write the table to disk. */ | |
770 | + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); | |
771 | + page = virt_to_page(bbr_table); | |
772 | + offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK; | |
773 | + if (bbr_id->lba_table1) { | |
774 | + job.sector = bbr_id->lba_table1 + table_sector_offset; | |
775 | + job.count = 1; | |
776 | + rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); | |
777 | + } | |
778 | + if (bbr_id->lba_table2) { | |
779 | + job.sector = bbr_id->lba_table2 + table_sector_offset; | |
780 | + rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); | |
781 | + } | |
782 | + le_bbr_table_sector_to_cpu(bbr_table); | |
783 | + | |
784 | + if (rc) { | |
785 | + /* Error writing one of the tables to disk. */ | |
786 | + DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.", | |
787 | + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev)); | |
788 | + return rc; | |
789 | + } | |
790 | + | |
791 | + /* Insert a new entry in the remapping binary-tree. */ | |
792 | + rc = bbr_insert_remap_entry(bbr_id, | |
793 | + &bbr_table->entries[index]); | |
794 | + if (rc) { | |
795 | + DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.", | |
796 | + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev)); | |
797 | + return rc; | |
798 | + } | |
799 | + | |
800 | + atomic_inc(&bbr_id->in_use_replacement_blks); | |
801 | + } | |
802 | + } | |
803 | + | |
804 | + return 0; | |
805 | +} | |
806 | + | |
807 | +/** | |
808 | + * bbr_io_process_request | |
809 | + * | |
810 | + * For each sector in this request, check if the sector has already | |
811 | + * been remapped. If so, process all previous sectors in the request, | |
812 | + * followed by the remapped sector. Then reset the starting lsn and | |
813 | + * count, and keep going with the rest of the request as if it were | |
814 | + * a whole new request. If any of the sync_io's return an error, | |
815 | + * call the remapper to relocate the bad sector(s). | |
816 | + **/ | |
817 | +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf) | |
818 | +{ | |
819 | + struct bbr_private * bbr_id = bbr_io_buf->bbr_id; | |
820 | + struct io_region job; | |
821 | + u64 starting_lsn = bbr_io_buf->sector; | |
822 | + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT; | |
823 | + u64 lsn, remapped_lsn; | |
824 | + char * buffer = bbr_io_buf->bh->b_data; | |
825 | + struct page * page = virt_to_page(buffer); | |
826 | + unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
827 | + unsigned int error; | |
828 | + int rw = bbr_io_buf->rw; | |
829 | + int rc = 0; | |
830 | + | |
831 | + job.dev = bbr_id->dev->dev; | |
832 | + | |
833 | + /* For each sector in this request, check if this sector has already | |
834 | + * been remapped. If so, process all previous sectors in this request, | |
835 | + * followed by the remapped sector. Then reset the starting lsn and | |
836 | + * count and keep going with the rest of the request as if it were | |
837 | + * a whole new request. | |
838 | + */ | |
839 | + for (lsn = 0; lsn < count; lsn++) { | |
840 | + remapped_lsn = starting_lsn + lsn; | |
841 | + rc = bbr_remap(bbr_id, &remapped_lsn); | |
842 | + if (!rc) { | |
843 | + /* This sector is fine. */ | |
844 | + continue; | |
845 | + } | |
846 | + | |
847 | + /* Process all sectors in the request up to this one. */ | |
848 | + if (lsn > 0) { | |
849 | + job.sector = starting_lsn; | |
850 | + job.count = lsn; | |
851 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
852 | + if (rc) { | |
853 | + /* If this I/O failed, then one of the sectors | |
854 | + * in this request needs to be relocated. | |
855 | + */ | |
856 | + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, | |
857 | + lsn, buffer); | |
858 | + if (rc) { | |
859 | + return rc; | |
860 | + } | |
861 | + } | |
862 | + buffer += (lsn << SECTOR_SHIFT); | |
863 | + page = virt_to_page(buffer); | |
864 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
865 | + } | |
866 | + | |
867 | + /* Process the remapped sector. */ | |
868 | + job.sector = remapped_lsn; | |
869 | + job.count = 1; | |
870 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
871 | + if (rc) { | |
872 | + /* BUGBUG - Need more processing if this caused an | |
873 | + * an error. If this I/O failed, then the existing | |
874 | + * remap is now bad, and we need to find a new remap. | |
875 | + * Can't use bbr_io_remap_error(), because the existing | |
876 | + * map entry needs to be changed, not added again, and | |
877 | + * the original table entry also needs to be changed. | |
878 | + */ | |
879 | + return rc; | |
880 | + } | |
881 | + | |
882 | + buffer += SECTOR_SIZE; | |
883 | + starting_lsn += (lsn + 1); | |
884 | + count -= (lsn + 1); | |
885 | + lsn = -1; | |
886 | + page = virt_to_page(buffer); | |
887 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
888 | + } | |
889 | + | |
890 | + /* Check for any remaining sectors after the last split. This could | |
891 | + * potentially be the whole request, but that should be a rare case | |
892 | + * because requests should only be processed by the thread if we know | |
893 | + * an error occurred or they contained one or more remapped sectors. | |
894 | + */ | |
895 | + if (count) { | |
896 | + job.sector = starting_lsn; | |
897 | + job.count = count; | |
898 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
899 | + if (rc) { | |
900 | + /* If this I/O failed, then one of the sectors in this | |
901 | + * request needs to be relocated. | |
902 | + */ | |
903 | + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, | |
904 | + count, buffer); | |
905 | + if (rc) { | |
906 | + return rc; | |
907 | + } | |
908 | + } | |
909 | + } | |
910 | + | |
911 | + return 0; | |
912 | +} | |
913 | + | |
914 | +/** | |
915 | + * bbr_io_handler | |
916 | + * | |
917 | + * This is the handler for the bbr_io_thread. It continuously loops, | |
918 | + * taking I/O requests off its list and processing them. If nothing | |
919 | + * is on the list, the thread goes back to sleep until specifically | |
920 | + * woken up. | |
921 | + * | |
922 | + * I/O requests should only be sent to this thread if we know that: | |
923 | + * a) the request contains at least one remapped sector. | |
924 | + * or | |
925 | + * b) the request caused an error on the normal I/O path. | |
926 | + * This function uses synchronous I/O, so sending a request to this | |
927 | + * thread that doesn't need special processing will cause severe | |
928 | + * performance degredation. | |
929 | + **/ | |
930 | +static void bbr_io_handler(void) | |
931 | +{ | |
932 | + struct bbr_io_buffer * bbr_io_buf; | |
933 | + struct buffer_head * bh; | |
934 | + unsigned long flags; | |
935 | + int rc; | |
936 | + | |
937 | + while (1) { | |
938 | + /* Process bbr_io_list, one entry at a time. */ | |
939 | + spin_lock_irqsave(&bbr_io_list_lock, flags); | |
940 | + if (list_empty(&bbr_io_list)) { | |
941 | + /* No more items on the list. */ | |
942 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
943 | + break; | |
944 | + } | |
945 | + bbr_io_buf = list_entry(bbr_io_list.next, | |
946 | + struct bbr_io_buffer, bbr_io_list); | |
947 | + list_del_init(&bbr_io_buf->bbr_io_list); | |
948 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
949 | + | |
950 | + rc = bbr_io_process_request(bbr_io_buf); | |
951 | + | |
952 | + /* Clean up and complete the original I/O. */ | |
953 | + bbr_io_buf->flags |= BBR_IO_HANDLED; | |
954 | + bh = bbr_io_buf->bh; | |
955 | + if (bh->b_end_io) { | |
956 | + /* If this was the bbr_io_buf for an error on the | |
957 | + * normal WRITE, don't free it here. It will be | |
958 | + * freed later in bbr_callback() | |
959 | + */ | |
960 | + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE)) | |
961 | + free_bbr_io_buf(bbr_io_buf); | |
962 | + bh->b_end_io(bh, rc ? 0 : 1); | |
963 | + } | |
964 | + } | |
965 | +} | |
966 | + | |
967 | +/** | |
968 | + * bbr_schedule_io | |
969 | + * | |
970 | + * Place the specified bbr_io_buf on the thread's processing list. | |
971 | + **/ | |
972 | +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf) | |
973 | +{ | |
974 | + unsigned long flags; | |
975 | + spin_lock_irqsave(&bbr_io_list_lock, flags); | |
976 | + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list); | |
977 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
978 | + dm_daemon_wake(bbr_io_thread); | |
979 | +} | |
980 | + | |
981 | +/** | |
982 | + * bbr_read | |
983 | + * | |
984 | + * If there are any remapped sectors on this object, send this request over | |
985 | + * to the thread for processing. Otherwise send it down the stack normally. | |
986 | + **/ | |
987 | +static int bbr_read(struct bbr_private * bbr_id, | |
988 | + struct buffer_head * bh) | |
989 | +{ | |
990 | + struct bbr_io_buffer * bbr_io_buf; | |
991 | + | |
992 | + | |
993 | + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || | |
994 | + !bbr_remap_probe(bbr_id, bh->b_rsector, | |
995 | + bh->b_size >> SECTOR_SHIFT)) { | |
996 | + /* No existing remaps or this request doesn't | |
997 | + * contain any remapped sectors. | |
998 | + */ | |
999 | + bh->b_rdev = bbr_id->dev->dev; | |
1000 | + return 1; | |
1001 | + } | |
1002 | + | |
1003 | + /* This request has at least one remapped sector. */ | |
1004 | + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ); | |
1005 | + if (!bbr_io_buf) { | |
1006 | + /* Can't get memory to track the I/O. */ | |
1007 | + bh->b_end_io(bh, 0); | |
1008 | + return -ENOMEM; | |
1009 | + } | |
1010 | + | |
1011 | + bbr_schedule_io(bbr_io_buf); | |
1012 | + return 0; | |
1013 | +} | |
1014 | + | |
1015 | +/** | |
1016 | + * bbr_callback | |
1017 | + * | |
1018 | + * This is the callback for normal write requests. Check for an error | |
1019 | + * during the I/O, and send to the thread for processing if necessary. | |
1020 | + **/ | |
1021 | +static int bbr_callback(struct dm_target * ti, | |
1022 | + struct buffer_head * bh, | |
1023 | + int rw, | |
1024 | + int error, | |
1025 | + union map_info * map_context) | |
1026 | +{ | |
1027 | + struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr; | |
1028 | + | |
1029 | + if (!bbr_io_buf) | |
1030 | + return error; | |
1031 | + | |
1032 | + /* Will try to relocate the WRITE if: | |
1033 | + * - It is an error, and | |
1034 | + * - It is not an error of BBR relocation, and | |
1035 | + */ | |
1036 | + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) { | |
1037 | + DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.", | |
1038 | + MAJOR(bh->b_rdev), MINOR(bh->b_rdev), | |
1039 | + (unsigned long)bbr_io_buf->sector); | |
1040 | + /* Indicate this bbr_io_buf is for an error on normal WRITE */ | |
1041 | + bbr_io_buf->flags |= BBR_IO_RELOCATE; | |
1042 | + bbr_schedule_io(bbr_io_buf); | |
1043 | + /* Returns >0 so that DM will let us retry the I/O */ | |
1044 | + return 1; | |
1045 | + } | |
1046 | + | |
1047 | + free_bbr_io_buf(bbr_io_buf); | |
1048 | + return error; | |
1049 | +} | |
1050 | + | |
1051 | +/** | |
1052 | + * bbr_write | |
1053 | + * | |
1054 | + * If there are any remapped sectors on this object, send the request over | |
1055 | + * to the thread for processing. Otherwise, register for callback | |
1056 | + * notification, and send the request down normally. | |
1057 | + **/ | |
1058 | +static int bbr_write(struct bbr_private * bbr_id, | |
1059 | + struct buffer_head * bh, | |
1060 | + union map_info * map_context) | |
1061 | +{ | |
1062 | + struct bbr_io_buffer * bbr_io_buf; | |
1063 | + | |
1064 | + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE); | |
1065 | + if (!bbr_io_buf) { | |
1066 | + /* Can't get memory to track the I/O. */ | |
1067 | + bh->b_end_io(bh, 0); | |
1068 | + return -ENOMEM; | |
1069 | + } | |
1070 | + | |
1071 | + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || | |
1072 | + !bbr_remap_probe(bbr_id, bh->b_rsector, | |
1073 | + bh->b_size >> SECTOR_SHIFT)) { | |
1074 | + /* No existing remaps or this request | |
1075 | + * contains no remapped sectors. | |
1076 | + */ | |
1077 | + bh->b_rdev = bbr_id->dev->dev; | |
1078 | + map_context->ptr = bbr_io_buf; | |
1079 | + return 1; | |
1080 | + } else { | |
1081 | + /* This request contains at least one remapped sector. */ | |
1082 | + map_context->ptr = NULL; | |
1083 | + bbr_schedule_io(bbr_io_buf); | |
1084 | + } | |
1085 | + return 0; | |
1086 | +} | |
1087 | + | |
1088 | +/** | |
1089 | + * Construct a bbr mapping | |
1090 | + **/ | |
1091 | +static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv) | |
1092 | +{ | |
1093 | + struct bbr_private * bbr_id; | |
1094 | + u32 block_size; | |
1095 | + char * end; | |
1096 | + int rc = -EINVAL; | |
1097 | + | |
1098 | + if (argc != 8) { | |
1099 | + ti->error = "dm-bbr requires exactly 8 arguments: " | |
1100 | + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size"; | |
1101 | + goto out1; | |
1102 | + } | |
1103 | + | |
1104 | + bbr_id = bbr_alloc_private(); | |
1105 | + if (!bbr_id) { | |
1106 | + ti->error = "dm-bbr: Error allocating bbr private data."; | |
1107 | + goto out1; | |
1108 | + } | |
1109 | + | |
1110 | + bbr_id->offset = simple_strtoull(argv[1], &end, 10); | |
1111 | + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10); | |
1112 | + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10); | |
1113 | + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10); | |
1114 | + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10); | |
1115 | + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10); | |
1116 | + block_size = simple_strtoul(argv[7], &end, 10); | |
1117 | + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT); | |
1118 | + | |
1119 | + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT, | |
1120 | + GFP_KERNEL); | |
1121 | + if (!bbr_id->bbr_table) { | |
1122 | + ti->error = "dm-bbr: Error allocating bbr table."; | |
1123 | + goto out2; | |
1124 | + } | |
1125 | + | |
1126 | + if (dm_get_device(ti, argv[0], 0, ti->len, | |
1127 | + dm_table_get_mode(ti->table), &bbr_id->dev)) { | |
1128 | + ti->error = "dm-bbr: Device lookup failed"; | |
1129 | + goto out2; | |
1130 | + } | |
1131 | + | |
1132 | + /* Using a semaphore here is probably overkill, | |
1133 | + * but at least it will be correct. | |
1134 | + */ | |
1135 | + down(&bbr_instances_lock); | |
1136 | + if (bbr_instances == 0) { | |
1137 | + rc = bbr_global_init(); | |
1138 | + if (rc) { | |
1139 | + up(&bbr_instances_lock); | |
1140 | + goto out3; | |
1141 | + } | |
1142 | + } | |
1143 | + bbr_instances++; | |
1144 | + up(&bbr_instances_lock); | |
1145 | + | |
1146 | + rc = bbr_setup(bbr_id); | |
1147 | + if (rc) { | |
1148 | + ti->error = "dm-bbr: Device setup failed"; | |
1149 | + goto out4; | |
1150 | + } | |
1151 | + | |
1152 | + ti->private = bbr_id; | |
1153 | + return 0; | |
1154 | + | |
1155 | +out4: | |
1156 | + down(&bbr_instances_lock); | |
1157 | + bbr_instances--; | |
1158 | + if (bbr_instances == 0) { | |
1159 | + bbr_global_cleanup(); | |
1160 | + } | |
1161 | + up(&bbr_instances_lock); | |
1162 | + | |
1163 | +out3: | |
1164 | + dm_put_device(ti, bbr_id->dev); | |
1165 | +out2: | |
1166 | + bbr_free_private(bbr_id); | |
1167 | +out1: | |
1168 | + return rc; | |
1169 | +} | |
1170 | + | |
1171 | +static void bbr_dtr(struct dm_target * ti) | |
1172 | +{ | |
1173 | + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; | |
1174 | + | |
1175 | + dm_put_device(ti, bbr_id->dev); | |
1176 | + bbr_free_private(bbr_id); | |
1177 | + | |
1178 | + down(&bbr_instances_lock); | |
1179 | + bbr_instances--; | |
1180 | + if (bbr_instances == 0) { | |
1181 | + bbr_global_cleanup(); | |
1182 | + } | |
1183 | + up(&bbr_instances_lock); | |
1184 | +} | |
1185 | + | |
1186 | +static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw, | |
1187 | + union map_info * map_context) | |
1188 | +{ | |
1189 | + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; | |
1190 | + | |
1191 | + bh->b_rsector += bbr_id->offset; | |
1192 | + switch (rw) { | |
1193 | + case READ: | |
1194 | + case READA: | |
1195 | + map_context->ptr = NULL; | |
1196 | + return bbr_read(bbr_id, bh); | |
1197 | + case WRITE: | |
1198 | + return bbr_write(bbr_id, bh, map_context); | |
1199 | + default: | |
1200 | + return -EIO; | |
1201 | + } | |
1202 | +} | |
1203 | + | |
1204 | +static int bbr_status(struct dm_target * ti, status_type_t type, | |
1205 | + char * result, unsigned int maxlen) | |
1206 | +{ | |
1207 | + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; | |
1208 | + | |
1209 | + switch (type) { | |
1210 | + case STATUSTYPE_INFO: | |
1211 | + result[0] = '\0'; | |
1212 | + break; | |
1213 | + | |
1214 | + case STATUSTYPE_TABLE: | |
1215 | + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u", | |
1216 | + dm_kdevname(bbr_id->dev->dev), bbr_id->offset, | |
1217 | + bbr_id->lba_table1, bbr_id->lba_table2, | |
1218 | + bbr_id->nr_sects_bbr_table, | |
1219 | + bbr_id->start_replacement_sect, | |
1220 | + bbr_id->nr_replacement_blks, | |
1221 | + bbr_id->blksize_in_sects << SECTOR_SHIFT); | |
1222 | + break; | |
1223 | + } | |
1224 | + return 0; | |
1225 | +} | |
1226 | + | |
1227 | +static struct target_type bbr_target = { | |
1228 | + name: "bbr", | |
1229 | + module: THIS_MODULE, | |
1230 | + ctr: bbr_ctr, | |
1231 | + dtr: bbr_dtr, | |
1232 | + map: bbr_map, | |
1233 | + end_io: bbr_callback, | |
1234 | + status: bbr_status, | |
1235 | +}; | |
1236 | + | |
1237 | +int __init dm_bbr_init(void) | |
1238 | +{ | |
1239 | + int r = dm_register_target(&bbr_target); | |
1240 | + | |
1241 | + if (r < 0) | |
1242 | + DMERR("dm-bbr: register failed %d", r); | |
1243 | + | |
1244 | + return r; | |
1245 | +} | |
1246 | + | |
1247 | +void __exit dm_bbr_exit(void) | |
1248 | +{ | |
1249 | + int r = dm_unregister_target(&bbr_target); | |
1250 | + | |
1251 | + if (r < 0) | |
1252 | + DMERR("dm-bbr: unregister failed %d", r); | |
1253 | +} | |
1254 | + | |
1255 | +module_init(dm_bbr_init); | |
1256 | +module_exit(dm_bbr_exit); | |
1257 | +MODULE_LICENSE("GPL"); | |
1258 | diff -urN linux-2.4.22/drivers/md/dm-bbr.h linux-2.4.22-evms/drivers/md/dm-bbr.h | |
1259 | --- linux-2.4.22/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100 | |
1260 | +++ linux-2.4.22-evms/drivers/md/dm-bbr.h 2003-09-15 17:08:42.000000000 +0200 | |
1261 | @@ -0,0 +1,148 @@ | |
1262 | +/* | |
1263 | + * Copyright (c) International Business Machines Corp., 2002-2003 | |
1264 | + * | |
1265 | + * This program is free software; you can redistribute it and/or modify | |
1266 | + * it under the terms of the GNU General Public License as published by | |
1267 | + * the Free Software Foundation; either version 2 of the License, or | |
1268 | + * (at your option) any later version. | |
1269 | + * | |
1270 | + * This program is distributed in the hope that it will be useful, | |
1271 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1272 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
1273 | + * the GNU General Public License for more details. | |
1274 | + * | |
1275 | + * You should have received a copy of the GNU General Public License | |
1276 | + * along with this program; if not, write to the Free Software | |
1277 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1278 | + * | |
1279 | + * linux/drivers/md/dm-bbr.h | |
1280 | + * | |
1281 | + * Bad-block-relocation (BBR) target for device-mapper. | |
1282 | + * | |
1283 | + * The BBR target is designed to remap I/O write failures to another safe | |
1284 | + * location on disk. Note that most disk drives have BBR built into them, | |
1285 | + * this means that our software BBR will be only activated when all hardware | |
1286 | + * BBR replacement sectors have been used. | |
1287 | + */ | |
1288 | + | |
1289 | +#ifndef _DM_BBR_H_ | |
1290 | +#define _DM_BBR_H_ | |
1291 | + | |
1292 | +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ | |
1293 | +#define BBR_ENTRIES_PER_SECT 31 | |
1294 | +#define BBR_NR_BUFS 128 | |
1295 | +#define INITIAL_CRC 0xFFFFFFFF | |
1296 | +#define CRC_POLYNOMIAL 0xEDB88320L | |
1297 | + | |
1298 | +/** | |
1299 | + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. | |
1300 | + * Use these in place of %Ld, %Lu, and %Lx. | |
1301 | + **/ | |
1302 | +#if BITS_PER_LONG > 32 | |
1303 | +#define PFU64 "%lu" | |
1304 | +#else | |
1305 | +#define PFU64 "%Lu" | |
1306 | +#endif | |
1307 | + | |
1308 | +/** | |
1309 | + * struct bbr_table_entry | |
1310 | + * @bad_sect: LBA of bad location. | |
1311 | + * @replacement_sect: LBA of new location. | |
1312 | + * | |
1313 | + * Structure to describe one BBR remap. | |
1314 | + **/ | |
1315 | +struct bbr_table_entry { | |
1316 | + u64 bad_sect; | |
1317 | + u64 replacement_sect; | |
1318 | +}; | |
1319 | + | |
1320 | +/** | |
1321 | + * struct bbr_table | |
1322 | + * @signature: Signature on each BBR table sector. | |
1323 | + * @crc: CRC for this table sector. | |
1324 | + * @sequence_number: Used to resolve conflicts when primary and secondary | |
1325 | + * tables do not match. | |
1326 | + * @in_use_cnt: Number of in-use table entries. | |
1327 | + * @entries: Actual table of remaps. | |
1328 | + * | |
1329 | + * Structure to describe each sector of the metadata table. Each sector in this | |
1330 | + * table can describe 31 remapped sectors. | |
1331 | + **/ | |
1332 | +struct bbr_table { | |
1333 | + u32 signature; | |
1334 | + u32 crc; | |
1335 | + u32 sequence_number; | |
1336 | + u32 in_use_cnt; | |
1337 | + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT]; | |
1338 | +}; | |
1339 | + | |
1340 | +/** | |
1341 | + * struct bbr_runtime_remap | |
1342 | + * | |
1343 | + * Node in the binary tree used to keep track of remaps. | |
1344 | + **/ | |
1345 | +struct bbr_runtime_remap { | |
1346 | + struct bbr_table_entry remap; | |
1347 | + struct bbr_runtime_remap *left; | |
1348 | + struct bbr_runtime_remap *right; | |
1349 | +}; | |
1350 | + | |
1351 | +/** | |
1352 | + * struct bbr_private | |
1353 | + * @dev: Info about underlying device. | |
1354 | + * @bbr_table: Copy of metadata table. | |
1355 | + * @offset: LBA of data area. | |
1356 | + * @lba_table1: LBA of primary BBR table. | |
1357 | + * @lba_table2: LBA of secondary BBR table. | |
1358 | + * @nr_sects_bbr_table: Size of each BBR table. | |
1359 | + * @nr_replacement_blks: Number of replacement blocks. | |
1360 | + * @start_replacement_sect: LBA of start of replacement blocks. | |
1361 | + * @blksize_in_sects: Size of each block. | |
1362 | + * @in_use_replacement_blks: Current number of remapped blocks. | |
1363 | + * @remap_root: Binary tree containing all remaps. | |
1364 | + * @bbr_id_lock: Lock for the binary tree. | |
1365 | + * | |
1366 | + * Private data for each BBR target. | |
1367 | + **/ | |
1368 | +struct bbr_private { | |
1369 | + struct dm_dev * dev; | |
1370 | + struct bbr_table * bbr_table; | |
1371 | + struct bbr_runtime_remap * remap_root; | |
1372 | + u64 offset; | |
1373 | + u64 lba_table1; | |
1374 | + u64 lba_table2; | |
1375 | + u64 nr_sects_bbr_table; | |
1376 | + u64 start_replacement_sect; | |
1377 | + u64 nr_replacement_blks; | |
1378 | + u32 blksize_in_sects; | |
1379 | + atomic_t in_use_replacement_blks; | |
1380 | + spinlock_t bbr_id_lock; | |
1381 | +}; | |
1382 | + | |
1383 | +#define BBR_IO_HANDLED (1<<0) | |
1384 | +#define BBR_IO_RELOCATE (1<<1) | |
1385 | + | |
1386 | +/** | |
1387 | + * struct bbr_io_buffer | |
1388 | + * @bbr_io_list: Thread's list of bbr_io_buf's. | |
1389 | + * @bbr_id: Object for this request. | |
1390 | + * @bh: Original buffer_head. | |
1391 | + * @sector: Original sector | |
1392 | + * @flags: Operation flag (BBR_IO_*) | |
1393 | + * @rw: READ or WRITE. | |
1394 | + * @rc: Return code from bbr_io_handler. | |
1395 | + * | |
1396 | + * Structure used to track each write request. | |
1397 | + **/ | |
1398 | +struct bbr_io_buffer { | |
1399 | + struct list_head bbr_io_list; | |
1400 | + struct bbr_private *bbr_id; | |
1401 | + struct buffer_head *bh; | |
1402 | + u64 sector; | |
1403 | + u32 flags; | |
1404 | + s32 rw; | |
1405 | + s32 rc; | |
1406 | +}; | |
1407 | + | |
1408 | +#endif | |
1409 | + | |
1410 | diff -urN linux-2.4.22/drivers/md/dm-snapshot.c linux-2.4.22-evms/drivers/md/dm-snapshot.c | |
1411 | --- linux-2.4.22/drivers/md/dm-snapshot.c 2003-09-15 17:07:45.000000000 +0200 | |
1412 | +++ linux-2.4.22-evms/drivers/md/dm-snapshot.c 2003-09-15 17:08:35.000000000 +0200 | |
1413 | @@ -92,6 +92,9 @@ | |
1414 | ||
1415 | /* List of snapshots for this origin */ | |
1416 | struct list_head snapshots; | |
1417 | + | |
1418 | + /* Count of snapshots and origins referrencing this structure. */ | |
1419 | + unsigned int count; | |
1420 | }; | |
1421 | ||
1422 | /* | |
1423 | @@ -155,6 +158,35 @@ | |
1424 | } | |
1425 | ||
1426 | /* | |
1427 | + * Allocate and initialize an origin structure. | |
1428 | + */ | |
1429 | +static struct origin * __alloc_origin(kdev_t dev) | |
1430 | +{ | |
1431 | + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL); | |
1432 | + if (o) { | |
1433 | + o->dev = dev; | |
1434 | + INIT_LIST_HEAD(&o->hash_list); | |
1435 | + INIT_LIST_HEAD(&o->snapshots); | |
1436 | + __insert_origin(o); | |
1437 | + } | |
1438 | + return o; | |
1439 | +} | |
1440 | + | |
1441 | +static void __get_origin(struct origin *o) | |
1442 | +{ | |
1443 | + o->count++; | |
1444 | +} | |
1445 | + | |
1446 | +static void __put_origin(struct origin *o) | |
1447 | +{ | |
1448 | + o->count--; | |
1449 | + if (o->count == 0) { | |
1450 | + list_del(&o->hash_list); | |
1451 | + kfree(o); | |
1452 | + } | |
1453 | +} | |
1454 | + | |
1455 | +/* | |
1456 | * Make a note of the snapshot and its origin so we can look it | |
1457 | * up when the origin has a write on it. | |
1458 | */ | |
1459 | @@ -168,20 +200,37 @@ | |
1460 | ||
1461 | if (!o) { | |
1462 | /* New origin */ | |
1463 | - o = kmalloc(sizeof(*o), GFP_KERNEL); | |
1464 | + o = __alloc_origin(dev); | |
1465 | if (!o) { | |
1466 | up_write(&_origins_lock); | |
1467 | return -ENOMEM; | |
1468 | } | |
1469 | + } | |
1470 | ||
1471 | - /* Initialise the struct */ | |
1472 | - INIT_LIST_HEAD(&o->snapshots); | |
1473 | - o->dev = dev; | |
1474 | + __get_origin(o); | |
1475 | + list_add_tail(&snap->list, &o->snapshots); | |
1476 | ||
1477 | - __insert_origin(o); | |
1478 | + up_write(&_origins_lock); | |
1479 | + return 0; | |
1480 | +} | |
1481 | + | |
1482 | +static int register_origin(kdev_t dev) | |
1483 | +{ | |
1484 | + struct origin *o; | |
1485 | + | |
1486 | + down_write(&_origins_lock); | |
1487 | + o = __lookup_origin(dev); | |
1488 | + | |
1489 | + if (!o) { | |
1490 | + /* New origin */ | |
1491 | + o = __alloc_origin(dev); | |
1492 | + if (!o) { | |
1493 | + up_write(&_origins_lock); | |
1494 | + return -ENOMEM; | |
1495 | + } | |
1496 | } | |
1497 | ||
1498 | - list_add_tail(&snap->list, &o->snapshots); | |
1499 | + __get_origin(o); | |
1500 | ||
1501 | up_write(&_origins_lock); | |
1502 | return 0; | |
1503 | @@ -195,11 +244,18 @@ | |
1504 | o = __lookup_origin(s->origin->dev); | |
1505 | ||
1506 | list_del(&s->list); | |
1507 | - if (list_empty(&o->snapshots)) { | |
1508 | - list_del(&o->hash_list); | |
1509 | - kfree(o); | |
1510 | - } | |
1511 | + __put_origin(o); | |
1512 | + | |
1513 | + up_write(&_origins_lock); | |
1514 | +} | |
1515 | + | |
1516 | +static void unregister_origin(kdev_t dev) | |
1517 | +{ | |
1518 | + struct origin *o; | |
1519 | ||
1520 | + down_write(&_origins_lock); | |
1521 | + o = __lookup_origin(dev); | |
1522 | + __put_origin(o); | |
1523 | up_write(&_origins_lock); | |
1524 | } | |
1525 | ||
1526 | @@ -1090,6 +1146,13 @@ | |
1527 | return r; | |
1528 | } | |
1529 | ||
1530 | + r = register_origin(dev->dev); | |
1531 | + if (r) { | |
1532 | + ti->error = "Cannot register origin"; | |
1533 | + dm_put_device(ti, dev); | |
1534 | + return r; | |
1535 | + } | |
1536 | + | |
1537 | ti->private = dev; | |
1538 | return 0; | |
1539 | } | |
1540 | @@ -1097,6 +1160,7 @@ | |
1541 | static void origin_dtr(struct dm_target *ti) | |
1542 | { | |
1543 | struct dm_dev *dev = (struct dm_dev *) ti->private; | |
1544 | + unregister_origin(dev->dev); | |
1545 | dm_put_device(ti, dev); | |
1546 | } | |
1547 | ||
1548 | diff -urN linux-2.4.22/drivers/md/dm-sparse.c linux-2.4.22-evms/drivers/md/dm-sparse.c | |
1549 | --- linux-2.4.22/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100 | |
1550 | +++ linux-2.4.22-evms/drivers/md/dm-sparse.c 2003-09-15 17:09:48.000000000 +0200 | |
1551 | @@ -0,0 +1,713 @@ | |
1552 | +/* -*- linux-c -*- */ | |
1553 | + | |
1554 | +/* | |
1555 | + * Copyright (c) International Business Machines Corp., 2002 | |
1556 | + * | |
1557 | + * This program is free software; you can redistribute it and/or modify | |
1558 | + * it under the terms of the GNU General Public License as published by | |
1559 | + * the Free Software Foundation; either version 2 of the License, or | |
1560 | + * (at your option) any later version. | |
1561 | + * | |
1562 | + * This program is distributed in the hope that it will be useful, | |
1563 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1564 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
1565 | + * the GNU General Public License for more details. | |
1566 | + * | |
1567 | + * You should have received a copy of the GNU General Public License | |
1568 | + * along with this program; if not, write to the Free Software | |
1569 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1570 | + * | |
1571 | + * linux/drivers/md/dm-sparse.c | |
1572 | + * | |
1573 | + * Sparse target for device-mapper. | |
1574 | + * | |
1575 | + * This target provides the ability to create a sparse device. This | |
1576 | + * allows a device to pretend to be larger than it really is. | |
1577 | + */ | |
1578 | + | |
1579 | +#include <linux/module.h> | |
1580 | +#include <linux/init.h> | |
1581 | +#include <linux/blkdev.h> | |
1582 | +#include <linux/slab.h> | |
1583 | +#include <linux/mempool.h> | |
1584 | +#include <linux/vmalloc.h> | |
1585 | + | |
1586 | +#include "dm.h" | |
1587 | +#include "dm-io.h" | |
1588 | + | |
1589 | +#define MAX_HASH_CHAIN_ENTRIES 10 | |
1590 | +#define NAME_SIZE 127 | |
1591 | + | |
1592 | +/* Sparse Ioctl | |
1593 | + device | |
1594 | + start | |
1595 | + chunk_size | |
1596 | + chunks | |
1597 | + */ | |
1598 | + | |
1599 | +// Entries in the sparse remapping structure | |
1600 | +struct sparse_hash_entry { | |
1601 | + u64 org_chunk; // Chunk number, not LBA. | |
1602 | + u64 sparse_chunk; // Chunk number, not LBA. | |
1603 | + struct sparse_hash_entry * next; | |
1604 | + struct sparse_hash_entry * prev; | |
1605 | +}; | |
1606 | + | |
1607 | +//Private data structure | |
1608 | +struct sparse_volume { | |
1609 | + struct dm_dev *dev; | |
1610 | + struct rw_semaphore sparse_semaphore; | |
1611 | + struct sparse_hash_entry ** sparse_map; // Hash table of remappings | |
1612 | + struct sparse_hash_entry * free_hash_list; | |
1613 | + kmem_cache_t * hash_slab; | |
1614 | + mempool_t * hash_pool; | |
1615 | + u32 dm_io_flag; | |
1616 | + u32 chunk_size; // Sectors. | |
1617 | + u32 chunk_shift; // Shift value for chunk size. | |
1618 | + u32 num_chunks; // In this volume. | |
1619 | + u32 next_cow_entry; // Index into current COW table. | |
1620 | + u64 current_cow_sector; // LOGICAL sector of current COW table. | |
1621 | + u32 next_free_chunk; // Index of next free chunk (not LBA!). | |
1622 | + u32 hash_table_size; // Size of the hash table for the remap. | |
1623 | + u64 start; | |
1624 | + u64 cow_table[64]; // One sector's worth of COW tables. | |
1625 | +}; | |
1626 | + | |
1627 | +/*************************** OLD SERVICES ****************************/ | |
1628 | + | |
1629 | +/* computes log base 2 of value */ | |
1630 | +inline int log2(u32 value) //ok to change to u32? | |
1631 | +{ | |
1632 | + int result = -1; | |
1633 | + long tmp; //ok to change to long? | |
1634 | + | |
1635 | + if (value) { | |
1636 | + tmp = value; | |
1637 | + result++; | |
1638 | + while (!(tmp & 1)) { | |
1639 | + result++; | |
1640 | + tmp >>= 1; | |
1641 | + } | |
1642 | + if (tmp != 1) { | |
1643 | + result = -2; | |
1644 | + } | |
1645 | + } | |
1646 | + return result; | |
1647 | +} | |
1648 | + | |
1649 | +/********************************* Functions *********************************/ | |
1650 | + | |
1651 | +/***************************** Hash Functions *****************************/ | |
1652 | + | |
1653 | +/* Take and initialize from the free hash list */ | |
1654 | +static struct sparse_hash_entry * | |
1655 | +allocate_sparse_hash_entry( struct sparse_volume * volume, | |
1656 | + u64 org_chunk, | |
1657 | + u64 sparse_chunk ) | |
1658 | +{ | |
1659 | + struct sparse_hash_entry * hash_entry; | |
1660 | + | |
1661 | + hash_entry = volume->free_hash_list; | |
1662 | + if ( hash_entry ) { //should always be the case b/c preallocate these | |
1663 | + volume->free_hash_list = hash_entry->next; | |
1664 | + hash_entry->org_chunk = org_chunk; | |
1665 | + hash_entry->sparse_chunk = sparse_chunk; | |
1666 | + hash_entry->next = NULL; | |
1667 | + hash_entry->prev = NULL; | |
1668 | + } | |
1669 | + | |
1670 | + return hash_entry; | |
1671 | +} | |
1672 | + | |
1673 | +/* | |
1674 | + * This function inserts a new entry into a sparse hash chain, immediately | |
1675 | + * following the specified entry. This function should not be used to add | |
1676 | + * an entry into an empty list, or as the first entry in an existing list. | |
1677 | + * For that case, use insert_sparse_map_entry_at_head(). | |
1678 | + */ | |
1679 | +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry, | |
1680 | + struct sparse_hash_entry * base ) | |
1681 | +{ | |
1682 | + entry->next = base->next; | |
1683 | + entry->prev = base; | |
1684 | + base->next = entry; | |
1685 | + if ( entry->next ) { | |
1686 | + entry->next->prev = entry; | |
1687 | + } | |
1688 | + return 0; | |
1689 | +} | |
1690 | + | |
1691 | +/* | |
1692 | + * This function inserts a new entry into a sparse chain as the first | |
1693 | + * entry in the chain. | |
1694 | + */ | |
1695 | +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry, | |
1696 | + struct sparse_hash_entry ** head ) | |
1697 | +{ | |
1698 | + entry->next = *head; | |
1699 | + entry->prev = NULL; | |
1700 | + *head = entry; | |
1701 | + if ( entry->next ) { | |
1702 | + entry->next->prev = entry; | |
1703 | + } | |
1704 | + return 0; | |
1705 | +} | |
1706 | + | |
1707 | +/* | |
1708 | + * Delete all items in a single chain in the hash table. | |
1709 | + */ | |
1710 | +static int delete_sparse_hash_chain( struct sparse_volume * vol, | |
1711 | + struct sparse_hash_entry * head ) | |
1712 | +{ | |
1713 | + struct sparse_hash_entry * next; | |
1714 | + | |
1715 | + while ( head ) { | |
1716 | + next = head->next; | |
1717 | + mempool_free( head, vol->hash_pool ); | |
1718 | + head = next; | |
1719 | + } | |
1720 | + return 0; | |
1721 | +} | |
1722 | + | |
1723 | +/* | |
1724 | + * This function will search the hash chain that is anchored at the | |
1725 | + * specified head pointer. If the chunk number is found, a pointer to that | |
1726 | + * entry in the chain is set, and a 1 is returned. If the chunk is not | |
1727 | + * found, a pointer to the previous entry is set and 0 is returned. If the | |
1728 | + * return pointer is NULL, this means either the list is empty, or the | |
1729 | + * specified sector should become the first list item. | |
1730 | + */ | |
1731 | +static int search_sparse_hash_chain( u64 chunk, | |
1732 | + struct sparse_hash_entry * head, | |
1733 | + struct sparse_hash_entry ** result ) | |
1734 | +{ | |
1735 | + struct sparse_hash_entry * curr = head; | |
1736 | + struct sparse_hash_entry * prev = head; | |
1737 | + while ( curr && curr->org_chunk < chunk ) { | |
1738 | + prev = curr; | |
1739 | + curr = curr->next; | |
1740 | + } | |
1741 | + if (!curr) { // Either an empty chain or went off the end of the chain. | |
1742 | + *result = prev; | |
1743 | + return 0; | |
1744 | + } | |
1745 | + else if ( curr->org_chunk != chunk ) { | |
1746 | + *result = curr->prev; | |
1747 | + return 0; | |
1748 | + } | |
1749 | + else { | |
1750 | + *result = curr; | |
1751 | + return 1; | |
1752 | + } | |
1753 | +} | |
1754 | + | |
1755 | +/* | |
1756 | + * This function takes a cow table entry (from the on-disk data), and | |
1757 | + * converts it into an appropriate entry for the sparse map, and | |
1758 | + * inserts it into the appropriate map for the specified volume. | |
1759 | + */ | |
1760 | +static int add_cow_entry_to_sparse_map( u64 org_chunk, | |
1761 | + u64 sparse_chunk, | |
1762 | + struct sparse_volume * volume ) | |
1763 | +{ | |
1764 | + struct sparse_hash_entry * new_entry; | |
1765 | + struct sparse_hash_entry * target_entry; | |
1766 | + u32 hash_value; | |
1767 | + int rc = -EINVAL; | |
1768 | + | |
1769 | + new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk); | |
1770 | + if (!new_entry) { | |
1771 | + return -ENOMEM; | |
1772 | + } | |
1773 | + | |
1774 | + hash_value = (long)org_chunk % volume->hash_table_size; | |
1775 | + | |
1776 | + if (! search_sparse_hash_chain( org_chunk, | |
1777 | + volume->sparse_map[hash_value], | |
1778 | + &target_entry ) ) { | |
1779 | + //should always take this path | |
1780 | + | |
1781 | + if ( target_entry ) { | |
1782 | + insert_sparse_hash_entry( new_entry, target_entry ); | |
1783 | + } | |
1784 | + else { | |
1785 | + insert_sparse_hash_entry_at_head | |
1786 | + ( new_entry, &(volume->sparse_map[hash_value]) ); | |
1787 | + } | |
1788 | + rc = 0; | |
1789 | + } | |
1790 | + return rc; | |
1791 | +} | |
1792 | + | |
1793 | +/* | |
1794 | + * Construct the initial hash table state based on | |
1795 | + * existing COW tables on the disk. | |
1796 | + */ | |
1797 | +static int build_sparse_maps(struct sparse_volume * volume) | |
1798 | +{ | |
1799 | + int rc = 0, done = 0; | |
1800 | + struct io_region job; | |
1801 | + struct page * page; | |
1802 | + unsigned int error, offset; | |
1803 | + | |
1804 | + while (!done) { | |
1805 | + | |
1806 | + // Read in one sector's worth of COW tables. | |
1807 | + job.dev = volume->dev->dev; | |
1808 | + job.sector = volume->current_cow_sector; | |
1809 | + job.count = 1; | |
1810 | + page = virt_to_page(volume->cow_table); | |
1811 | + offset = (unsigned long)volume->cow_table & ~PAGE_MASK; | |
1812 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
1813 | + if (rc) { | |
1814 | + return rc; | |
1815 | + } | |
1816 | + | |
1817 | + // Translate every valid COW table entry into | |
1818 | + // a sparse map entry. | |
1819 | + for ( volume->next_cow_entry = 0; | |
1820 | + | |
1821 | + volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) && | |
1822 | + volume->cow_table[volume->next_cow_entry] != | |
1823 | + 0xffffffffffffffff; | |
1824 | + | |
1825 | + volume->next_cow_entry++, volume->next_free_chunk++ ) { | |
1826 | + | |
1827 | + if ( (rc = add_cow_entry_to_sparse_map | |
1828 | + ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ), | |
1829 | + volume->next_free_chunk, volume ))) { | |
1830 | + return( rc ); | |
1831 | + } | |
1832 | + } | |
1833 | + // Move on to the next sector if necessary. | |
1834 | + if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) { | |
1835 | + volume->current_cow_sector++; | |
1836 | + } | |
1837 | + else { | |
1838 | + done = 1; | |
1839 | + } | |
1840 | + } | |
1841 | + return 0; | |
1842 | +} | |
1843 | + | |
1844 | +/************************* Other Functions ************************/ | |
1845 | + | |
1846 | +/* | |
1847 | + * Function: sparse_remap_chunk | |
1848 | + * | |
1849 | + * This function performs a sector remap on a sparse volume. This should | |
1850 | + * be called from the I/O path, It first determines the base sector | |
1851 | + * of the chunk containing the specified sector, and saves the remainder. | |
1852 | + * Then it performs a search through the sparse map for the specified | |
1853 | + * volume. If a match is found, the sector number is changed to the new | |
1854 | + * value. If no match is found, the value is left the same, meaning the | |
1855 | + * chunk has not been remapped. | |
1856 | + */ | |
1857 | +static int sparse_remap_chunk( struct sparse_volume * sparse_volume, | |
1858 | + u64 * sector ) | |
1859 | +{ | |
1860 | + struct sparse_hash_entry * result; | |
1861 | + u64 chunk; | |
1862 | + u32 hash_value; | |
1863 | + u32 remainder; | |
1864 | + int rc = 1; | |
1865 | + | |
1866 | + down_read(&sparse_volume->sparse_semaphore); | |
1867 | + | |
1868 | + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); | |
1869 | + chunk = *sector >> sparse_volume->chunk_shift; | |
1870 | + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; | |
1871 | + | |
1872 | + if ( search_sparse_hash_chain( chunk, | |
1873 | + sparse_volume->sparse_map[hash_value], | |
1874 | + &result) ) { | |
1875 | + *sector = ( result->sparse_chunk << sparse_volume->chunk_shift ) | |
1876 | + + remainder; | |
1877 | + rc = 0; | |
1878 | + } | |
1879 | + up_read(&sparse_volume->sparse_semaphore); | |
1880 | + return rc; | |
1881 | +} | |
1882 | + | |
1883 | +/* Function: sparse_cow_write | |
1884 | + * | |
1885 | + * Check this sparse node to see if the given sector/chunk has been | |
1886 | + * remapped yet. If it hasn't, create a new hash table entry, update the | |
1887 | + * in-memory COW table, write the COW table to disk. | |
1888 | + */ | |
1889 | + | |
1890 | +static int sparse_cow_write( struct sparse_volume * sparse_volume, | |
1891 | + u64 * sector ) | |
1892 | +{ | |
1893 | + struct sparse_hash_entry * target_entry, * new_map_entry; | |
1894 | + struct io_region job; | |
1895 | + struct page * page; | |
1896 | + char * cow = NULL; | |
1897 | + unsigned int error, offset; | |
1898 | + u64 chunk; | |
1899 | + u32 hash_value = 0; | |
1900 | + u32 remainder; | |
1901 | + int rc; | |
1902 | + | |
1903 | + down_write(&sparse_volume->sparse_semaphore); | |
1904 | + | |
1905 | + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); | |
1906 | + chunk = *sector >> sparse_volume->chunk_shift; | |
1907 | + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; | |
1908 | + | |
1909 | + if ( search_sparse_hash_chain( chunk, | |
1910 | + sparse_volume->sparse_map[hash_value], | |
1911 | + &target_entry) ) { | |
1912 | + *sector = | |
1913 | + ( target_entry->sparse_chunk << sparse_volume->chunk_shift ) | |
1914 | + + remainder; | |
1915 | + rc = 0; | |
1916 | + goto out; | |
1917 | + } | |
1918 | + | |
1919 | + // Is there enough room left on this sparse to remap this chunk? | |
1920 | + if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) { | |
1921 | + DMERR("dm-sparse: full no new remaps allowed\n"); | |
1922 | + rc = -ENOSPC; | |
1923 | + goto out; | |
1924 | + } | |
1925 | + | |
1926 | + // Create and initialize a new hash table entry for the new remap. | |
1927 | + new_map_entry = allocate_sparse_hash_entry | |
1928 | + (sparse_volume, chunk, sparse_volume->next_free_chunk); | |
1929 | + if ( ! new_map_entry ) { | |
1930 | + // Can't get memory for map entry. Disable this sparse. | |
1931 | + DMERR("dm-sparse: memory error allocating hash entry\n"); | |
1932 | + rc = -ENOMEM; | |
1933 | + goto out; | |
1934 | + } | |
1935 | + | |
1936 | + //Always write cow table so its safe | |
1937 | + cow = kmalloc( SECTOR_SIZE, GFP_KERNEL ); | |
1938 | + if (! cow ) { | |
1939 | + // Can't get I/O buffer. Disable this sparse. | |
1940 | + DMERR("dm-sparse: memory error allocating COW table buffer"); | |
1941 | + rc = -ENOMEM; | |
1942 | + goto out; | |
1943 | + } | |
1944 | + | |
1945 | + // Add the entry to the hash table. | |
1946 | + if ( target_entry ) { | |
1947 | + insert_sparse_hash_entry( new_map_entry, target_entry ); | |
1948 | + } | |
1949 | + else { | |
1950 | + insert_sparse_hash_entry_at_head | |
1951 | + ( new_map_entry, | |
1952 | + &(sparse_volume->sparse_map[hash_value]) ); | |
1953 | + } | |
1954 | + | |
1955 | + sparse_volume->next_free_chunk++; | |
1956 | + | |
1957 | + // Update the appropriate entry in the COW table. | |
1958 | + sparse_volume->cow_table[sparse_volume->next_cow_entry] = | |
1959 | + cpu_to_le64(chunk); | |
1960 | + sparse_volume->next_cow_entry++; | |
1961 | + | |
1962 | + memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE); | |
1963 | + | |
1964 | + //because of ordering issues needs to be synchronous | |
1965 | + job.dev = sparse_volume->dev->dev; | |
1966 | + job.sector = sparse_volume->current_cow_sector; | |
1967 | + job.count = 1; | |
1968 | + page = virt_to_page(cow); | |
1969 | + offset = (unsigned long)cow & ~PAGE_MASK; | |
1970 | + dm_io_sync(1, &job, WRITE, page, offset, &error); | |
1971 | + | |
1972 | + // Update the in-memory COW table values. | |
1973 | + if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) ) | |
1974 | + { | |
1975 | + sparse_volume->next_cow_entry = 0; | |
1976 | + sparse_volume->current_cow_sector++; | |
1977 | + memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE); | |
1978 | + } | |
1979 | + | |
1980 | + *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift ) | |
1981 | + + remainder; | |
1982 | + | |
1983 | + rc = 0; | |
1984 | + | |
1985 | + out: | |
1986 | + up_write(&sparse_volume->sparse_semaphore); | |
1987 | + if ( cow ) { | |
1988 | + kfree( cow ); | |
1989 | + } | |
1990 | + | |
1991 | + return rc; | |
1992 | +} | |
1993 | + | |
1994 | +/************************ EXPORT FUNCTIONS ************************/ | |
1995 | + | |
1996 | +/* | |
1997 | + * Function: sparse_dtr | |
1998 | + */ | |
1999 | +static void sparse_dtr( struct dm_target *ti ) | |
2000 | +{ | |
2001 | + struct sparse_volume * vol = (struct sparse_volume *)ti->private; | |
2002 | + int i; | |
2003 | + | |
2004 | + if (vol) { | |
2005 | + | |
2006 | + if (vol->sparse_map) { | |
2007 | + for ( i = 0; i < vol->hash_table_size; i++ ) { | |
2008 | + delete_sparse_hash_chain( vol, vol->sparse_map[i] ); | |
2009 | + } | |
2010 | + delete_sparse_hash_chain( vol, vol->free_hash_list ); | |
2011 | + vfree(vol->sparse_map); | |
2012 | + } | |
2013 | + | |
2014 | + if (vol->hash_pool) | |
2015 | + mempool_destroy(vol->hash_pool); | |
2016 | + | |
2017 | + if (vol->hash_slab) | |
2018 | + kmem_cache_destroy(vol->hash_slab); | |
2019 | + | |
2020 | + dm_put_device(ti, vol->dev); | |
2021 | + | |
2022 | + if (vol->dm_io_flag) { | |
2023 | + dm_io_put(1); | |
2024 | + } | |
2025 | + | |
2026 | + kfree( vol ); | |
2027 | + } | |
2028 | +} | |
2029 | + | |
2030 | +/* | |
2031 | + * Function: sparse_ctr | |
2032 | + */ | |
2033 | +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv ) | |
2034 | +{ | |
2035 | + int i, rc = -EINVAL; | |
2036 | + struct sparse_hash_entry *new_entry; | |
2037 | + struct sparse_volume *vol; | |
2038 | + struct dm_dev *dev; | |
2039 | + u32 chunk_size, chunks; | |
2040 | + u64 start; | |
2041 | + char* end, slab_name[NAME_SIZE+1]; | |
2042 | + | |
2043 | + if ( argc != 4 ) { | |
2044 | + ti->error="dm-sparse: wrong number of arguments"; | |
2045 | + return rc; | |
2046 | + } | |
2047 | + | |
2048 | + start = simple_strtoull(argv[1], &end, 10); | |
2049 | + if (*end) { | |
2050 | + ti->error="dm-sparse: Invalid first chunk lba"; | |
2051 | + return rc; | |
2052 | + } | |
2053 | + | |
2054 | + chunk_size = simple_strtoul(argv[2], &end, 10); | |
2055 | + if (*end) { | |
2056 | + ti->error="dm-sparse: Invalid chunk_size"; | |
2057 | + return rc; | |
2058 | + } | |
2059 | + | |
2060 | + chunks = simple_strtoul(argv[3], &end, 10); | |
2061 | + if (*end) { | |
2062 | + ti->error="dm-sparse: Invalid number of chunks"; | |
2063 | + return rc; | |
2064 | + } | |
2065 | + | |
2066 | + if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size, | |
2067 | + dm_table_get_mode(ti->table), &dev ) ) { | |
2068 | + ti->error = "dm-sparse: Device lookup failed"; | |
2069 | + return rc; | |
2070 | + } | |
2071 | + | |
2072 | + vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL); | |
2073 | + if ( !vol ) { | |
2074 | + ti->error = "dm-sparse: Memory allocation for private-data failed"; | |
2075 | + rc = -ENOMEM; | |
2076 | + goto out; | |
2077 | + } | |
2078 | + | |
2079 | + memset( vol, 0, sizeof(struct sparse_volume) ); | |
2080 | + | |
2081 | + rc = dm_io_get(1); | |
2082 | + if (rc) { | |
2083 | + ti->error = "dm-sparse: failed to initialize dm-io."; | |
2084 | + sparse_dtr(ti); | |
2085 | + return rc; | |
2086 | + } | |
2087 | + | |
2088 | + // Initialize | |
2089 | + vol->dm_io_flag = 1; | |
2090 | + vol->chunk_size = chunk_size; | |
2091 | + vol->chunk_shift = log2(chunk_size); | |
2092 | + vol->num_chunks = chunks; | |
2093 | + vol->current_cow_sector = 1; | |
2094 | + vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1; | |
2095 | + vol->start = start; | |
2096 | + vol->dev = dev; | |
2097 | + init_rwsem(&vol->sparse_semaphore); | |
2098 | + | |
2099 | + snprintf(slab_name, NAME_SIZE, "sparse-%p", vol); | |
2100 | + vol->hash_slab = kmem_cache_create(slab_name, | |
2101 | + sizeof(struct sparse_hash_entry), | |
2102 | + 0, SLAB_HWCACHE_ALIGN, | |
2103 | + NULL, NULL); | |
2104 | + if ( ! vol->hash_slab ) { | |
2105 | + ti->error = "dm-sparse: memory allocation error in hash slab create"; | |
2106 | + sparse_dtr(ti); | |
2107 | + return -ENOMEM; | |
2108 | + } | |
2109 | + vol->hash_pool = mempool_create(1, mempool_alloc_slab, | |
2110 | + mempool_free_slab, | |
2111 | + vol->hash_slab); | |
2112 | + if ( ! vol->hash_pool ) { | |
2113 | + ti->error = "dm-sparse: memory allocation error in hash pool create"; | |
2114 | + sparse_dtr(ti); | |
2115 | + return -ENOMEM; | |
2116 | + } | |
2117 | + | |
2118 | + // Sparse hash table | |
2119 | + vol->sparse_map = vmalloc( vol->hash_table_size * | |
2120 | + sizeof( struct sparse_hash_entry * ) ); | |
2121 | + if ( ! vol->sparse_map ) { | |
2122 | + ti->error = "dm-sparse: Memory allocation error in sparse_map create"; | |
2123 | + sparse_dtr(ti); | |
2124 | + return -ENOMEM; | |
2125 | + } | |
2126 | + | |
2127 | + memset( vol->sparse_map, 0, vol->hash_table_size * | |
2128 | + sizeof( struct sparse_hash_entry * ) ); | |
2129 | + | |
2130 | + for ( i = 0; i < chunks; i++ ) { | |
2131 | + | |
2132 | + new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL ); | |
2133 | + if ( ! new_entry ) { | |
2134 | + ti->error="dm-sparse: memory allocation error in hash table setup"; | |
2135 | + sparse_dtr(ti); | |
2136 | + return -ENOMEM; | |
2137 | + } | |
2138 | + | |
2139 | + new_entry->next = vol->free_hash_list; | |
2140 | + vol->free_hash_list = new_entry; | |
2141 | + } | |
2142 | + | |
2143 | + rc = build_sparse_maps(vol); | |
2144 | + if (rc) { | |
2145 | + ti->error = "dm-sparse: error building hash tables"; | |
2146 | + sparse_dtr(ti); | |
2147 | + return rc; | |
2148 | + } | |
2149 | + | |
2150 | + ti->private = vol; | |
2151 | + return rc; | |
2152 | + | |
2153 | + out: | |
2154 | + dm_put_device(ti, dev); | |
2155 | + return rc; | |
2156 | +} | |
2157 | + | |
2158 | +/* | |
2159 | + * Function: sparse_map | |
2160 | + */ | |
2161 | +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw, | |
2162 | + union map_info *map_context ) | |
2163 | +{ | |
2164 | + struct sparse_volume * volume = (struct sparse_volume*)ti->private; | |
2165 | + u64 sector = bh->b_rsector; | |
2166 | + int rc; | |
2167 | + | |
2168 | + | |
2169 | + | |
2170 | + // Check if this sector has been remapped | |
2171 | + rc = sparse_remap_chunk( volume, §or ); | |
2172 | + | |
2173 | + if ( rc < 0 ) { //Error | |
2174 | + bh->b_end_io(bh, 0); | |
2175 | + return rc; | |
2176 | + } | |
2177 | + | |
2178 | + if ( rc == 0 ) { // Remapped I/O : read or write same logic | |
2179 | + bh->b_rsector = volume->start + sector; | |
2180 | + bh->b_rdev = volume->dev->dev; | |
2181 | + return 1; | |
2182 | + } | |
2183 | + | |
2184 | + // ( Previously )Un-mapped: read / write different logic | |
2185 | + | |
2186 | + if ( rw ) { //write : | |
2187 | + rc = sparse_cow_write( volume, §or ); | |
2188 | + | |
2189 | + if ( rc < 0 ) { //Error | |
2190 | + bh->b_end_io(bh, 0); | |
2191 | + return rc; | |
2192 | + } | |
2193 | + //Send write on | |
2194 | + bh->b_rsector = volume->start + sector; | |
2195 | + bh->b_rdev = volume->dev->dev; | |
2196 | + return 1; | |
2197 | + } | |
2198 | + | |
2199 | + //Reading something that was never written | |
2200 | + //return zeros and indicate complete | |
2201 | + memset(bh->b_data, 0x0, bh->b_size); | |
2202 | + bh->b_end_io(bh, 1); | |
2203 | + return 0; | |
2204 | +} | |
2205 | + | |
2206 | +static int sparse_status( struct dm_target *ti, status_type_t type, | |
2207 | + char *result, unsigned int maxlen ) | |
2208 | +{ | |
2209 | + struct sparse_volume * vol = (struct sparse_volume * )ti->private; | |
2210 | + | |
2211 | + switch(type) { | |
2212 | + | |
2213 | + case STATUSTYPE_INFO: | |
2214 | + snprintf( result, maxlen, "%d%%", | |
2215 | + ( vol->next_free_chunk * 100 ) / vol->num_chunks ); | |
2216 | + break; | |
2217 | + | |
2218 | + case STATUSTYPE_TABLE: | |
2219 | + snprintf( result, maxlen, "%s %Lu %u %u", | |
2220 | + dm_kdevname(vol->dev->dev), vol->start, | |
2221 | + vol->chunk_size, vol->num_chunks ); | |
2222 | + break; | |
2223 | + | |
2224 | + default: | |
2225 | + break; | |
2226 | + } | |
2227 | + | |
2228 | + return 0; | |
2229 | +} | |
2230 | + | |
2231 | +/****************** FUNCTION TABLE **********************/ | |
2232 | + | |
2233 | +static struct target_type sparse_target = { | |
2234 | + .name = "sparse", | |
2235 | + .module = THIS_MODULE, | |
2236 | + .ctr = sparse_ctr, | |
2237 | + .dtr = sparse_dtr, | |
2238 | + .map = sparse_map, | |
2239 | + .status = sparse_status, | |
2240 | +}; | |
2241 | + | |
2242 | +/********************* REGISTRATION *****************/ | |
2243 | + | |
2244 | +int __init sparse_init(void) | |
2245 | +{ | |
2246 | + int rc = dm_register_target(&sparse_target); | |
2247 | + | |
2248 | + if ( rc < 0 ) | |
2249 | + DMWARN("sparse target registration failed"); | |
2250 | + | |
2251 | + return rc; | |
2252 | +} | |
2253 | + | |
2254 | +void __exit sparse_exit(void) | |
2255 | +{ | |
2256 | + if (dm_unregister_target(&sparse_target) ) | |
2257 | + DMWARN("sparse target unregistration failed"); | |
2258 | + | |
2259 | + return; | |
2260 | +} | |
2261 | + | |
2262 | +module_init(sparse_init); | |
2263 | +module_exit(sparse_exit); | |
2264 | +MODULE_LICENSE("GPL"); | |
2265 | diff -urN linux-2.4.22/drivers/md/multipath.c linux-2.4.22-evms/drivers/md/multipath.c | |
2266 | --- linux-2.4.22/drivers/md/multipath.c 2003-06-13 16:51:34.000000000 +0200 | |
2267 | +++ linux-2.4.22-evms/drivers/md/multipath.c 2003-09-15 17:09:36.000000000 +0200 | |
2268 | @@ -139,15 +139,16 @@ | |
2269 | static int multipath_map (mddev_t *mddev, kdev_t *rdev) | |
2270 | { | |
2271 | multipath_conf_t *conf = mddev_to_conf(mddev); | |
2272 | - int i, disks = MD_SB_DISKS; | |
2273 | + int i; | |
2274 | ||
2275 | /* | |
2276 | * Later we do read balancing on the read side | |
2277 | * now we use the first available disk. | |
2278 | */ | |
2279 | ||
2280 | - for (i = 0; i < disks; i++) { | |
2281 | + for (i = 0; i < conf->nr_disks; i++) { | |
2282 | if (conf->multipaths[i].operational) { | |
2283 | + /* first operational is winner! */ | |
2284 | *rdev = conf->multipaths[i].dev; | |
2285 | return (0); | |
2286 | } | |
2287 | @@ -191,6 +192,8 @@ | |
2288 | { | |
2289 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private); | |
2290 | ||
2291 | + atomic_dec(&mp_bh->multipath->nr_pending); | |
2292 | + | |
2293 | /* | |
2294 | * this branch is our 'one multipath IO has finished' event handler: | |
2295 | */ | |
2296 | @@ -223,19 +226,39 @@ | |
2297 | } | |
2298 | ||
2299 | /* | |
2300 | - * This routine returns the disk from which the requested read should | |
2301 | - * be done. | |
2302 | + * Multipath read balance ... | |
2303 | + * | |
2304 | + * Returns: | |
2305 | + * | |
2306 | + * If no active paths | |
2307 | + * | |
2308 | + * - Error ( -1 ) | |
2309 | + * | |
2310 | + * If active paths == 1 | |
2311 | + * | |
2312 | + * - 1st active path encountered | |
2313 | + * | |
2314 | + * If active paths > 1 | |
2315 | + * | |
2316 | + * - 1st idle active path encountered | |
2317 | + * - else ... the active path doing the least amount of work. | |
2318 | */ | |
2319 | - | |
2320 | static int multipath_read_balance (multipath_conf_t *conf) | |
2321 | { | |
2322 | - int disk; | |
2323 | - | |
2324 | - for (disk = 0; disk < conf->raid_disks; disk++) | |
2325 | - if (conf->multipaths[disk].operational) | |
2326 | - return disk; | |
2327 | - BUG(); | |
2328 | - return 0; | |
2329 | + int i, disk=-1, nr_pending, least_pending=0; | |
2330 | + | |
2331 | + for (i=0; i<conf->nr_disks; i++) { | |
2332 | + if (conf->multipaths[i].operational) { | |
2333 | + nr_pending = atomic_read(&conf->multipaths[i].nr_pending); | |
2334 | + if (nr_pending==0 || conf->working_disks==1) | |
2335 | + return i; | |
2336 | + if (least_pending==0 || nr_pending<least_pending) { | |
2337 | + disk = i; | |
2338 | + least_pending = nr_pending; | |
2339 | + } | |
2340 | + } | |
2341 | + } | |
2342 | + return disk; | |
2343 | } | |
2344 | ||
2345 | static int multipath_make_request (mddev_t *mddev, int rw, | |
2346 | @@ -245,6 +268,7 @@ | |
2347 | struct buffer_head *bh_req; | |
2348 | struct multipath_bh * mp_bh; | |
2349 | struct multipath_info *multipath; | |
2350 | + int disk; | |
2351 | ||
2352 | if (!buffer_locked(bh)) | |
2353 | BUG(); | |
2354 | @@ -267,7 +291,16 @@ | |
2355 | /* | |
2356 | * read balancing logic: | |
2357 | */ | |
2358 | - multipath = conf->multipaths + multipath_read_balance(conf); | |
2359 | + disk = multipath_read_balance(conf); | |
2360 | + if (disk==-1) { | |
2361 | + printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n"); | |
2362 | + buffer_IO_error(bh); | |
2363 | + return 0; | |
2364 | + } | |
2365 | + | |
2366 | + multipath = conf->multipaths + disk; | |
2367 | + mp_bh->multipath = multipath; | |
2368 | + atomic_inc(&multipath->nr_pending); | |
2369 | ||
2370 | bh_req = &mp_bh->bh_req; | |
2371 | memcpy(bh_req, bh, sizeof(*bh)); | |
2372 | @@ -331,13 +364,14 @@ | |
2373 | { | |
2374 | multipath_conf_t *conf = mddev_to_conf(mddev); | |
2375 | struct multipath_info * multipaths = conf->multipaths; | |
2376 | - int disks = MD_SB_DISKS; | |
2377 | int other_paths = 1; | |
2378 | - int i; | |
2379 | + int i, first = 1; | |
2380 | + mdk_rdev_t *rdev; | |
2381 | + struct md_list_head *tmp; | |
2382 | ||
2383 | if (conf->working_disks == 1) { | |
2384 | other_paths = 0; | |
2385 | - for (i = 0; i < disks; i++) { | |
2386 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2387 | if (multipaths[i].spare) { | |
2388 | other_paths = 1; | |
2389 | break; | |
2390 | @@ -351,16 +385,17 @@ | |
2391 | * first check if this is a queued request for a device | |
2392 | * which has just failed. | |
2393 | */ | |
2394 | - for (i = 0; i < disks; i++) { | |
2395 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2396 | if (multipaths[i].dev==dev && !multipaths[i].operational) | |
2397 | return 0; | |
2398 | } | |
2399 | printk (LAST_DISK); | |
2400 | } else { | |
2401 | + mdp_super_t *sb = mddev->sb; | |
2402 | /* | |
2403 | * Mark disk as unusable | |
2404 | */ | |
2405 | - for (i = 0; i < disks; i++) { | |
2406 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2407 | if (multipaths[i].dev==dev && multipaths[i].operational) { | |
2408 | mark_disk_bad(mddev, i); | |
2409 | break; | |
2410 | @@ -369,7 +404,6 @@ | |
2411 | if (!conf->working_disks) { | |
2412 | int err = 1; | |
2413 | mdp_disk_t *spare; | |
2414 | - mdp_super_t *sb = mddev->sb; | |
2415 | ||
2416 | spare = get_spare(mddev); | |
2417 | if (spare) { | |
2418 | @@ -384,6 +418,21 @@ | |
2419 | sb->spare_disks--; | |
2420 | } | |
2421 | } | |
2422 | + /* prevent unnecessary work in md_do_recovery() */ | |
2423 | + if (conf->working_disks) { | |
2424 | + conf->raid_disks = conf->working_disks | |
2425 | + = sb->raid_disks = sb->active_disks; | |
2426 | + } | |
2427 | + /* update alias disk info to insure we can do sb commit. */ | |
2428 | + ITERATE_RDEV(mddev,rdev,tmp) { | |
2429 | + if (first && disk_active(&sb->disks[rdev->desc_nr])) { | |
2430 | + rdev->alias_device = 0; | |
2431 | + first = 0; | |
2432 | + } else { | |
2433 | + if (!disk_faulty(&sb->disks[rdev->desc_nr])) | |
2434 | + rdev->alias_device = 1; | |
2435 | + } | |
2436 | + } | |
2437 | } | |
2438 | return 0; | |
2439 | } | |
2440 | @@ -677,9 +726,8 @@ | |
2441 | /* | |
2442 | * This is a kernel thread which: | |
2443 | * | |
2444 | - * 1. Retries failed read operations on working multipaths. | |
2445 | + * 1. Retries failed operations on working multipaths. | |
2446 | * 2. Updates the raid superblock when problems encounter. | |
2447 | - * 3. Performs writes following reads for array syncronising. | |
2448 | */ | |
2449 | ||
2450 | static void multipathd (void *data) | |
2451 | @@ -833,6 +881,7 @@ | |
2452 | mdk_rdev_t *rdev, *def_rdev = NULL; | |
2453 | struct md_list_head *tmp; | |
2454 | int num_rdevs = 0; | |
2455 | + int active_disks = 0, spare_disks = 0, faulty_disks = 0; | |
2456 | ||
2457 | MOD_INC_USE_COUNT; | |
2458 | ||
2459 | @@ -881,9 +930,7 @@ | |
2460 | printk(NOT_IN_SYNC, partition_name(rdev->dev)); | |
2461 | ||
2462 | /* | |
2463 | - * Mark all disks as spare to start with, then pick our | |
2464 | - * active disk. If we have a disk that is marked active | |
2465 | - * in the sb, then use it, else use the first rdev. | |
2466 | + * Mark all disks as spare to start with. | |
2467 | */ | |
2468 | disk->number = desc->number; | |
2469 | disk->raid_disk = desc->raid_disk; | |
2470 | @@ -894,20 +941,21 @@ | |
2471 | mark_disk_sync(desc); | |
2472 | ||
2473 | if (disk_active(desc)) { | |
2474 | - if(!conf->working_disks) { | |
2475 | - printk(OPERATIONAL, partition_name(rdev->dev), | |
2476 | - desc->raid_disk); | |
2477 | - disk->operational = 1; | |
2478 | - disk->spare = 0; | |
2479 | - conf->working_disks++; | |
2480 | - def_rdev = rdev; | |
2481 | - } else { | |
2482 | - mark_disk_spare(desc); | |
2483 | - } | |
2484 | - } else | |
2485 | - mark_disk_spare(desc); | |
2486 | + printk(OPERATIONAL, partition_name(rdev->dev), | |
2487 | + desc->raid_disk); | |
2488 | + disk->operational = 1; | |
2489 | + disk->spare = 0; | |
2490 | + conf->working_disks++; | |
2491 | + def_rdev = rdev; | |
2492 | + active_disks++; | |
2493 | + } else if (disk_faulty(desc)) { | |
2494 | + disk->spare = 0; | |
2495 | + faulty_disks++; | |
2496 | + } else { | |
2497 | + spare_disks++; | |
2498 | + } | |
2499 | ||
2500 | - if(!num_rdevs++) def_rdev = rdev; | |
2501 | + num_rdevs++; | |
2502 | } | |
2503 | if(!conf->working_disks && num_rdevs) { | |
2504 | desc = &sb->disks[def_rdev->desc_nr]; | |
2505 | @@ -918,11 +966,12 @@ | |
2506 | disk->spare = 0; | |
2507 | conf->working_disks++; | |
2508 | mark_disk_active(desc); | |
2509 | + active_disks++; | |
2510 | } | |
2511 | /* | |
2512 | - * Make sure our active path is in desc spot 0 | |
2513 | + * If there is only 1 active path ... make sure it is in desc spot 0 | |
2514 | */ | |
2515 | - if(def_rdev->desc_nr != 0) { | |
2516 | + if (active_disks == 1 && def_rdev->desc_nr != 0) { | |
2517 | rdev = find_rdev_nr(mddev, 0); | |
2518 | desc = &sb->disks[def_rdev->desc_nr]; | |
2519 | desc2 = sb->disks; | |
2520 | @@ -940,10 +989,10 @@ | |
2521 | def_rdev->desc_nr = 0; | |
2522 | } | |
2523 | } | |
2524 | - conf->raid_disks = sb->raid_disks = sb->active_disks = 1; | |
2525 | + conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks; | |
2526 | conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; | |
2527 | - sb->failed_disks = 0; | |
2528 | - sb->spare_disks = num_rdevs - 1; | |
2529 | + sb->failed_disks = faulty_disks; | |
2530 | + sb->spare_disks = spare_disks; | |
2531 | mddev->sb_dirty = 1; | |
2532 | conf->mddev = mddev; | |
2533 | conf->device_lock = MD_SPIN_LOCK_UNLOCKED; | |
4dd5eeca JR |
2534 | diff -urN linux-2.4.22/include/linux/raid/multipath.h linux-2.4.22-evms/include/linux/raid/multipath.h |
2535 | --- linux-2.4.22/include/linux/raid/multipath.h 2001-11-12 18:51:56.000000000 +0100 | |
2536 | +++ linux-2.4.22-evms/include/linux/raid/multipath.h 2003-09-15 17:09:36.000000000 +0200 | |
2537 | @@ -15,6 +15,7 @@ | |
2538 | int spare; | |
2539 | ||
2540 | int used_slot; | |
2541 | + atomic_t nr_pending; /* number of pending requests */ | |
2542 | }; | |
2543 | ||
2544 | struct multipath_private_data { | |
2545 | @@ -63,6 +64,7 @@ | |
2546 | struct buffer_head *master_bh; | |
2547 | struct buffer_head bh_req; | |
2548 | struct multipath_bh *next_mp; /* next for retry or in free list */ | |
2549 | + struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/ | |
2550 | }; | |
2551 | /* bits for multipath_bh.state */ | |
2552 | #define MPBH_Uptodate 1 |