]>
Commit | Line | Data |
---|---|---|
cdeda7f0 AM |
1 | diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in |
2 | --- linux-2.4.24.org/drivers/md/Config.in 2004-01-18 15:09:18.503177509 +0100 | |
3 | +++ linux-2.4.24/drivers/md/Config.in 2004-01-18 16:05:08.202479073 +0100 | |
4 | @@ -12,6 +12,10 @@ | |
5 | dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD | |
6 | dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD | |
7 | dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD | |
8 | +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then | |
9 | + dep_tristate ' Bad Block Relocation Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM | |
10 | + dep_tristate ' Sparse Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM | |
11 | +fi | |
12 | ||
13 | dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD | |
14 | dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD | |
15 | diff -urN linux-2.4.24.org/drivers/md/dm-bbr.c linux-2.4.24/drivers/md/dm-bbr.c | |
16 | --- linux-2.4.24.org/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100 | |
17 | +++ linux-2.4.24/drivers/md/dm-bbr.c 2004-01-18 16:03:13.099546349 +0100 | |
18 | @@ -0,0 +1,1227 @@ | |
19 | +/* | |
20 | + * (C) Copyright IBM Corp. 2002, 2003 | |
21 | + * | |
22 | + * This program is free software; you can redistribute it and/or modify | |
23 | + * it under the terms of the GNU General Public License as published by | |
24 | + * the Free Software Foundation; either version 2 of the License, or | |
25 | + * (at your option) any later version. | |
26 | + * | |
27 | + * This program is distributed in the hope that it will be useful, | |
28 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
29 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
30 | + * the GNU General Public License for more details. | |
31 | + * | |
32 | + * You should have received a copy of the GNU General Public License | |
33 | + * along with this program; if not, write to the Free Software | |
34 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
35 | + * | |
36 | + * linux/drivers/md/dm-bbr.c | |
37 | + * | |
38 | + * Bad-block-relocation (BBR) target for device-mapper. | |
39 | + * | |
40 | + * The BBR target is designed to remap I/O write failures to another safe | |
41 | + * location on disk. Note that most disk drives have BBR built into them, | |
42 | + * this means that our software BBR will be only activated when all hardware | |
43 | + * BBR replacement sectors have been used. | |
44 | + */ | |
45 | + | |
46 | +#include <linux/kernel.h> | |
47 | +#include <linux/module.h> | |
48 | +#include <linux/init.h> | |
49 | +#include <linux/blkdev.h> | |
50 | +#include <linux/spinlock.h> | |
51 | +#include <linux/smp_lock.h> | |
52 | +#include <linux/slab.h> | |
53 | +#include <linux/mempool.h> | |
54 | +#include "dm.h" | |
55 | +#include "dm-bbr.h" | |
56 | +#include "dm-daemon.h" | |
57 | +#include "dm-io.h" | |
58 | + | |
59 | +/* Number of active BBR devices. */ | |
60 | +static int bbr_instances = 0; | |
61 | +static DECLARE_MUTEX(bbr_instances_lock); | |
62 | + | |
63 | +/* Data pertaining to the I/O thread. */ | |
64 | +static struct dm_daemon * bbr_io_thread = NULL; | |
65 | +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED; | |
66 | +static LIST_HEAD(bbr_io_list); | |
67 | +static void bbr_io_handler(void); | |
68 | + | |
69 | +/* Global pools for bbr_io_buf's and bbr_remap's. */ | |
70 | +static kmem_cache_t * bbr_io_buf_cache; | |
71 | +static mempool_t * bbr_io_buf_pool; | |
72 | +static kmem_cache_t * bbr_remap_cache; | |
73 | +static mempool_t * bbr_remap_pool; | |
74 | + | |
75 | +static void bbr_free_remap(struct bbr_private * bbr_id); | |
76 | + | |
77 | +/** | |
78 | + * destroy_pools | |
79 | + * | |
80 | + * Delete the pools for the remap list and I/O anchors. | |
81 | + **/ | |
82 | +static void destroy_pools(void) | |
83 | +{ | |
84 | + if (bbr_io_buf_pool) { | |
85 | + mempool_destroy(bbr_io_buf_pool); | |
86 | + bbr_io_buf_pool = NULL; | |
87 | + } | |
88 | + if (bbr_io_buf_cache) { | |
89 | + kmem_cache_destroy(bbr_io_buf_cache); | |
90 | + bbr_io_buf_cache = NULL; | |
91 | + } | |
92 | + if (bbr_remap_pool) { | |
93 | + mempool_destroy(bbr_remap_pool); | |
94 | + bbr_remap_pool = NULL; | |
95 | + } | |
96 | + if (bbr_remap_cache) { | |
97 | + kmem_cache_destroy(bbr_remap_cache); | |
98 | + bbr_remap_cache = NULL; | |
99 | + } | |
100 | +} | |
101 | + | |
102 | +/** | |
103 | + * create_pools | |
104 | + * | |
105 | + * Create mempools for the remap list and I/O anchors. | |
106 | + **/ | |
107 | +static int create_pools(void) | |
108 | +{ | |
109 | + if (!bbr_remap_cache) { | |
110 | + bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache", | |
111 | + sizeof(struct bbr_runtime_remap), | |
112 | + 0, SLAB_HWCACHE_ALIGN, | |
113 | + NULL, NULL); | |
114 | + if (!bbr_remap_cache) { | |
115 | + DMERR("Unable to create BBR remap cache."); | |
116 | + goto out; | |
117 | + } | |
118 | + } | |
119 | + if (!bbr_remap_pool) { | |
120 | + bbr_remap_pool = mempool_create(64, mempool_alloc_slab, | |
121 | + mempool_free_slab, | |
122 | + bbr_remap_cache); | |
123 | + if (!bbr_remap_pool) { | |
124 | + DMERR("Unable to create BBR remap mempool."); | |
125 | + goto out; | |
126 | + } | |
127 | + } | |
128 | + | |
129 | + if (!bbr_io_buf_cache) { | |
130 | + bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache", | |
131 | + sizeof(struct bbr_io_buffer), | |
132 | + 0, SLAB_HWCACHE_ALIGN, | |
133 | + NULL, NULL); | |
134 | + if (!bbr_io_buf_cache) { | |
135 | + DMERR("Unable to create BBR I/O buffer cache."); | |
136 | + goto out; | |
137 | + } | |
138 | + } | |
139 | + if (!bbr_io_buf_pool) { | |
140 | + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab, | |
141 | + mempool_free_slab, | |
142 | + bbr_io_buf_cache); | |
143 | + if (!bbr_io_buf_pool) { | |
144 | + DMERR("Unable to create BBR I/O buffer mempool."); | |
145 | + goto out; | |
146 | + } | |
147 | + } | |
148 | + | |
149 | +out: | |
150 | + if (!bbr_remap_cache || !bbr_remap_pool || | |
151 | + !bbr_io_buf_cache || !bbr_io_buf_pool ) { | |
152 | + destroy_pools(); | |
153 | + return -ENOMEM; | |
154 | + } | |
155 | + | |
156 | + return 0; | |
157 | +} | |
158 | + | |
159 | +/** | |
160 | + * stop_io_thread | |
161 | + * | |
162 | + * Use the dm-daemon services to stop the BBR I/O thread. | |
163 | + **/ | |
164 | +static void stop_io_thread(void) | |
165 | +{ | |
166 | + if (bbr_io_thread) { | |
167 | + dm_daemon_stop(bbr_io_thread); | |
168 | + kfree(bbr_io_thread); | |
169 | + bbr_io_thread = NULL; | |
170 | + } | |
171 | +} | |
172 | + | |
173 | +/** | |
174 | + * start_io_thread | |
175 | + * | |
176 | + * Use the dm-daemon services to start the BBR I/O thread. | |
177 | + **/ | |
178 | +static int start_io_thread(void) | |
179 | +{ | |
180 | + int rc; | |
181 | + | |
182 | + if (!bbr_io_thread) { | |
183 | + bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL); | |
184 | + if (!bbr_io_thread) { | |
185 | + return -ENOMEM; | |
186 | + } | |
187 | + | |
188 | + rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler); | |
189 | + if (rc) { | |
190 | + kfree(bbr_io_thread); | |
191 | + return rc; | |
192 | + } | |
193 | + } | |
194 | + | |
195 | + return 0; | |
196 | +} | |
197 | + | |
198 | +/** | |
199 | + * bbr_global_init | |
200 | + * | |
201 | + * Set up the mempools, I/O thread, and sync-I/O service. This should | |
202 | + * be called only when the first bbr device is created. | |
203 | + **/ | |
204 | +static int bbr_global_init(void) | |
205 | +{ | |
206 | + int rc; | |
207 | + | |
208 | + rc = create_pools(); | |
209 | + if (rc) { | |
210 | + goto out; | |
211 | + } | |
212 | + | |
213 | + rc = start_io_thread(); | |
214 | + if (rc) { | |
215 | + destroy_pools(); | |
216 | + goto out; | |
217 | + } | |
218 | + | |
219 | + rc = dm_io_get(1); | |
220 | + if (rc) { | |
221 | + destroy_pools(); | |
222 | + stop_io_thread(); | |
223 | + goto out; | |
224 | + } | |
225 | + | |
226 | +out: | |
227 | + return rc; | |
228 | +} | |
229 | + | |
230 | +/** | |
231 | + * bbr_global_cleanup | |
232 | + * | |
233 | + * Cleanup the mempools, I/O thread and sync-I/O service. This should | |
234 | + * be called only when the last bbr device is removed. | |
235 | + **/ | |
236 | +static void bbr_global_cleanup(void) | |
237 | +{ | |
238 | + destroy_pools(); | |
239 | + stop_io_thread(); | |
240 | + dm_io_put(1); | |
241 | +} | |
242 | + | |
243 | +static struct bbr_private * bbr_alloc_private(void) | |
244 | +{ | |
245 | + struct bbr_private *bbr_id; | |
246 | + | |
247 | + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL); | |
248 | + if (bbr_id) { | |
249 | + memset(bbr_id, 0, sizeof(*bbr_id)); | |
250 | + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); | |
251 | + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED; | |
252 | + } | |
253 | + | |
254 | + return bbr_id; | |
255 | +} | |
256 | + | |
257 | +static void bbr_free_private(struct bbr_private *bbr_id) | |
258 | +{ | |
259 | + if (bbr_id->bbr_table) { | |
260 | + kfree(bbr_id->bbr_table); | |
261 | + } | |
262 | + bbr_free_remap(bbr_id); | |
263 | + kfree(bbr_id); | |
264 | +} | |
265 | + | |
266 | +static u32 crc_table[256]; | |
267 | +static u32 crc_table_built = 0; | |
268 | + | |
269 | +static void build_crc_table(void) | |
270 | +{ | |
271 | + u32 i, j, crc; | |
272 | + | |
273 | + for (i = 0; i <= 255; i++) { | |
274 | + crc = i; | |
275 | + for (j = 8; j > 0; j--) { | |
276 | + if (crc & 1) | |
277 | + crc = (crc >> 1) ^ CRC_POLYNOMIAL; | |
278 | + else | |
279 | + crc >>= 1; | |
280 | + } | |
281 | + crc_table[i] = crc; | |
282 | + } | |
283 | + crc_table_built = 1; | |
284 | +} | |
285 | + | |
286 | +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize) | |
287 | +{ | |
288 | + unsigned char *current_byte; | |
289 | + u32 temp1, temp2, i; | |
290 | + | |
291 | + current_byte = (unsigned char *) buffer; | |
292 | + /* Make sure the crc table is available */ | |
293 | + if (!crc_table_built) | |
294 | + build_crc_table(); | |
295 | + /* Process each byte in the buffer. */ | |
296 | + for (i = 0; i < buffersize; i++) { | |
297 | + temp1 = (crc >> 8) & 0x00FFFFFF; | |
298 | + temp2 = crc_table[(crc ^ (u32) * current_byte) & | |
299 | + (u32) 0xff]; | |
300 | + current_byte++; | |
301 | + crc = temp1 ^ temp2; | |
302 | + } | |
303 | + return crc; | |
304 | +} | |
305 | + | |
306 | +/** | |
307 | + * le_bbr_table_sector_to_cpu | |
308 | + * | |
309 | + * Convert bbr meta data from on-disk (LE) format | |
310 | + * to the native cpu endian format. | |
311 | + **/ | |
312 | +static void le_bbr_table_sector_to_cpu(struct bbr_table *p) | |
313 | +{ | |
314 | + int i; | |
315 | + p->signature = le32_to_cpup(&p->signature); | |
316 | + p->crc = le32_to_cpup(&p->crc); | |
317 | + p->sequence_number = le32_to_cpup(&p->sequence_number); | |
318 | + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); | |
319 | + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { | |
320 | + p->entries[i].bad_sect = | |
321 | + le64_to_cpup(&p->entries[i].bad_sect); | |
322 | + p->entries[i].replacement_sect = | |
323 | + le64_to_cpup(&p->entries[i].replacement_sect); | |
324 | + } | |
325 | +} | |
326 | + | |
327 | +/** | |
328 | + * cpu_bbr_table_sector_to_le | |
329 | + * | |
330 | + * Convert bbr meta data from cpu endian format to on-disk (LE) format | |
331 | + **/ | |
332 | +static void cpu_bbr_table_sector_to_le(struct bbr_table * p, | |
333 | + struct bbr_table * le) | |
334 | +{ | |
335 | + int i; | |
336 | + le->signature = cpu_to_le32p(&p->signature); | |
337 | + le->crc = cpu_to_le32p(&p->crc); | |
338 | + le->sequence_number = cpu_to_le32p(&p->sequence_number); | |
339 | + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); | |
340 | + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { | |
341 | + le->entries[i].bad_sect = | |
342 | + cpu_to_le64p(&p->entries[i].bad_sect); | |
343 | + le->entries[i].replacement_sect = | |
344 | + cpu_to_le64p(&p->entries[i].replacement_sect); | |
345 | + } | |
346 | +} | |
347 | + | |
348 | +/** | |
349 | + * validate_bbr_table_sector | |
350 | + * | |
351 | + * Check the specified BBR table sector for a valid signature and CRC. If it's | |
352 | + * valid, endian-convert the table sector. | |
353 | + **/ | |
354 | +static int validate_bbr_table_sector(struct bbr_table * p) | |
355 | +{ | |
356 | + int rc = 0; | |
357 | + int org_crc, final_crc; | |
358 | + | |
359 | + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) { | |
360 | + DMERR("BBR table signature doesn't match!"); | |
361 | + DMERR("Found 0x%x. Expecting 0x%x", | |
362 | + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE); | |
363 | + rc = -EINVAL; | |
364 | + goto out; | |
365 | + } | |
366 | + | |
367 | + if (!p->crc) { | |
368 | + DMERR("BBR table sector has no CRC!"); | |
369 | + rc = -EINVAL; | |
370 | + goto out; | |
371 | + } | |
372 | + | |
373 | + org_crc = le32_to_cpup(&p->crc); | |
374 | + p->crc = 0; | |
375 | + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p)); | |
376 | + if (final_crc != org_crc) { | |
377 | + DMERR("CRC failed!"); | |
378 | + DMERR("Found 0x%x. Expecting 0x%x", | |
379 | + org_crc, final_crc); | |
380 | + rc = -EINVAL; | |
381 | + goto out; | |
382 | + } | |
383 | + | |
384 | + p->crc = cpu_to_le32p(&org_crc); | |
385 | + le_bbr_table_sector_to_cpu(p); | |
386 | + | |
387 | +out: | |
388 | + return rc; | |
389 | +} | |
390 | + | |
391 | +/** | |
392 | + * bbr_binary_tree_insert | |
393 | + * | |
394 | + * Insert a node into the binary tree. | |
395 | + **/ | |
396 | +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root, | |
397 | + struct bbr_runtime_remap *newnode) | |
398 | +{ | |
399 | + struct bbr_runtime_remap **node = root; | |
400 | + while (node && *node) { | |
401 | + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) { | |
402 | + node = &((*node)->right); | |
403 | + } else { | |
404 | + node = &((*node)->left); | |
405 | + } | |
406 | + } | |
407 | + | |
408 | + newnode->left = newnode->right = NULL; | |
409 | + *node = newnode; | |
410 | +} | |
411 | + | |
412 | +/** | |
413 | + * bbr_binary_search | |
414 | + * | |
415 | + * Search for a node that contains bad_sect == lsn. | |
416 | + **/ | |
417 | +static struct bbr_runtime_remap * bbr_binary_search( | |
418 | + struct bbr_runtime_remap *root, | |
419 | + u64 lsn) | |
420 | +{ | |
421 | + struct bbr_runtime_remap *node = root; | |
422 | + while (node) { | |
423 | + if (node->remap.bad_sect == lsn) { | |
424 | + break; | |
425 | + } | |
426 | + if (lsn > node->remap.bad_sect) { | |
427 | + node = node->right; | |
428 | + } else { | |
429 | + node = node->left; | |
430 | + } | |
431 | + } | |
432 | + return node; | |
433 | +} | |
434 | + | |
435 | +/** | |
436 | + * bbr_binary_tree_destroy | |
437 | + * | |
438 | + * Destroy the binary tree. | |
439 | + **/ | |
440 | +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root, | |
441 | + struct bbr_private * bbr_id) | |
442 | +{ | |
443 | + struct bbr_runtime_remap **link = NULL; | |
444 | + struct bbr_runtime_remap *node = root; | |
445 | + | |
446 | + while (node) { | |
447 | + if (node->left) { | |
448 | + link = &(node->left); | |
449 | + node = node->left; | |
450 | + continue; | |
451 | + } | |
452 | + if (node->right) { | |
453 | + link = &(node->right); | |
454 | + node = node->right; | |
455 | + continue; | |
456 | + } | |
457 | + | |
458 | + mempool_free(node, bbr_remap_pool); | |
459 | + if (node == root) { | |
460 | + /* If root is deleted, we're done. */ | |
461 | + break; | |
462 | + } | |
463 | + | |
464 | + /* Back to root. */ | |
465 | + node = root; | |
466 | + *link = NULL; | |
467 | + } | |
468 | +} | |
469 | + | |
470 | +static void bbr_free_remap(struct bbr_private * bbr_id) | |
471 | +{ | |
472 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
473 | + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id); | |
474 | + bbr_id->remap_root = NULL; | |
475 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
476 | +} | |
477 | + | |
478 | +/** | |
479 | + * bbr_insert_remap_entry | |
480 | + * | |
481 | + * Create a new remap entry and add it to the binary tree for this node. | |
482 | + **/ | |
483 | +static int bbr_insert_remap_entry(struct bbr_private *bbr_id, | |
484 | + struct bbr_table_entry *new_bbr_entry) | |
485 | +{ | |
486 | + struct bbr_runtime_remap *newnode; | |
487 | + | |
488 | + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO); | |
489 | + if (!newnode) { | |
490 | + DMERR("Could not allocate from remap mempool!"); | |
491 | + return -ENOMEM; | |
492 | + } | |
493 | + newnode->remap.bad_sect = new_bbr_entry->bad_sect; | |
494 | + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; | |
495 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
496 | + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); | |
497 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
498 | + return 0; | |
499 | +} | |
500 | + | |
501 | +/** | |
502 | + * bbr_table_to_remap_list | |
503 | + * | |
504 | + * The on-disk bbr table is sorted by the replacement sector LBA. In order to | |
505 | + * improve run time performance, the in memory remap list must be sorted by | |
506 | + * the bad sector LBA. This function is called at discovery time to initialize | |
507 | + * the remap list. This function assumes that at least one copy of meta data | |
508 | + * is valid. | |
509 | + **/ | |
510 | +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id) | |
511 | +{ | |
512 | + u32 in_use_blks = 0; | |
513 | + int i, j; | |
514 | + struct bbr_table *p; | |
515 | + | |
516 | + for (i = 0, p = bbr_id->bbr_table; | |
517 | + i < bbr_id->nr_sects_bbr_table; | |
518 | + i++, p++) { | |
519 | + if (!p->in_use_cnt) { | |
520 | + break; | |
521 | + } | |
522 | + in_use_blks += p->in_use_cnt; | |
523 | + for (j = 0; j < p->in_use_cnt; j++) { | |
524 | + bbr_insert_remap_entry(bbr_id, &p->entries[j]); | |
525 | + } | |
526 | + } | |
527 | + if (in_use_blks) { | |
528 | + DMWARN("There are %u BBR entries for device %s", | |
529 | + in_use_blks, dm_kdevname(bbr_id->dev->dev)); | |
530 | + } | |
531 | + | |
532 | + return in_use_blks; | |
533 | +} | |
534 | + | |
535 | +/** | |
536 | + * bbr_search_remap_entry | |
537 | + * | |
538 | + * Search remap entry for the specified sector. If found, return a pointer to | |
539 | + * the table entry. Otherwise, return NULL. | |
540 | + **/ | |
541 | +static struct bbr_table_entry * bbr_search_remap_entry( | |
542 | + struct bbr_private *bbr_id, | |
543 | + u64 lsn) | |
544 | +{ | |
545 | + struct bbr_runtime_remap *p; | |
546 | + | |
547 | + spin_lock_irq(&bbr_id->bbr_id_lock); | |
548 | + p = bbr_binary_search(bbr_id->remap_root, lsn); | |
549 | + spin_unlock_irq(&bbr_id->bbr_id_lock); | |
550 | + if (p) { | |
551 | + return (&p->remap); | |
552 | + } else { | |
553 | + return NULL; | |
554 | + } | |
555 | +} | |
556 | + | |
557 | +/** | |
558 | + * bbr_remap | |
559 | + * | |
560 | + * If *lsn is in the remap table, return TRUE and modify *lsn, | |
561 | + * else, return FALSE. | |
562 | + **/ | |
563 | +static inline int bbr_remap(struct bbr_private *bbr_id, | |
564 | + u64 *lsn) | |
565 | +{ | |
566 | + struct bbr_table_entry *e; | |
567 | + | |
568 | + if (atomic_read(&bbr_id->in_use_replacement_blks)) { | |
569 | + e = bbr_search_remap_entry(bbr_id, *lsn); | |
570 | + if (e) { | |
571 | + *lsn = e->replacement_sect; | |
572 | + return 1; | |
573 | + } | |
574 | + } | |
575 | + return 0; | |
576 | +} | |
577 | + | |
578 | +/** | |
579 | + * bbr_remap_probe | |
580 | + * | |
581 | + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap | |
582 | + * table return TRUE, Else, return FALSE. | |
583 | + **/ | |
584 | +static inline int bbr_remap_probe(struct bbr_private * bbr_id, | |
585 | + u64 lsn, u64 nr_sects) | |
586 | +{ | |
587 | + u64 tmp, cnt; | |
588 | + | |
589 | + if (atomic_read(&bbr_id->in_use_replacement_blks)) { | |
590 | + for (cnt = 0, tmp = lsn; | |
591 | + cnt < nr_sects; | |
592 | + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { | |
593 | + if (bbr_remap(bbr_id,&tmp)) { | |
594 | + return 1; | |
595 | + } | |
596 | + } | |
597 | + } | |
598 | + return 0; | |
599 | +} | |
600 | + | |
601 | +/** | |
602 | + * bbr_setup | |
603 | + * | |
604 | + * Read the remap tables from disk and set up the initial remap tree. | |
605 | + **/ | |
606 | +static int bbr_setup(struct bbr_private *bbr_id) | |
607 | +{ | |
608 | + struct bbr_table *table = bbr_id->bbr_table; | |
609 | + struct page *page; | |
610 | + struct io_region job; | |
611 | + unsigned int error, offset; | |
612 | + int i, rc = 0; | |
613 | + | |
614 | + job.dev = bbr_id->dev->dev; | |
615 | + job.count = 1; | |
616 | + | |
617 | + /* Read and verify each BBR table sector individually. */ | |
618 | + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) { | |
619 | + job.sector = bbr_id->lba_table1 + i; | |
620 | + page = virt_to_page(table); | |
621 | + offset = (unsigned long)table & ~PAGE_MASK; | |
622 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
623 | + if (rc && bbr_id->lba_table2) { | |
624 | + job.sector = bbr_id->lba_table2 + i; | |
625 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
626 | + } | |
627 | + if (rc) { | |
628 | + goto out; | |
629 | + } | |
630 | + | |
631 | + rc = validate_bbr_table_sector(table); | |
632 | + if (rc) { | |
633 | + goto out; | |
634 | + } | |
635 | + } | |
636 | + atomic_set(&bbr_id->in_use_replacement_blks, | |
637 | + bbr_table_to_remap_list(bbr_id)); | |
638 | + | |
639 | +out: | |
640 | + if (rc) { | |
641 | + DMERR("dm-bbr: error during device setup: %d", rc); | |
642 | + } | |
643 | + return rc; | |
644 | +} | |
645 | + | |
646 | +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id, | |
647 | + struct buffer_head * bh, | |
648 | + int rw) | |
649 | +{ | |
650 | + struct bbr_io_buffer * bbr_io_buf; | |
651 | + | |
652 | + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO); | |
653 | + if (bbr_io_buf) { | |
654 | + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer)); | |
655 | + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list); | |
656 | + bbr_io_buf->bbr_id = bbr_id; | |
657 | + bbr_io_buf->sector = bh->b_rsector; | |
658 | + bbr_io_buf->bh = bh; | |
659 | + bbr_io_buf->rw = rw; | |
660 | + } else { | |
661 | + DMWARN("Could not allocate from BBR I/O buffer pool!"); | |
662 | + } | |
663 | + return bbr_io_buf; | |
664 | +} | |
665 | + | |
666 | +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf) | |
667 | +{ | |
668 | + mempool_free(bbr_io_buf, bbr_io_buf_pool); | |
669 | +} | |
670 | + | |
671 | +/** | |
672 | + * bbr_io_remap_error | |
673 | + * @bbr_id: Private data for the BBR node. | |
674 | + * @rw: READ or WRITE. | |
675 | + * @starting_lsn: Starting sector of request to remap. | |
676 | + * @count: Number of sectors in the request. | |
677 | + * @buffer: Data buffer for the request. | |
678 | + * | |
679 | + * For the requested range, try to write each sector individually. For each | |
680 | + * sector that fails, find the next available remap location and write the | |
681 | + * data to that new location. Then update the table and write both copies | |
682 | + * of the table to disk. Finally, update the in-memory mapping and do any | |
683 | + * other necessary bookkeeping. | |
684 | + **/ | |
685 | +static int bbr_io_remap_error(struct bbr_private *bbr_id, | |
686 | + int rw, | |
687 | + u64 starting_lsn, | |
688 | + u64 count, | |
689 | + char *buffer) | |
690 | +{ | |
691 | + struct bbr_table *bbr_table; | |
692 | + struct io_region job; | |
693 | + struct page *page; | |
694 | + unsigned long table_sector_index; | |
695 | + unsigned long table_sector_offset; | |
696 | + unsigned long index; | |
697 | + unsigned int offset_in_page, error; | |
698 | + u64 lsn, new_lsn; | |
699 | + int rc; | |
700 | + | |
701 | + if (rw == READ) { | |
702 | + /* Nothing can be done about read errors. */ | |
703 | + return -EIO; | |
704 | + } | |
705 | + | |
706 | + job.dev = bbr_id->dev->dev; | |
707 | + job.count = 1; | |
708 | + | |
709 | + /* For each sector in the request. */ | |
710 | + for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) { | |
711 | + job.sector = starting_lsn + lsn; | |
712 | + page = virt_to_page(buffer); | |
713 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
714 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
715 | + while (rc) { | |
716 | + /* Find the next available relocation sector. */ | |
717 | + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); | |
718 | + if (new_lsn >= bbr_id->nr_replacement_blks) { | |
719 | + /* No more replacement sectors available. */ | |
720 | + return -EIO; | |
721 | + } | |
722 | + new_lsn += bbr_id->start_replacement_sect; | |
723 | + | |
724 | + /* Write the data to its new location. */ | |
725 | + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64, | |
726 | + dm_kdevname(bbr_id->dev->dev), | |
727 | + starting_lsn + lsn, new_lsn); | |
728 | + job.sector = new_lsn; | |
729 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
730 | + if (rc) { | |
731 | + /* This replacement sector is bad. | |
732 | + * Try the next one. | |
733 | + */ | |
734 | + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.", | |
735 | + dm_kdevname(bbr_id->dev->dev), new_lsn); | |
736 | + atomic_inc(&bbr_id->in_use_replacement_blks); | |
737 | + continue; | |
738 | + } | |
739 | + | |
740 | + /* Add this new entry to the on-disk table. */ | |
741 | + table_sector_index = new_lsn - | |
742 | + bbr_id->start_replacement_sect; | |
743 | + table_sector_offset = table_sector_index / | |
744 | + BBR_ENTRIES_PER_SECT; | |
745 | + index = table_sector_index % BBR_ENTRIES_PER_SECT; | |
746 | + | |
747 | + bbr_table = &bbr_id->bbr_table[table_sector_offset]; | |
748 | + bbr_table->entries[index].bad_sect = starting_lsn + lsn; | |
749 | + bbr_table->entries[index].replacement_sect = new_lsn; | |
750 | + bbr_table->in_use_cnt++; | |
751 | + bbr_table->sequence_number++; | |
752 | + bbr_table->crc = 0; | |
753 | + bbr_table->crc = calculate_crc(INITIAL_CRC, | |
754 | + bbr_table, | |
755 | + sizeof(struct bbr_table)); | |
756 | + | |
757 | + /* Write the table to disk. */ | |
758 | + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); | |
759 | + page = virt_to_page(bbr_table); | |
760 | + offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK; | |
761 | + if (bbr_id->lba_table1) { | |
762 | + job.sector = bbr_id->lba_table1 + table_sector_offset; | |
763 | + rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); | |
764 | + } | |
765 | + if (bbr_id->lba_table2) { | |
766 | + job.sector = bbr_id->lba_table2 + table_sector_offset; | |
767 | + rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); | |
768 | + } | |
769 | + le_bbr_table_sector_to_cpu(bbr_table); | |
770 | + | |
771 | + if (rc) { | |
772 | + /* Error writing one of the tables to disk. */ | |
773 | + DMERR("dm-bbr: device %s: error updating BBR tables on disk.", | |
774 | + dm_kdevname(bbr_id->dev->dev)); | |
775 | + return rc; | |
776 | + } | |
777 | + | |
778 | + /* Insert a new entry in the remapping binary-tree. */ | |
779 | + rc = bbr_insert_remap_entry(bbr_id, | |
780 | + &bbr_table->entries[index]); | |
781 | + if (rc) { | |
782 | + DMERR("dm-bbr: device %s: error adding new entry to remap tree.", | |
783 | + dm_kdevname(bbr_id->dev->dev)); | |
784 | + return rc; | |
785 | + } | |
786 | + | |
787 | + atomic_inc(&bbr_id->in_use_replacement_blks); | |
788 | + } | |
789 | + } | |
790 | + | |
791 | + return 0; | |
792 | +} | |
793 | + | |
794 | +/** | |
795 | + * bbr_io_process_request | |
796 | + * | |
797 | + * For each sector in this request, check if the sector has already | |
798 | + * been remapped. If so, process all previous sectors in the request, | |
799 | + * followed by the remapped sector. Then reset the starting lsn and | |
800 | + * count, and keep going with the rest of the request as if it were | |
801 | + * a whole new request. If any of the sync_io's return an error, | |
802 | + * call the remapper to relocate the bad sector(s). | |
803 | + **/ | |
804 | +static int bbr_io_process_request(struct bbr_io_buffer *bbr_io_buf) | |
805 | +{ | |
806 | + struct bbr_private *bbr_id = bbr_io_buf->bbr_id; | |
807 | + struct io_region job; | |
808 | + u64 starting_lsn = bbr_io_buf->sector; | |
809 | + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT; | |
810 | + u64 lsn, remapped_lsn; | |
811 | + char *buffer = bbr_io_buf->bh->b_data; | |
812 | + struct page *page = virt_to_page(buffer); | |
813 | + unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
814 | + unsigned int error; | |
815 | + int rw = bbr_io_buf->rw; | |
816 | + int rc = 0; | |
817 | + | |
818 | + job.dev = bbr_id->dev->dev; | |
819 | + | |
820 | + /* For each sector in this request, check if this sector has | |
821 | + * already been remapped. If so, process all previous sectors | |
822 | + * in this request, followed by the remapped sector. Then reset | |
823 | + * the starting lsn and count and keep going with the rest of | |
824 | + * the request as if it were a whole new request. | |
825 | + */ | |
826 | + for (lsn = 0; lsn < count; lsn++) { | |
827 | + remapped_lsn = starting_lsn + lsn; | |
828 | + rc = bbr_remap(bbr_id, &remapped_lsn); | |
829 | + if (!rc) { | |
830 | + /* This sector is fine. */ | |
831 | + continue; | |
832 | + } | |
833 | + | |
834 | + /* Process all sectors in the request up to this one. */ | |
835 | + if (lsn > 0) { | |
836 | + job.sector = starting_lsn; | |
837 | + job.count = lsn; | |
838 | + rc = dm_io_sync(1, &job, rw, page, | |
839 | + offset_in_page, &error); | |
840 | + if (rc) { | |
841 | + /* If this I/O failed, then one of the | |
842 | + * sectors in this request needs to be | |
843 | + * relocated. | |
844 | + */ | |
845 | + rc = bbr_io_remap_error(bbr_id, rw, | |
846 | + starting_lsn, | |
847 | + lsn, buffer); | |
848 | + if (rc) { | |
849 | + return rc; | |
850 | + } | |
851 | + } | |
852 | + buffer += (lsn << SECTOR_SHIFT); | |
853 | + page = virt_to_page(buffer); | |
854 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
855 | + } | |
856 | + | |
857 | + /* Process the remapped sector. */ | |
858 | + job.sector = remapped_lsn; | |
859 | + job.count = 1; | |
860 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
861 | + if (rc) { | |
862 | + /* BUGBUG - Need more processing if this caused | |
863 | + * an error. If this I/O failed, then the | |
864 | + * existing remap is now bad, and we need to | |
865 | + * find a new remap. Can't use | |
866 | + * bbr_io_remap_error(), because the existing | |
867 | + * map entry needs to be changed, not added | |
868 | + * again, and the original table entry also | |
869 | + * needs to be changed. | |
870 | + */ | |
871 | + return rc; | |
872 | + } | |
873 | + | |
874 | + buffer += SECTOR_SIZE; | |
875 | + starting_lsn += (lsn + 1); | |
876 | + count -= (lsn + 1); | |
877 | + lsn = -1; | |
878 | + page = virt_to_page(buffer); | |
879 | + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; | |
880 | + } | |
881 | + | |
882 | + /* Check for any remaining sectors after the last split. This | |
883 | + * could potentially be the whole request, but that should be a | |
884 | + * rare case because requests should only be processed by the | |
885 | + * thread if we know an error occurred or they contained one or | |
886 | + * more remapped sectors. | |
887 | + */ | |
888 | + if (count) { | |
889 | + job.sector = starting_lsn; | |
890 | + job.count = count; | |
891 | + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); | |
892 | + if (rc) { | |
893 | + /* If this I/O failed, then one of the sectors | |
894 | + * in this request needs to be relocated. | |
895 | + */ | |
896 | + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn, | |
897 | + count, buffer); | |
898 | + if (rc) { | |
899 | + return rc; | |
900 | + } | |
901 | + } | |
902 | + } | |
903 | + | |
904 | + return 0; | |
905 | +} | |
906 | + | |
907 | +/** | |
908 | + * bbr_io_handler | |
909 | + * | |
910 | + * This is the handler for the bbr_io_thread. It continuously loops, | |
911 | + * taking I/O requests off its list and processing them. If nothing | |
912 | + * is on the list, the thread goes back to sleep until specifically | |
913 | + * woken up. | |
914 | + * | |
915 | + * I/O requests should only be sent to this thread if we know that: | |
916 | + * a) the request contains at least one remapped sector. | |
917 | + * or | |
918 | + * b) the request caused an error on the normal I/O path. | |
919 | + * This function uses synchronous I/O, so sending a request to this | |
920 | + * thread that doesn't need special processing will cause severe | |
921 | + * performance degredation. | |
922 | + **/ | |
923 | +static void bbr_io_handler(void) | |
924 | +{ | |
925 | + struct bbr_io_buffer *bbr_io_buf; | |
926 | + struct buffer_head *bh; | |
927 | + unsigned long flags; | |
928 | + int rc; | |
929 | + | |
930 | + while (1) { | |
931 | + /* Process bbr_io_list, one entry at a time. */ | |
932 | + spin_lock_irqsave(&bbr_io_list_lock, flags); | |
933 | + if (list_empty(&bbr_io_list)) { | |
934 | + /* No more items on the list. */ | |
935 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
936 | + break; | |
937 | + } | |
938 | + bbr_io_buf = list_entry(bbr_io_list.next, | |
939 | + struct bbr_io_buffer, bbr_io_list); | |
940 | + list_del_init(&bbr_io_buf->bbr_io_list); | |
941 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
942 | + | |
943 | + rc = bbr_io_process_request(bbr_io_buf); | |
944 | + | |
945 | + /* Clean up and complete the original I/O. */ | |
946 | + bbr_io_buf->flags |= BBR_IO_HANDLED; | |
947 | + bh = bbr_io_buf->bh; | |
948 | + if (bh->b_end_io) { | |
949 | + /* If this was the bbr_io_buf for an error on the | |
950 | + * normal WRITE, don't free it here. It will be | |
951 | + * freed later in bbr_callback() | |
952 | + */ | |
953 | + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE)) | |
954 | + free_bbr_io_buf(bbr_io_buf); | |
955 | + bh->b_end_io(bh, rc ? 0 : 1); | |
956 | + } | |
957 | + } | |
958 | +} | |
959 | + | |
960 | +/** | |
961 | + * bbr_schedule_io | |
962 | + * | |
963 | + * Place the specified bbr_io_buf on the thread's processing list. | |
964 | + **/ | |
965 | +static void bbr_schedule_io(struct bbr_io_buffer *bbr_io_buf) | |
966 | +{ | |
967 | + unsigned long flags; | |
968 | + spin_lock_irqsave(&bbr_io_list_lock, flags); | |
969 | + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list); | |
970 | + spin_unlock_irqrestore(&bbr_io_list_lock, flags); | |
971 | + dm_daemon_wake(bbr_io_thread); | |
972 | +} | |
973 | + | |
974 | +/** | |
975 | + * bbr_read | |
976 | + * | |
977 | + * If there are any remapped sectors on this object, send this request over | |
978 | + * to the thread for processing. Otherwise send it down the stack normally. | |
979 | + **/ | |
980 | +static int bbr_read(struct bbr_private *bbr_id, | |
981 | + struct buffer_head *bh) | |
982 | +{ | |
983 | + struct bbr_io_buffer *bbr_io_buf; | |
984 | + | |
985 | + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || | |
986 | + !bbr_remap_probe(bbr_id, bh->b_rsector, | |
987 | + bh->b_size >> SECTOR_SHIFT)) { | |
988 | + /* No existing remaps or this request doesn't | |
989 | + * contain any remapped sectors. | |
990 | + */ | |
991 | + bh->b_rdev = bbr_id->dev->dev; | |
992 | + return 1; | |
993 | + } | |
994 | + | |
995 | + /* This request has at least one remapped sector. */ | |
996 | + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ); | |
997 | + if (!bbr_io_buf) { | |
998 | + /* Can't get memory to track the I/O. */ | |
999 | + return -ENOMEM; | |
1000 | + } | |
1001 | + | |
1002 | + bbr_schedule_io(bbr_io_buf); | |
1003 | + return 0; | |
1004 | +} | |
1005 | + | |
1006 | +/** | |
1007 | + * bbr_callback | |
1008 | + * | |
1009 | + * This is the callback for normal write requests. Check for an error | |
1010 | + * during the I/O, and send to the thread for processing if necessary. | |
1011 | + **/ | |
1012 | +static int bbr_callback(struct dm_target *ti, struct buffer_head *bh, int rw, | |
1013 | + int error, union map_info *map_context) | |
1014 | +{ | |
1015 | + struct bbr_io_buffer *bbr_io_buf = map_context->ptr; | |
1016 | + | |
1017 | + if (!bbr_io_buf) | |
1018 | + return error; | |
1019 | + | |
1020 | + /* Will try to relocate the WRITE if: | |
1021 | + * - It is an error, and | |
1022 | + * - It is not an error of BBR relocation, and | |
1023 | + */ | |
1024 | + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) { | |
1025 | + DMERR("dm-bbr: device %s: Write failure on sector %lu. Scheduling for retry.", | |
1026 | + dm_kdevname(bh->b_rdev), | |
1027 | + (unsigned long)bbr_io_buf->sector); | |
1028 | + /* Indicate this bbr_io_buf is for an error on normal WRITE */ | |
1029 | + bbr_io_buf->flags |= BBR_IO_RELOCATE; | |
1030 | + bbr_schedule_io(bbr_io_buf); | |
1031 | + /* Returns >0 so that DM will let us retry the I/O */ | |
1032 | + return 1; | |
1033 | + } | |
1034 | + | |
1035 | + free_bbr_io_buf(bbr_io_buf); | |
1036 | + return error; | |
1037 | +} | |
1038 | + | |
1039 | +/** | |
1040 | + * bbr_write | |
1041 | + * | |
1042 | + * If there are any remapped sectors on this object, send the request over | |
1043 | + * to the thread for processing. Otherwise, register for callback | |
1044 | + * notification, and send the request down normally. | |
1045 | + **/ | |
1046 | +static int bbr_write(struct bbr_private *bbr_id, | |
1047 | + struct buffer_head *bh, | |
1048 | + union map_info *map_context) | |
1049 | +{ | |
1050 | + struct bbr_io_buffer *bbr_io_buf; | |
1051 | + int rc = 1; | |
1052 | + | |
1053 | + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE); | |
1054 | + if (!bbr_io_buf) { | |
1055 | + /* Can't get memory to track the I/O. */ | |
1056 | + return -ENOMEM; | |
1057 | + } | |
1058 | + | |
1059 | + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || | |
1060 | + !bbr_remap_probe(bbr_id, bh->b_rsector, | |
1061 | + bh->b_size >> SECTOR_SHIFT)) { | |
1062 | + /* No existing remaps or this request | |
1063 | + * contains no remapped sectors. | |
1064 | + */ | |
1065 | + bh->b_rdev = bbr_id->dev->dev; | |
1066 | + map_context->ptr = bbr_io_buf; | |
1067 | + } else { | |
1068 | + /* This request contains at least one remapped sector. */ | |
1069 | + bbr_schedule_io(bbr_io_buf); | |
1070 | + rc = 0; | |
1071 | + } | |
1072 | + | |
1073 | + return rc; | |
1074 | +} | |
1075 | + | |
1076 | +/** | |
1077 | + * Construct a bbr mapping | |
1078 | + **/ | |
1079 | +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
1080 | +{ | |
1081 | + struct bbr_private *bbr_id; | |
1082 | + unsigned long block_size; | |
1083 | + char *end; | |
1084 | + int rc = -EINVAL; | |
1085 | + | |
1086 | + if (argc != 8) { | |
1087 | + ti->error = "dm-bbr requires exactly 8 arguments: " | |
1088 | + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size"; | |
1089 | + goto out1; | |
1090 | + } | |
1091 | + | |
1092 | + bbr_id = bbr_alloc_private(); | |
1093 | + if (!bbr_id) { | |
1094 | + ti->error = "dm-bbr: Error allocating bbr private data."; | |
1095 | + goto out1; | |
1096 | + } | |
1097 | + | |
1098 | + bbr_id->offset = simple_strtoull(argv[1], &end, 10); | |
1099 | + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10); | |
1100 | + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10); | |
1101 | + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10); | |
1102 | + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10); | |
1103 | + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10); | |
1104 | + block_size = simple_strtoul(argv[7], &end, 10); | |
1105 | + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT); | |
1106 | + | |
1107 | + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT, | |
1108 | + GFP_KERNEL); | |
1109 | + if (!bbr_id->bbr_table) { | |
1110 | + ti->error = "dm-bbr: Error allocating bbr table."; | |
1111 | + goto out2; | |
1112 | + } | |
1113 | + | |
1114 | + if (dm_get_device(ti, argv[0], 0, ti->len, | |
1115 | + dm_table_get_mode(ti->table), &bbr_id->dev)) { | |
1116 | + ti->error = "dm-bbr: Device lookup failed"; | |
1117 | + goto out2; | |
1118 | + } | |
1119 | + | |
1120 | + /* Using a semaphore here is probably overkill, | |
1121 | + * but at least it will be correct. | |
1122 | + */ | |
1123 | + down(&bbr_instances_lock); | |
1124 | + if (bbr_instances == 0) { | |
1125 | + rc = bbr_global_init(); | |
1126 | + if (rc) { | |
1127 | + up(&bbr_instances_lock); | |
1128 | + goto out3; | |
1129 | + } | |
1130 | + } | |
1131 | + bbr_instances++; | |
1132 | + up(&bbr_instances_lock); | |
1133 | + | |
1134 | + rc = bbr_setup(bbr_id); | |
1135 | + if (rc) { | |
1136 | + ti->error = "dm-bbr: Device setup failed"; | |
1137 | + goto out4; | |
1138 | + } | |
1139 | + | |
1140 | + ti->private = bbr_id; | |
1141 | + return 0; | |
1142 | + | |
1143 | +out4: | |
1144 | + down(&bbr_instances_lock); | |
1145 | + bbr_instances--; | |
1146 | + if (bbr_instances == 0) { | |
1147 | + bbr_global_cleanup(); | |
1148 | + } | |
1149 | + up(&bbr_instances_lock); | |
1150 | + | |
1151 | +out3: | |
1152 | + dm_put_device(ti, bbr_id->dev); | |
1153 | +out2: | |
1154 | + bbr_free_private(bbr_id); | |
1155 | +out1: | |
1156 | + return rc; | |
1157 | +} | |
1158 | + | |
1159 | +static void bbr_dtr(struct dm_target *ti) | |
1160 | +{ | |
1161 | + struct bbr_private *bbr_id = ti->private; | |
1162 | + | |
1163 | + dm_put_device(ti, bbr_id->dev); | |
1164 | + bbr_free_private(bbr_id); | |
1165 | + | |
1166 | + down(&bbr_instances_lock); | |
1167 | + bbr_instances--; | |
1168 | + if (bbr_instances == 0) { | |
1169 | + bbr_global_cleanup(); | |
1170 | + } | |
1171 | + up(&bbr_instances_lock); | |
1172 | +} | |
1173 | + | |
1174 | +static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw, | |
1175 | + union map_info *map_context) | |
1176 | +{ | |
1177 | + struct bbr_private *bbr_id = ti->private; | |
1178 | + | |
1179 | + bh->b_rsector += bbr_id->offset; | |
1180 | + map_context->ptr = NULL; | |
1181 | + switch (rw) { | |
1182 | + case READ: | |
1183 | + case READA: | |
1184 | + return bbr_read(bbr_id, bh); | |
1185 | + case WRITE: | |
1186 | + return bbr_write(bbr_id, bh, map_context); | |
1187 | + default: | |
1188 | + return -EIO; | |
1189 | + } | |
1190 | +} | |
1191 | + | |
1192 | +static int bbr_status(struct dm_target *ti, status_type_t type, | |
1193 | + char *result, unsigned int maxlen) | |
1194 | +{ | |
1195 | + struct bbr_private *bbr_id = ti->private; | |
1196 | + | |
1197 | + switch (type) { | |
1198 | + case STATUSTYPE_INFO: | |
1199 | + result[0] = '\0'; | |
1200 | + break; | |
1201 | + | |
1202 | + case STATUSTYPE_TABLE: | |
1203 | + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u", | |
1204 | + dm_kdevname(bbr_id->dev->dev), | |
1205 | + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2, | |
1206 | + bbr_id->nr_sects_bbr_table, | |
1207 | + bbr_id->start_replacement_sect, | |
1208 | + bbr_id->nr_replacement_blks, | |
1209 | + bbr_id->blksize_in_sects << SECTOR_SHIFT); | |
1210 | + break; | |
1211 | + } | |
1212 | + return 0; | |
1213 | +} | |
1214 | + | |
1215 | +static struct target_type bbr_target = { | |
1216 | + name: "bbr", | |
1217 | + module: THIS_MODULE, | |
1218 | + ctr: bbr_ctr, | |
1219 | + dtr: bbr_dtr, | |
1220 | + map: bbr_map, | |
1221 | + end_io: bbr_callback, | |
1222 | + status: bbr_status, | |
1223 | +}; | |
1224 | + | |
1225 | +int __init dm_bbr_init(void) | |
1226 | +{ | |
1227 | + int r = dm_register_target(&bbr_target); | |
1228 | + | |
1229 | + if (r < 0) | |
1230 | + DMERR("dm-bbr: register failed %d", r); | |
1231 | + | |
1232 | + return r; | |
1233 | +} | |
1234 | + | |
1235 | +void __exit dm_bbr_exit(void) | |
1236 | +{ | |
1237 | + int r = dm_unregister_target(&bbr_target); | |
1238 | + | |
1239 | + if (r < 0) | |
1240 | + DMERR("dm-bbr: unregister failed %d", r); | |
1241 | +} | |
1242 | + | |
1243 | +module_init(dm_bbr_init); | |
1244 | +module_exit(dm_bbr_exit); | |
1245 | +MODULE_LICENSE("GPL"); | |
1246 | diff -urN linux-2.4.24.org/drivers/md/dm-bbr.h linux-2.4.24/drivers/md/dm-bbr.h | |
1247 | --- linux-2.4.24.org/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100 | |
1248 | +++ linux-2.4.24/drivers/md/dm-bbr.h 2004-01-18 16:03:13.101545929 +0100 | |
1249 | @@ -0,0 +1,143 @@ | |
1250 | +/* | |
1251 | + * (C) Copyright IBM Corp. 2002, 2003 | |
1252 | + * | |
1253 | + * This program is free software; you can redistribute it and/or modify | |
1254 | + * it under the terms of the GNU General Public License as published by | |
1255 | + * the Free Software Foundation; either version 2 of the License, or | |
1256 | + * (at your option) any later version. | |
1257 | + * | |
1258 | + * This program is distributed in the hope that it will be useful, | |
1259 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1260 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
1261 | + * the GNU General Public License for more details. | |
1262 | + * | |
1263 | + * You should have received a copy of the GNU General Public License | |
1264 | + * along with this program; if not, write to the Free Software | |
1265 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1266 | + * | |
1267 | + * linux/drivers/md/dm-bbr.h | |
1268 | + * | |
1269 | + * Bad-block-relocation (BBR) target for device-mapper. | |
1270 | + * | |
1271 | + * The BBR target is designed to remap I/O write failures to another safe | |
1272 | + * location on disk. Note that most disk drives have BBR built into them, | |
1273 | + * this means that our software BBR will be only activated when all hardware | |
1274 | + * BBR replacement sectors have been used. | |
1275 | + */ | |
1276 | + | |
1277 | +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ | |
1278 | +#define BBR_ENTRIES_PER_SECT 31 | |
1279 | +#define BBR_NR_BUFS 128 | |
1280 | +#define INITIAL_CRC 0xFFFFFFFF | |
1281 | +#define CRC_POLYNOMIAL 0xEDB88320L | |
1282 | + | |
1283 | +/** | |
1284 | + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. | |
1285 | + * Use these in place of %Ld, %Lu, and %Lx. | |
1286 | + **/ | |
1287 | +#if BITS_PER_LONG > 32 | |
1288 | +#define PFU64 "%lu" | |
1289 | +#else | |
1290 | +#define PFU64 "%Lu" | |
1291 | +#endif | |
1292 | + | |
1293 | +/** | |
1294 | + * struct bbr_table_entry | |
1295 | + * @bad_sect: LBA of bad location. | |
1296 | + * @replacement_sect: LBA of new location. | |
1297 | + * | |
1298 | + * Structure to describe one BBR remap. | |
1299 | + **/ | |
1300 | +struct bbr_table_entry { | |
1301 | + u64 bad_sect; | |
1302 | + u64 replacement_sect; | |
1303 | +}; | |
1304 | + | |
1305 | +/** | |
1306 | + * struct bbr_table | |
1307 | + * @signature: Signature on each BBR table sector. | |
1308 | + * @crc: CRC for this table sector. | |
1309 | + * @sequence_number: Used to resolve conflicts when primary and secondary | |
1310 | + * tables do not match. | |
1311 | + * @in_use_cnt: Number of in-use table entries. | |
1312 | + * @entries: Actual table of remaps. | |
1313 | + * | |
1314 | + * Structure to describe each sector of the metadata table. Each sector in this | |
1315 | + * table can describe 31 remapped sectors. | |
1316 | + **/ | |
1317 | +struct bbr_table { | |
1318 | + u32 signature; | |
1319 | + u32 crc; | |
1320 | + u32 sequence_number; | |
1321 | + u32 in_use_cnt; | |
1322 | + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT]; | |
1323 | +}; | |
1324 | + | |
1325 | +/** | |
1326 | + * struct bbr_runtime_remap | |
1327 | + * | |
1328 | + * Node in the binary tree used to keep track of remaps. | |
1329 | + **/ | |
1330 | +struct bbr_runtime_remap { | |
1331 | + struct bbr_table_entry remap; | |
1332 | + struct bbr_runtime_remap *left; | |
1333 | + struct bbr_runtime_remap *right; | |
1334 | +}; | |
1335 | + | |
1336 | +/** | |
1337 | + * struct bbr_private | |
1338 | + * @dev: Info about underlying device. | |
1339 | + * @bbr_table: Copy of metadata table. | |
1340 | + * @remap_root: Binary tree containing all remaps. | |
1341 | + * @offset: LBA of data area. | |
1342 | + * @lba_table1: LBA of primary BBR table. | |
1343 | + * @lba_table2: LBA of secondary BBR table. | |
1344 | + * @nr_sects_bbr_table: Size of each BBR table. | |
1345 | + * @nr_replacement_blks: Number of replacement blocks. | |
1346 | + * @start_replacement_sect: LBA of start of replacement blocks. | |
1347 | + * @blksize_in_sects: Size of each block. | |
1348 | + * @in_use_replacement_blks: Current number of remapped blocks. | |
1349 | + * @bbr_id_lock: Lock for the binary tree. | |
1350 | + * | |
1351 | + * Private data for each BBR target. | |
1352 | + **/ | |
1353 | +struct bbr_private { | |
1354 | + struct dm_dev *dev; | |
1355 | + struct bbr_table *bbr_table; | |
1356 | + struct bbr_runtime_remap *remap_root; | |
1357 | + u64 offset; | |
1358 | + u64 lba_table1; | |
1359 | + u64 lba_table2; | |
1360 | + u64 nr_sects_bbr_table; | |
1361 | + u64 start_replacement_sect; | |
1362 | + u64 nr_replacement_blks; | |
1363 | + u32 blksize_in_sects; | |
1364 | + atomic_t in_use_replacement_blks; | |
1365 | + spinlock_t bbr_id_lock; | |
1366 | +}; | |
1367 | + | |
1368 | +#define BBR_IO_HANDLED (1<<0) | |
1369 | +#define BBR_IO_RELOCATE (1<<1) | |
1370 | + | |
1371 | +/** | |
1372 | + * struct bbr_io_buffer | |
1373 | + * @bbr_io_list: Thread's list of bbr_io_buf's. | |
1374 | + * @bbr_id: Object for this request. | |
1375 | + * @bh: Original buffer_head. | |
1376 | + * @sector: Original sector | |
1377 | + * @flags: Operation flag (BBR_IO_*) | |
1378 | + * @rw: READ or WRITE. | |
1379 | + * @rc: Return code from bbr_io_handler. | |
1380 | + * | |
1381 | + * Structure used to track each write request. | |
1382 | + **/ | |
1383 | +struct bbr_io_buffer { | |
1384 | + struct list_head bbr_io_list; | |
1385 | + struct bbr_private *bbr_id; | |
1386 | + struct buffer_head *bh; | |
1387 | + u64 sector; | |
1388 | + u32 flags; | |
1389 | + s32 rw; | |
1390 | + s32 rc; | |
1391 | +}; | |
1392 | + | |
1393 | diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c | |
1394 | --- linux-2.4.24.org/drivers/md/dm.c 2004-01-18 15:09:18.533171353 +0100 | |
1395 | +++ linux-2.4.24/drivers/md/dm.c 2004-01-18 15:59:40.046635861 +0100 | |
1396 | @@ -951,13 +951,23 @@ | |
1397 | int r = 0; | |
1398 | DECLARE_WAITQUEUE(wait, current); | |
1399 | ||
1400 | - down_write(&md->lock); | |
1401 | + /* Flush IO to the origin device */ | |
1402 | + down_read(&md->lock); | |
1403 | + if (test_bit(DMF_BLOCK_IO, &md->flags)) { | |
1404 | + up_read(&md->lock); | |
1405 | + return -EINVAL; | |
1406 | + } | |
1407 | + | |
1408 | + fsync_dev_lockfs(md->dev); | |
1409 | + up_read(&md->lock); | |
1410 | + | |
1411 | ||
1412 | /* | |
1413 | - * First we set the BLOCK_IO flag so no more ios will be | |
1414 | - * mapped. | |
1415 | + * Set the BLOCK_IO flag so no more ios will be mapped. | |
1416 | */ | |
1417 | + down_write(&md->lock); | |
1418 | if (test_bit(DMF_BLOCK_IO, &md->flags)) { | |
1419 | + unlockfs(md->dev); | |
1420 | up_write(&md->lock); | |
1421 | return -EINVAL; | |
1422 | } | |
1423 | @@ -986,6 +996,7 @@ | |
1424 | ||
1425 | /* did we flush everything ? */ | |
1426 | if (atomic_read(&md->pending)) { | |
1427 | + unlockfs(md->dev); | |
1428 | clear_bit(DMF_BLOCK_IO, &md->flags); | |
1429 | r = -EINTR; | |
1430 | } else { | |
1431 | @@ -1017,6 +1028,7 @@ | |
1432 | md->deferred = NULL; | |
1433 | up_write(&md->lock); | |
1434 | ||
1435 | + unlockfs(md->dev); | |
1436 | flush_deferred_io(def); | |
1437 | run_task_queue(&tq_disk); | |
1438 | ||
1439 | diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c | |
1440 | --- linux-2.4.24.org/drivers/md/dm-snapshot.c 2004-01-18 15:09:18.569163966 +0100 | |
1441 | +++ linux-2.4.24/drivers/md/dm-snapshot.c 2004-01-18 16:02:40.858328124 +0100 | |
1442 | @@ -92,6 +92,9 @@ | |
1443 | ||
1444 | /* List of snapshots for this origin */ | |
1445 | struct list_head snapshots; | |
1446 | + | |
1447 | + /* Count of snapshots and origins referrencing this structure. */ | |
1448 | + unsigned int count; | |
1449 | }; | |
1450 | ||
1451 | /* | |
1452 | @@ -155,6 +158,35 @@ | |
1453 | } | |
1454 | ||
1455 | /* | |
1456 | + * Allocate and initialize an origin structure. | |
1457 | + */ | |
1458 | +static struct origin * __alloc_origin(kdev_t dev) | |
1459 | +{ | |
1460 | + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL); | |
1461 | + if (o) { | |
1462 | + o->dev = dev; | |
1463 | + INIT_LIST_HEAD(&o->hash_list); | |
1464 | + INIT_LIST_HEAD(&o->snapshots); | |
1465 | + __insert_origin(o); | |
1466 | + } | |
1467 | + return o; | |
1468 | +} | |
1469 | + | |
1470 | +static void __get_origin(struct origin *o) | |
1471 | +{ | |
1472 | + o->count++; | |
1473 | +} | |
1474 | + | |
1475 | +static void __put_origin(struct origin *o) | |
1476 | +{ | |
1477 | + o->count--; | |
1478 | + if (o->count == 0) { | |
1479 | + list_del(&o->hash_list); | |
1480 | + kfree(o); | |
1481 | + } | |
1482 | +} | |
1483 | + | |
1484 | +/* | |
1485 | * Make a note of the snapshot and its origin so we can look it | |
1486 | * up when the origin has a write on it. | |
1487 | */ | |
1488 | @@ -168,20 +200,37 @@ | |
1489 | ||
1490 | if (!o) { | |
1491 | /* New origin */ | |
1492 | - o = kmalloc(sizeof(*o), GFP_KERNEL); | |
1493 | + o = __alloc_origin(dev); | |
1494 | if (!o) { | |
1495 | up_write(&_origins_lock); | |
1496 | return -ENOMEM; | |
1497 | } | |
1498 | + } | |
1499 | ||
1500 | - /* Initialise the struct */ | |
1501 | - INIT_LIST_HEAD(&o->snapshots); | |
1502 | - o->dev = dev; | |
1503 | + __get_origin(o); | |
1504 | + list_add_tail(&snap->list, &o->snapshots); | |
1505 | ||
1506 | - __insert_origin(o); | |
1507 | + up_write(&_origins_lock); | |
1508 | + return 0; | |
1509 | +} | |
1510 | + | |
1511 | +static int register_origin(kdev_t dev) | |
1512 | +{ | |
1513 | + struct origin *o; | |
1514 | + | |
1515 | + down_write(&_origins_lock); | |
1516 | + o = __lookup_origin(dev); | |
1517 | + | |
1518 | + if (!o) { | |
1519 | + /* New origin */ | |
1520 | + o = __alloc_origin(dev); | |
1521 | + if (!o) { | |
1522 | + up_write(&_origins_lock); | |
1523 | + return -ENOMEM; | |
1524 | + } | |
1525 | } | |
1526 | ||
1527 | - list_add_tail(&snap->list, &o->snapshots); | |
1528 | + __get_origin(o); | |
1529 | ||
1530 | up_write(&_origins_lock); | |
1531 | return 0; | |
1532 | @@ -195,11 +244,18 @@ | |
1533 | o = __lookup_origin(s->origin->dev); | |
1534 | ||
1535 | list_del(&s->list); | |
1536 | - if (list_empty(&o->snapshots)) { | |
1537 | - list_del(&o->hash_list); | |
1538 | - kfree(o); | |
1539 | - } | |
1540 | + __put_origin(o); | |
1541 | + | |
1542 | + up_write(&_origins_lock); | |
1543 | +} | |
1544 | + | |
1545 | +static void unregister_origin(kdev_t dev) | |
1546 | +{ | |
1547 | + struct origin *o; | |
1548 | ||
1549 | + down_write(&_origins_lock); | |
1550 | + o = __lookup_origin(dev); | |
1551 | + __put_origin(o); | |
1552 | up_write(&_origins_lock); | |
1553 | } | |
1554 | ||
1555 | @@ -524,9 +580,6 @@ | |
1556 | goto bad5; | |
1557 | } | |
1558 | ||
1559 | - /* Flush IO to the origin device */ | |
1560 | - fsync_dev(s->origin->dev); | |
1561 | - | |
1562 | /* Add snapshot to the list of snapshots for this origin */ | |
1563 | if (register_snapshot(s)) { | |
1564 | r = -EINVAL; | |
1565 | @@ -1093,6 +1146,13 @@ | |
1566 | return r; | |
1567 | } | |
1568 | ||
1569 | + r = register_origin(dev->dev); | |
1570 | + if (r) { | |
1571 | + ti->error = "Cannot register origin"; | |
1572 | + dm_put_device(ti, dev); | |
1573 | + return r; | |
1574 | + } | |
1575 | + | |
1576 | ti->private = dev; | |
1577 | return 0; | |
1578 | } | |
1579 | @@ -1100,6 +1160,7 @@ | |
1580 | static void origin_dtr(struct dm_target *ti) | |
1581 | { | |
1582 | struct dm_dev *dev = (struct dm_dev *) ti->private; | |
1583 | + unregister_origin(dev->dev); | |
1584 | dm_put_device(ti, dev); | |
1585 | } | |
1586 | ||
1587 | diff -urN linux-2.4.24.org/drivers/md/dm-sparse.c linux-2.4.24/drivers/md/dm-sparse.c | |
1588 | --- linux-2.4.24.org/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100 | |
1589 | +++ linux-2.4.24/drivers/md/dm-sparse.c 2004-01-18 16:04:48.284615142 +0100 | |
1590 | @@ -0,0 +1,709 @@ | |
1591 | +/* -*- linux-c -*- */ | |
1592 | + | |
1593 | +/* | |
1594 | + * Copyright (c) International Business Machines Corp., 2002 | |
1595 | + * | |
1596 | + * This program is free software; you can redistribute it and/or modify | |
1597 | + * it under the terms of the GNU General Public License as published by | |
1598 | + * the Free Software Foundation; either version 2 of the License, or | |
1599 | + * (at your option) any later version. | |
1600 | + * | |
1601 | + * This program is distributed in the hope that it will be useful, | |
1602 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1603 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | |
1604 | + * the GNU General Public License for more details. | |
1605 | + * | |
1606 | + * You should have received a copy of the GNU General Public License | |
1607 | + * along with this program; if not, write to the Free Software | |
1608 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1609 | + * | |
1610 | + * linux/drivers/md/dm-sparse.c | |
1611 | + * | |
1612 | + * Sparse target for device-mapper. | |
1613 | + * | |
1614 | + * This target provides the ability to create a sparse device. This | |
1615 | + * allows a device to pretend to be larger than it really is. | |
1616 | + */ | |
1617 | + | |
1618 | +#include <linux/module.h> | |
1619 | +#include <linux/init.h> | |
1620 | +#include <linux/blkdev.h> | |
1621 | +#include <linux/slab.h> | |
1622 | +#include <linux/mempool.h> | |
1623 | +#include <linux/vmalloc.h> | |
1624 | + | |
1625 | +#include "dm.h" | |
1626 | +#include "dm-io.h" | |
1627 | + | |
1628 | +#define MAX_HASH_CHAIN_ENTRIES 10 | |
1629 | +#define NAME_SIZE 127 | |
1630 | + | |
1631 | +/* Sparse Ioctl | |
1632 | + device | |
1633 | + start | |
1634 | + chunk_size | |
1635 | + chunks | |
1636 | + */ | |
1637 | + | |
1638 | +// Entries in the sparse remapping structure | |
1639 | +struct sparse_hash_entry { | |
1640 | + u64 org_chunk; // Chunk number, not LBA. | |
1641 | + u64 sparse_chunk; // Chunk number, not LBA. | |
1642 | + struct sparse_hash_entry * next; | |
1643 | + struct sparse_hash_entry * prev; | |
1644 | +}; | |
1645 | + | |
1646 | +//Private data structure | |
1647 | +struct sparse_volume { | |
1648 | + struct dm_dev *dev; | |
1649 | + struct rw_semaphore sparse_semaphore; | |
1650 | + struct sparse_hash_entry ** sparse_map; // Hash table of remappings | |
1651 | + struct sparse_hash_entry * free_hash_list; | |
1652 | + kmem_cache_t * hash_slab; | |
1653 | + mempool_t * hash_pool; | |
1654 | + u32 dm_io_flag; | |
1655 | + u32 chunk_size; // Sectors. | |
1656 | + u32 chunk_shift; // Shift value for chunk size. | |
1657 | + u32 num_chunks; // In this volume. | |
1658 | + u32 next_cow_entry; // Index into current COW table. | |
1659 | + u64 current_cow_sector; // LOGICAL sector of current COW table. | |
1660 | + u32 next_free_chunk; // Index of next free chunk (not LBA!). | |
1661 | + u32 hash_table_size; // Size of the hash table for the remap. | |
1662 | + u64 start; | |
1663 | + u64 cow_table[64]; // One sector's worth of COW tables. | |
1664 | +}; | |
1665 | + | |
1666 | +/*************************** OLD SERVICES ****************************/ | |
1667 | + | |
1668 | +/* computes log base 2 of value */ | |
1669 | +inline int log2(u32 value) //ok to change to u32? | |
1670 | +{ | |
1671 | + int result = -1; | |
1672 | + long tmp; //ok to change to long? | |
1673 | + | |
1674 | + if (value) { | |
1675 | + tmp = value; | |
1676 | + result++; | |
1677 | + while (!(tmp & 1)) { | |
1678 | + result++; | |
1679 | + tmp >>= 1; | |
1680 | + } | |
1681 | + if (tmp != 1) { | |
1682 | + result = -2; | |
1683 | + } | |
1684 | + } | |
1685 | + return result; | |
1686 | +} | |
1687 | + | |
1688 | +/********************************* Functions *********************************/ | |
1689 | + | |
1690 | +/***************************** Hash Functions *****************************/ | |
1691 | + | |
1692 | +/* Take and initialize from the free hash list */ | |
1693 | +static struct sparse_hash_entry * | |
1694 | +allocate_sparse_hash_entry( struct sparse_volume * volume, | |
1695 | + u64 org_chunk, | |
1696 | + u64 sparse_chunk ) | |
1697 | +{ | |
1698 | + struct sparse_hash_entry * hash_entry; | |
1699 | + | |
1700 | + hash_entry = volume->free_hash_list; | |
1701 | + if ( hash_entry ) { //should always be the case b/c preallocate these | |
1702 | + volume->free_hash_list = hash_entry->next; | |
1703 | + hash_entry->org_chunk = org_chunk; | |
1704 | + hash_entry->sparse_chunk = sparse_chunk; | |
1705 | + hash_entry->next = NULL; | |
1706 | + hash_entry->prev = NULL; | |
1707 | + } | |
1708 | + | |
1709 | + return hash_entry; | |
1710 | +} | |
1711 | + | |
1712 | +/* | |
1713 | + * This function inserts a new entry into a sparse hash chain, immediately | |
1714 | + * following the specified entry. This function should not be used to add | |
1715 | + * an entry into an empty list, or as the first entry in an existing list. | |
1716 | + * For that case, use insert_sparse_map_entry_at_head(). | |
1717 | + */ | |
1718 | +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry, | |
1719 | + struct sparse_hash_entry * base ) | |
1720 | +{ | |
1721 | + entry->next = base->next; | |
1722 | + entry->prev = base; | |
1723 | + base->next = entry; | |
1724 | + if ( entry->next ) { | |
1725 | + entry->next->prev = entry; | |
1726 | + } | |
1727 | + return 0; | |
1728 | +} | |
1729 | + | |
1730 | +/* | |
1731 | + * This function inserts a new entry into a sparse chain as the first | |
1732 | + * entry in the chain. | |
1733 | + */ | |
1734 | +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry, | |
1735 | + struct sparse_hash_entry ** head ) | |
1736 | +{ | |
1737 | + entry->next = *head; | |
1738 | + entry->prev = NULL; | |
1739 | + *head = entry; | |
1740 | + if ( entry->next ) { | |
1741 | + entry->next->prev = entry; | |
1742 | + } | |
1743 | + return 0; | |
1744 | +} | |
1745 | + | |
1746 | +/* | |
1747 | + * Delete all items in a single chain in the hash table. | |
1748 | + */ | |
1749 | +static int delete_sparse_hash_chain( struct sparse_volume * vol, | |
1750 | + struct sparse_hash_entry * head ) | |
1751 | +{ | |
1752 | + struct sparse_hash_entry * next; | |
1753 | + | |
1754 | + while ( head ) { | |
1755 | + next = head->next; | |
1756 | + mempool_free( head, vol->hash_pool ); | |
1757 | + head = next; | |
1758 | + } | |
1759 | + return 0; | |
1760 | +} | |
1761 | + | |
1762 | +/* | |
1763 | + * This function will search the hash chain that is anchored at the | |
1764 | + * specified head pointer. If the chunk number is found, a pointer to that | |
1765 | + * entry in the chain is set, and a 1 is returned. If the chunk is not | |
1766 | + * found, a pointer to the previous entry is set and 0 is returned. If the | |
1767 | + * return pointer is NULL, this means either the list is empty, or the | |
1768 | + * specified sector should become the first list item. | |
1769 | + */ | |
1770 | +static int search_sparse_hash_chain( u64 chunk, | |
1771 | + struct sparse_hash_entry * head, | |
1772 | + struct sparse_hash_entry ** result ) | |
1773 | +{ | |
1774 | + struct sparse_hash_entry * curr = head; | |
1775 | + struct sparse_hash_entry * prev = head; | |
1776 | + while ( curr && curr->org_chunk < chunk ) { | |
1777 | + prev = curr; | |
1778 | + curr = curr->next; | |
1779 | + } | |
1780 | + if (!curr) { // Either an empty chain or went off the end of the chain. | |
1781 | + *result = prev; | |
1782 | + return 0; | |
1783 | + } | |
1784 | + else if ( curr->org_chunk != chunk ) { | |
1785 | + *result = curr->prev; | |
1786 | + return 0; | |
1787 | + } | |
1788 | + else { | |
1789 | + *result = curr; | |
1790 | + return 1; | |
1791 | + } | |
1792 | +} | |
1793 | + | |
1794 | +/* | |
1795 | + * This function takes a cow table entry (from the on-disk data), and | |
1796 | + * converts it into an appropriate entry for the sparse map, and | |
1797 | + * inserts it into the appropriate map for the specified volume. | |
1798 | + */ | |
1799 | +static int add_cow_entry_to_sparse_map( u64 org_chunk, | |
1800 | + u64 sparse_chunk, | |
1801 | + struct sparse_volume * volume ) | |
1802 | +{ | |
1803 | + struct sparse_hash_entry * new_entry; | |
1804 | + struct sparse_hash_entry * target_entry; | |
1805 | + u32 hash_value; | |
1806 | + int rc = -EINVAL; | |
1807 | + | |
1808 | + new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk); | |
1809 | + if (!new_entry) { | |
1810 | + return -ENOMEM; | |
1811 | + } | |
1812 | + | |
1813 | + hash_value = (long)org_chunk % volume->hash_table_size; | |
1814 | + | |
1815 | + if (! search_sparse_hash_chain( org_chunk, | |
1816 | + volume->sparse_map[hash_value], | |
1817 | + &target_entry ) ) { | |
1818 | + //should always take this path | |
1819 | + | |
1820 | + if ( target_entry ) { | |
1821 | + insert_sparse_hash_entry( new_entry, target_entry ); | |
1822 | + } | |
1823 | + else { | |
1824 | + insert_sparse_hash_entry_at_head | |
1825 | + ( new_entry, &(volume->sparse_map[hash_value]) ); | |
1826 | + } | |
1827 | + rc = 0; | |
1828 | + } | |
1829 | + return rc; | |
1830 | +} | |
1831 | + | |
1832 | +/* | |
1833 | + * Construct the initial hash table state based on | |
1834 | + * existing COW tables on the disk. | |
1835 | + */ | |
1836 | +static int build_sparse_maps(struct sparse_volume * volume) | |
1837 | +{ | |
1838 | + int rc = 0, done = 0; | |
1839 | + struct io_region job; | |
1840 | + struct page * page; | |
1841 | + unsigned int error, offset; | |
1842 | + | |
1843 | + while (!done) { | |
1844 | + | |
1845 | + // Read in one sector's worth of COW tables. | |
1846 | + job.dev = volume->dev->dev; | |
1847 | + job.sector = volume->current_cow_sector; | |
1848 | + job.count = 1; | |
1849 | + page = virt_to_page(volume->cow_table); | |
1850 | + offset = (unsigned long)volume->cow_table & ~PAGE_MASK; | |
1851 | + rc = dm_io_sync(1, &job, READ, page, offset, &error); | |
1852 | + if (rc) { | |
1853 | + return rc; | |
1854 | + } | |
1855 | + | |
1856 | + // Translate every valid COW table entry into | |
1857 | + // a sparse map entry. | |
1858 | + for ( volume->next_cow_entry = 0; | |
1859 | + | |
1860 | + volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) && | |
1861 | + volume->cow_table[volume->next_cow_entry] != | |
1862 | + 0xffffffffffffffff; | |
1863 | + | |
1864 | + volume->next_cow_entry++, volume->next_free_chunk++ ) { | |
1865 | + | |
1866 | + if ( (rc = add_cow_entry_to_sparse_map | |
1867 | + ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ), | |
1868 | + volume->next_free_chunk, volume ))) { | |
1869 | + return( rc ); | |
1870 | + } | |
1871 | + } | |
1872 | + // Move on to the next sector if necessary. | |
1873 | + if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) { | |
1874 | + volume->current_cow_sector++; | |
1875 | + } | |
1876 | + else { | |
1877 | + done = 1; | |
1878 | + } | |
1879 | + } | |
1880 | + return 0; | |
1881 | +} | |
1882 | + | |
1883 | +/************************* Other Functions ************************/ | |
1884 | + | |
1885 | +/* | |
1886 | + * Function: sparse_remap_chunk | |
1887 | + * | |
1888 | + * This function performs a sector remap on a sparse volume. This should | |
1889 | + * be called from the I/O path, It first determines the base sector | |
1890 | + * of the chunk containing the specified sector, and saves the remainder. | |
1891 | + * Then it performs a search through the sparse map for the specified | |
1892 | + * volume. If a match is found, the sector number is changed to the new | |
1893 | + * value. If no match is found, the value is left the same, meaning the | |
1894 | + * chunk has not been remapped. | |
1895 | + */ | |
1896 | +static int sparse_remap_chunk( struct sparse_volume * sparse_volume, | |
1897 | + u64 * sector ) | |
1898 | +{ | |
1899 | + struct sparse_hash_entry * result; | |
1900 | + u64 chunk; | |
1901 | + u32 hash_value; | |
1902 | + u32 remainder; | |
1903 | + int rc = 1; | |
1904 | + | |
1905 | + down_read(&sparse_volume->sparse_semaphore); | |
1906 | + | |
1907 | + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); | |
1908 | + chunk = *sector >> sparse_volume->chunk_shift; | |
1909 | + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; | |
1910 | + | |
1911 | + if ( search_sparse_hash_chain( chunk, | |
1912 | + sparse_volume->sparse_map[hash_value], | |
1913 | + &result) ) { | |
1914 | + *sector = ( result->sparse_chunk << sparse_volume->chunk_shift ) | |
1915 | + + remainder; | |
1916 | + rc = 0; | |
1917 | + } | |
1918 | + up_read(&sparse_volume->sparse_semaphore); | |
1919 | + return rc; | |
1920 | +} | |
1921 | + | |
1922 | +/* Function: sparse_cow_write | |
1923 | + * | |
1924 | + * Check this sparse node to see if the given sector/chunk has been | |
1925 | + * remapped yet. If it hasn't, create a new hash table entry, update the | |
1926 | + * in-memory COW table, write the COW table to disk. | |
1927 | + */ | |
1928 | + | |
1929 | +static int sparse_cow_write( struct sparse_volume * sparse_volume, | |
1930 | + u64 * sector ) | |
1931 | +{ | |
1932 | + struct sparse_hash_entry * target_entry, * new_map_entry; | |
1933 | + struct io_region job; | |
1934 | + struct page * page; | |
1935 | + char * cow = NULL; | |
1936 | + unsigned int error, offset; | |
1937 | + u64 chunk; | |
1938 | + u32 hash_value = 0; | |
1939 | + u32 remainder; | |
1940 | + int rc; | |
1941 | + | |
1942 | + down_write(&sparse_volume->sparse_semaphore); | |
1943 | + | |
1944 | + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); | |
1945 | + chunk = *sector >> sparse_volume->chunk_shift; | |
1946 | + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; | |
1947 | + | |
1948 | + if ( search_sparse_hash_chain( chunk, | |
1949 | + sparse_volume->sparse_map[hash_value], | |
1950 | + &target_entry) ) { | |
1951 | + *sector = | |
1952 | + ( target_entry->sparse_chunk << sparse_volume->chunk_shift ) | |
1953 | + + remainder; | |
1954 | + rc = 0; | |
1955 | + goto out; | |
1956 | + } | |
1957 | + | |
1958 | + // Is there enough room left on this sparse to remap this chunk? | |
1959 | + if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) { | |
1960 | + DMERR("dm-sparse: full no new remaps allowed\n"); | |
1961 | + rc = -ENOSPC; | |
1962 | + goto out; | |
1963 | + } | |
1964 | + | |
1965 | + // Create and initialize a new hash table entry for the new remap. | |
1966 | + new_map_entry = allocate_sparse_hash_entry | |
1967 | + (sparse_volume, chunk, sparse_volume->next_free_chunk); | |
1968 | + if ( ! new_map_entry ) { | |
1969 | + // Can't get memory for map entry. Disable this sparse. | |
1970 | + DMERR("dm-sparse: memory error allocating hash entry\n"); | |
1971 | + rc = -ENOMEM; | |
1972 | + goto out; | |
1973 | + } | |
1974 | + | |
1975 | + //Always write cow table so its safe | |
1976 | + cow = kmalloc( SECTOR_SIZE, GFP_KERNEL ); | |
1977 | + if (! cow ) { | |
1978 | + // Can't get I/O buffer. Disable this sparse. | |
1979 | + DMERR("dm-sparse: memory error allocating COW table buffer"); | |
1980 | + rc = -ENOMEM; | |
1981 | + goto out; | |
1982 | + } | |
1983 | + | |
1984 | + // Add the entry to the hash table. | |
1985 | + if ( target_entry ) { | |
1986 | + insert_sparse_hash_entry( new_map_entry, target_entry ); | |
1987 | + } | |
1988 | + else { | |
1989 | + insert_sparse_hash_entry_at_head | |
1990 | + ( new_map_entry, | |
1991 | + &(sparse_volume->sparse_map[hash_value]) ); | |
1992 | + } | |
1993 | + | |
1994 | + sparse_volume->next_free_chunk++; | |
1995 | + | |
1996 | + // Update the appropriate entry in the COW table. | |
1997 | + sparse_volume->cow_table[sparse_volume->next_cow_entry] = | |
1998 | + cpu_to_le64(chunk); | |
1999 | + sparse_volume->next_cow_entry++; | |
2000 | + | |
2001 | + memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE); | |
2002 | + | |
2003 | + //because of ordering issues needs to be synchronous | |
2004 | + job.dev = sparse_volume->dev->dev; | |
2005 | + job.sector = sparse_volume->current_cow_sector; | |
2006 | + job.count = 1; | |
2007 | + page = virt_to_page(cow); | |
2008 | + offset = (unsigned long)cow & ~PAGE_MASK; | |
2009 | + dm_io_sync(1, &job, WRITE, page, offset, &error); | |
2010 | + | |
2011 | + // Update the in-memory COW table values. | |
2012 | + if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) ) | |
2013 | + { | |
2014 | + sparse_volume->next_cow_entry = 0; | |
2015 | + sparse_volume->current_cow_sector++; | |
2016 | + memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE); | |
2017 | + } | |
2018 | + | |
2019 | + *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift ) | |
2020 | + + remainder; | |
2021 | + | |
2022 | + rc = 0; | |
2023 | + | |
2024 | + out: | |
2025 | + up_write(&sparse_volume->sparse_semaphore); | |
2026 | + if ( cow ) { | |
2027 | + kfree( cow ); | |
2028 | + } | |
2029 | + | |
2030 | + return rc; | |
2031 | +} | |
2032 | + | |
2033 | +/************************ EXPORT FUNCTIONS ************************/ | |
2034 | + | |
2035 | +/* | |
2036 | + * Function: sparse_dtr | |
2037 | + */ | |
2038 | +static void sparse_dtr( struct dm_target *ti ) | |
2039 | +{ | |
2040 | + struct sparse_volume * vol = (struct sparse_volume *)ti->private; | |
2041 | + int i; | |
2042 | + | |
2043 | + if (vol) { | |
2044 | + | |
2045 | + if (vol->sparse_map) { | |
2046 | + for ( i = 0; i < vol->hash_table_size; i++ ) { | |
2047 | + delete_sparse_hash_chain( vol, vol->sparse_map[i] ); | |
2048 | + } | |
2049 | + delete_sparse_hash_chain( vol, vol->free_hash_list ); | |
2050 | + vfree(vol->sparse_map); | |
2051 | + } | |
2052 | + | |
2053 | + if (vol->hash_pool) | |
2054 | + mempool_destroy(vol->hash_pool); | |
2055 | + | |
2056 | + if (vol->hash_slab) | |
2057 | + kmem_cache_destroy(vol->hash_slab); | |
2058 | + | |
2059 | + dm_put_device(ti, vol->dev); | |
2060 | + | |
2061 | + if (vol->dm_io_flag) { | |
2062 | + dm_io_put(1); | |
2063 | + } | |
2064 | + | |
2065 | + kfree( vol ); | |
2066 | + } | |
2067 | +} | |
2068 | + | |
2069 | +/* | |
2070 | + * Function: sparse_ctr | |
2071 | + */ | |
2072 | +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv ) | |
2073 | +{ | |
2074 | + int i, rc = -EINVAL; | |
2075 | + struct sparse_hash_entry *new_entry; | |
2076 | + struct sparse_volume *vol; | |
2077 | + struct dm_dev *dev; | |
2078 | + u32 chunk_size, chunks; | |
2079 | + u64 start; | |
2080 | + char* end, slab_name[NAME_SIZE+1]; | |
2081 | + | |
2082 | + if ( argc != 4 ) { | |
2083 | + ti->error="dm-sparse: wrong number of arguments"; | |
2084 | + return rc; | |
2085 | + } | |
2086 | + | |
2087 | + start = simple_strtoull(argv[1], &end, 10); | |
2088 | + if (*end) { | |
2089 | + ti->error="dm-sparse: Invalid first chunk lba"; | |
2090 | + return rc; | |
2091 | + } | |
2092 | + | |
2093 | + chunk_size = simple_strtoul(argv[2], &end, 10); | |
2094 | + if (*end) { | |
2095 | + ti->error="dm-sparse: Invalid chunk_size"; | |
2096 | + return rc; | |
2097 | + } | |
2098 | + | |
2099 | + chunks = simple_strtoul(argv[3], &end, 10); | |
2100 | + if (*end) { | |
2101 | + ti->error="dm-sparse: Invalid number of chunks"; | |
2102 | + return rc; | |
2103 | + } | |
2104 | + | |
2105 | + if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size, | |
2106 | + dm_table_get_mode(ti->table), &dev ) ) { | |
2107 | + ti->error = "dm-sparse: Device lookup failed"; | |
2108 | + return rc; | |
2109 | + } | |
2110 | + | |
2111 | + vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL); | |
2112 | + if ( !vol ) { | |
2113 | + ti->error = "dm-sparse: Memory allocation for private-data failed"; | |
2114 | + rc = -ENOMEM; | |
2115 | + goto out; | |
2116 | + } | |
2117 | + | |
2118 | + memset( vol, 0, sizeof(struct sparse_volume) ); | |
2119 | + | |
2120 | + rc = dm_io_get(1); | |
2121 | + if (rc) { | |
2122 | + ti->error = "dm-sparse: failed to initialize dm-io."; | |
2123 | + sparse_dtr(ti); | |
2124 | + return rc; | |
2125 | + } | |
2126 | + | |
2127 | + // Initialize | |
2128 | + vol->dm_io_flag = 1; | |
2129 | + vol->chunk_size = chunk_size; | |
2130 | + vol->chunk_shift = log2(chunk_size); | |
2131 | + vol->num_chunks = chunks; | |
2132 | + vol->current_cow_sector = 1; | |
2133 | + vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1; | |
2134 | + vol->start = start; | |
2135 | + vol->dev = dev; | |
2136 | + init_rwsem(&vol->sparse_semaphore); | |
2137 | + | |
2138 | + snprintf(slab_name, NAME_SIZE, "sparse-%p", vol); | |
2139 | + vol->hash_slab = kmem_cache_create(slab_name, | |
2140 | + sizeof(struct sparse_hash_entry), | |
2141 | + 0, SLAB_HWCACHE_ALIGN, | |
2142 | + NULL, NULL); | |
2143 | + if ( ! vol->hash_slab ) { | |
2144 | + ti->error = "dm-sparse: memory allocation error in hash slab create"; | |
2145 | + sparse_dtr(ti); | |
2146 | + return -ENOMEM; | |
2147 | + } | |
2148 | + vol->hash_pool = mempool_create(1, mempool_alloc_slab, | |
2149 | + mempool_free_slab, | |
2150 | + vol->hash_slab); | |
2151 | + if ( ! vol->hash_pool ) { | |
2152 | + ti->error = "dm-sparse: memory allocation error in hash pool create"; | |
2153 | + sparse_dtr(ti); | |
2154 | + return -ENOMEM; | |
2155 | + } | |
2156 | + | |
2157 | + // Sparse hash table | |
2158 | + vol->sparse_map = vmalloc( vol->hash_table_size * | |
2159 | + sizeof( struct sparse_hash_entry * ) ); | |
2160 | + if ( ! vol->sparse_map ) { | |
2161 | + ti->error = "dm-sparse: Memory allocation error in sparse_map create"; | |
2162 | + sparse_dtr(ti); | |
2163 | + return -ENOMEM; | |
2164 | + } | |
2165 | + | |
2166 | + memset( vol->sparse_map, 0, vol->hash_table_size * | |
2167 | + sizeof( struct sparse_hash_entry * ) ); | |
2168 | + | |
2169 | + for ( i = 0; i < chunks; i++ ) { | |
2170 | + | |
2171 | + new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL ); | |
2172 | + if ( ! new_entry ) { | |
2173 | + ti->error="dm-sparse: memory allocation error in hash table setup"; | |
2174 | + sparse_dtr(ti); | |
2175 | + return -ENOMEM; | |
2176 | + } | |
2177 | + | |
2178 | + new_entry->next = vol->free_hash_list; | |
2179 | + vol->free_hash_list = new_entry; | |
2180 | + } | |
2181 | + | |
2182 | + rc = build_sparse_maps(vol); | |
2183 | + if (rc) { | |
2184 | + ti->error = "dm-sparse: error building hash tables"; | |
2185 | + sparse_dtr(ti); | |
2186 | + return rc; | |
2187 | + } | |
2188 | + | |
2189 | + ti->private = vol; | |
2190 | + return rc; | |
2191 | + | |
2192 | + out: | |
2193 | + dm_put_device(ti, dev); | |
2194 | + return rc; | |
2195 | +} | |
2196 | + | |
2197 | +/* | |
2198 | + * Function: sparse_map | |
2199 | + */ | |
2200 | +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw, | |
2201 | + union map_info *map_context ) | |
2202 | +{ | |
2203 | + struct sparse_volume * volume = (struct sparse_volume*)ti->private; | |
2204 | + u64 sector = bh->b_rsector; | |
2205 | + int rc; | |
2206 | + | |
2207 | + // Check if this sector has been remapped | |
2208 | + rc = sparse_remap_chunk( volume, §or ); | |
2209 | + | |
2210 | + if ( rc < 0 ) { //Error | |
2211 | + return rc; | |
2212 | + } | |
2213 | + | |
2214 | + if ( rc == 0 ) { // Remapped I/O : read or write same logic | |
2215 | + bh->b_rsector = volume->start + sector; | |
2216 | + bh->b_rdev = volume->dev->dev; | |
2217 | + return 1; | |
2218 | + } | |
2219 | + | |
2220 | + // ( Previously )Un-mapped: read / write different logic | |
2221 | + | |
2222 | + if ( rw ) { //write : | |
2223 | + rc = sparse_cow_write( volume, §or ); | |
2224 | + | |
2225 | + if ( rc < 0 ) { //Error | |
2226 | + return rc; | |
2227 | + } | |
2228 | + //Send write on | |
2229 | + bh->b_rsector = volume->start + sector; | |
2230 | + bh->b_rdev = volume->dev->dev; | |
2231 | + return 1; | |
2232 | + } | |
2233 | + | |
2234 | + //Reading something that was never written | |
2235 | + //return zeros and indicate complete | |
2236 | + memset(bh->b_data, 0x0, bh->b_size); | |
2237 | + bh->b_end_io(bh, 1); | |
2238 | + return 0; | |
2239 | +} | |
2240 | + | |
2241 | +static int sparse_status( struct dm_target *ti, status_type_t type, | |
2242 | + char *result, unsigned int maxlen ) | |
2243 | +{ | |
2244 | + struct sparse_volume * vol = (struct sparse_volume * )ti->private; | |
2245 | + | |
2246 | + switch(type) { | |
2247 | + | |
2248 | + case STATUSTYPE_INFO: | |
2249 | + snprintf( result, maxlen, "%d%%", | |
2250 | + ( vol->next_free_chunk * 100 ) / vol->num_chunks ); | |
2251 | + break; | |
2252 | + | |
2253 | + case STATUSTYPE_TABLE: | |
2254 | + snprintf( result, maxlen, "%s %Lu %u %u", | |
2255 | + dm_kdevname(vol->dev->dev), vol->start, | |
2256 | + vol->chunk_size, vol->num_chunks ); | |
2257 | + break; | |
2258 | + | |
2259 | + default: | |
2260 | + break; | |
2261 | + } | |
2262 | + | |
2263 | + return 0; | |
2264 | +} | |
2265 | + | |
2266 | +/****************** FUNCTION TABLE **********************/ | |
2267 | + | |
2268 | +static struct target_type sparse_target = { | |
2269 | + .name = "sparse", | |
2270 | + .module = THIS_MODULE, | |
2271 | + .ctr = sparse_ctr, | |
2272 | + .dtr = sparse_dtr, | |
2273 | + .map = sparse_map, | |
2274 | + .status = sparse_status, | |
2275 | +}; | |
2276 | + | |
2277 | +/********************* REGISTRATION *****************/ | |
2278 | + | |
2279 | +int __init sparse_init(void) | |
2280 | +{ | |
2281 | + int rc = dm_register_target(&sparse_target); | |
2282 | + | |
2283 | + if ( rc < 0 ) | |
2284 | + DMWARN("sparse target registration failed"); | |
2285 | + | |
2286 | + return rc; | |
2287 | +} | |
2288 | + | |
2289 | +void __exit sparse_exit(void) | |
2290 | +{ | |
2291 | + if (dm_unregister_target(&sparse_target) ) | |
2292 | + DMWARN("sparse target unregistration failed"); | |
2293 | + | |
2294 | + return; | |
2295 | +} | |
2296 | + | |
2297 | +module_init(sparse_init); | |
2298 | +module_exit(sparse_exit); | |
2299 | +MODULE_LICENSE("GPL"); | |
2300 | diff -urN linux-2.4.24.org/drivers/md/lvm.c linux-2.4.24/drivers/md/lvm.c | |
2301 | --- linux-2.4.24.org/drivers/md/lvm.c 2004-01-18 14:58:09.106704262 +0100 | |
2302 | +++ linux-2.4.24/drivers/md/lvm.c 2004-01-18 15:57:55.568033496 +0100 | |
2303 | @@ -236,9 +236,6 @@ | |
2304 | #define DEVICE_OFF(device) | |
2305 | #define LOCAL_END_REQUEST | |
2306 | ||
2307 | -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */ | |
2308 | -/* #define LVM_VFS_ENHANCEMENT */ | |
2309 | - | |
2310 | #include <linux/config.h> | |
2311 | #include <linux/module.h> | |
2312 | #include <linux/kernel.h> | |
2313 | @@ -2250,12 +2247,8 @@ | |
2314 | if (lv_ptr->lv_access & LV_SNAPSHOT) { | |
2315 | lv_t *org = lv_ptr->lv_snapshot_org, *last; | |
2316 | ||
2317 | - /* sync the original logical volume */ | |
2318 | - fsync_dev(org->lv_dev); | |
2319 | -#ifdef LVM_VFS_ENHANCEMENT | |
2320 | /* VFS function call to sync and lock the filesystem */ | |
2321 | fsync_dev_lockfs(org->lv_dev); | |
2322 | -#endif | |
2323 | ||
2324 | down_write(&org->lv_lock); | |
2325 | org->lv_access |= LV_SNAPSHOT_ORG; | |
2326 | @@ -2281,11 +2274,9 @@ | |
2327 | else | |
2328 | set_device_ro(lv_ptr->lv_dev, 1); | |
2329 | ||
2330 | -#ifdef LVM_VFS_ENHANCEMENT | |
2331 | /* VFS function call to unlock the filesystem */ | |
2332 | if (lv_ptr->lv_access & LV_SNAPSHOT) | |
2333 | unlockfs(lv_ptr->lv_snapshot_org->lv_dev); | |
2334 | -#endif | |
2335 | ||
2336 | lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de = | |
2337 | lvm_fs_create_lv(vg_ptr, lv_ptr); | |
2338 | diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile | |
2339 | --- linux-2.4.24.org/drivers/md/Makefile 2004-01-18 15:09:18.620153502 +0100 | |
2340 | +++ linux-2.4.24/drivers/md/Makefile 2004-01-18 16:04:48.278616388 +0100 | |
2341 | @@ -28,6 +28,8 @@ | |
2342 | obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o | |
2343 | ||
2344 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |
2345 | +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o | |
2346 | +obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o | |
2347 | ||
2348 | include $(TOPDIR)/Rules.make | |
2349 | ||
2350 | diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c | |
2351 | --- linux-2.4.24.org/drivers/md/md.c 2004-01-18 14:58:09.227678566 +0100 | |
2352 | +++ linux-2.4.24/drivers/md/md.c 2004-01-18 16:04:27.702900923 +0100 | |
2353 | @@ -2146,6 +2146,8 @@ | |
2354 | ||
2355 | SET_FROM_SB(utime); | |
2356 | SET_FROM_SB(state); | |
2357 | + if (mddev->curr_resync) | |
2358 | + info.state |= (1 << MD_ARRAY_RECOVERY_RUNNING); | |
2359 | SET_FROM_SB(active_disks); | |
2360 | SET_FROM_SB(working_disks); | |
2361 | SET_FROM_SB(failed_disks); | |
2362 | diff -urN linux-2.4.24.org/drivers/md/multipath.c linux-2.4.24/drivers/md/multipath.c | |
2363 | --- linux-2.4.24.org/drivers/md/multipath.c 2004-01-18 14:58:09.254672832 +0100 | |
2364 | +++ linux-2.4.24/drivers/md/multipath.c 2004-01-18 16:04:38.291691263 +0100 | |
2365 | @@ -139,15 +139,16 @@ | |
2366 | static int multipath_map (mddev_t *mddev, kdev_t *rdev) | |
2367 | { | |
2368 | multipath_conf_t *conf = mddev_to_conf(mddev); | |
2369 | - int i, disks = MD_SB_DISKS; | |
2370 | + int i; | |
2371 | ||
2372 | /* | |
2373 | * Later we do read balancing on the read side | |
2374 | * now we use the first available disk. | |
2375 | */ | |
2376 | ||
2377 | - for (i = 0; i < disks; i++) { | |
2378 | + for (i = 0; i < conf->nr_disks; i++) { | |
2379 | if (conf->multipaths[i].operational) { | |
2380 | + /* first operational is winner! */ | |
2381 | *rdev = conf->multipaths[i].dev; | |
2382 | return (0); | |
2383 | } | |
2384 | @@ -191,6 +192,8 @@ | |
2385 | { | |
2386 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private); | |
2387 | ||
2388 | + atomic_dec(&mp_bh->multipath->nr_pending); | |
2389 | + | |
2390 | /* | |
2391 | * this branch is our 'one multipath IO has finished' event handler: | |
2392 | */ | |
2393 | @@ -223,19 +226,39 @@ | |
2394 | } | |
2395 | ||
2396 | /* | |
2397 | - * This routine returns the disk from which the requested read should | |
2398 | - * be done. | |
2399 | + * Multipath read balance ... | |
2400 | + * | |
2401 | + * Returns: | |
2402 | + * | |
2403 | + * If no active paths | |
2404 | + * | |
2405 | + * - Error ( -1 ) | |
2406 | + * | |
2407 | + * If active paths == 1 | |
2408 | + * | |
2409 | + * - 1st active path encountered | |
2410 | + * | |
2411 | + * If active paths > 1 | |
2412 | + * | |
2413 | + * - 1st idle active path encountered | |
2414 | + * - else ... the active path doing the least amount of work. | |
2415 | */ | |
2416 | - | |
2417 | static int multipath_read_balance (multipath_conf_t *conf) | |
2418 | { | |
2419 | - int disk; | |
2420 | - | |
2421 | - for (disk = 0; disk < conf->raid_disks; disk++) | |
2422 | - if (conf->multipaths[disk].operational) | |
2423 | - return disk; | |
2424 | - BUG(); | |
2425 | - return 0; | |
2426 | + int i, disk=-1, nr_pending, least_pending=0; | |
2427 | + | |
2428 | + for (i=0; i<conf->nr_disks; i++) { | |
2429 | + if (conf->multipaths[i].operational) { | |
2430 | + nr_pending = atomic_read(&conf->multipaths[i].nr_pending); | |
2431 | + if (nr_pending==0 || conf->working_disks==1) | |
2432 | + return i; | |
2433 | + if (least_pending==0 || nr_pending<least_pending) { | |
2434 | + disk = i; | |
2435 | + least_pending = nr_pending; | |
2436 | + } | |
2437 | + } | |
2438 | + } | |
2439 | + return disk; | |
2440 | } | |
2441 | ||
2442 | static int multipath_make_request (mddev_t *mddev, int rw, | |
2443 | @@ -245,6 +268,7 @@ | |
2444 | struct buffer_head *bh_req; | |
2445 | struct multipath_bh * mp_bh; | |
2446 | struct multipath_info *multipath; | |
2447 | + int disk; | |
2448 | ||
2449 | if (!buffer_locked(bh)) | |
2450 | BUG(); | |
2451 | @@ -267,7 +291,16 @@ | |
2452 | /* | |
2453 | * read balancing logic: | |
2454 | */ | |
2455 | - multipath = conf->multipaths + multipath_read_balance(conf); | |
2456 | + disk = multipath_read_balance(conf); | |
2457 | + if (disk==-1) { | |
2458 | + printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n"); | |
2459 | + buffer_IO_error(bh); | |
2460 | + return 0; | |
2461 | + } | |
2462 | + | |
2463 | + multipath = conf->multipaths + disk; | |
2464 | + mp_bh->multipath = multipath; | |
2465 | + atomic_inc(&multipath->nr_pending); | |
2466 | ||
2467 | bh_req = &mp_bh->bh_req; | |
2468 | memcpy(bh_req, bh, sizeof(*bh)); | |
2469 | @@ -331,13 +364,14 @@ | |
2470 | { | |
2471 | multipath_conf_t *conf = mddev_to_conf(mddev); | |
2472 | struct multipath_info * multipaths = conf->multipaths; | |
2473 | - int disks = MD_SB_DISKS; | |
2474 | int other_paths = 1; | |
2475 | - int i; | |
2476 | + int i, first = 1; | |
2477 | + mdk_rdev_t *rdev; | |
2478 | + struct md_list_head *tmp; | |
2479 | ||
2480 | if (conf->working_disks == 1) { | |
2481 | other_paths = 0; | |
2482 | - for (i = 0; i < disks; i++) { | |
2483 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2484 | if (multipaths[i].spare) { | |
2485 | other_paths = 1; | |
2486 | break; | |
2487 | @@ -351,16 +385,17 @@ | |
2488 | * first check if this is a queued request for a device | |
2489 | * which has just failed. | |
2490 | */ | |
2491 | - for (i = 0; i < disks; i++) { | |
2492 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2493 | if (multipaths[i].dev==dev && !multipaths[i].operational) | |
2494 | return 0; | |
2495 | } | |
2496 | printk (LAST_DISK); | |
2497 | } else { | |
2498 | + mdp_super_t *sb = mddev->sb; | |
2499 | /* | |
2500 | * Mark disk as unusable | |
2501 | */ | |
2502 | - for (i = 0; i < disks; i++) { | |
2503 | + for (i = 0; i < MD_SB_DISKS; i++) { | |
2504 | if (multipaths[i].dev==dev && multipaths[i].operational) { | |
2505 | mark_disk_bad(mddev, i); | |
2506 | break; | |
2507 | @@ -369,7 +404,6 @@ | |
2508 | if (!conf->working_disks) { | |
2509 | int err = 1; | |
2510 | mdp_disk_t *spare; | |
2511 | - mdp_super_t *sb = mddev->sb; | |
2512 | ||
2513 | spare = get_spare(mddev); | |
2514 | if (spare) { | |
2515 | @@ -384,6 +418,21 @@ | |
2516 | sb->spare_disks--; | |
2517 | } | |
2518 | } | |
2519 | + /* prevent unnecessary work in md_do_recovery() */ | |
2520 | + if (conf->working_disks) { | |
2521 | + conf->raid_disks = conf->working_disks | |
2522 | + = sb->raid_disks = sb->active_disks; | |
2523 | + } | |
2524 | + /* update alias disk info to insure we can do sb commit. */ | |
2525 | + ITERATE_RDEV(mddev,rdev,tmp) { | |
2526 | + if (first && disk_active(&sb->disks[rdev->desc_nr])) { | |
2527 | + rdev->alias_device = 0; | |
2528 | + first = 0; | |
2529 | + } else { | |
2530 | + if (!disk_faulty(&sb->disks[rdev->desc_nr])) | |
2531 | + rdev->alias_device = 1; | |
2532 | + } | |
2533 | + } | |
2534 | } | |
2535 | return 0; | |
2536 | } | |
2537 | @@ -677,9 +726,8 @@ | |
2538 | /* | |
2539 | * This is a kernel thread which: | |
2540 | * | |
2541 | - * 1. Retries failed read operations on working multipaths. | |
2542 | + * 1. Retries failed operations on working multipaths. | |
2543 | * 2. Updates the raid superblock when problems encounter. | |
2544 | - * 3. Performs writes following reads for array syncronising. | |
2545 | */ | |
2546 | ||
2547 | static void multipathd (void *data) | |
2548 | @@ -833,6 +881,7 @@ | |
2549 | mdk_rdev_t *rdev, *def_rdev = NULL; | |
2550 | struct md_list_head *tmp; | |
2551 | int num_rdevs = 0; | |
2552 | + int active_disks = 0, spare_disks = 0, faulty_disks = 0; | |
2553 | ||
2554 | MOD_INC_USE_COUNT; | |
2555 | ||
2556 | @@ -881,9 +930,7 @@ | |
2557 | printk(NOT_IN_SYNC, partition_name(rdev->dev)); | |
2558 | ||
2559 | /* | |
2560 | - * Mark all disks as spare to start with, then pick our | |
2561 | - * active disk. If we have a disk that is marked active | |
2562 | - * in the sb, then use it, else use the first rdev. | |
2563 | + * Mark all disks as spare to start with. | |
2564 | */ | |
2565 | disk->number = desc->number; | |
2566 | disk->raid_disk = desc->raid_disk; | |
2567 | @@ -894,20 +941,21 @@ | |
2568 | mark_disk_sync(desc); | |
2569 | ||
2570 | if (disk_active(desc)) { | |
2571 | - if(!conf->working_disks) { | |
2572 | - printk(OPERATIONAL, partition_name(rdev->dev), | |
2573 | - desc->raid_disk); | |
2574 | - disk->operational = 1; | |
2575 | - disk->spare = 0; | |
2576 | - conf->working_disks++; | |
2577 | - def_rdev = rdev; | |
2578 | - } else { | |
2579 | - mark_disk_spare(desc); | |
2580 | - } | |
2581 | - } else | |
2582 | - mark_disk_spare(desc); | |
2583 | + printk(OPERATIONAL, partition_name(rdev->dev), | |
2584 | + desc->raid_disk); | |
2585 | + disk->operational = 1; | |
2586 | + disk->spare = 0; | |
2587 | + conf->working_disks++; | |
2588 | + def_rdev = rdev; | |
2589 | + active_disks++; | |
2590 | + } else if (disk_faulty(desc)) { | |
2591 | + disk->spare = 0; | |
2592 | + faulty_disks++; | |
2593 | + } else { | |
2594 | + spare_disks++; | |
2595 | + } | |
2596 | ||
2597 | - if(!num_rdevs++) def_rdev = rdev; | |
2598 | + num_rdevs++; | |
2599 | } | |
2600 | if(!conf->working_disks && num_rdevs) { | |
2601 | desc = &sb->disks[def_rdev->desc_nr]; | |
2602 | @@ -918,11 +966,12 @@ | |
2603 | disk->spare = 0; | |
2604 | conf->working_disks++; | |
2605 | mark_disk_active(desc); | |
2606 | + active_disks++; | |
2607 | } | |
2608 | /* | |
2609 | - * Make sure our active path is in desc spot 0 | |
2610 | + * If there is only 1 active path ... make sure it is in desc spot 0 | |
2611 | */ | |
2612 | - if(def_rdev->desc_nr != 0) { | |
2613 | + if (active_disks == 1 && def_rdev->desc_nr != 0) { | |
2614 | rdev = find_rdev_nr(mddev, 0); | |
2615 | desc = &sb->disks[def_rdev->desc_nr]; | |
2616 | desc2 = sb->disks; | |
2617 | @@ -940,10 +989,10 @@ | |
2618 | def_rdev->desc_nr = 0; | |
2619 | } | |
2620 | } | |
2621 | - conf->raid_disks = sb->raid_disks = sb->active_disks = 1; | |
2622 | + conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks; | |
2623 | conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; | |
2624 | - sb->failed_disks = 0; | |
2625 | - sb->spare_disks = num_rdevs - 1; | |
2626 | + sb->failed_disks = faulty_disks; | |
2627 | + sb->spare_disks = spare_disks; | |
2628 | mddev->sb_dirty = 1; | |
2629 | conf->mddev = mddev; | |
2630 | conf->device_lock = MD_SPIN_LOCK_UNLOCKED; | |
2631 | diff -urN linux-2.4.24.org/fs/buffer.c linux-2.4.24/fs/buffer.c | |
2632 | --- linux-2.4.24.org/fs/buffer.c 2004-01-18 14:55:22.305275818 +0100 | |
2633 | +++ linux-2.4.24/fs/buffer.c 2004-01-18 15:57:55.602026171 +0100 | |
2634 | @@ -419,6 +419,34 @@ | |
2635 | fsync_dev(dev); | |
2636 | } | |
2637 | ||
2638 | +int fsync_dev_lockfs(kdev_t dev) | |
2639 | +{ | |
2640 | + /* you are not allowed to try locking all the filesystems | |
2641 | + ** on the system, your chances of getting through without | |
2642 | + ** total deadlock are slim to none. | |
2643 | + */ | |
2644 | + if (!dev) | |
2645 | + return fsync_dev(dev) ; | |
2646 | + | |
2647 | + sync_buffers(dev, 0); | |
2648 | + | |
2649 | + lock_kernel(); | |
2650 | + /* note, the FS might need to start transactions to | |
2651 | + ** sync the inodes, or the quota, no locking until | |
2652 | + ** after these are done | |
2653 | + */ | |
2654 | + sync_inodes(dev); | |
6ba999b3 | 2655 | + DQUOT_SYNC_DEV(dev); |
cdeda7f0 AM |
2656 | + /* if inodes or quotas could be dirtied during the |
2657 | + ** sync_supers_lockfs call, the FS is responsible for getting | |
2658 | + ** them on disk, without deadlocking against the lock | |
2659 | + */ | |
2660 | + sync_supers_lockfs(dev) ; | |
2661 | + unlock_kernel(); | |
2662 | + | |
2663 | + return sync_buffers(dev, 1) ; | |
2664 | +} | |
2665 | + | |
2666 | asmlinkage long sys_sync(void) | |
2667 | { | |
2668 | fsync_dev(0); | |
2669 | diff -urN linux-2.4.24.org/fs/reiserfs/super.c linux-2.4.24/fs/reiserfs/super.c | |
2670 | --- linux-2.4.24.org/fs/reiserfs/super.c 2004-01-18 14:55:18.875002271 +0100 | |
2671 | +++ linux-2.4.24/fs/reiserfs/super.c 2004-01-18 15:57:55.657014322 +0100 | |
2672 | @@ -84,7 +84,7 @@ | |
2673 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); | |
2674 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | |
2675 | reiserfs_block_writes(&th) ; | |
2676 | - journal_end(&th, s, 1) ; | |
2677 | + journal_end_sync(&th, s, 1) ; | |
2678 | } | |
2679 | s->s_dirt = 0; | |
2680 | unlock_kernel() ; | |
2681 | diff -urN linux-2.4.24.org/fs/super.c linux-2.4.24/fs/super.c | |
2682 | --- linux-2.4.24.org/fs/super.c 2004-01-18 14:55:11.177633010 +0100 | |
2683 | +++ linux-2.4.24/fs/super.c 2004-01-18 15:57:55.687007859 +0100 | |
2684 | @@ -38,6 +38,13 @@ | |
2685 | LIST_HEAD(super_blocks); | |
2686 | spinlock_t sb_lock = SPIN_LOCK_UNLOCKED; | |
2687 | ||
2688 | +/* | |
2689 | + * lock/unlockfs grab a read lock on s_umount, but you need this lock to | |
2690 | + * make sure no lockfs runs are in progress before inserting/removing | |
2691 | + * supers from the list. | |
2692 | + */ | |
2693 | +static DECLARE_MUTEX(lockfs_sem); | |
2694 | + | |
2695 | /* | |
2696 | * Handling of filesystem drivers list. | |
2697 | * Rules: | |
2698 | @@ -436,6 +443,19 @@ | |
2699 | put_super(sb); | |
2700 | } | |
2701 | ||
2702 | +static void write_super_lockfs(struct super_block *sb) | |
2703 | +{ | |
2704 | + lock_super(sb); | |
2705 | + if (sb->s_root && sb->s_op) { | |
2706 | + if (sb->s_dirt && sb->s_op->write_super) | |
2707 | + sb->s_op->write_super(sb); | |
2708 | + if (sb->s_op->write_super_lockfs) { | |
2709 | + sb->s_op->write_super_lockfs(sb); | |
2710 | + } | |
2711 | + } | |
2712 | + unlock_super(sb); | |
2713 | +} | |
2714 | + | |
2715 | static inline void write_super(struct super_block *sb) | |
2716 | { | |
2717 | lock_super(sb); | |
2718 | @@ -483,6 +503,39 @@ | |
2719 | spin_unlock(&sb_lock); | |
2720 | } | |
2721 | ||
2722 | +/* | |
2723 | + * Note: don't check the dirty flag before waiting, we want the lock | |
2724 | + * to happen every time this is called. dev must be non-zero | |
2725 | + */ | |
2726 | +void sync_supers_lockfs(kdev_t dev) | |
2727 | +{ | |
2728 | + struct super_block * sb; | |
2729 | + | |
2730 | + down(&lockfs_sem) ; | |
2731 | + if (dev) { | |
2732 | + sb = get_super(dev); | |
2733 | + if (sb) { | |
2734 | + write_super_lockfs(sb); | |
2735 | + drop_super(sb); | |
2736 | + } | |
2737 | + } | |
2738 | +} | |
2739 | + | |
2740 | +void unlockfs(kdev_t dev) | |
2741 | +{ | |
2742 | + struct super_block * sb; | |
2743 | + | |
2744 | + if (dev) { | |
2745 | + sb = get_super(dev); | |
2746 | + if (sb) { | |
2747 | + if (sb->s_op && sb->s_op->unlockfs) | |
2748 | + sb->s_op->unlockfs(sb) ; | |
2749 | + drop_super(sb); | |
2750 | + } | |
2751 | + } | |
2752 | + up(&lockfs_sem) ; | |
2753 | +} | |
2754 | + | |
2755 | /** | |
2756 | * get_super - get the superblock of a device | |
2757 | * @dev: device to get the superblock for | |
2758 | @@ -702,6 +755,7 @@ | |
2759 | goto out1; | |
2760 | ||
2761 | error = -EBUSY; | |
2762 | + down(&lockfs_sem); | |
2763 | restart: | |
2764 | spin_lock(&sb_lock); | |
2765 | ||
2766 | @@ -713,6 +767,7 @@ | |
2767 | ((flags ^ old->s_flags) & MS_RDONLY)) { | |
2768 | spin_unlock(&sb_lock); | |
2769 | destroy_super(s); | |
2770 | + up(&lockfs_sem); | |
2771 | goto out1; | |
2772 | } | |
2773 | if (!grab_super(old)) | |
2774 | @@ -720,12 +775,14 @@ | |
2775 | destroy_super(s); | |
2776 | blkdev_put(bdev, BDEV_FS); | |
2777 | path_release(&nd); | |
2778 | + up(&lockfs_sem); | |
2779 | return old; | |
2780 | } | |
2781 | s->s_dev = dev; | |
2782 | s->s_bdev = bdev; | |
2783 | s->s_flags = flags; | |
2784 | insert_super(s, fs_type); | |
2785 | + up(&lockfs_sem); | |
2786 | if (!fs_type->read_super(s, data, flags & MS_VERBOSE ? 1 : 0)) | |
2787 | goto Einval; | |
2788 | s->s_flags |= MS_ACTIVE; | |
2789 | @@ -833,7 +890,10 @@ | |
2790 | if (!deactivate_super(sb)) | |
2791 | return; | |
2792 | ||
2793 | + down(&lockfs_sem); | |
2794 | down_write(&sb->s_umount); | |
2795 | + up(&lockfs_sem); | |
2796 | + | |
2797 | sb->s_root = NULL; | |
2798 | /* Need to clean after the sucker */ | |
2799 | if (fs->fs_flags & FS_LITTER) | |
2800 | diff -urN linux-2.4.24.org/include/linux/fs.h linux-2.4.24/include/linux/fs.h | |
2801 | --- linux-2.4.24.org/include/linux/fs.h 2004-01-18 14:55:29.014855364 +0100 | |
2802 | +++ linux-2.4.24/include/linux/fs.h 2004-01-18 15:59:11.694692181 +0100 | |
2803 | @@ -1287,6 +1287,7 @@ | |
2804 | extern int sync_buffers(kdev_t, int); | |
2805 | extern void sync_dev(kdev_t); | |
2806 | extern int fsync_dev(kdev_t); | |
2807 | +extern int fsync_dev_lockfs(kdev_t); | |
2808 | extern int fsync_super(struct super_block *); | |
2809 | extern int fsync_no_super(kdev_t); | |
2810 | extern void sync_inodes_sb(struct super_block *); | |
2811 | @@ -1305,6 +1306,8 @@ | |
2812 | extern int filemap_fdatasync(struct address_space *); | |
2813 | extern int filemap_fdatawait(struct address_space *); | |
2814 | extern void sync_supers(kdev_t dev, int wait); | |
2815 | +extern void sync_supers_lockfs(kdev_t); | |
2816 | +extern void unlockfs(kdev_t); | |
2817 | extern int bmap(struct inode *, int); | |
2818 | extern int notify_change(struct dentry *, struct iattr *); | |
2819 | extern int permission(struct inode *, int); | |
2820 | diff -urN linux-2.4.24.org/include/linux/raid/md_u.h linux-2.4.24/include/linux/raid/md_u.h | |
2821 | --- linux-2.4.24.org/include/linux/raid/md_u.h 2004-01-18 14:55:35.554471508 +0100 | |
2822 | +++ linux-2.4.24/include/linux/raid/md_u.h 2004-01-18 16:04:27.764887949 +0100 | |
2823 | @@ -50,6 +50,10 @@ | |
2824 | int patchlevel; | |
2825 | } mdu_version_t; | |
2826 | ||
2827 | +#define MD_ARRAY_CLEAN 0 | |
2828 | +#define MD_ARRAY_ERRORS 1 | |
2829 | +#define MD_ARRAY_RECOVERY_RUNNING 2 | |
2830 | + | |
2831 | typedef struct mdu_array_info_s { | |
2832 | /* | |
2833 | * Generic constant information | |
2834 | diff -urN linux-2.4.24.org/include/linux/raid/multipath.h linux-2.4.24/include/linux/raid/multipath.h | |
2835 | --- linux-2.4.24.org/include/linux/raid/multipath.h 2004-01-18 14:55:35.563469605 +0100 | |
2836 | +++ linux-2.4.24/include/linux/raid/multipath.h 2004-01-18 16:04:38.329683369 +0100 | |
2837 | @@ -15,6 +15,7 @@ | |
2838 | int spare; | |
2839 | ||
2840 | int used_slot; | |
2841 | + atomic_t nr_pending; /* number of pending requests */ | |
2842 | }; | |
2843 | ||
2844 | struct multipath_private_data { | |
2845 | @@ -63,6 +64,7 @@ | |
2846 | struct buffer_head *master_bh; | |
2847 | struct buffer_head bh_req; | |
2848 | struct multipath_bh *next_mp; /* next for retry or in free list */ | |
2849 | + struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/ | |
2850 | }; | |
2851 | /* bits for multipath_bh.state */ | |
2852 | #define MPBH_Uptodate 1 | |
2853 | diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c | |
2854 | --- linux-2.4.24.org/kernel/ksyms.c 2004-01-18 14:55:22.698192617 +0100 | |
2855 | +++ linux-2.4.24/kernel/ksyms.c 2004-01-18 15:57:55.824978130 +0100 | |
2856 | @@ -200,6 +200,8 @@ | |
2857 | EXPORT_SYMBOL(invalidate_inode_pages); | |
2858 | EXPORT_SYMBOL(truncate_inode_pages); | |
2859 | EXPORT_SYMBOL(fsync_dev); | |
2860 | +EXPORT_SYMBOL(fsync_dev_lockfs); | |
2861 | +EXPORT_SYMBOL(unlockfs); | |
2862 | EXPORT_SYMBOL(fsync_no_super); | |
2863 | EXPORT_SYMBOL(permission); | |
2864 | EXPORT_SYMBOL(vfs_permission); |