1 diff -urN linux.org/fs/buffer.c linux/fs/buffer.c
2 --- linux.org/fs/buffer.c Fri May 17 18:26:25 2002
3 +++ linux/fs/buffer.c Fri May 17 18:29:33 2002
7 /* Stage 3: start the IO */
8 - for (i = 0; i < nr; i++)
9 - submit_bh(READ, arr[i]);
11 + for (i = 0; i < nr; i++) {
12 + struct buffer_head * bh = arr[i];
13 + if (buffer_uptodate(bh))
14 + end_buffer_io_async(bh, 1);
16 + submit_bh(READ, bh);
22 diff -urN linux.org/fs/ext3/balloc.c linux/fs/ext3/balloc.c
23 --- linux.org/fs/ext3/balloc.c Fri May 17 18:26:25 2002
24 +++ linux/fs/ext3/balloc.c Fri May 17 18:30:16 2002
26 int i, j, k, tmp, alloctmp;
29 + int performed_allocation = 0;
30 struct super_block * sb;
31 struct ext3_group_desc * gdp;
32 struct ext3_super_block * es;
36 /* No space left on the device */
44 J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
45 BUFFER_TRACE(bh, "setting bitmap bit");
46 ext3_set_bit(j, bh->b_data);
47 + performed_allocation = 1;
49 #ifdef CONFIG_JBD_DEBUG
52 ext3_std_error(sb, fatal);
56 + * Undo the block allocation
58 + if (!performed_allocation)
59 + DQUOT_FREE_BLOCK(inode, 1);
63 diff -urN linux.org/fs/ext3/file.c linux/fs/ext3/file.c
64 --- linux.org/fs/ext3/file.c Fri May 17 18:26:25 2002
65 +++ linux/fs/ext3/file.c Fri May 17 18:30:00 2002
68 ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
71 struct inode *inode = file->f_dentry->d_inode;
72 + extern kdev_t TRACE_DEV;
75 - * Nasty: if the file is subject to synchronous writes then we need
76 - * to force generic_osync_inode() to call ext3_write_inode().
77 - * We do that by marking the inode dirty. This adds much more
78 - * computational expense than we need, but we're going to sync
81 - if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
82 - mark_inode_dirty(inode);
83 + ret = generic_file_write(file, buf, count, ppos);
85 - return generic_file_write(file, buf, count, ppos);
86 + /* Skip file flushing code if there was an error, or if nothing
91 + /* If the inode is IS_SYNC, or is O_SYNC and we are doing
92 + data-journaling, then we need to make sure that we force the
93 + transaction to disk to keep all metadata uptodate
96 + if (file->f_flags & O_SYNC) {
97 + /* If we are non-data-journaled, then the dirty data has
98 + already been flushed to backing store by
99 + generic_osync_inode, and the inode has been flushed
100 + too if there have been any modifications other than
101 + mere timestamp updates.
103 + Open question --- do we care about flushing
104 + timestamps too if the inode is IS_SYNC? */
105 + if (!ext3_should_journal_data(inode))
111 + /* So we know that there has been no forced data flush. If the
112 + inode is marked IS_SYNC, we need to force one ourselves. */
113 + if (!IS_SYNC(inode))
116 + /* Open question #2 --- should we force data to disk here too?
117 + If we don't, the only impact is that data=writeback
118 + filesystems won't flush data to disk automatically on
119 + IS_SYNC, only metadata (but historically, that is what ext2
123 + err = ext3_force_commit(inode->i_sb);
129 struct file_operations ext3_file_operations = {
130 diff -urN linux.org/fs/ext3/fsync.c linux/fs/ext3/fsync.c
131 --- linux.org/fs/ext3/fsync.c Fri May 17 18:26:25 2002
132 +++ linux/fs/ext3/fsync.c Fri May 17 18:29:20 2002
134 * we'll end up waiting on them in commit.
136 ret = fsync_inode_buffers(inode);
137 - ret |= fsync_inode_data_buffers(inode);
139 + /* In writeback node, we need to force out data buffers too. In
140 + * the other modes, ext3_force_commit takes care of forcing out
141 + * just the right data blocks. */
142 + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
143 + ret |= fsync_inode_data_buffers(inode);
145 ext3_force_commit(inode->i_sb);
147 diff -urN linux.org/fs/ext3/ialloc.c linux/fs/ext3/ialloc.c
148 --- linux.org/fs/ext3/ialloc.c Fri May 17 18:26:25 2002
149 +++ linux/fs/ext3/ialloc.c Fri May 17 18:29:25 2002
158 bitmap_nr = load_inode_bitmap (sb, i);
163 + ext3_std_error(sb, err);
167 - ext3_std_error(sb, err);
171 diff -urN linux.org/fs/ext3/inode.c linux/fs/ext3/inode.c
172 --- linux.org/fs/ext3/inode.c Fri May 17 18:26:25 2002
173 +++ linux/fs/ext3/inode.c Fri May 17 18:30:26 2002
186 - if (IS_SYNC(inode))
187 - handle->h_sync = 1;
191 @@ -950,11 +949,13 @@
194 static int walk_page_buffers( handle_t *handle,
195 + struct inode *inode,
196 struct buffer_head *head,
200 int (*fn)( handle_t *handle,
201 + struct inode *inode,
202 struct buffer_head *bh))
204 struct buffer_head *bh;
209 - err = (*fn)(handle, bh);
210 + err = (*fn)(handle, inode, bh);
214 @@ -1005,7 +1006,7 @@
218 -static int do_journal_get_write_access(handle_t *handle,
219 +static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
220 struct buffer_head *bh)
222 return ext3_journal_get_write_access(handle, bh);
223 @@ -1031,7 +1032,7 @@
224 goto prepare_write_failed;
226 if (ext3_should_journal_data(inode)) {
227 - ret = walk_page_buffers(handle, page->buffers,
228 + ret = walk_page_buffers(handle, inode, page->buffers,
229 from, to, NULL, do_journal_get_write_access);
232 @@ -1052,24 +1053,32 @@
236 -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
237 +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
238 + struct buffer_head *bh)
240 - return ext3_journal_dirty_data(handle, bh, 0);
241 + int ret = ext3_journal_dirty_data(handle, bh, 0);
242 + if (bh->b_inode != inode)
243 + buffer_insert_inode_data_queue(bh, inode);
248 * For ext3_writepage(). We also brelse() the buffer to account for
249 * the bget() which ext3_writepage() performs.
251 -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
252 +static int journal_dirty_async_data(handle_t *handle, struct inode *inode,
253 + struct buffer_head *bh)
255 int ret = ext3_journal_dirty_data(handle, bh, 1);
256 + if (bh->b_inode != inode)
257 + buffer_insert_inode_data_queue(bh, inode);
262 /* For commit_write() in data=journal mode */
263 -static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
264 +static int commit_write_fn(handle_t *handle, struct inode *inode,
265 + struct buffer_head *bh)
267 set_bit(BH_Uptodate, &bh->b_state);
268 return ext3_journal_dirty_metadata(handle, bh);
269 @@ -1104,7 +1113,7 @@
271 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
273 - ret = walk_page_buffers(handle, page->buffers,
274 + ret = walk_page_buffers(handle, inode, page->buffers,
275 from, to, &partial, commit_write_fn);
277 SetPageUptodate(page);
278 @@ -1114,7 +1123,7 @@
279 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
281 if (ext3_should_order_data(inode)) {
282 - ret = walk_page_buffers(handle, page->buffers,
283 + ret = walk_page_buffers(handle, inode, page->buffers,
284 from, to, NULL, journal_dirty_sync_data);
286 /* Be careful here if generic_commit_write becomes a
287 @@ -1196,7 +1205,8 @@
288 return generic_block_bmap(mapping,block,ext3_get_block);
291 -static int bget_one(handle_t *handle, struct buffer_head *bh)
292 +static int bget_one(handle_t *handle, struct inode *inode,
293 + struct buffer_head *bh)
295 atomic_inc(&bh->b_count);
297 @@ -1295,7 +1305,7 @@
298 create_empty_buffers(page,
299 inode->i_dev, inode->i_sb->s_blocksize);
300 page_buffers = page->buffers;
301 - walk_page_buffers(handle, page_buffers, 0,
302 + walk_page_buffers(handle, inode, page_buffers, 0,
303 PAGE_CACHE_SIZE, NULL, bget_one);
306 @@ -1313,7 +1323,7 @@
308 /* And attach them to the current transaction */
310 - err = walk_page_buffers(handle, page_buffers,
311 + err = walk_page_buffers(handle, inode, page_buffers,
312 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
315 diff -urN linux.org/fs/ext3/super.c linux/fs/ext3/super.c
316 --- linux.org/fs/ext3/super.c Fri May 17 18:26:25 2002
317 +++ linux/fs/ext3/super.c Fri May 17 18:29:29 2002
318 @@ -1589,8 +1589,10 @@
319 journal_t *journal = EXT3_SB(sb)->s_journal;
321 /* Now we set up the journal barrier. */
323 journal_lock_updates(journal);
324 journal_flush(journal);
327 /* Journal blocked and flushed, clear needs_recovery flag. */
328 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
329 diff -urN linux.org/fs/jbd/journal.c linux/fs/jbd/journal.c
330 --- linux.org/fs/jbd/journal.c Fri May 17 18:26:25 2002
331 +++ linux/fs/jbd/journal.c Fri May 17 18:30:21 2002
336 + reparent_to_init();
337 spin_lock_irq(¤t->sigmask_lock);
338 sigfillset(¤t->blocked);
339 recalc_sigpending(current);
342 journal->j_task = NULL;
343 wake_up(&journal->j_wait_done_commit);
345 jbd_debug(1, "Journal thread exiting.\n");
348 @@ -1486,6 +1488,49 @@
349 unlock_journal(journal);
354 + * Report any unexpected dirty buffers which turn up. Normally those
355 + * indicate an error, but they can occur if the user is running (say)
356 + * tune2fs to modify the live filesystem, so we need the option of
357 + * continuing as gracefully as possible. #
359 + * The caller should already hold the journal lock and
360 + * journal_datalist_lock spinlock: most callers will need those anyway
361 + * in order to probe the buffer's journaling state safely.
363 +void __jbd_unexpected_dirty_buffer(char *function, int line,
364 + struct journal_head *jh)
366 + struct buffer_head *bh = jh2bh(jh);
369 + if (buffer_dirty(bh)) {
370 + printk ("%sUnexpected dirty buffer encountered at "
371 + "%s:%d (%s blocknr %lu)\n",
372 + KERN_WARNING, function, line,
373 + kdevname(bh->b_dev), bh->b_blocknr);
374 +#ifdef JBD_PARANOID_WRITES
375 + J_ASSERT (!buffer_dirty(bh));
378 + /* If this buffer is one which might reasonably be dirty
379 + * --- ie. data, or not part of this journal --- then
380 + * we're OK to leave it alone, but otherwise we need to
381 + * move the dirty bit to the journal's own internal
383 + jlist = jh->b_jlist;
385 + if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
386 + jlist == BJ_Shadow || jlist == BJ_Forget) {
387 + if (atomic_set_buffer_clean(jh2bh(jh))) {
388 + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
395 int journal_blocks_per_page(struct inode *inode)
397 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
398 diff -urN linux.org/fs/jbd/transaction.c linux/fs/jbd/transaction.c
399 --- linux.org/fs/jbd/transaction.c Fri May 17 18:26:25 2002
400 +++ linux/fs/jbd/transaction.c Fri May 17 18:29:51 2002
401 @@ -539,76 +539,67 @@
403 do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
405 + struct buffer_head *bh;
406 transaction_t *transaction = handle->h_transaction;
407 journal_t *journal = transaction->t_journal;
409 char *frozen_buffer = NULL;
414 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
416 JBUFFER_TRACE(jh, "entry");
420 /* @@@ Need to check for errors here at some point. */
423 - * AKPM: neither bdflush nor kupdate run with the BKL. There's
424 - * nothing we can do to prevent them from starting writeout of a
425 - * BUF_DIRTY buffer at any time. And checkpointing buffers are on
426 - * BUF_DIRTY. So. We no longer assert that the buffer is unlocked.
428 - * However. It is very wrong for us to allow ext3 to start directly
429 - * altering the ->b_data of buffers which may at that very time be
430 - * undergoing writeout to the client filesystem. This can leave
431 - * the filesystem in an inconsistent, transient state if we crash.
432 - * So what we do is to steal the buffer if it is in checkpoint
433 - * mode and dirty. The journal lock will keep out checkpoint-mode
434 - * state transitions within journal_remove_checkpoint() and the buffer
435 - * is locked to keep bdflush/kupdate/whoever away from it as well.
437 * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
438 * simple lock_journal(). This code here will care for locked buffers.
441 - * The buffer_locked() || buffer_dirty() tests here are simply an
442 - * optimisation tweak. If anyone else in the system decides to
443 - * lock this buffer later on, we'll blow up. There doesn't seem
444 - * to be a good reason why they should do this.
446 - if (jh->b_cp_transaction &&
447 - (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
448 + locked = test_and_set_bit(BH_Lock, &bh->b_state);
450 + /* We can't reliably test the buffer state if we found
451 + * it already locked, so just wait for the lock and
453 unlock_journal(journal);
454 - lock_buffer(jh2bh(jh));
455 - spin_lock(&journal_datalist_lock);
456 - if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
457 - /* OK, we need to steal it */
458 - JBUFFER_TRACE(jh, "stealing from checkpoint mode");
459 - J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
460 - J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
462 - J_ASSERT(handle->h_buffer_credits > 0);
463 - handle->h_buffer_credits--;
465 - /* This will clear BH_Dirty and set BH_JBDDirty. */
466 - JBUFFER_TRACE(jh, "file as BJ_Reserved");
467 - __journal_file_buffer(jh, transaction, BJ_Reserved);
469 - /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
470 - refile_buffer(jh2bh(jh));
471 + __wait_on_buffer(bh);
472 + lock_journal(journal);
476 + /* We now hold the buffer lock so it is safe to query the buffer
477 + * state. Is the buffer dirty?
479 + * If so, there are two possibilities. The buffer may be
480 + * non-journaled, and undergoing a quite legitimate writeback.
481 + * Otherwise, it is journaled, and we don't expect dirty buffers
482 + * in that state (the buffers should be marked JBD_Dirty
483 + * instead.) So either the IO is being done under our own
484 + * control and this is a bug, or it's a third party IO such as
485 + * dump(8) (which may leave the buffer scheduled for read ---
486 + * ie. locked but not dirty) or tune2fs (which may actually have
487 + * the buffer dirtied, ugh.) */
490 - * The buffer is now hidden from bdflush. It is
491 - * metadata against the current transaction.
493 - JBUFFER_TRACE(jh, "steal from cp mode is complete");
494 + if (buffer_dirty(bh)) {
495 + spin_lock(&journal_datalist_lock);
496 + /* First question: is this buffer already part of the
497 + * current transaction or the existing committing
499 + if (jh->b_transaction) {
500 + J_ASSERT_JH(jh, jh->b_transaction == transaction ||
501 + jh->b_transaction == journal->j_committing_transaction);
502 + if (jh->b_next_transaction)
503 + J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
504 + JBUFFER_TRACE(jh, "Unexpected dirty buffer");
505 + jbd_unexpected_dirty_buffer(jh);
507 spin_unlock(&journal_datalist_lock);
508 - unlock_buffer(jh2bh(jh));
509 - lock_journal(journal);
513 - J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
517 if (is_handle_aborted(handle))
518 @@ -1926,6 +1917,7 @@
519 transaction_t *transaction, int jlist)
521 struct journal_head **list = 0;
524 assert_spin_locked(&journal_datalist_lock);
526 @@ -1936,13 +1928,24 @@
527 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
528 jh->b_transaction == 0);
530 - if (jh->b_transaction) {
531 - if (jh->b_jlist == jlist)
533 + if (jh->b_transaction && jh->b_jlist == jlist)
536 + /* The following list of buffer states needs to be consistent
537 + * with __jbd_unexpected_dirty_buffer()'s handling of dirty
540 + if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
541 + jlist == BJ_Shadow || jlist == BJ_Forget) {
542 + if (atomic_set_buffer_clean(jh2bh(jh)) ||
543 + test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state))
547 + if (jh->b_transaction)
548 __journal_unfile_buffer(jh);
551 jh->b_transaction = transaction;
556 @@ -1979,12 +1982,8 @@
557 __blist_add_buffer(list, jh);
560 - if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
561 - jlist == BJ_Shadow || jlist == BJ_Forget) {
562 - if (atomic_set_buffer_clean(jh2bh(jh))) {
563 - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
567 + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
570 void journal_file_buffer(struct journal_head *jh,
571 diff -urN linux.org/include/linux/ext3_fs.h linux/include/linux/ext3_fs.h
572 --- linux.org/include/linux/ext3_fs.h Fri May 17 18:26:25 2002
573 +++ linux/include/linux/ext3_fs.h Fri May 17 18:30:04 2002
576 * The second extended file system version
578 -#define EXT3FS_DATE "10 Jan 2002"
579 -#define EXT3FS_VERSION "2.4-0.9.17"
580 +#define EXT3FS_DATE "14 May 2002"
581 +#define EXT3FS_VERSION "2.4-0.9.18"
585 diff -urN linux.org/include/linux/jbd.h linux/include/linux/jbd.h
586 --- linux.org/include/linux/jbd.h Mon Feb 25 20:38:13 2002
587 +++ linux/include/linux/jbd.h Fri May 17 18:29:38 2002
590 #define journal_oom_retry 1
593 + * Define JBD_PARANOID_WRITES to cause a kernel BUG() check if ext3
594 + * finds a buffer unexpectedly dirty. This is useful for debugging, but
595 + * can cause spurious kernel panics if there are applications such as
596 + * tune2fs modifying our buffer_heads behind our backs.
598 +#undef JBD_PARANOID_WRITES
600 #ifdef CONFIG_JBD_DEBUG
602 * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
607 +extern void __jbd_unexpected_dirty_buffer(char *, int, struct journal_head *);
608 +#define jbd_unexpected_dirty_buffer(jh) \
609 + __jbd_unexpected_dirty_buffer(__FUNCTION__, __LINE__, (jh))
614 diff -urN linux.org/kernel/ksyms.c linux/kernel/ksyms.c
615 --- linux.org/kernel/ksyms.c Fri May 17 18:37:07 2002
616 +++ linux/kernel/ksyms.c Fri May 17 20:38:05 2002
618 EXPORT_SYMBOL(insert_inode_hash);
619 EXPORT_SYMBOL(remove_inode_hash);
620 EXPORT_SYMBOL(buffer_insert_inode_queue);
621 +EXPORT_SYMBOL(buffer_insert_inode_data_queue);
622 EXPORT_SYMBOL(make_bad_inode);
623 EXPORT_SYMBOL(is_bad_inode);
624 EXPORT_SYMBOL(event);