diff -urN linux.org/fs/buffer.c linux/fs/buffer.c --- linux.org/fs/buffer.c Fri May 17 18:26:25 2002 +++ linux/fs/buffer.c Fri May 17 18:29:33 2002 @@ -1815,9 +1815,14 @@ } /* Stage 3: start the IO */ - for (i = 0; i < nr; i++) - submit_bh(READ, arr[i]); - + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_io_async(bh, 1); + else + submit_bh(READ, bh); + } + return 0; } diff -urN linux.org/fs/ext3/balloc.c linux/fs/ext3/balloc.c --- linux.org/fs/ext3/balloc.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/balloc.c Fri May 17 18:30:16 2002 @@ -542,6 +542,7 @@ int i, j, k, tmp, alloctmp; int bitmap_nr; int fatal = 0, err; + int performed_allocation = 0; struct super_block * sb; struct ext3_group_desc * gdp; struct ext3_super_block * es; @@ -644,8 +645,7 @@ } /* No space left on the device */ - unlock_super (sb); - return 0; + goto out; search_back: /* @@ -694,6 +694,7 @@ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); BUFFER_TRACE(bh, "setting bitmap bit"); ext3_set_bit(j, bh->b_data); + performed_allocation = 1; #ifdef CONFIG_JBD_DEBUG { @@ -815,6 +816,11 @@ ext3_std_error(sb, fatal); } unlock_super (sb); + /* + * Undo the block allocation + */ + if (!performed_allocation) + DQUOT_FREE_BLOCK(inode, 1); return 0; } diff -urN linux.org/fs/ext3/file.c linux/fs/ext3/file.c --- linux.org/fs/ext3/file.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/file.c Fri May 17 18:30:00 2002 @@ -61,19 +61,53 @@ static ssize_t ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { + int ret, err; struct inode *inode = file->f_dentry->d_inode; + extern kdev_t TRACE_DEV; - /* - * Nasty: if the file is subject to synchronous writes then we need - * to force generic_osync_inode() to call ext3_write_inode(). - * We do that by marking the inode dirty. This adds much more - * computational expense than we need, but we're going to sync - * anyway. - */ - if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) - mark_inode_dirty(inode); + ret = generic_file_write(file, buf, count, ppos); - return generic_file_write(file, buf, count, ppos); + /* Skip file flushing code if there was an error, or if nothing + was written. */ + if (ret <= 0) + return ret; + + /* If the inode is IS_SYNC, or is O_SYNC and we are doing + data-journaling, then we need to make sure that we force the + transaction to disk to keep all metadata uptodate + synchronously. */ + + if (file->f_flags & O_SYNC) { + /* If we are non-data-journaled, then the dirty data has + already been flushed to backing store by + generic_osync_inode, and the inode has been flushed + too if there have been any modifications other than + mere timestamp updates. + + Open question --- do we care about flushing + timestamps too if the inode is IS_SYNC? */ + if (!ext3_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* So we know that there has been no forced data flush. If the + inode is marked IS_SYNC, we need to force one ourselves. */ + if (!IS_SYNC(inode)) + return ret; + + /* Open question #2 --- should we force data to disk here too? + If we don't, the only impact is that data=writeback + filesystems won't flush data to disk automatically on + IS_SYNC, only metadata (but historically, that is what ext2 + has done.) */ + +force_commit: + err = ext3_force_commit(inode->i_sb); + if (err) + return err; + return ret; } struct file_operations ext3_file_operations = { diff -urN linux.org/fs/ext3/fsync.c linux/fs/ext3/fsync.c --- linux.org/fs/ext3/fsync.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/fsync.c Fri May 17 18:29:20 2002 @@ -62,7 +62,12 @@ * we'll end up waiting on them in commit. */ ret = fsync_inode_buffers(inode); - ret |= fsync_inode_data_buffers(inode); + + /* In writeback node, we need to force out data buffers too. In + * the other modes, ext3_force_commit takes care of forcing out + * just the right data blocks. */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + ret |= fsync_inode_data_buffers(inode); ext3_force_commit(inode->i_sb); diff -urN linux.org/fs/ext3/ialloc.c linux/fs/ext3/ialloc.c --- linux.org/fs/ext3/ialloc.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/ialloc.c Fri May 17 18:29:25 2002 @@ -392,7 +392,7 @@ err = -ENOSPC; if (!gdp) - goto fail; + goto out; err = -EIO; bitmap_nr = load_inode_bitmap (sb, i); @@ -523,9 +523,10 @@ return inode; fail: + ext3_std_error(sb, err); +out: unlock_super(sb); iput(inode); - ext3_std_error(sb, err); return ERR_PTR(err); } diff -urN linux.org/fs/ext3/inode.c linux/fs/ext3/inode.c --- linux.org/fs/ext3/inode.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/inode.c Fri May 17 18:30:26 2002 @@ -412,6 +412,7 @@ return NULL; changed: + brelse(bh); *err = -EAGAIN; goto no_block; failure: @@ -581,8 +582,6 @@ parent = nr; } - if (IS_SYNC(inode)) - handle->h_sync = 1; } if (n == num) return 0; @@ -950,11 +949,13 @@ } static int walk_page_buffers( handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, + struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -972,7 +973,7 @@ *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1005,7 +1006,7 @@ * write. */ -static int do_journal_get_write_access(handle_t *handle, +static int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return ext3_journal_get_write_access(handle, bh); @@ -1031,7 +1032,7 @@ goto prepare_write_failed; if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, do_journal_get_write_access); if (ret) { /* @@ -1052,24 +1053,32 @@ return ret; } -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { - return ext3_journal_dirty_data(handle, bh, 0); + int ret = ext3_journal_dirty_data(handle, bh, 0); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); + return ret; } /* * For ext3_writepage(). We also brelse() the buffer to account for * the bget() which ext3_writepage() performs. */ -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_async_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret = ext3_journal_dirty_data(handle, bh, 1); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); __brelse(bh); return ret; } /* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +static int commit_write_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { set_bit(BH_Uptodate, &bh->b_state); return ext3_journal_dirty_metadata(handle, bh); @@ -1104,7 +1113,7 @@ int partial = 0; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, &partial, commit_write_fn); if (!partial) SetPageUptodate(page); @@ -1114,7 +1123,7 @@ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; } else { if (ext3_should_order_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, journal_dirty_sync_data); } /* Be careful here if generic_commit_write becomes a @@ -1196,7 +1205,8 @@ return generic_block_bmap(mapping,block,ext3_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { atomic_inc(&bh->b_count); return 0; @@ -1295,7 +1305,7 @@ create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); page_buffers = page->buffers; - walk_page_buffers(handle, page_buffers, 0, + walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, bget_one); } @@ -1313,7 +1323,7 @@ /* And attach them to the current transaction */ if (order_data) { - err = walk_page_buffers(handle, page_buffers, + err = walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); if (!ret) ret = err; diff -urN linux.org/fs/ext3/super.c linux/fs/ext3/super.c --- linux.org/fs/ext3/super.c Fri May 17 18:26:25 2002 +++ linux/fs/ext3/super.c Fri May 17 18:29:29 2002 @@ -1589,8 +1589,10 @@ journal_t *journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ + unlock_super(sb); journal_lock_updates(journal); journal_flush(journal); + lock_super(sb); /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); diff -urN linux.org/fs/jbd/journal.c linux/fs/jbd/journal.c --- linux.org/fs/jbd/journal.c Fri May 17 18:26:25 2002 +++ linux/fs/jbd/journal.c Fri May 17 18:30:21 2002 @@ -204,6 +204,7 @@ lock_kernel(); daemonize(); + reparent_to_init(); spin_lock_irq(¤t->sigmask_lock); sigfillset(¤t->blocked); recalc_sigpending(current); @@ -267,6 +268,7 @@ journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); + unlock_kernel(); jbd_debug(1, "Journal thread exiting.\n"); return 0; } @@ -1486,6 +1488,49 @@ unlock_journal(journal); } + +/* + * Report any unexpected dirty buffers which turn up. Normally those + * indicate an error, but they can occur if the user is running (say) + * tune2fs to modify the live filesystem, so we need the option of + * continuing as gracefully as possible. # + * + * The caller should already hold the journal lock and + * journal_datalist_lock spinlock: most callers will need those anyway + * in order to probe the buffer's journaling state safely. + */ +void __jbd_unexpected_dirty_buffer(char *function, int line, + struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + int jlist; + + if (buffer_dirty(bh)) { + printk ("%sUnexpected dirty buffer encountered at " + "%s:%d (%s blocknr %lu)\n", + KERN_WARNING, function, line, + kdevname(bh->b_dev), bh->b_blocknr); +#ifdef JBD_PARANOID_WRITES + J_ASSERT (!buffer_dirty(bh)); +#endif + + /* If this buffer is one which might reasonably be dirty + * --- ie. data, or not part of this journal --- then + * we're OK to leave it alone, but otherwise we need to + * move the dirty bit to the journal's own internal + * JBDDirty bit. */ + jlist = jh->b_jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } +} + + int journal_blocks_per_page(struct inode *inode) { return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); diff -urN linux.org/fs/jbd/transaction.c linux/fs/jbd/transaction.c --- linux.org/fs/jbd/transaction.c Fri May 17 18:26:25 2002 +++ linux/fs/jbd/transaction.c Fri May 17 18:29:51 2002 @@ -539,76 +539,67 @@ static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { + struct buffer_head *bh; transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int error; char *frozen_buffer = NULL; int need_copy = 0; - + int locked; + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: + bh = jh2bh(jh); + /* @@@ Need to check for errors here at some point. */ /* - * AKPM: neither bdflush nor kupdate run with the BKL. There's - * nothing we can do to prevent them from starting writeout of a - * BUF_DIRTY buffer at any time. And checkpointing buffers are on - * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. - * - * However. It is very wrong for us to allow ext3 to start directly - * altering the ->b_data of buffers which may at that very time be - * undergoing writeout to the client filesystem. This can leave - * the filesystem in an inconsistent, transient state if we crash. - * So what we do is to steal the buffer if it is in checkpoint - * mode and dirty. The journal lock will keep out checkpoint-mode - * state transitions within journal_remove_checkpoint() and the buffer - * is locked to keep bdflush/kupdate/whoever away from it as well. - * * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a * simple lock_journal(). This code here will care for locked buffers. */ - /* - * The buffer_locked() || buffer_dirty() tests here are simply an - * optimisation tweak. If anyone else in the system decides to - * lock this buffer later on, we'll blow up. There doesn't seem - * to be a good reason why they should do this. - */ - if (jh->b_cp_transaction && - (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { + locked = test_and_set_bit(BH_Lock, &bh->b_state); + if (locked) { + /* We can't reliably test the buffer state if we found + * it already locked, so just wait for the lock and + * retry. */ unlock_journal(journal); - lock_buffer(jh2bh(jh)); - spin_lock(&journal_datalist_lock); - if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { - /* OK, we need to steal it */ - JBUFFER_TRACE(jh, "stealing from checkpoint mode"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); - - J_ASSERT(handle->h_buffer_credits > 0); - handle->h_buffer_credits--; - - /* This will clear BH_Dirty and set BH_JBDDirty. */ - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - __journal_file_buffer(jh, transaction, BJ_Reserved); - - /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ - refile_buffer(jh2bh(jh)); + __wait_on_buffer(bh); + lock_journal(journal); + goto repeat; + } + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ - /* - * The buffer is now hidden from bdflush. It is - * metadata against the current transaction. - */ - JBUFFER_TRACE(jh, "steal from cp mode is complete"); + if (buffer_dirty(bh)) { + spin_lock(&journal_datalist_lock); + /* First question: is this buffer already part of the + * current transaction or the existing committing + * transaction? */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + JBUFFER_TRACE(jh, "Unexpected dirty buffer"); + jbd_unexpected_dirty_buffer(jh); } spin_unlock(&journal_datalist_lock); - unlock_buffer(jh2bh(jh)); - lock_journal(journal); - goto repeat; } - J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); + unlock_buffer(bh); error = -EROFS; if (is_handle_aborted(handle)) @@ -1926,6 +1917,7 @@ transaction_t *transaction, int jlist) { struct journal_head **list = 0; + int was_dirty = 0; assert_spin_locked(&journal_datalist_lock); @@ -1936,13 +1928,24 @@ J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == 0); - if (jh->b_transaction) { - if (jh->b_jlist == jlist) - return; + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + /* The following list of buffer states needs to be consistent + * with __jbd_unexpected_dirty_buffer()'s handling of dirty + * state. */ + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh)) || + test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) + was_dirty = 1; + } + + if (jh->b_transaction) __journal_unfile_buffer(jh); - } else { + else jh->b_transaction = transaction; - } switch (jlist) { case BJ_None: @@ -1979,12 +1982,8 @@ __blist_add_buffer(list, jh); jh->b_jlist = jlist; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - if (atomic_set_buffer_clean(jh2bh(jh))) { - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); - } - } + if (was_dirty) + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); } void journal_file_buffer(struct journal_head *jh, diff -urN linux.org/include/linux/ext3_fs.h linux/include/linux/ext3_fs.h --- linux.org/include/linux/ext3_fs.h Fri May 17 18:26:25 2002 +++ linux/include/linux/ext3_fs.h Fri May 17 18:30:04 2002 @@ -36,8 +36,8 @@ /* * The second extended file system version */ -#define EXT3FS_DATE "10 Jan 2002" -#define EXT3FS_VERSION "2.4-0.9.17" +#define EXT3FS_DATE "14 May 2002" +#define EXT3FS_VERSION "2.4-0.9.18" /* * Debug code diff -urN linux.org/include/linux/jbd.h linux/include/linux/jbd.h --- linux.org/include/linux/jbd.h Mon Feb 25 20:38:13 2002 +++ linux/include/linux/jbd.h Fri May 17 18:29:38 2002 @@ -32,6 +32,14 @@ #define journal_oom_retry 1 +/* + * Define JBD_PARANOID_WRITES to cause a kernel BUG() check if ext3 + * finds a buffer unexpectedly dirty. This is useful for debugging, but + * can cause spurious kernel panics if there are applications such as + * tune2fs modifying our buffer_heads behind our backs. + */ +#undef JBD_PARANOID_WRITES + #ifdef CONFIG_JBD_DEBUG /* * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal @@ -730,6 +738,10 @@ schedule(); \ } while (1) +extern void __jbd_unexpected_dirty_buffer(char *, int, struct journal_head *); +#define jbd_unexpected_dirty_buffer(jh) \ + __jbd_unexpected_dirty_buffer(__FUNCTION__, __LINE__, (jh)) + /* * is_journal_abort * diff -urN linux.org/kernel/ksyms.c linux/kernel/ksyms.c --- linux.org/kernel/ksyms.c Fri May 17 18:37:07 2002 +++ linux/kernel/ksyms.c Fri May 17 20:38:05 2002 @@ -536,6 +536,7 @@ EXPORT_SYMBOL(insert_inode_hash); EXPORT_SYMBOL(remove_inode_hash); EXPORT_SYMBOL(buffer_insert_inode_queue); +EXPORT_SYMBOL(buffer_insert_inode_data_queue); EXPORT_SYMBOL(make_bad_inode); EXPORT_SYMBOL(is_bad_inode); EXPORT_SYMBOL(event);