2 ===================================================================
3 RCS file: /cvsroot/gkernel/ext3/fs/buffer.c,v
4 retrieving revision 1.8.2.16.2.11
5 retrieving revision 1.36.2.22
6 diff -u -r1.8.2.16.2.11 -r1.36.2.22
7 --- fs/buffer.c 9 May 2002 15:54:51 -0000 1.8.2.16.2.11
8 +++ fs/buffer.c 9 May 2002 16:10:09 -0000 1.36.2.22
12 /* Stage 3: start the IO */
13 - for (i = 0; i < nr; i++)
14 - submit_bh(READ, arr[i]);
16 + for (i = 0; i < nr; i++) {
17 + struct buffer_head * bh = arr[i];
18 + if (buffer_uptodate(bh))
19 + end_buffer_io_async(bh, 1);
21 + submit_bh(READ, bh);
28 ===================================================================
29 RCS file: /cvsroot/gkernel/ext3/fs/ext3/file.c,v
30 retrieving revision 1.11.2.3
31 retrieving revision 1.27.2.3
32 diff -u -r1.11.2.3 -r1.27.2.3
33 --- fs/ext3/file.c 16 Nov 2001 14:35:14 -0000 1.11.2.3
34 +++ fs/ext3/file.c 14 May 2002 15:14:27 -0000 1.27.2.3
37 ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
40 struct inode *inode = file->f_dentry->d_inode;
41 + extern kdev_t TRACE_DEV;
44 - * Nasty: if the file is subject to synchronous writes then we need
45 - * to force generic_osync_inode() to call ext3_write_inode().
46 - * We do that by marking the inode dirty. This adds much more
47 - * computational expense than we need, but we're going to sync
50 - if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
51 - mark_inode_dirty(inode);
52 + ret = generic_file_write(file, buf, count, ppos);
54 - return generic_file_write(file, buf, count, ppos);
55 + /* Skip file flushing code if there was an error, or if nothing
60 + /* If the inode is IS_SYNC, or is O_SYNC and we are doing
61 + data-journaling, then we need to make sure that we force the
62 + transaction to disk to keep all metadata uptodate
65 + if (file->f_flags & O_SYNC) {
66 + /* If we are non-data-journaled, then the dirty data has
67 + already been flushed to backing store by
68 + generic_osync_inode, and the inode has been flushed
69 + too if there have been any modifications other than
70 + mere timestamp updates.
72 + Open question --- do we care about flushing
73 + timestamps too if the inode is IS_SYNC? */
74 + if (!ext3_should_journal_data(inode))
80 + /* So we know that there has been no forced data flush. If the
81 + inode is marked IS_SYNC, we need to force one ourselves. */
82 + if (!IS_SYNC(inode))
85 + /* Open question #2 --- should we force data to disk here too?
86 + If we don't, the only impact is that data=writeback
87 + filesystems won't flush data to disk automatically on
88 + IS_SYNC, only metadata (but historically, that is what ext2
92 + err = ext3_force_commit(inode->i_sb);
98 struct file_operations ext3_file_operations = {
99 Index: fs/ext3/fsync.c
100 ===================================================================
101 RCS file: /cvsroot/gkernel/ext3/fs/ext3/fsync.c,v
102 retrieving revision 1.2.2.3
103 retrieving revision 1.7.2.2
104 diff -u -r1.2.2.3 -r1.7.2.2
105 --- fs/ext3/fsync.c 21 Nov 2001 07:49:08 -0000 1.2.2.3
106 +++ fs/ext3/fsync.c 10 Apr 2002 17:00:50 -0000 1.7.2.2
108 * we'll end up waiting on them in commit.
110 ret = fsync_inode_buffers(inode);
111 - ret |= fsync_inode_data_buffers(inode);
113 + /* In writeback node, we need to force out data buffers too. In
114 + * the other modes, ext3_force_commit takes care of forcing out
115 + * just the right data blocks. */
116 + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
117 + ret |= fsync_inode_data_buffers(inode);
119 ext3_force_commit(inode->i_sb);
121 Index: fs/ext3/ialloc.c
122 ===================================================================
123 RCS file: /cvsroot/gkernel/ext3/fs/ext3/ialloc.c,v
124 retrieving revision 1.5.2.4
125 retrieving revision 1.19.4.5
126 diff -u -r1.5.2.4 -r1.19.4.5
127 --- fs/ext3/ialloc.c 13 Mar 2002 11:53:43 -0000 1.5.2.4
128 +++ fs/ext3/ialloc.c 10 Apr 2002 17:02:19 -0000 1.19.4.5
137 bitmap_nr = load_inode_bitmap (sb, i);
142 + ext3_std_error(sb, err);
146 - ext3_std_error(sb, err);
150 Index: fs/ext3/inode.c
151 ===================================================================
152 RCS file: /cvsroot/gkernel/ext3/fs/ext3/inode.c,v
153 retrieving revision 1.12.2.6
154 retrieving revision 1.64.2.22
155 diff -u -r1.12.2.6 -r1.64.2.22
156 --- fs/ext3/inode.c 9 May 2002 15:54:51 -0000 1.12.2.6
157 +++ fs/ext3/inode.c 13 May 2002 17:10:02 -0000 1.64.2.22
166 @@ -948,11 +949,13 @@
169 static int walk_page_buffers( handle_t *handle,
170 + struct inode *inode,
171 struct buffer_head *head,
175 int (*fn)( handle_t *handle,
176 + struct inode *inode,
177 struct buffer_head *bh))
179 struct buffer_head *bh;
184 - err = (*fn)(handle, bh);
185 + err = (*fn)(handle, inode, bh);
189 @@ -1003,7 +1006,7 @@
193 -static int do_journal_get_write_access(handle_t *handle,
194 +static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
195 struct buffer_head *bh)
197 return ext3_journal_get_write_access(handle, bh);
198 @@ -1029,7 +1032,7 @@
199 goto prepare_write_failed;
201 if (ext3_should_journal_data(inode)) {
202 - ret = walk_page_buffers(handle, page->buffers,
203 + ret = walk_page_buffers(handle, inode, page->buffers,
204 from, to, NULL, do_journal_get_write_access);
207 @@ -1050,24 +1053,32 @@
211 -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
212 +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
213 + struct buffer_head *bh)
215 - return ext3_journal_dirty_data(handle, bh, 0);
216 + int ret = ext3_journal_dirty_data(handle, bh, 0);
217 + if (bh->b_inode != inode)
218 + buffer_insert_inode_data_queue(bh, inode);
223 * For ext3_writepage(). We also brelse() the buffer to account for
224 * the bget() which ext3_writepage() performs.
226 -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
227 +static int journal_dirty_async_data(handle_t *handle, struct inode *inode,
228 + struct buffer_head *bh)
230 int ret = ext3_journal_dirty_data(handle, bh, 1);
231 + if (bh->b_inode != inode)
232 + buffer_insert_inode_data_queue(bh, inode);
237 /* For commit_write() in data=journal mode */
238 -static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
239 +static int commit_write_fn(handle_t *handle, struct inode *inode,
240 + struct buffer_head *bh)
242 set_bit(BH_Uptodate, &bh->b_state);
243 return ext3_journal_dirty_metadata(handle, bh);
244 @@ -1102,7 +1113,7 @@
246 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
248 - ret = walk_page_buffers(handle, page->buffers,
249 + ret = walk_page_buffers(handle, inode, page->buffers,
250 from, to, &partial, commit_write_fn);
252 SetPageUptodate(page);
253 @@ -1112,7 +1123,7 @@
254 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
256 if (ext3_should_order_data(inode)) {
257 - ret = walk_page_buffers(handle, page->buffers,
258 + ret = walk_page_buffers(handle, inode, page->buffers,
259 from, to, NULL, journal_dirty_sync_data);
261 /* Be careful here if generic_commit_write becomes a
262 @@ -1194,7 +1205,8 @@
263 return generic_block_bmap(mapping,block,ext3_get_block);
266 -static int bget_one(handle_t *handle, struct buffer_head *bh)
267 +static int bget_one(handle_t *handle, struct inode *inode,
268 + struct buffer_head *bh)
270 atomic_inc(&bh->b_count);
272 @@ -1293,7 +1305,7 @@
273 create_empty_buffers(page,
274 inode->i_dev, inode->i_sb->s_blocksize);
275 page_buffers = page->buffers;
276 - walk_page_buffers(handle, page_buffers, 0,
277 + walk_page_buffers(handle, inode, page_buffers, 0,
278 PAGE_CACHE_SIZE, NULL, bget_one);
281 @@ -1311,7 +1323,7 @@
283 /* And attach them to the current transaction */
285 - err = walk_page_buffers(handle, page_buffers,
286 + err = walk_page_buffers(handle, inode, page_buffers,
287 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
290 Index: fs/ext3/super.c
291 ===================================================================
292 RCS file: /cvsroot/gkernel/ext3/fs/ext3/super.c,v
293 retrieving revision 1.12.2.5
294 retrieving revision 1.34.2.21
295 diff -u -r1.12.2.5 -r1.34.2.21
296 --- fs/ext3/super.c 13 Mar 2002 11:53:43 -0000 1.12.2.5
297 +++ fs/ext3/super.c 15 Apr 2002 20:34:54 -0000 1.34.2.21
298 @@ -1589,8 +1589,10 @@
299 journal_t *journal = EXT3_SB(sb)->s_journal;
301 /* Now we set up the journal barrier. */
303 journal_lock_updates(journal);
304 journal_flush(journal);
307 /* Journal blocked and flushed, clear needs_recovery flag. */
308 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
309 Index: fs/jbd/journal.c
310 ===================================================================
311 RCS file: /cvsroot/gkernel/ext3/fs/jbd/journal.c,v
312 retrieving revision 1.11.2.5
313 retrieving revision 1.49.2.11
314 diff -u -r1.11.2.5 -r1.49.2.11
315 --- fs/jbd/journal.c 9 Apr 2002 17:30:41 -0000 1.11.2.5
316 +++ fs/jbd/journal.c 9 May 2002 15:05:59 -0000 1.49.2.11
317 @@ -1488,6 +1488,49 @@
318 unlock_journal(journal);
323 + * Report any unexpected dirty buffers which turn up. Normally those
324 + * indicate an error, but they can occur if the user is running (say)
325 + * tune2fs to modify the live filesystem, so we need the option of
326 + * continuing as gracefully as possible. #
328 + * The caller should already hold the journal lock and
329 + * journal_datalist_lock spinlock: most callers will need those anyway
330 + * in order to probe the buffer's journaling state safely.
332 +void __jbd_unexpected_dirty_buffer(char *function, int line,
333 + struct journal_head *jh)
335 + struct buffer_head *bh = jh2bh(jh);
338 + if (buffer_dirty(bh)) {
339 + printk ("%sUnexpected dirty buffer encountered at "
340 + "%s:%d (%s blocknr %lu)\n",
341 + KERN_WARNING, function, line,
342 + kdevname(bh->b_dev), bh->b_blocknr);
343 +#ifdef JBD_PARANOID_WRITES
344 + J_ASSERT (!buffer_dirty(bh));
347 + /* If this buffer is one which might reasonably be dirty
348 + * --- ie. data, or not part of this journal --- then
349 + * we're OK to leave it alone, but otherwise we need to
350 + * move the dirty bit to the journal's own internal
352 + jlist = jh->b_jlist;
354 + if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
355 + jlist == BJ_Shadow || jlist == BJ_Forget) {
356 + if (atomic_set_buffer_clean(jh2bh(jh))) {
357 + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
364 int journal_blocks_per_page(struct inode *inode)
366 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
367 Index: fs/jbd/transaction.c
368 ===================================================================
369 RCS file: /cvsroot/gkernel/ext3/fs/jbd/transaction.c,v
370 retrieving revision 1.14.2.4
371 retrieving revision 1.64.2.9
372 diff -u -r1.14.2.4 -r1.64.2.9
373 --- fs/jbd/transaction.c 23 Jan 2002 07:26:47 -0000 1.14.2.4
374 +++ fs/jbd/transaction.c 9 May 2002 15:16:34 -0000 1.64.2.9
375 @@ -539,76 +539,67 @@
377 do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
379 + struct buffer_head *bh;
380 transaction_t *transaction = handle->h_transaction;
381 journal_t *journal = transaction->t_journal;
383 char *frozen_buffer = NULL;
388 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
390 JBUFFER_TRACE(jh, "entry");
394 /* @@@ Need to check for errors here at some point. */
397 - * AKPM: neither bdflush nor kupdate run with the BKL. There's
398 - * nothing we can do to prevent them from starting writeout of a
399 - * BUF_DIRTY buffer at any time. And checkpointing buffers are on
400 - * BUF_DIRTY. So. We no longer assert that the buffer is unlocked.
402 - * However. It is very wrong for us to allow ext3 to start directly
403 - * altering the ->b_data of buffers which may at that very time be
404 - * undergoing writeout to the client filesystem. This can leave
405 - * the filesystem in an inconsistent, transient state if we crash.
406 - * So what we do is to steal the buffer if it is in checkpoint
407 - * mode and dirty. The journal lock will keep out checkpoint-mode
408 - * state transitions within journal_remove_checkpoint() and the buffer
409 - * is locked to keep bdflush/kupdate/whoever away from it as well.
411 * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
412 * simple lock_journal(). This code here will care for locked buffers.
415 - * The buffer_locked() || buffer_dirty() tests here are simply an
416 - * optimisation tweak. If anyone else in the system decides to
417 - * lock this buffer later on, we'll blow up. There doesn't seem
418 - * to be a good reason why they should do this.
420 - if (jh->b_cp_transaction &&
421 - (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
422 + locked = test_and_set_bit(BH_Lock, &bh->b_state);
424 + /* We can't reliably test the buffer state if we found
425 + * it already locked, so just wait for the lock and
427 unlock_journal(journal);
428 - lock_buffer(jh2bh(jh));
429 - spin_lock(&journal_datalist_lock);
430 - if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
431 - /* OK, we need to steal it */
432 - JBUFFER_TRACE(jh, "stealing from checkpoint mode");
433 - J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
434 - J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
436 - J_ASSERT(handle->h_buffer_credits > 0);
437 - handle->h_buffer_credits--;
439 - /* This will clear BH_Dirty and set BH_JBDDirty. */
440 - JBUFFER_TRACE(jh, "file as BJ_Reserved");
441 - __journal_file_buffer(jh, transaction, BJ_Reserved);
443 - /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
444 - refile_buffer(jh2bh(jh));
445 + __wait_on_buffer(bh);
446 + lock_journal(journal);
450 + /* We now hold the buffer lock so it is safe to query the buffer
451 + * state. Is the buffer dirty?
453 + * If so, there are two possibilities. The buffer may be
454 + * non-journaled, and undergoing a quite legitimate writeback.
455 + * Otherwise, it is journaled, and we don't expect dirty buffers
456 + * in that state (the buffers should be marked JBD_Dirty
457 + * instead.) So either the IO is being done under our own
458 + * control and this is a bug, or it's a third party IO such as
459 + * dump(8) (which may leave the buffer scheduled for read ---
460 + * ie. locked but not dirty) or tune2fs (which may actually have
461 + * the buffer dirtied, ugh.) */
464 - * The buffer is now hidden from bdflush. It is
465 - * metadata against the current transaction.
467 - JBUFFER_TRACE(jh, "steal from cp mode is complete");
468 + if (buffer_dirty(bh)) {
469 + spin_lock(&journal_datalist_lock);
470 + /* First question: is this buffer already part of the
471 + * current transaction or the existing committing
473 + if (jh->b_transaction) {
474 + J_ASSERT_JH(jh, jh->b_transaction == transaction ||
475 + jh->b_transaction == journal->j_committing_transaction);
476 + if (jh->b_next_transaction)
477 + J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
478 + JBUFFER_TRACE(jh, "Unexpected dirty buffer");
479 + jbd_unexpected_dirty_buffer(jh);
481 spin_unlock(&journal_datalist_lock);
482 - unlock_buffer(jh2bh(jh));
483 - lock_journal(journal);
487 - J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
491 if (is_handle_aborted(handle))
492 @@ -1926,6 +1917,7 @@
493 transaction_t *transaction, int jlist)
495 struct journal_head **list = 0;
498 assert_spin_locked(&journal_datalist_lock);
500 @@ -1936,13 +1928,24 @@
501 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
502 jh->b_transaction == 0);
504 - if (jh->b_transaction) {
505 - if (jh->b_jlist == jlist)
507 + if (jh->b_transaction && jh->b_jlist == jlist)
510 + /* The following list of buffer states needs to be consistent
511 + * with __jbd_unexpected_dirty_buffer()'s handling of dirty
514 + if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
515 + jlist == BJ_Shadow || jlist == BJ_Forget) {
516 + if (atomic_set_buffer_clean(jh2bh(jh)) ||
517 + test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state))
521 + if (jh->b_transaction)
522 __journal_unfile_buffer(jh);
525 jh->b_transaction = transaction;
530 @@ -1979,12 +1982,8 @@
531 __blist_add_buffer(list, jh);
534 - if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
535 - jlist == BJ_Shadow || jlist == BJ_Forget) {
536 - if (atomic_set_buffer_clean(jh2bh(jh))) {
537 - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
541 + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
544 void journal_file_buffer(struct journal_head *jh,
545 Index: include/linux/ext3_fs.h
546 ===================================================================
547 RCS file: /cvsroot/gkernel/ext3/include/linux/ext3_fs.h,v
548 retrieving revision 1.22.2.3
549 retrieving revision 1.20.2.17
550 diff -u -r1.22.2.3 -r1.20.2.17
551 --- include/linux/ext3_fs.h 23 Jan 2002 07:26:47 -0000 1.22.2.3
552 +++ include/linux/ext3_fs.h 14 May 2002 15:24:16 -0000 1.20.2.17
555 * The second extended file system version
557 -#define EXT3FS_DATE "10 Jan 2002"
558 -#define EXT3FS_VERSION "2.4-0.9.17"
559 +#define EXT3FS_DATE "14 May 2002"
560 +#define EXT3FS_VERSION "2.4-0.9.18"
564 Index: include/linux/jbd.h
565 ===================================================================
566 RCS file: /cvsroot/gkernel/ext3/include/linux/jbd.h,v
567 retrieving revision 1.38.2.5
568 diff -u -r1.38.2.5 jbd.h
569 --- include/linux/jbd.h 13 Mar 2002 11:53:44 -0000 1.38.2.5
570 +++ include/linux/jbd.h 14 May 2002 15:31:34 -0000
573 #define journal_oom_retry 1
576 + * Define JBD_PARANOID_WRITES to cause a kernel BUG() check if ext3
577 + * finds a buffer unexpectedly dirty. This is useful for debugging, but
578 + * can cause spurious kernel panics if there are applications such as
579 + * tune2fs modifying our buffer_heads behind our backs.
581 +#undef JBD_PARANOID_WRITES
583 #ifdef CONFIG_JBD_DEBUG
585 * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
590 +extern void __jbd_unexpected_dirty_buffer(char *, int, struct journal_head *);
591 +#define jbd_unexpected_dirty_buffer(jh) \
592 + __jbd_unexpected_dirty_buffer(__FUNCTION__, __LINE__, (jh))