1 diff -urN linux-2.4.22.org/fs/buffer.c linux-2.4.22/fs/buffer.c
2 --- linux-2.4.22.org/fs/buffer.c 2003-11-21 15:08:24.000000000 +0100
3 +++ linux-2.4.22/fs/buffer.c 2003-11-21 15:14:23.000000000 +0100
5 spin_unlock(&lru_list_lock);
8 +void buffer_insert_list_journal_head(struct buffer_head *bh,
9 + struct list_head *list,
12 + spin_lock(&lru_list_lock);
13 + if (buffer_attached(bh))
14 + list_del(&bh->b_inode_buffers);
15 + set_buffer_attached(bh);
16 + list_add(&bh->b_inode_buffers, list);
17 + bh->b_journal_head = journal_head;
18 + spin_unlock(&lru_list_lock);
20 +EXPORT_SYMBOL(buffer_insert_list_journal_head);
23 * The caller must have the lru_list lock before calling the
24 * remove_inode_queue functions.
27 * Called when truncating a buffer on a page completely.
29 -static void discard_buffer(struct buffer_head * bh)
30 +void discard_buffer(struct buffer_head * bh)
32 if (buffer_mapped(bh) || buffer_delay(bh)) {
33 mark_buffer_clean(bh);
34 diff -urN linux-2.4.22.org/fs/inode.c linux-2.4.22/fs/inode.c
35 --- linux-2.4.22.org/fs/inode.c 2003-11-21 15:08:24.000000000 +0100
36 +++ linux-2.4.22/fs/inode.c 2003-11-21 15:14:23.000000000 +0100
41 -static void try_to_sync_unused_inodes(void * arg)
42 +static void try_to_sync_unused_inodes(void)
44 struct super_block * sb;
45 int nr_inodes = inodes_stat.nr_unused;
47 spin_unlock(&inode_lock);
50 -static struct tq_struct unused_inodes_flush_task;
51 +static DECLARE_WAIT_QUEUE_HEAD(kinoded_wait) ;
52 +static atomic_t kinoded_goal = ATOMIC_INIT(0) ;
55 * write_inode_now - write an inode to disk
57 !inode_has_buffers(inode))
58 #define INODE(entry) (list_entry(entry, struct inode, i_list))
60 -void prune_icache(int goal)
61 +static void _prune_icache(int goal)
64 struct list_head *entry, *freeable = &list;
66 spin_unlock(&inode_lock);
68 dispose_list(freeable);
69 + kmem_cache_shrink(inode_cachep);
72 - * If we didn't freed enough clean inodes schedule
73 - * a sync of the dirty inodes, we cannot do it
74 - * from here or we're either synchronously dogslow
75 - * or we deadlock with oom.
76 + * If we didn't freed enough clean inodes
80 - schedule_task(&unused_inodes_flush_task);
81 + try_to_sync_unused_inodes();
84 +void prune_icache(int goal) {
85 + atomic_add(goal, &kinoded_goal);
86 + if (atomic_read(&kinoded_goal) > 16) {
87 + wake_up_interruptible(&kinoded_wait);
91 int shrink_icache_memory(int priority, int gfp_mask)
96 - * Nasty deadlock avoidance..
98 - * We may hold various FS locks, and we don't
99 - * want to recurse into the FS that called us
100 - * in clear_inode() and friends..
102 - if (!(gfp_mask & __GFP_FS))
105 count = inodes_stat.nr_unused / priority;
108 - return kmem_cache_shrink(inode_cachep);
113 @@ -1198,6 +1193,34 @@
117 +int kinoded(void *startup) {
119 + struct task_struct *tsk = current;
123 + strcpy(tsk->comm, "kinoded");
125 + /* avoid getting signals */
126 + spin_lock_irq(&tsk->sigmask_lock);
127 + flush_signals(tsk);
128 + sigfillset(&tsk->blocked);
129 + recalc_sigpending(tsk);
130 + spin_unlock_irq(&tsk->sigmask_lock);
132 + printk("kinoded started\n") ;
133 + complete((struct completion *)startup);
135 + wait_event_interruptible(kinoded_wait,
136 + atomic_read(&kinoded_goal));
137 + while((goal = atomic_read(&kinoded_goal))) {
138 + _prune_icache(goal);
139 + atomic_sub(goal, &kinoded_goal);
146 * Initialize the hash tables.
148 @@ -1249,8 +1272,17 @@
151 panic("cannot create inode slab cache");
154 - unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
155 +/* we need to start a thread, and inode_init happens too early for that
156 +** to work. So, add a second init func through module_init
158 +static int __init inode_mod_init(void)
160 + static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
161 + kernel_thread(kinoded, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
162 + wait_for_completion(&startup);
167 @@ -1344,3 +1376,5 @@
172 +module_init(inode_mod_init) ;
173 diff -urN linux-2.4.22.org/fs/reiserfs/bitmap.c linux-2.4.22/fs/reiserfs/bitmap.c
174 --- linux-2.4.22.org/fs/reiserfs/bitmap.c 2003-11-21 15:08:29.000000000 +0100
175 +++ linux-2.4.22/fs/reiserfs/bitmap.c 2003-11-21 15:14:23.000000000 +0100
177 #include <linux/errno.h>
178 #include <linux/locks.h>
179 #include <linux/kernel.h>
180 +#include <linux/quotaops.h>
182 #include <linux/reiserfs_fs.h>
183 #include <linux/reiserfs_fs_sb.h>
187 static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
189 + struct inode *inode, b_blocknr_t block,
190 + int for_unformatted)
192 struct super_block * s = th->t_super;
193 struct reiserfs_super_block * rs;
197 PROC_INFO_INC( s, free_block );
199 rs = SB_DISK_SUPER_BLOCK (s);
200 sbh = SB_BUFFER_WITH_SB (s);
201 apbi = SB_AP_BITMAP(s);
203 block, bdevname(s->s_dev));
207 reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
209 /* clear bit for the given block in bit map */
210 @@ -329,39 +329,55 @@
211 set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
213 journal_mark_dirty (th, s, sbh);
214 + if (for_unformatted) {
215 +#ifdef REISERQUOTA_DEBUG
216 + printk(KERN_DEBUG "reiserquota: freeing block id=%u\n", inode->i_uid);
218 + DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
223 void reiserfs_free_block (struct reiserfs_transaction_handle *th,
224 - unsigned long block) {
225 + struct inode *inode, unsigned long block,
226 + int for_unformatted)
228 struct super_block * s = th->t_super;
230 RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
231 RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
232 /* mark it before we clear it, just in case */
233 journal_mark_freed(th, s, block) ;
234 - _reiserfs_free_block(th, block) ;
235 + _reiserfs_free_block(th, inode, block, for_unformatted) ;
238 /* preallocated blocks don't need to be run through journal_mark_freed */
239 void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
240 - unsigned long block) {
241 + struct inode *inode,
242 + unsigned long block)
244 RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
245 RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
246 - _reiserfs_free_block(th, block) ;
247 + _reiserfs_free_block(th, inode, block, 1) ;
250 static void __discard_prealloc (struct reiserfs_transaction_handle * th,
251 struct inode * inode)
253 unsigned long save = inode->u.reiserfs_i.i_prealloc_block ;
255 #ifdef CONFIG_REISERFS_CHECK
256 if (inode->u.reiserfs_i.i_prealloc_count < 0)
257 reiserfs_warning(th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.\n", __FUNCTION__ );
259 while (inode->u.reiserfs_i.i_prealloc_count > 0) {
260 - reiserfs_free_prealloc_block(th,inode->u.reiserfs_i.i_prealloc_block);
261 + reiserfs_free_prealloc_block(th, inode, inode->u.reiserfs_i.i_prealloc_block);
262 inode->u.reiserfs_i.i_prealloc_block++;
263 inode->u.reiserfs_i.i_prealloc_count --;
267 + reiserfs_update_sd(th, inode) ;
269 inode->u.reiserfs_i.i_prealloc_block = save ;
270 list_del (&(inode->u.reiserfs_i.i_prealloc_list));
272 if (hint->formatted_node || hint->inode == NULL) {
276 hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
277 border = hint->beg + (unsigned long) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
278 if (border > hint->search_start)
280 int nr_allocated = 0;
282 determine_prealloc_size(hint);
283 + if (!hint->formatted_node) {
285 +#ifdef REISERQUOTA_DEBUG
286 + printk(KERN_DEBUG "reiserquota: allocating %d blocks id=%u\n", amount_needed, hint->inode->i_uid);
288 + quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
289 + if (quota_ret) /* Quota exceeded? */
290 + return QUOTA_EXCEEDED;
291 + if (hint->preallocate && hint->prealloc_size ) {
292 +#ifdef REISERQUOTA_DEBUG
293 + printk(KERN_DEBUG "reiserquota: allocating (prealloc) %d blocks id=%u\n", hint->prealloc_size, hint->inode->i_uid);
295 + quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
297 + hint->preallocate=hint->prealloc_size=0;
302 += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
303 amount_needed - nr_allocated, hint->prealloc_size))
306 /* not all blocks were successfully allocated yet*/
307 if (second_pass) { /* it was a second pass; we must free all blocks */
308 + if (!hint->formatted_node) {
309 +#ifdef REISERQUOTA_DEBUG
310 + printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
312 + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */
314 while (nr_allocated --)
315 - reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
316 + reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
318 return NO_DISK_SPACE;
319 } else { /* refine search parameters for next pass */
324 + if ( !hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + INODE_INFO(hint->inode)->i_prealloc_count) {
325 + /* Some of preallocation blocks were not allocated */
326 +#ifdef REISERQUOTA_DEBUG
327 + printk(KERN_DEBUG "reiserquota: freeing (failed prealloc) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count, hint->inode->i_uid);
329 + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count);
336 if (ret != CARRY_ON) {
337 while (amount_needed ++ < initial_amount_needed) {
338 - reiserfs_free_block(hint->th, *(--new_blocknrs));
339 + reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
343 diff -urN linux-2.4.22.org/fs/reiserfs/do_balan.c linux-2.4.22/fs/reiserfs/do_balan.c
344 --- linux-2.4.22.org/fs/reiserfs/do_balan.c 2003-11-21 15:08:29.000000000 +0100
345 +++ linux-2.4.22/fs/reiserfs/do_balan.c 2003-11-21 15:14:23.000000000 +0100
347 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
348 struct buffer_head * bh, int flag)
350 - if (reiserfs_dont_log(tb->tb_sb)) {
351 - if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
352 - __mark_buffer_dirty(bh) ;
353 - tb->need_balance_dirty = 1;
356 - int windex = push_journal_writer("do_balance") ;
357 - journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
358 - pop_journal_writer(windex) ;
360 + journal_mark_dirty(tb->transaction_handle,
361 + tb->transaction_handle->t_super, bh) ;
364 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
365 @@ -1247,7 +1239,7 @@
366 if (buffer_dirty (tb->thrown[i]))
367 reiserfs_warning (tb->tb_sb, "free_thrown deals with dirty buffer %ld\n", blocknr);
368 brelse(tb->thrown[i]) ; /* incremented in store_thrown */
369 - reiserfs_free_block (tb->transaction_handle, blocknr);
370 + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
374 @@ -1259,9 +1251,11 @@
375 set_blkh_level( blkh, FREE_LEVEL );
376 set_blkh_nr_item( blkh, 0 );
378 - mark_buffer_clean (bh);
379 + if (buffer_dirty(bh))
381 + // mark_buffer_clean (bh);
382 /* reiserfs_free_block is no longer schedule safe
383 - reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr);
384 + reiserfs_free_block (tb->transaction_handle, NULL, tb->tb_sb, bh->b_blocknr, 0);
387 store_thrown (tb, bh);
388 @@ -1575,6 +1569,7 @@
390 tb->need_balance_dirty = 0;
392 + reiserfs_check_lock_depth("do balance");
393 if (FILESYSTEM_CHANGED_TB(tb)) {
394 reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
396 @@ -1605,5 +1600,6 @@
399 do_balance_completed (tb);
400 + reiserfs_check_lock_depth("do balance2");
403 diff -urN linux-2.4.22.org/fs/reiserfs/file.c linux-2.4.22/fs/reiserfs/file.c
404 --- linux-2.4.22.org/fs/reiserfs/file.c 2003-11-21 15:08:29.000000000 +0100
405 +++ linux-2.4.22/fs/reiserfs/file.c 2003-11-21 15:14:23.000000000 +0100
407 #include <linux/sched.h>
408 #include <linux/reiserfs_fs.h>
409 #include <linux/smp_lock.h>
410 +#include <linux/quotaops.h>
413 ** We pack the tails of files on file close, not at the time they are written.
416 down (&inode->i_sem);
417 journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
418 - reiserfs_update_inode_transaction(inode) ;
420 #ifdef REISERFS_PREALLOCATE
421 reiserfs_discard_prealloc (&th, inode);
423 static int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
424 struct inode *inode = dentry->d_inode ;
426 - if (attr->ia_valid & ATTR_SIZE) {
427 + unsigned int ia_valid = attr->ia_valid ;
429 + if (ia_valid & ATTR_SIZE) {
430 /* version 2 items will be caught by the s_maxbytes check
431 ** done for us in vmtruncate
434 attr->ia_size > MAX_NON_LFS)
437 + /* During a truncate, we have to make sure the new i_size is in
438 + ** the transaction before we start dropping updates to data logged
439 + ** or ordered write data pages.
441 + if (attr->ia_size < inode->i_size && reiserfs_file_data_log(inode)) {
442 + struct reiserfs_transaction_handle th ;
443 + journal_begin(&th, inode->i_sb, 1) ;
444 + reiserfs_update_sd_size(&th, inode, attr->ia_size) ;
445 + journal_end(&th, inode->i_sb, 1) ;
446 /* fill in hole pointers in the expanding truncate case. */
447 - if (attr->ia_size > inode->i_size) {
448 + } else if (attr->ia_size > inode->i_size) {
449 error = generic_cont_expand(inode, attr->ia_size) ;
450 if (inode->u.reiserfs_i.i_prealloc_count > 0) {
451 struct reiserfs_transaction_handle th ;
452 @@ -123,15 +134,35 @@
455 error = inode_change_ok(inode, attr) ;
457 - inode_setattr(inode, attr) ;
459 + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
460 + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
461 + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
464 + error = inode_setattr(inode, attr) ;
470 +reiserfs_file_write(struct file *f, const char *b, size_t count, loff_t *ppos)
473 + struct inode *inode = f->f_dentry->d_inode;
475 + ret = generic_file_write(f, b, count, ppos);
476 + if (ret >= 0 && f->f_flags & O_SYNC) {
478 + reiserfs_commit_for_inode(inode);
484 struct file_operations reiserfs_file_operations = {
485 read: generic_file_read,
486 - write: generic_file_write,
487 + write: reiserfs_file_write,
488 ioctl: reiserfs_ioctl,
489 mmap: generic_file_mmap,
490 release: reiserfs_file_release,
491 diff -urN linux-2.4.22.org/fs/reiserfs/fix_node.c linux-2.4.22/fs/reiserfs/fix_node.c
492 --- linux-2.4.22.org/fs/reiserfs/fix_node.c 2003-11-21 15:08:29.000000000 +0100
493 +++ linux-2.4.22/fs/reiserfs/fix_node.c 2003-11-21 15:14:23.000000000 +0100
495 else /* If we have enough already then there is nothing to do. */
498 - if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
499 - n_amount_needed) == NO_DISK_SPACE )
500 + /* No need to check quota - is not allocated for blocks used for formatted nodes */
501 + if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
502 + n_amount_needed) == NO_DISK_SPACE)
503 return NO_DISK_SPACE;
505 /* for each blocknumber we just got, get a buffer and stick it on FEB */
506 @@ -2121,7 +2122,8 @@
508 static void clear_all_dirty_bits(struct super_block *s,
509 struct buffer_head *bh) {
510 - reiserfs_prepare_for_journal(s, bh, 0) ;
511 + // reiserfs_prepare_for_journal(s, bh, 0) ;
512 + set_bit(BH_JPrepared, &bh->b_state) ;
515 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
516 @@ -2518,7 +2520,7 @@
517 /* de-allocated block which was not used by balancing and
518 bforget about buffer for it */
520 - reiserfs_free_block (tb->transaction_handle, blocknr);
521 + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
524 /* release used as new nodes including a new root */
525 diff -urN linux-2.4.22.org/fs/reiserfs/ibalance.c linux-2.4.22/fs/reiserfs/ibalance.c
526 --- linux-2.4.22.org/fs/reiserfs/ibalance.c 2003-11-21 15:08:29.000000000 +0100
527 +++ linux-2.4.22/fs/reiserfs/ibalance.c 2003-11-21 15:14:23.000000000 +0100
529 /* use check_internal if new root is an internal node */
530 check_internal (new_root);
531 /*&&&&&&&&&&&&&&&&&&&&&&*/
532 - tb->tb_sb->s_dirt = 1;
534 /* do what is needed for buffer thrown from tree */
535 reiserfs_invalidate_buffer(tb, tbSh);
537 PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
538 PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
539 do_balance_mark_sb_dirty (tb, tb->tb_sb->u.reiserfs_sb.s_sbh, 1);
540 - tb->tb_sb->s_dirt = 1;
543 if ( tb->blknum[h] == 2 ) {
544 diff -urN linux-2.4.22.org/fs/reiserfs/inode.c linux-2.4.22/fs/reiserfs/inode.c
545 --- linux-2.4.22.org/fs/reiserfs/inode.c 2003-11-21 15:08:29.000000000 +0100
546 +++ linux-2.4.22/fs/reiserfs/inode.c 2003-11-21 15:14:23.000000000 +0100
549 #include <linux/config.h>
550 #include <linux/sched.h>
551 +#include <linux/fs.h>
552 #include <linux/reiserfs_fs.h>
553 #include <linux/locks.h>
554 #include <linux/smp_lock.h>
555 +#include <linux/quotaops.h>
556 #include <asm/uaccess.h>
557 #include <asm/unaligned.h>
560 #define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
561 #define GET_BLOCK_NO_ISEM 8 /* i_sem is not held, don't preallocate */
563 +static int reiserfs_commit_write(struct file *, struct page *,
564 + unsigned from, unsigned to) ;
565 static int reiserfs_get_block (struct inode * inode, long block,
566 struct buffer_head * bh_result, int create);
572 + DQUOT_FREE_INODE(inode);
573 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
574 if (INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
575 down (&inode->i_sem);
579 static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
580 - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
581 + struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
582 + buffer_insert_list_journal_head(bh, &jl->j_ordered_bh_list, jl);
585 - buffer_insert_list(bh, &j->j_dirty_buffers) ;
586 +static void add_to_tail_list(struct inode *inode, struct buffer_head *bh) {
587 + struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
588 + buffer_insert_list_journal_head(bh, &jl->j_tail_bh_list, jl);
592 @@ -201,15 +210,16 @@
596 -/*static*/ void restart_transaction(struct reiserfs_transaction_handle *th,
597 - struct inode *inode, struct path *path) {
598 - struct super_block *s = th->t_super ;
599 - int len = th->t_blocks_allocated ;
601 +static void restart_transaction(struct reiserfs_transaction_handle *th,
602 + struct inode *inode, struct path *path,
603 + int jbegin_count) {
604 + /* we cannot restart while nested unless the parent allows it */
605 + if (!reiserfs_restartable_handle(th) && th->t_refcount > 1) {
609 reiserfs_update_sd(th, inode) ;
610 - journal_end(th, s, len) ;
611 - journal_begin(th, s, len) ;
612 + reiserfs_restart_transaction(th, jbegin_count) ;
613 reiserfs_update_inode_transaction(inode) ;
620 + if ((offset + inode->i_sb->s_blocksize) > PAGE_CACHE_SIZE) {
621 +printk("get_block_create_0 offset %lu too large\n", offset);
624 memset (p, 0, inode->i_sb->s_blocksize);
626 if (!is_direct_le_ih (ih)) {
627 @@ -421,10 +435,32 @@
628 static int reiserfs_get_block_direct_io (struct inode * inode, long block,
629 struct buffer_head * bh_result, int create) {
632 + struct reiserfs_transaction_handle *th;
634 + struct super_block *s = inode->i_sb;
636 + /* get_block might start a new transaction and leave it running.
637 + * test for that by checking for a transaction running right now
638 + * and recording its refcount. Run a journal_end if the refcount
639 + * after reiserfs_get_block is higher than it was before.
641 + if (reiserfs_transaction_running(s)) {
642 + th = current->journal_info;
643 + refcount = th->t_refcount;
645 bh_result->b_page = NULL;
646 ret = reiserfs_get_block(inode, block, bh_result, create) ;
648 + if (!ret && reiserfs_transaction_running(s)) {
649 + th = current->journal_info;
650 + if (th->t_refcount > refcount) {
652 + reiserfs_update_sd(th, inode) ;
653 + journal_end(th, s, th->t_blocks_allocated);
658 /* don't allow direct io onto tail pages */
659 if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
660 /* make sure future calls to the direct io funcs for this offset
662 struct buffer_head *bh_result,
663 loff_t tail_offset) {
664 unsigned long index ;
665 - unsigned long tail_end ;
666 unsigned long tail_start ;
667 struct page * tail_page ;
668 struct page * hole_page = bh_result->b_page ;
671 /* always try to read until the end of the block */
672 tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
673 - tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
675 index = tail_offset >> PAGE_CACHE_SHIFT ;
676 if ( !hole_page || index != hole_page->index) {
677 @@ -492,16 +526,13 @@
678 ** data that has been read directly into the page, and block_prepare_write
679 ** won't trigger a get_block in this case.
681 - fix_tail_page_for_writing(tail_page) ;
682 - retval = block_prepare_write(tail_page, tail_start, tail_end,
683 - reiserfs_get_block) ;
684 + retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_start) ;
688 /* tail conversion might change the data in the page */
689 flush_dcache_page(tail_page) ;
691 - retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
692 + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_start) ;
695 if (tail_page != hole_page) {
696 @@ -541,20 +572,34 @@
700 - struct reiserfs_transaction_handle th ;
701 + struct reiserfs_transaction_handle *th = NULL ;
702 /* space reserved in transaction batch:
703 . 3 balancings in direct->indirect conversion
704 . 1 block involved into reiserfs_update_sd()
706 XXX in practically impossible worst case direct2indirect()
707 - can incur (much) more that 3 balancings. */
708 - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
709 + can incur (much) more that 3 balancings, but we deal with
710 + direct2indirect lower down */
711 + int jbegin_count = JOURNAL_PER_BALANCE_CNT + 2;
713 - int transaction_started = 0 ;
715 loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
716 + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
719 + /* if this block might contain a tail, we need to be more conservative */
720 + if (new_offset <= (loff_t)(16 * 1024)) {
721 + jbegin_count += JOURNAL_PER_BALANCE_CNT * 2;
723 + /* we might nest for the entire page, so we need to make sure
724 + * to reserve enough to insert pointers in the tree for each block
727 + jbegin_count *= blocks_per_page;
728 + if (reiserfs_file_data_log(inode)) {
729 + jbegin_count += blocks_per_page;
733 - th.t_trans_id = 0 ;
734 version = get_inode_item_key_version (inode);
741 + /* don't leave the trans running if we are already nested */
742 + if (reiserfs_transaction_running(inode->i_sb))
745 /* If file is of such a size, that it might have a tail and tails are enabled
746 ** we should mark it as possibly needing tail packing on close
748 @@ -591,10 +640,18 @@
749 /* set the key of the first byte in the 'block'-th block of file */
750 make_cpu_key (&key, inode, new_offset,
751 TYPE_ANY, 3/*key length*/);
753 + /* reiserfs_commit_write will close any transaction currently
754 + ** running. So, if we are nesting into someone else, we have to
755 + ** make sure and bump the refcount
757 if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
758 - journal_begin(&th, inode->i_sb, jbegin_count) ;
759 + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
761 + retval = PTR_ERR(th) ;
764 reiserfs_update_inode_transaction(inode) ;
765 - transaction_started = 1 ;
769 @@ -614,28 +671,34 @@
771 if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
772 /* we have to allocate block for the unformatted node */
773 - if (!transaction_started) {
774 + if (!reiserfs_active_handle(th)) {
776 - journal_begin(&th, inode->i_sb, jbegin_count) ;
777 + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
779 + retval = PTR_ERR(th) ;
782 reiserfs_update_inode_transaction(inode) ;
783 - transaction_started = 1 ;
787 - repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
788 + repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
790 - if (repeat == NO_DISK_SPACE) {
791 + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
792 /* restart the transaction to give the journal a chance to free
793 ** some blocks. releases the path, so we have to go back to
794 ** research if we succeed on the second try
796 - restart_transaction(&th, inode, &path) ;
797 - repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
798 + restart_transaction(th, inode, &path, jbegin_count) ;
799 + repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
801 - if (repeat != NO_DISK_SPACE) {
802 + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
806 + if (repeat == QUOTA_EXCEEDED)
813 @@ -660,15 +723,12 @@
814 bh_result->b_state |= (1UL << BH_New);
815 put_block_num(item, pos_in_item, allocated_block_nr) ;
816 unfm_ptr = allocated_block_nr;
817 - journal_mark_dirty (&th, inode->i_sb, bh);
818 - inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
819 - reiserfs_update_sd(&th, inode) ;
820 + journal_mark_dirty (th, inode->i_sb, bh);
821 + reiserfs_update_sd(th, inode) ;
823 set_block_dev_mapped(bh_result, unfm_ptr, inode);
825 pop_journal_writer(windex) ;
826 - if (transaction_started)
827 - journal_end(&th, inode->i_sb, jbegin_count) ;
831 @@ -676,18 +736,23 @@
832 ** there is no need to make sure the inode is updated with this
835 + if (!dangle && reiserfs_active_handle(th))
836 + journal_end(th, inode->i_sb, jbegin_count) ;
840 - if (!transaction_started) {
841 + if (!reiserfs_active_handle(th)) {
842 /* if we don't pathrelse, we could vs-3050 on the buffer if
843 ** someone is waiting for it (they can't finish until the buffer
844 - ** is released, we can start a new transaction until they finish)
845 + ** is released, we can't start a new transaction until they finish)
848 - journal_begin(&th, inode->i_sb, jbegin_count) ;
849 + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
851 + retval = PTR_ERR(th) ;
854 reiserfs_update_inode_transaction(inode) ;
855 - transaction_started = 1 ;
859 @@ -716,13 +781,11 @@
860 set_cpu_key_k_offset (&tmp_key, 1);
861 PATH_LAST_POSITION(&path) ++;
863 - retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
864 + retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
866 - reiserfs_free_block (&th, allocated_block_nr);
867 - goto failure; // retval == -ENOSPC or -EIO or -EEXIST
868 + reiserfs_free_block (th, inode, allocated_block_nr, 1);
869 + goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
872 - inode->i_blocks += inode->i_sb->s_blocksize / 512;
873 //mark_tail_converted (inode);
874 } else if (is_direct_le_ih (ih)) {
875 /* direct item has to be converted */
877 node. FIXME: this should also get into page cache */
880 - journal_end(&th, inode->i_sb, jbegin_count) ;
881 - transaction_started = 0 ;
882 + /* ugly, but we should only end the transaction if
883 + ** we aren't nested
885 + if (th->t_refcount == 1) {
886 + journal_end(th, inode->i_sb, jbegin_count) ;
890 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
892 @@ -751,20 +819,27 @@
893 reiserfs_warning(inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
894 if (allocated_block_nr) {
895 /* the bitmap, the super, and the stat data == 3 */
896 - journal_begin(&th, inode->i_sb, 3) ;
897 - reiserfs_free_block (&th, allocated_block_nr);
898 - transaction_started = 1 ;
899 + if (!reiserfs_active_handle(th)) {
900 + th = reiserfs_persistent_transaction(inode->i_sb,3);
903 + reiserfs_free_block(th,inode,allocated_block_nr,1);
911 - retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
912 + retval = direct2indirect (th, inode, &path, unbh, tail_offset);
914 reiserfs_unmap_buffer(unbh);
915 - reiserfs_free_block (&th, allocated_block_nr);
916 + reiserfs_free_block (th, inode, allocated_block_nr, 1);
920 + reiserfs_update_sd(th, inode) ;
922 /* it is important the mark_buffer_uptodate is done after
923 ** the direct2indirect. The buffer might contain valid
924 ** data newer than the data on disk (read by readpage, changed,
925 @@ -775,24 +850,25 @@
927 mark_buffer_uptodate (unbh, 1);
929 - /* unbh->b_page == NULL in case of DIRECT_IO request, this means
930 - buffer will disappear shortly, so it should not be added to
932 + /* we've converted the tail, so we must
933 + ** flush unbh before the transaction commits.
934 + ** unbh->b_page will be NULL for direct io requests, and
935 + ** in that case there's no data to log, dirty or order
937 if ( unbh->b_page ) {
938 - /* we've converted the tail, so we must
939 - ** flush unbh before the transaction commits
941 - add_to_flushlist(inode, unbh) ;
943 - /* mark it dirty now to prevent commit_write from adding
944 - ** this buffer to the inode's dirty buffer list
946 - __mark_buffer_dirty(unbh) ;
947 + if (reiserfs_file_data_log(inode)) {
948 + reiserfs_prepare_for_journal(inode->i_sb, unbh, 1) ;
949 + journal_mark_dirty(th, inode->i_sb, unbh) ;
951 + /* mark it dirty now to prevent commit_write from adding
952 + ** this buffer to the inode's dirty buffer list
954 + __mark_buffer_dirty(unbh) ;
955 + /* note, this covers the data=ordered case too */
956 + add_to_tail_list(inode, unbh) ;
960 - //inode->i_blocks += inode->i_sb->s_blocksize / 512;
961 - //mark_tail_converted (inode);
963 /* append indirect item with holes if needed, when appending
964 pointer to 'block'-th block use block, which is already
965 @@ -840,18 +916,16 @@
966 only have space for one block */
967 blocks_needed=max_to_insert?max_to_insert:1;
969 - retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
970 + retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
972 if (blocks_needed != 1)
976 - reiserfs_free_block (&th, allocated_block_nr);
977 + reiserfs_free_block (th, inode, allocated_block_nr, 1);
981 - inode->i_blocks += inode->i_sb->s_blocksize / 512;
984 /* We need to mark new file size in case this function will be
985 interrupted/aborted later on. And we may do this only for
989 ** release the path so that anybody waiting on the path before
990 ** ending their transaction will be able to continue.
992 + ** this only happens when inserting holes into the file, so it
993 + ** does not affect data=ordered safety at all
995 - if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
996 - restart_transaction(&th, inode, &path) ;
997 + if (journal_transaction_should_end(th, jbegin_count)) {
998 + restart_transaction(th, inode, &path, jbegin_count) ;
1000 /* inserting indirect pointers for a hole can take a
1001 ** long time. reschedule if needed
1003 "%K should not be found\n", &key);
1005 if (allocated_block_nr)
1006 - reiserfs_free_block (&th, allocated_block_nr);
1007 + reiserfs_free_block (th, inode, allocated_block_nr, 1);
1011 @@ -902,20 +979,82 @@
1015 - reiserfs_check_path(&path) ;
1018 - if (transaction_started) {
1019 - reiserfs_update_sd(&th, inode) ;
1020 - journal_end(&th, inode->i_sb, jbegin_count) ;
1021 + pathrelse(&path) ;
1022 + /* if we had an error, end the transaction */
1023 + if (!IS_ERR(th) && reiserfs_active_handle(th)) {
1024 + if (retval != 0) {
1025 + reiserfs_update_sd(th, inode) ;
1026 + journal_end(th, inode->i_sb, jbegin_count) ;
1028 + } else if (!dangle) {
1029 + journal_end(th, inode->i_sb, jbegin_count) ;
1033 pop_journal_writer(windex) ;
1034 + if (retval == 0 && reiserfs_active_handle(th) &&
1035 + current->journal_info != th) {
1039 - reiserfs_check_path(&path) ;
1044 +/* Compute real number of used bytes by file
1045 + * Following three functions can go away when we'll have enough space in stat item
1047 +static int real_space_diff(struct inode *inode, int sd_size)
1050 + loff_t blocksize = inode->i_sb->s_blocksize ;
1052 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1055 + /* End of file is also in full block with indirect reference, so round
1056 + ** up to the next block.
1058 + ** there is just no way to know if the tail is actually packed
1059 + ** on the file, so we have to assume it isn't. When we pack the
1060 + ** tail, we add 4 bytes to pretend there really is an unformatted
1063 + bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
1067 +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1070 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1071 + return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
1073 + return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
1076 +/* Compute number of blocks used by file in ReiserFS counting */
1077 +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1079 + loff_t bytes = inode_get_bytes(inode) ;
1080 + loff_t real_space = real_space_diff(inode, sd_size) ;
1082 + /* keeps fsck and non-quota versions of reiserfs happy */
1083 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1084 + bytes += (loff_t)511 ;
1087 + /* files from before the quota patch might i_blocks such that
1088 + ** bytes < real_space. Deal with that here to prevent it from
1089 + ** going negative.
1091 + if (bytes < real_space)
1093 + return (bytes - real_space) >> 9;
1097 // BAD: new directories have stat data of new type and all other items
1098 // of old type. Version stored in the inode says about body items, so
1099 @@ -971,6 +1110,14 @@
1101 rdev = sd_v1_rdev(sd);
1102 inode->u.reiserfs_i.i_first_direct_byte = sd_v1_first_direct_byte(sd);
1103 + /* an early bug in the quota code can give us an odd number for the
1104 + ** block count. This is incorrect, fix it here.
1106 + if (inode->i_blocks & 1) {
1107 + inode->i_blocks++ ;
1109 + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1111 /* nopack is initially zero for v1 objects. For v2 objects,
1112 nopack is initialised from sd_attrs */
1113 inode->u.reiserfs_i.i_flags &= ~i_nopack_mask;
1114 @@ -1000,6 +1147,8 @@
1115 set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1117 set_inode_sd_version (inode, STAT_DATA_V2);
1118 + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1120 /* read persistent inode attributes from sd and initalise
1121 generic inode flags from them */
1122 inode -> u.reiserfs_i.i_attrs = sd_v2_attrs( sd );
1123 @@ -1026,7 +1175,7 @@
1126 // update new stat data with inode fields
1127 -static void inode2sd (void * sd, struct inode * inode)
1128 +static void inode2sd (void * sd, struct inode * inode, loff_t new_size)
1130 struct stat_data * sd_v2 = (struct stat_data *)sd;
1132 @@ -1034,12 +1183,12 @@
1133 set_sd_v2_mode(sd_v2, inode->i_mode );
1134 set_sd_v2_nlink(sd_v2, inode->i_nlink );
1135 set_sd_v2_uid(sd_v2, inode->i_uid );
1136 - set_sd_v2_size(sd_v2, inode->i_size );
1137 + set_sd_v2_size(sd_v2, new_size);
1138 set_sd_v2_gid(sd_v2, inode->i_gid );
1139 set_sd_v2_mtime(sd_v2, inode->i_mtime );
1140 set_sd_v2_atime(sd_v2, inode->i_atime );
1141 set_sd_v2_ctime(sd_v2, inode->i_ctime );
1142 - set_sd_v2_blocks(sd_v2, inode->i_blocks );
1143 + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1144 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1145 set_sd_v2_rdev(sd_v2, inode->i_rdev );
1147 @@ -1051,7 +1200,7 @@
1150 // used to copy inode's fields to old stat data
1151 -static void inode2sd_v1 (void * sd, struct inode * inode)
1152 +static void inode2sd_v1 (void * sd, struct inode * inode, loff_t new_size)
1154 struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1156 @@ -1059,7 +1208,7 @@
1157 set_sd_v1_uid(sd_v1, inode->i_uid );
1158 set_sd_v1_gid(sd_v1, inode->i_gid );
1159 set_sd_v1_nlink(sd_v1, inode->i_nlink );
1160 - set_sd_v1_size(sd_v1, inode->i_size );
1161 + set_sd_v1_size(sd_v1, new_size);
1162 set_sd_v1_atime(sd_v1, inode->i_atime );
1163 set_sd_v1_ctime(sd_v1, inode->i_ctime );
1164 set_sd_v1_mtime(sd_v1, inode->i_mtime );
1165 @@ -1067,7 +1216,7 @@
1166 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1167 set_sd_v1_rdev(sd_v1, inode->i_rdev );
1169 - set_sd_v1_blocks(sd_v1, inode->i_blocks );
1170 + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1172 // Sigh. i_first_direct_byte is back
1173 set_sd_v1_first_direct_byte(sd_v1, inode->u.reiserfs_i.i_first_direct_byte);
1174 @@ -1077,7 +1226,8 @@
1175 /* NOTE, you must prepare the buffer head before sending it here,
1176 ** and then log it after the call
1178 -static void update_stat_data (struct path * path, struct inode * inode)
1179 +static void update_stat_data (struct path * path, struct inode * inode,
1182 struct buffer_head * bh;
1183 struct item_head * ih;
1184 @@ -1091,17 +1241,16 @@
1186 if (stat_data_v1 (ih)) {
1187 // path points to old stat data
1188 - inode2sd_v1 (B_I_PITEM (bh, ih), inode);
1189 + inode2sd_v1 (B_I_PITEM (bh, ih), inode, new_size);
1191 - inode2sd (B_I_PITEM (bh, ih), inode);
1192 + inode2sd (B_I_PITEM (bh, ih), inode, new_size);
1199 -void reiserfs_update_sd (struct reiserfs_transaction_handle *th,
1200 - struct inode * inode)
1201 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1202 + struct inode *inode, loff_t new_size)
1205 INITIALIZE_PATH(path);
1206 @@ -1151,7 +1300,7 @@
1210 - update_stat_data (&path, inode);
1211 + update_stat_data (&path, inode, new_size);
1212 journal_mark_dirty(th, th->t_super, bh) ;
1215 @@ -1236,6 +1385,7 @@
1216 reiserfs_make_bad_inode( inode );
1219 + reiserfs_update_inode_transaction(inode);
1220 reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1223 @@ -1415,8 +1565,6 @@
1224 ** does something when called for a synchronous update.
1226 void reiserfs_write_inode (struct inode * inode, int do_sync) {
1227 - struct reiserfs_transaction_handle th ;
1228 - int jbegin_count = 1 ;
1230 if (inode->i_sb->s_flags & MS_RDONLY) {
1231 reiserfs_warning(inode->i_sb, "clm-6005: writing inode %lu on readonly FS\n",
1232 @@ -1430,9 +1578,7 @@
1234 if (do_sync && !(current->flags & PF_MEMALLOC)) {
1236 - journal_begin(&th, inode->i_sb, jbegin_count) ;
1237 - reiserfs_update_sd (&th, inode);
1238 - journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1239 + reiserfs_commit_for_inode(inode) ;
1243 @@ -1450,6 +1596,7 @@
1244 /* stat data of new object is inserted already, this inserts the item
1245 containing "." and ".." entries */
1246 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1247 + struct inode *inode,
1248 struct item_head * ih, struct path * path,
1249 const struct inode * dir)
1251 @@ -1494,13 +1641,14 @@
1254 /* insert item, that is empty directory item */
1255 - return reiserfs_insert_item (th, path, &key, ih, body);
1256 + return reiserfs_insert_item (th, path, &key, ih, inode, body);
1260 /* stat data of object has been inserted, this inserts the item
1261 containing the body of symlink */
1262 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1263 + struct inode *inode, /* Inode of symlink */
1264 struct item_head * ih,
1265 struct path * path, const char * symname, int item_len)
1267 @@ -1530,7 +1678,7 @@
1270 /* insert item, that is body of symlink */
1271 - return reiserfs_insert_item (th, path, &key, ih, symname);
1272 + return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1276 @@ -1604,7 +1752,8 @@
1278 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1279 inode->i_size = i_size;
1280 - inode->i_blocks = (inode->i_size + 511) >> 9;
1281 + inode->i_blocks = 0;
1282 + inode->i_bytes = 0;
1283 inode->u.reiserfs_i.i_first_direct_byte = S_ISLNK(mode) ? 1 :
1284 U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1286 @@ -1638,9 +1787,9 @@
1290 - inode2sd_v1 (&sd, inode);
1291 + inode2sd_v1 (&sd, inode, inode->i_size);
1293 - inode2sd (&sd, inode);
1294 + inode2sd (&sd, inode, inode->i_size);
1296 // these do not go to on-disk stat data
1297 inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1298 @@ -1665,7 +1814,7 @@
1299 if (dir->u.reiserfs_i.new_packing_locality)
1300 th->displace_new_blocks = 1;
1302 - retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd));
1303 + retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1305 reiserfs_check_path(&path_to_key) ;
1307 @@ -1678,14 +1827,14 @@
1309 if (S_ISDIR(mode)) {
1310 /* insert item with "." and ".." */
1311 - retval = reiserfs_new_directory (th, &ih, &path_to_key, dir);
1312 + retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1315 if (S_ISLNK(mode)) {
1316 /* insert body of symlink */
1317 if (!old_format_only (sb))
1318 i_size = ROUND_UP(i_size);
1319 - retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size);
1320 + retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1324 @@ -1705,6 +1854,9 @@
1326 /* dquot_drop must be done outside a transaction */
1327 journal_end(th, th->t_super, th->t_blocks_allocated) ;
1328 + DQUOT_FREE_INODE(inode);
1329 + DQUOT_DROP(inode);
1330 + inode->i_flags |= S_NOQUOTA;
1331 make_bad_inode(inode);
1334 @@ -1816,6 +1968,7 @@
1336 struct page *page = NULL ;
1338 + int need_balance_dirty = 0 ;
1339 struct buffer_head *bh = NULL ;
1341 if (p_s_inode->i_size > 0) {
1342 @@ -1848,34 +2001,58 @@
1343 transaction of truncating gets committed - on reboot the file
1344 either appears truncated properly or not truncated at all */
1345 add_save_link (&th, p_s_inode, 1);
1348 reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
1349 pop_journal_writer(windex) ;
1350 - journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1352 - if (update_timestamps)
1353 - remove_save_link (p_s_inode, 1/* truncate */);
1356 + if (!PageLocked(page))
1358 length = offset & (blocksize - 1) ;
1359 /* if we are not on a block boundary */
1361 length = blocksize - length ;
1362 - memset((char *)kmap(page) + offset, 0, length) ;
1363 + if ((offset + length) > PAGE_CACHE_SIZE) {
1366 + memset((char *)page_address(page) + offset, 0, length) ;
1367 flush_dcache_page(page) ;
1369 if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1370 - if (!atomic_set_buffer_dirty(bh)) {
1371 + if (reiserfs_file_data_log(p_s_inode)) {
1372 + reiserfs_prepare_for_journal(p_s_inode->i_sb, bh, 1) ;
1373 + journal_mark_dirty(&th, p_s_inode->i_sb, bh) ;
1375 + /* it is safe to block here, but it would be faster
1376 + ** to balance dirty after the journal lock is dropped
1378 + if (!atomic_set_buffer_dirty(bh)) {
1379 set_buffer_flushtime(bh);
1381 buffer_insert_inode_data_queue(bh, p_s_inode);
1383 + need_balance_dirty = 1;
1385 + if (reiserfs_data_ordered(p_s_inode->i_sb)) {
1386 + add_to_flushlist(p_s_inode, bh) ;
1394 + journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1) ;
1396 + if (update_timestamps)
1397 + remove_save_link(p_s_inode, 1/* truncate */);
1401 page_cache_release(page) ;
1404 + if (need_balance_dirty) {
1410 @@ -1944,6 +2121,8 @@
1414 + if (((B_I_PITEM(bh, ih) - bh->b_data) + pos_in_item + copy_size) > inode->i_sb->s_blocksize)
1416 memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
1418 journal_mark_dirty(&th, inode->i_sb, bh) ;
1419 @@ -1971,9 +2150,37 @@
1421 /* this is where we fill in holes in the file. */
1422 if (use_get_block) {
1423 + int old_refcount = 0 ;
1424 + struct reiserfs_transaction_handle *hole_th ;
1425 + if (reiserfs_transaction_running(inode->i_sb)) {
1426 + hole_th = current->journal_info ;
1427 + old_refcount = hole_th->t_refcount ;
1429 retval = reiserfs_get_block(inode, block, bh_result,
1430 GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
1432 + /* did reiserfs_get_block leave us a running transaction? */
1433 + if (reiserfs_transaction_running(inode->i_sb)) {
1434 + hole_th = current->journal_info ;
1435 + if (old_refcount < hole_th->t_refcount) {
1437 + /* we've filled a hole, make sure the new block
1438 + * gets to disk before transaction commit
1440 + if (buffer_mapped(bh_result) && bh_result->b_blocknr != 0 &&
1441 + reiserfs_data_ordered(inode->i_sb))
1443 + __mark_buffer_dirty(bh_result) ;
1444 + mark_buffer_uptodate(bh_result, 1);
1445 + /* no need to update the inode trans, already done */
1446 + add_to_flushlist(inode, bh_result) ;
1448 + reiserfs_update_sd(hole_th, inode) ;
1449 + journal_end(hole_th, hole_th->t_super,
1450 + hole_th->t_blocks_allocated) ;
1454 if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
1455 /* get_block failed to find a mapped unformatted node. */
1457 @@ -1988,33 +2195,41 @@
1458 /* helper func to get a buffer head ready for writepage to send to
1461 -static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
1462 +static void submit_bh_for_writepage(struct page *page,
1463 + struct buffer_head **bhp, int nr) {
1464 struct buffer_head *bh ;
1467 - /* lock them all first so the end_io handler doesn't unlock the page
1469 + /* lock them all first so the end_io handler doesn't
1470 + ** unlock too early
1472 + ** There's just no safe way to log the buffers during writepage,
1473 + ** we'll deadlock if kswapd tries to start a transaction.
1475 + ** There's also no useful way to tie them to a specific transaction,
1476 + ** so we just don't bother.
1478 for(i = 0 ; i < nr ; i++) {
1481 - set_buffer_async_io(bh) ;
1484 + set_buffer_async_io(bh);
1485 + set_bit(BH_Uptodate, &bh->b_state) ;
1487 for(i = 0 ; i < nr ; i++) {
1489 /* submit_bh doesn't care if the buffer is dirty, but nobody
1490 ** later on in the call chain will be cleaning it. So, we
1491 ** clean the buffer here, it still gets written either way.
1494 clear_bit(BH_Dirty, &bh->b_state) ;
1495 - set_bit(BH_Uptodate, &bh->b_state) ;
1496 submit_bh(WRITE, bh) ;
1500 static int reiserfs_write_full_page(struct page *page) {
1501 struct inode *inode = page->mapping->host ;
1502 - unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
1503 + loff_t size = inode->i_size;
1504 + unsigned long end_index = size >> PAGE_CACHE_SHIFT ;
1505 unsigned last_offset = PAGE_CACHE_SIZE;
1507 unsigned long block ;
1508 @@ -2024,21 +2239,36 @@
1509 struct buffer_head *arr[PAGE_CACHE_SIZE/512] ;
1512 + if (reiserfs_transaction_running(inode->i_sb)) {
1516 + if (!PageLocked(page))
1519 if (!page->buffers) {
1520 block_prepare_write(page, 0, 0, NULL) ;
1524 + if (reiserfs_transaction_running(inode->i_sb)) {
1527 /* last page in the file, zero out any contents past the
1528 ** last byte in the file
1530 if (page->index >= end_index) {
1531 - last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1533 + last_offset = size & (PAGE_CACHE_SIZE - 1) ;
1534 /* no file contents in this page */
1535 if (page->index >= end_index + 1 || !last_offset) {
1539 - memset((char *)kmap(page)+last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
1541 + if (last_offset > PAGE_CACHE_SIZE)
1543 + memset(p + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
1544 flush_dcache_page(page) ;
1547 @@ -2079,7 +2309,7 @@
1548 ** nr == 0 without there being any kind of error.
1551 - submit_bh_for_writepage(arr, nr) ;
1552 + submit_bh_for_writepage(page, arr, nr) ;
1553 wakeup_page_waiters(page);
1556 @@ -2091,7 +2321,7 @@
1560 - submit_bh_for_writepage(arr, nr) ;
1561 + submit_bh_for_writepage(page, arr, nr) ;
1565 @@ -2116,10 +2346,46 @@
1567 int reiserfs_prepare_write(struct file *f, struct page *page,
1568 unsigned from, unsigned to) {
1569 + int cur_refcount = 0 ;
1571 struct inode *inode = page->mapping->host ;
1572 + struct reiserfs_transaction_handle *th ;
1574 reiserfs_wait_on_write_block(inode->i_sb) ;
1575 fix_tail_page_for_writing(page) ;
1576 - return block_prepare_write(page, from, to, reiserfs_get_block) ;
1578 + /* we look for a running transaction before the block_prepare_write
1579 + ** call, and then again afterwards. This lets us know if
1580 + ** reiserfs_get_block added any additional transactions, so we can
1581 + ** let reiserfs_commit_write know if he needs to close them.
1582 + ** this is just nasty
1584 + if (reiserfs_transaction_running(inode->i_sb)) {
1585 + th = current->journal_info ;
1586 + cur_refcount = th->t_refcount ;
1588 + ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
1590 + /* it is very important that we only set the dangling bit when
1591 + ** there is no chance of additional nested transactions.
1593 + if (reiserfs_transaction_running(inode->i_sb)) {
1594 + th = current->journal_info ;
1595 + if (th->t_refcount > cur_refcount) {
1596 + /* if we return an error, commit_write isn't going to get called
1597 + * we need to make sure we end any transactions
1598 + * reiserfs_get_block left hanging around
1602 + journal_end(th, th->t_super, th->t_blocks_allocated) ;
1605 + reiserfs_set_handle_dangling(th) ;
1613 @@ -2127,20 +2393,96 @@
1614 return generic_block_bmap(as, block, reiserfs_bmap) ;
1617 +/* taken from fs/buffer.c */
1618 +static int __commit_write(struct reiserfs_transaction_handle *th,
1619 + struct inode *inode, struct page *page,
1620 + unsigned from, unsigned to, int *balance)
1622 + unsigned block_start, block_end;
1624 + unsigned blocksize;
1625 + struct buffer_head *bh, *head;
1628 + blocksize = 1 << inode->i_blkbits;
1629 + if (reiserfs_file_data_log(inode)) {
1632 + /* one for each block + the stat data, the caller closes the handle */
1633 + journal_begin(th, inode->i_sb,(PAGE_CACHE_SIZE >> inode->i_blkbits)+1);
1634 + reiserfs_update_inode_transaction(inode) ;
1638 + for(bh = head = page->buffers, block_start = 0;
1639 + bh != head || !block_start;
1640 + block_start=block_end, bh = bh->b_this_page) {
1641 + block_end = block_start + blocksize;
1642 + if (block_end <= from || block_start >= to) {
1643 + if (!buffer_uptodate(bh))
1646 + set_bit(BH_Uptodate, &bh->b_state);
1649 + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1650 + journal_mark_dirty (th, inode->i_sb, bh);
1652 + } else if (!atomic_set_buffer_dirty(bh)) {
1654 + if (reiserfs_data_ordered(inode->i_sb)) {
1656 + add_to_flushlist(inode, bh);
1657 + /* if we don't update the inode trans information,
1658 + * an fsync(fd) might not catch these data blocks
1660 + reiserfs_update_inode_transaction(inode);
1663 + buffer_insert_inode_data_queue(bh, inode);
1671 + * is this a partial write that happened to make all buffers
1672 + * uptodate then we can optimize away a bogus readpage() for
1673 + * the next read(). Here we 'discover' wether the page went
1674 + * uptodate as a result of this (potentially partial) write.
1677 + SetPageUptodate(page);
1681 static int reiserfs_commit_write(struct file *f, struct page *page,
1682 unsigned from, unsigned to) {
1683 struct inode *inode = page->mapping->host ;
1684 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1687 + int need_balance = 0;
1688 + struct reiserfs_transaction_handle th ;
1689 + struct reiserfs_transaction_handle *dth = NULL ;
1691 + /* we must do this before anything that might nest a transaction or
1692 + ** mess with the handle flags
1694 + if (reiserfs_transaction_running(inode->i_sb)) {
1695 + dth = current->journal_info ;
1696 + if (reiserfs_dangling_handle(dth)) {
1697 + reiserfs_clear_handle_dangling(dth) ;
1702 reiserfs_wait_on_write_block(inode->i_sb) ;
1705 + ret = __commit_write(&th, inode, page, from, to, &need_balance) ;
1707 - /* generic_commit_write does this for us, but does not update the
1708 - ** transaction tracking stuff when the size changes. So, we have
1709 - ** to do the i_size updates here.
1711 if (pos > inode->i_size) {
1712 - struct reiserfs_transaction_handle th ;
1714 /* If the file have grown beyond the border where it
1715 can have a tail, unmark it as needing a tail
1716 @@ -2149,24 +2491,135 @@
1717 (have_small_tails (inode->i_sb) && inode->i_size > block_size(inode)) )
1718 inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
1720 - journal_begin(&th, inode->i_sb, 1) ;
1721 + if (!reiserfs_active_handle(&th)) {
1722 + journal_begin(&th, inode->i_sb, 1) ;
1724 reiserfs_update_inode_transaction(inode) ;
1725 inode->i_size = pos ;
1726 reiserfs_update_sd(&th, inode) ;
1727 - journal_end(&th, inode->i_sb, 1) ;
1729 + journal_end(&th, th.t_super, th.t_blocks_allocated) ;
1731 + } else if (reiserfs_active_handle(&th)) {
1732 + /* in case commit_write left one running and the i_size update did
1736 + journal_end(&th, th.t_super, th.t_blocks_allocated) ;
1740 - ret = generic_commit_write(f, page, from, to) ;
1742 - /* we test for O_SYNC here so we can commit the transaction
1743 - ** for any packed tails the file might have had
1744 + /* did reiserfs_get_block leave us with a running transaction?
1746 - if (f && (f->f_flags & O_SYNC)) {
1749 - reiserfs_commit_for_inode(inode) ;
1750 + journal_end(dth, dth->t_super, dth->t_blocks_allocated) ;
1762 +/* decide if this buffer needs to stay around for data logging or ordered
1765 +static int flushpage_can_drop(struct inode *inode, struct buffer_head *bh) {
1768 + if (!buffer_mapped(bh)) {
1771 + if (reiserfs_file_data_log(inode)) {
1773 + /* very conservative, leave the buffer pinned if anyone might need it.
1774 + ** this should be changed to drop the buffer if it is only in the
1775 + ** current transaction
1777 + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
1782 + if (reiserfs_data_ordered(inode->i_sb)) {
1783 + if (buffer_dirty(bh) && bh->b_journal_head) {
1784 + struct reiserfs_journal_list *jl = NULL;
1787 + /* we can race against fsync_inode_buffers if we aren't careful */
1788 + if (buffer_attached(bh) && buffer_dirty(bh))
1789 + jl = bh->b_journal_head;
1791 + /* why is this safe?
1792 + * reiserfs_setattr updates i_size in the on disk
1793 + * stat data before allowing vmtruncate to be called.
1795 + * If buffer was put onto the ordered list for this
1796 + * transaction, we know for sure either this transaction
1797 + * or an older one already has updated i_size on disk,
1798 + * and this ordered data won't be referenced in the file
1801 + * if the buffer was put onto the ordered list for an older
1802 + * transaction, we need to leave it around
1804 + if (jl != SB_JOURNAL(inode->i_sb)->j_current_jl) {
1813 +/* stolen from fs/buffer.c:discard_bh_page */
1814 +static int reiserfs_flushpage(struct page *page, unsigned long offset) {
1815 + struct buffer_head *head, *bh, *next;
1816 + struct inode *inode = page->mapping->host ;
1817 + unsigned int curr_off = 0;
1820 + if (!PageLocked(page))
1822 + if (!page->buffers)
1825 + head = page->buffers;
1828 + unsigned int next_off = curr_off + bh->b_size;
1829 + next = bh->b_this_page;
1831 + /* is this buffer to be completely truncated away? */
1832 + if (offset <= curr_off) {
1833 + if (flushpage_can_drop(inode, bh))
1834 + discard_buffer(bh);
1838 + curr_off = next_off;
1840 + } while (bh != head);
1843 + * subtle. We release buffer-heads only if this is
1844 + * the 'final' flushpage. We have invalidated the get_block
1845 + * cached value unconditionally, so real IO is not
1846 + * possible anymore.
1848 + * If the free doesn't work out, the buffers can be
1849 + * left around - they just turn into anonymous buffers
1853 + if (!ret || !try_to_free_buffers(page, 0))
1855 + if (page->buffers)
1861 @@ -2222,6 +2675,9 @@
1862 struct kiobuf *iobuf, unsigned long blocknr,
1865 + if (reiserfs_data_ordered(inode->i_sb) || reiserfs_file_data_log(inode)) {
1869 reiserfs_commit_for_tail(inode);
1871 @@ -2237,4 +2693,5 @@
1872 commit_write: reiserfs_commit_write,
1873 bmap: reiserfs_aop_bmap,
1874 direct_IO: reiserfs_direct_io,
1875 + flushpage: reiserfs_flushpage,
1877 diff -urN linux-2.4.22.org/fs/reiserfs/ioctl.c linux-2.4.22/fs/reiserfs/ioctl.c
1878 --- linux-2.4.22.org/fs/reiserfs/ioctl.c 2003-11-21 15:08:29.000000000 +0100
1879 +++ linux-2.4.22/fs/reiserfs/ioctl.c 2003-11-21 15:14:23.000000000 +0100
1882 case REISERFS_IOC_UNPACK:
1883 if( S_ISREG( inode -> i_mode ) ) {
1885 - return reiserfs_unpack (inode, filp);
1890 + result = reiserfs_unpack (inode, filp);
1891 + if (reiserfs_file_data_log(inode)) {
1892 + struct reiserfs_transaction_handle th;
1894 + journal_begin(&th, inode->i_sb, 1);
1895 + SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
1896 + journal_end_sync(&th, inode->i_sb, 1);
1905 * Following {G,S}ETFLAGS, and {G,S}ETVERSION are providing ext2
1906 * binary compatible interface (used by lsattr(1), and chattr(1)) and
1911 + struct address_space *mapping ;
1912 unsigned long write_from ;
1913 unsigned long blocksize = inode->i_sb->s_blocksize ;
1915 @@ -127,19 +137,20 @@
1916 ** reiserfs_get_block to unpack the tail for us.
1918 index = inode->i_size >> PAGE_CACHE_SHIFT ;
1919 - page = grab_cache_page(inode->i_mapping, index) ;
1920 + mapping = inode->i_mapping ;
1921 + page = grab_cache_page(mapping, index) ;
1926 - retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
1927 + retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
1931 /* conversion can change page contents, must flush */
1932 flush_dcache_page(page) ;
1933 inode->u.reiserfs_i.i_flags |= i_nopack_mask;
1934 - kunmap(page) ; /* mapped by prepare_write */
1935 + retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
1939 diff -urN linux-2.4.22.org/fs/reiserfs/journal.c linux-2.4.22/fs/reiserfs/journal.c
1940 --- linux-2.4.22.org/fs/reiserfs/journal.c 2003-11-21 15:08:29.000000000 +0100
1941 +++ linux-2.4.22/fs/reiserfs/journal.c 2003-11-21 15:14:23.000000000 +0100
1943 ** -- Note, if you call this as an immediate flush from
1944 ** from within kupdate, it will ignore the immediate flag
1946 -** The commit thread -- a writer process for async commits. It allows a
1947 -** a process to request a log flush on a task queue.
1948 -** the commit will happen once the commit thread wakes up.
1949 -** The benefit here is the writer (with whatever
1950 -** related locks it has) doesn't have to wait for the
1951 -** log blocks to hit disk if it doesn't want to.
1952 +** The commit thread -- a writer process for metadata and async commits.
1953 +** this allows us to do less io with the journal lock
1957 +#define EXPORT_SYMTAB
1958 +#include <linux/module.h>
1959 #include <linux/config.h>
1960 #include <asm/uaccess.h>
1961 #include <asm/system.h>
1962 +#include <linux/init.h>
1964 #include <linux/sched.h>
1965 #include <asm/semaphore.h>
1967 #include <linux/string.h>
1968 #include <linux/smp_lock.h>
1970 +/* gets a struct reiserfs_journal_list * from a list head */
1971 +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
1973 +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
1976 /* the number of mounted filesystems. This is used to decide when to
1977 ** start and kill the commit thread
1979 static int reiserfs_mounted_fs_count = 0 ;
1981 -/* wake this up when you add something to the commit thread task queue */
1982 +static struct list_head kreiserfsd_supers = LIST_HEAD_INIT(kreiserfsd_supers);
1984 +/* wake this up when you want help from the commit thread */
1985 DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_wait) ;
1987 -/* wait on this if you need to be sure you task queue entries have been run */
1988 +/* so we can wait for the commit thread to make progress */
1989 static DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_done) ;
1990 -DECLARE_TASK_QUEUE(reiserfs_commit_thread_tq) ;
1991 +DECLARE_MUTEX(kreiserfsd_sem) ;
1993 #define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
1997 #define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */
1999 +/* journal list state bits */
2000 +#define LIST_TOUCHED 1
2002 /* flags for do_journal_end */
2003 #define FLUSH_ALL 1 /* flush commit and real blocks */
2004 #define COMMIT_NOW 2 /* end and commit this transaction */
2007 /* state bits for the journal */
2008 #define WRITERS_BLOCKED 1 /* set when new writers not allowed */
2009 +#define WRITERS_QUEUED 2 /* set when log is full due to too many
2013 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
2014 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
2016 ** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for
2019 -static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
2020 +static inline int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
2022 clear_bit(BH_Dirty, &bh->b_state) ;
2025 int pop_journal_writer(int index) {
2026 #ifdef CONFIG_REISERFS_CHECK
2030 journal_writers[index] = NULL ;
2033 @@ -522,6 +538,12 @@
2037 + /* when data logging is on, no special action is needed for the data
2040 + if (reiserfs_data_log(p_s_sb))
2043 PROC_INFO_INC( p_s_sb, journal.in_journal );
2044 /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
2045 ** if we crash before the transaction that freed it commits, this transaction won't
2048 /* is it in the current transaction. This should never happen */
2049 if ((cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, dev,bl,size))) {
2054 @@ -574,17 +597,12 @@
2055 /* lock the current transaction */
2056 inline static void lock_journal(struct super_block *p_s_sb) {
2057 PROC_INFO_INC( p_s_sb, journal.lock_journal );
2058 - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
2059 - PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
2060 - sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
2062 - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
2063 + down(&SB_JOURNAL(p_s_sb)->j_lock);
2066 /* unlock the current transaction */
2067 inline static void unlock_journal(struct super_block *p_s_sb) {
2068 - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
2069 - wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
2070 + up(&SB_JOURNAL(p_s_sb)->j_lock);
2074 @@ -602,6 +620,83 @@
2075 jl->j_list_bitmap = NULL ;
2078 +static int journal_list_still_alive(struct super_block *s,
2079 + unsigned long trans_id)
2081 + struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
2082 + struct reiserfs_journal_list *jl;
2084 + if (!list_empty(entry)) {
2085 + jl = JOURNAL_LIST_ENTRY(entry->next);
2086 + if (jl->j_trans_id <= trans_id) {
2093 +static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
2094 + struct reiserfs_journal_list *other_jl;
2095 + struct reiserfs_journal_list *first_jl;
2096 + struct list_head *entry;
2097 + unsigned long trans_id = jl->j_trans_id;
2098 + unsigned long other_trans_id;
2099 + unsigned long first_trans_id;
2103 + * first we walk backwards to find the oldest uncommitted transation
2106 + entry = jl->j_list.prev;
2108 + other_jl = JOURNAL_LIST_ENTRY(entry);
2109 + if (entry == &SB_JOURNAL(s)->j_journal_list ||
2110 + atomic_read(&other_jl->j_older_commits_done))
2113 + first_jl = other_jl;
2114 + entry = other_jl->j_list.prev;
2117 + /* if we didn't find any older uncommitted transactions, return now */
2118 + if (first_jl == jl) {
2122 + first_trans_id = first_jl->j_trans_id;
2124 + entry = &first_jl->j_list;
2126 + other_jl = JOURNAL_LIST_ENTRY(entry);
2127 + other_trans_id = other_jl->j_trans_id;
2129 + if (other_trans_id < trans_id) {
2130 + if (atomic_read(&other_jl->j_commit_left) != 0) {
2131 + flush_commit_list(s, other_jl, 0);
2133 + /* list we were called with is gone, return */
2134 + if (!journal_list_still_alive(s, trans_id))
2137 + /* the one we just flushed is gone, this means all
2138 + * older lists are also gone, so first_jl is no longer
2139 + * valid either. Go back to the beginning.
2141 + if (!journal_list_still_alive(s, other_trans_id)) {
2145 + entry = entry->next;
2146 + if (entry == &SB_JOURNAL(s)->j_journal_list)
2156 ** if this journal list still has commit blocks unflushed, send them to disk.
2158 @@ -611,16 +706,19 @@
2160 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
2164 int retry_count = 0 ;
2165 int orig_commit_left = 0 ;
2166 struct buffer_head *tbh = NULL ;
2167 - struct reiserfs_journal_list *other_jl ;
2168 + unsigned long trans_id = jl->j_trans_id;
2170 reiserfs_check_lock_depth("flush_commit_list") ;
2172 if (atomic_read(&jl->j_older_commits_done)) {
2173 + if (!list_empty(&jl->j_ordered_bh_list))
2175 + if (!list_empty(&jl->j_tail_bh_list))
2180 @@ -628,50 +726,51 @@
2181 ** us is on disk too
2183 if (jl->j_len <= 0) {
2187 + if (trans_id == SB_JOURNAL(s)->j_trans_id)
2191 - /* we _must_ make sure the transactions are committed in order. Start with the
2192 - ** index after this one, wrap all the way around
2194 - index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
2195 - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
2196 - other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
2197 - if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 &&
2198 - other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
2199 - flush_commit_list(s, other_jl, 0) ;
2201 + if (flush_older_commits(s, jl) == 1) {
2202 + /* list disappeared during flush_older_commits. return */
2208 - /* don't flush the commit list for the current transactoin */
2209 - if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
2213 /* make sure nobody is trying to flush this one at the same time */
2214 - if (atomic_read(&(jl->j_commit_flushing))) {
2215 - sleep_on(&(jl->j_commit_wait)) ;
2217 - atomic_set(&(jl->j_older_commits_done), 1) ;
2220 + down(&jl->j_commit_lock);
2221 + if (!journal_list_still_alive(s, trans_id)) {
2222 + up(&jl->j_commit_lock);
2225 + if (jl->j_trans_id == 0)
2228 /* this commit is done, exit */
2229 if (atomic_read(&(jl->j_commit_left)) <= 0) {
2231 atomic_set(&(jl->j_older_commits_done), 1) ;
2233 + if (!list_empty(&jl->j_ordered_bh_list))
2235 + if (!list_empty(&jl->j_tail_bh_list))
2237 + up(&jl->j_commit_lock);
2240 - /* keeps others from flushing while we are flushing */
2241 - atomic_set(&(jl->j_commit_flushing), 1) ;
2244 + /* write any buffers that must hit disk before the commit is done */
2245 + while(!list_empty(&jl->j_ordered_bh_list)) {
2247 + fsync_buffers_list(&jl->j_ordered_bh_list);
2250 if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
2251 - reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
2252 + reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, trans_id %lu\n", jl->j_len, jl->j_trans_id) ;
2257 if (buffer_dirty(tbh)) {
2258 reiserfs_warning(s, "journal-569: flush_commit_list, block already dirty!\n") ;
2260 - mark_buffer_dirty(tbh) ;
2261 + atomic_set_buffer_dirty(tbh);
2263 ll_rw_block(WRITE, 1, &tbh) ;
2265 @@ -745,16 +844,22 @@
2266 atomic_dec(&(jl->j_commit_left)) ;
2267 bforget(jl->j_commit_bh) ;
2269 + if (SB_JOURNAL(s)->j_last_commit_id != 0 &&
2270 + (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
2271 + reiserfs_warning(s, "clm-2200: dev %s, last commit %lu, current %lu\n",
2272 + kdevname(s->s_dev), SB_JOURNAL(s)->j_last_commit_id,
2273 + SB_JOURNAL(s)->j_last_commit_id);
2275 + SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
2277 /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */
2278 cleanup_freed_for_journal_list(s, jl) ;
2281 atomic_set(&(jl->j_older_commits_done), 1) ;
2283 - atomic_set(&(jl->j_commit_flushing), 0) ;
2284 - wake_up(&(jl->j_commit_wait)) ;
2285 + up(&jl->j_commit_lock);
2291 @@ -853,22 +958,27 @@
2292 ** flush any and all journal lists older than you are
2293 ** can only be called from flush_journal_list
2295 -static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
2297 - struct reiserfs_journal_list *other_jl ;
2299 - index = jl - SB_JOURNAL_LIST(p_s_sb) ;
2300 - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
2301 - other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
2302 - if (other_jl && other_jl->j_len > 0 &&
2303 - other_jl->j_trans_id > 0 &&
2304 - other_jl->j_trans_id < trans_id &&
2306 - /* do not flush all */
2307 - flush_journal_list(p_s_sb, other_jl, 0) ;
2308 +static int flush_older_journal_lists(struct super_block *p_s_sb,
2309 + struct reiserfs_journal_list *jl)
2311 + struct list_head *entry;
2312 + struct reiserfs_journal_list *other_jl ;
2313 + unsigned long trans_id = jl->j_trans_id;
2315 + /* we know we are the only ones flushing things, no extra race
2316 + * protection is required.
2319 + entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
2320 + other_jl = JOURNAL_LIST_ENTRY(entry);
2321 + if (other_jl->j_trans_id < trans_id) {
2322 + /* do not flush all */
2323 + flush_journal_list(p_s_sb, other_jl, 0) ;
2325 + /* other_jl is now deleted from the list */
2333 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
2334 @@ -881,14 +991,23 @@
2337 static void submit_logged_buffer(struct buffer_head *bh) {
2340 bh->b_end_io = reiserfs_end_buffer_io_sync ;
2341 mark_buffer_notjournal_new(bh) ;
2342 clear_bit(BH_Dirty, &bh->b_state) ;
2343 + if (!buffer_uptodate(bh))
2345 submit_bh(WRITE, bh) ;
2348 +static void del_from_work_list(struct super_block *s,
2349 + struct reiserfs_journal_list *jl) {
2350 + if (!list_empty(&jl->j_working_list)) {
2351 + list_del_init(&jl->j_working_list);
2352 + SB_JOURNAL(s)->j_num_work_lists--;
2356 /* flush a journal list, both commit and real blocks
2358 ** always set flushall to 1, unless you are calling from inside
2359 @@ -909,29 +1028,27 @@
2360 unsigned long j_len_saved = jl->j_len ;
2362 if (j_len_saved <= 0) {
2367 if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
2368 reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d\n",
2369 atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
2371 - /* if someone is getting the commit list, we must wait for them */
2372 - while (atomic_read(&(jl->j_commit_flushing))) {
2373 - sleep_on(&(jl->j_commit_wait)) ;
2375 - /* if someone is flushing this list, we must wait for them */
2376 - while (atomic_read(&(jl->j_flushing))) {
2377 - sleep_on(&(jl->j_flush_wait)) ;
2380 - /* this list is now ours, we can change anything we want */
2381 - atomic_set(&(jl->j_flushing), 1) ;
2382 + if (jl->j_trans_id == 0)
2385 + /* if flushall == 0, the lock is already held */
2387 + down(&SB_JOURNAL(s)->j_flush_sem);
2388 + } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
2393 if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
2394 - reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
2395 - atomic_dec(&(jl->j_flushing)) ;
2396 + reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, transid %lu\n", j_len_saved, jl->j_trans_id) ;
2400 @@ -981,13 +1098,13 @@
2403 if (buffer_journal_dirty(saved_bh)) {
2404 + if (!can_dirty(cn))
2407 - mark_buffer_notjournal_dirty(saved_bh) ;
2408 - /* undo the inc from journal_mark_dirty */
2409 - put_bh(saved_bh) ;
2411 - if (can_dirty(cn)) {
2414 + } else if (can_dirty(cn)) {
2415 + /* everything with !pjl && jwait should be writable */
2420 @@ -995,7 +1112,8 @@
2421 ** sure they are commited, and don't try writing it to disk
2424 - flush_commit_list(s, pjl, 1) ;
2425 + if (atomic_read(&pjl->j_commit_left))
2426 + flush_commit_list(s, pjl, 1) ;
2430 @@ -1029,7 +1147,12 @@
2431 /* we inc again because saved_bh gets decremented at free_cnode */
2433 set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2434 + lock_buffer(saved_bh);
2435 submit_logged_buffer(saved_bh) ;
2436 + if (cn->blocknr != saved_bh->b_blocknr) {
2437 +printk("cn %lu does not match bh %lu\n", cn->blocknr, saved_bh->b_blocknr);
2442 reiserfs_warning(s, "clm-2082: Unable to flush buffer %lu in flush_journal_list\n",
2443 @@ -1057,9 +1180,23 @@
2445 reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
2447 + if (cn->blocknr != cn->bh->b_blocknr) {
2448 +printk("2cn %lu does not match bh %lu\n", cn->blocknr, cn->bh->b_blocknr);
2451 if (!buffer_uptodate(cn->bh)) {
2452 - reiserfs_panic(s, "journal-949: buffer write failed\n") ;
2453 + reiserfs_panic(s, "journal-949: buffer %lu write failed\n", cn->bh->b_blocknr) ;
2456 + /* note, we must clear the JDirty_wait bit after the up to date
2457 + ** check, otherwise we race against our flushpage routine
2459 + if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
2462 + /* undo the inc from journal_mark_dirty */
2465 refile_buffer(cn->bh) ;
2468 @@ -1074,7 +1211,7 @@
2469 ** replayed after a crash
2472 - flush_older_journal_lists(s, jl, jl->j_trans_id) ;
2473 + flush_older_journal_lists(s, jl);
2476 /* before we can remove everything from the hash tables for this
2477 @@ -1089,46 +1226,137 @@
2478 update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
2480 remove_all_from_journal_list(s, jl, 0) ;
2481 + list_del(&jl->j_list);
2482 + SB_JOURNAL(s)->j_num_lists--;
2483 + del_from_work_list(s, jl);
2485 + if (SB_JOURNAL(s)->j_last_flush_id != 0 &&
2486 + (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
2487 + reiserfs_warning(s, "clm-2201: dev %s, last flush %lu, current %lu\n",
2488 + kdevname(s->s_dev), SB_JOURNAL(s)->j_last_flush_id,
2489 + SB_JOURNAL(s)->j_last_flush_id);
2491 + SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
2493 + /* not strictly required since we are freeing the list, but it should
2494 + * help find code using dead lists later on
2497 atomic_set(&(jl->j_nonzerolen), 0) ;
2499 jl->j_realblock = NULL ;
2500 jl->j_commit_bh = NULL ;
2501 jl->j_trans_id = 0 ;
2502 - atomic_dec(&(jl->j_flushing)) ;
2503 - wake_up(&(jl->j_flush_wait)) ;
2506 + if (!list_empty(&jl->j_ordered_bh_list))
2509 + if (!list_empty(&jl->j_tail_bh_list))
2512 + // kmem_cache_free(journal_list_cachep, jl);
2513 + reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
2516 + up(&SB_JOURNAL(s)->j_flush_sem);
2521 -static int kupdate_one_transaction(struct super_block *s,
2522 +#define CHUNK_SIZE 32
2523 +struct buffer_chunk {
2524 + struct buffer_head *bh[CHUNK_SIZE];
2528 +static void write_chunk(struct buffer_chunk *chunk) {
2530 + for (i = 0; i < chunk->nr ; i++) {
2531 + submit_logged_buffer(chunk->bh[i]) ;
2536 +static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
2537 + if (chunk->nr >= CHUNK_SIZE)
2539 + chunk->bh[chunk->nr++] = bh;
2540 + if (chunk->nr >= CHUNK_SIZE)
2541 + write_chunk(chunk);
2544 +static int write_one_transaction(struct super_block *s,
2545 + struct reiserfs_journal_list *jl,
2546 + struct buffer_chunk *chunk)
2548 + struct reiserfs_journal_list *pjl ; /* previous list for this cn */
2549 + struct reiserfs_journal_cnode *cn;
2552 + jl->j_state |= LIST_TOUCHED;
2553 + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
2554 + del_from_work_list(s, jl);
2557 + del_from_work_list(s, jl);
2559 + cn = jl->j_realblock ;
2561 + /* if the blocknr == 0, this has been cleared from the hash,
2564 + if (cn->blocknr == 0) {
2567 + /* look for a more recent transaction that logged this
2568 + ** buffer. Only the most recent transaction with a buffer in
2569 + ** it is allowed to send that buffer to disk
2571 + pjl = find_newer_jl_for_cn(cn) ;
2572 + if (!pjl && cn->bh && buffer_journal_dirty(cn->bh) && can_dirty(cn)) {
2573 + if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
2574 + struct buffer_head *tmp_bh;
2575 + /* we can race against journal_mark_freed when we try
2576 + * to lock_buffer(cn->bh), so we have to inc the buffer
2577 + * count, and recheck things after locking
2581 + set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2582 + lock_buffer(tmp_bh);
2583 + if (cn->bh && buffer_journal_dirty(tmp_bh) &&
2584 + !test_bit(BH_JPrepared, &tmp_bh->b_state))
2586 + add_to_chunk(chunk, tmp_bh);
2589 + /* note, cn->bh might be null now */
2590 + unlock_buffer(tmp_bh);
2597 + if (current->need_resched)
2603 +static int wait_one_transaction(struct super_block *s,
2604 struct reiserfs_journal_list *jl)
2606 struct reiserfs_journal_list *pjl ; /* previous list for this cn */
2607 struct reiserfs_journal_cnode *cn, *walk_cn ;
2608 unsigned long blocknr ;
2610 - int orig_trans_id = jl->j_trans_id ;
2611 struct buffer_head *saved_bh ;
2614 - /* if someone is getting the commit list, we must wait for them */
2615 - while (atomic_read(&(jl->j_commit_flushing))) {
2616 - sleep_on(&(jl->j_commit_wait)) ;
2618 - /* if someone is flushing this list, we must wait for them */
2619 - while (atomic_read(&(jl->j_flushing))) {
2620 - sleep_on(&(jl->j_flush_wait)) ;
2622 - /* was it flushed while we slept? */
2623 - if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
2625 + if (atomic_read(&jl->j_commit_left) != 0 || jl->j_len <= 0) {
2629 - /* this list is now ours, we can change anything we want */
2630 - atomic_set(&(jl->j_flushing), 1) ;
2633 cn = jl->j_realblock ;
2636 @@ -1143,27 +1371,14 @@
2637 ** it is allowed to send that buffer to disk
2639 pjl = find_newer_jl_for_cn(cn) ;
2640 - if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
2643 - if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
2644 - set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2645 - submit_logged_buffer(cn->bh) ;
2647 - /* someone else is using this buffer. We can't
2648 - ** send it to disk right now because they might
2649 - ** be changing/logging it.
2653 - } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
2654 + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
2655 clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2656 if (!pjl && cn->bh) {
2657 wait_on_buffer(cn->bh) ;
2659 - /* check again, someone could have logged while we scheduled */
2660 - pjl = find_newer_jl_for_cn(cn) ;
2661 + /* check again, someone could have logged while we scheduled */
2662 + pjl = find_newer_jl_for_cn(cn) ;
2665 /* before the JDirty_wait bit is set, the
2666 ** buffer is added to the hash list. So, if we are
2667 ** run in the middle of a do_journal_end, we will notice
2668 @@ -1210,60 +1425,182 @@
2672 + if (current->need_resched)
2675 - /* the first run through the loop sends all the dirty buffers to
2677 - ** the second run through the loop does all the accounting
2684 +static int kupdate_transactions(struct super_block *s,
2685 + struct reiserfs_journal_list *jl,
2686 + struct reiserfs_journal_list **next_jl,
2687 + unsigned long *next_trans_id,
2692 + int transactions_flushed = 0;
2693 + unsigned long orig_trans_id = jl->j_trans_id;
2694 + struct reiserfs_journal_list *orig_jl = jl;
2695 + struct buffer_chunk chunk;
2696 + struct list_head *entry;
2699 + down(&SB_JOURNAL(s)->j_flush_sem);
2700 + if (!journal_list_still_alive(s, orig_trans_id)) {
2704 - atomic_set(&(jl->j_flushing), 0) ;
2705 - wake_up(&(jl->j_flush_wait)) ;
2707 + /* we've got j_flush_sem held, nobody is going to delete any
2708 + * of these lists out from underneath us
2710 + while((num_trans && transactions_flushed < num_trans) ||
2711 + (!num_trans && written < num_blocks)) {
2713 + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
2714 + atomic_read(&jl->j_commit_left))
2716 + del_from_work_list(s, jl);
2719 + ret = write_one_transaction(s, jl, &chunk);
2723 + transactions_flushed++;
2725 + entry = jl->j_list.next;
2727 + /* did we wrap? */
2728 + if (entry == &SB_JOURNAL(s)->j_journal_list) {
2731 + jl = JOURNAL_LIST_ENTRY(entry);
2733 + /* don't bother with older transactions */
2734 + if (jl->j_trans_id <= orig_trans_id)
2738 + write_chunk(&chunk);
2743 + *next_trans_id = jl->j_trans_id;
2744 + ret = transactions_flushed;
2745 + while(transactions_flushed--) {
2747 + wait_one_transaction(s, jl);
2748 + entry = jl->j_list.next;
2749 + jl = JOURNAL_LIST_ENTRY(entry);
2751 + /* make sure we can really count */
2752 + if (jl->j_trans_id <= orig_trans_id && transactions_flushed > 0) {
2753 +printk("flushing %s %lu, orig_trans_id was %lu\n", kdevname(s->s_dev), jl->j_trans_id, orig_trans_id);
2757 + *next_trans_id = jl->j_trans_id;
2761 + up(&SB_JOURNAL(s)->j_flush_sem);
2765 +/* for o_sync and fsync heavy applications, they tend to use
2766 +** all the journa list slots with tiny transactions. These
2767 +** trigger lots and lots of calls to update the header block, which
2768 +** adds seeks and slows things down.
2770 +** This function tries to clear out a large chunk of the journal lists
2771 +** at once, which makes everything faster since only the newest journal
2772 +** list updates the header block
2774 +static int flush_used_journal_lists(struct super_block *s,
2775 + struct reiserfs_journal_list *jl) {
2776 + unsigned long len = 0;
2777 + unsigned long cur_len;
2780 + struct reiserfs_journal_list *tjl;
2781 + struct reiserfs_journal_list *flush_jl;
2782 + unsigned long trans_id;
2784 + flush_jl = tjl = jl;
2786 + /* flush for 256 transactions or 256 blocks, whichever comes first */
2787 + for(i = 0 ; i < 256 && len < 256 ; i++) {
2788 + if (atomic_read(&tjl->j_commit_left) ||
2789 + tjl->j_trans_id < jl->j_trans_id) {
2792 + cur_len = atomic_read(&tjl->j_nonzerolen);
2793 + if (cur_len > 0) {
2794 + tjl->j_state &= ~LIST_TOUCHED;
2798 + if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
2800 + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
2802 + /* try to find a group of blocks we can flush across all the
2803 + ** transactions, but only bother if we've actually spanned
2804 + ** across multiple lists
2806 + if (flush_jl != jl) {
2807 + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
2809 + flush_journal_list(s, flush_jl, 1) ;
2814 /* since we never give dirty buffers to bdflush/kupdate, we have to
2815 ** flush them ourselves. This runs through the journal lists, finds
2816 ** old metadata in need of flushing and sends it to disk.
2817 ** this does not end transactions, commit anything, or free
2820 -** returns the highest transaction id that was flushed last time
2822 static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
2823 - struct reiserfs_journal_list *jl ;
2826 + struct reiserfs_journal_list *jl, *next_jl;
2827 + unsigned long trans_id, next_trans_id;
2831 - start = SB_JOURNAL_LIST_INDEX(s) ;
2832 + jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
2834 - /* safety check to prevent flush attempts during a mount */
2837 + /* kupdate transactions might not set next_trans_id, it must be
2838 + * initialized before each call
2840 + next_trans_id = 0;
2841 + if (list_empty(&SB_JOURNAL(s)->j_working_list)) {
2844 - i = (start + 1) % JOURNAL_LIST_COUNT ;
2845 - while(i != start) {
2846 - jl = SB_JOURNAL_LIST(s) + i ;
2847 - age = CURRENT_TIME - jl->j_timestamp ;
2848 - if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) &&
2849 - atomic_read(&(jl->j_nonzerolen)) > 0 &&
2850 - atomic_read(&(jl->j_commit_left)) == 0) {
2851 + trans_id = jl->j_trans_id;
2853 - if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
2856 - /* if ret was already 1, we want to preserve that */
2857 - ret |= kupdate_one_transaction(s, jl) ;
2859 - if (atomic_read(&(jl->j_nonzerolen)) > 0) {
2862 - i = (i + 1) % JOURNAL_LIST_COUNT ;
2863 + /* check for race with the code that frees lists */
2864 + if (jl->j_trans_id == 0)
2866 + age = CURRENT_TIME - jl->j_timestamp ;
2867 + if (age >= SB_JOURNAL_MAX_COMMIT_AGE(s) &&
2868 + atomic_read(&jl->j_nonzerolen) > 0 &&
2869 + atomic_read(&jl->j_commit_left) == 0)
2871 + if (kupdate_transactions(s, jl, &next_jl, &next_trans_id, 32, 32) < 0)
2873 + if (next_jl != JOURNAL_WORK_ENTRY(&SB_JOURNAL(s)->j_working_list) &&
2874 + next_trans_id > trans_id)
2876 + if (journal_list_still_alive(s, next_trans_id)) {
2887 @@ -1307,6 +1644,12 @@
2890 static void free_journal_ram(struct super_block *p_s_sb) {
2892 + // kmem_cache_free(journal_list_cachep, SB_JOURNAL(p_s_sb)->j_current_jl);
2893 + reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl,
2894 + sizeof(struct reiserfs_journal_list), p_s_sb);
2895 + SB_JOURNAL(p_s_sb)->j_num_lists--;
2897 vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
2898 free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
2899 free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
2900 @@ -1327,6 +1670,10 @@
2901 static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
2902 struct reiserfs_transaction_handle myth ;
2904 + down(&kreiserfsd_sem);
2905 + list_del(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers);
2906 + up(&kreiserfsd_sem);
2908 /* we only want to flush out transactions if we were called with error == 0
2910 if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
2911 @@ -1813,66 +2160,6 @@
2916 -struct reiserfs_journal_commit_task {
2917 - struct super_block *p_s_sb ;
2919 - int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
2920 - ** is zero, we free the whole struct on finish
2922 - struct reiserfs_journal_commit_task *self ;
2923 - struct wait_queue *task_done ;
2924 - struct tq_struct task ;
2927 -static void reiserfs_journal_commit_task_func(struct reiserfs_journal_commit_task *ct) {
2929 - struct reiserfs_journal_list *jl ;
2930 - jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
2932 - flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ;
2934 - if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
2935 - atomic_read(&(jl->j_commit_left)) == 0) {
2936 - kupdate_one_transaction(ct->p_s_sb, jl) ;
2938 - reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
2941 -static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
2942 - struct super_block *p_s_sb,
2945 - reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
2947 - ct->p_s_sb = p_s_sb ;
2948 - ct->jindex = jindex ;
2949 - ct->task_done = NULL ;
2950 - INIT_LIST_HEAD(&ct->task.list) ;
2951 - ct->task.sync = 0 ;
2952 - ct->task.routine = (void *)(void *)reiserfs_journal_commit_task_func ;
2954 - ct->task.data = (void *)ct ;
2957 -static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
2958 - struct reiserfs_journal_commit_task *ct ;
2959 - /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
2960 - ** to start/join a transaction, which will deadlock
2962 - ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
2964 - setup_commit_task_arg(ct, p_s_sb, jindex) ;
2965 - queue_task(&(ct->task), &reiserfs_commit_thread_tq);
2966 - wake_up(&reiserfs_commit_thread_wait) ;
2968 -#ifdef CONFIG_REISERFS_CHECK
2969 - reiserfs_warning(p_s_sb, "journal-1540: kmalloc failed, doing sync commit\n") ;
2971 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
2976 ** this is the commit thread. It is started with kernel_thread on
2977 ** FS mount, and journal_release() waits for it to exit.
2978 @@ -1885,6 +2172,9 @@
2979 ** then run the per filesystem commit task queue when we wakeup.
2981 static int reiserfs_journal_commit_thread(void *nullp) {
2982 + struct list_head *entry, *safe ;
2983 + struct super_block *s;
2984 + time_t last_run = 0;
2988 @@ -1897,13 +2187,73 @@
2992 - while(TQ_ACTIVE(reiserfs_commit_thread_tq)) {
2993 - run_task_queue(&reiserfs_commit_thread_tq) ;
2995 + down(&kreiserfsd_sem);
2996 + list_for_each_safe(entry, safe, &kreiserfsd_supers) {
2997 + s = list_entry(entry, struct super_block,
2998 + u.reiserfs_sb.s_reiserfs_supers);
2999 + if (!(s->s_flags & MS_RDONLY)) {
3000 + flush_async_commits(s);
3002 + if (CURRENT_TIME - last_run > 5) {
3003 + reiserfs_flush_old_commits(s);
3006 + if (!list_empty(&SB_JOURNAL(s)->j_working_list)) {
3007 + struct reiserfs_journal_list *jl, *tjl;
3008 + unsigned long trans_id ;
3009 + unsigned long start;
3010 + unsigned long cur_start;
3011 + unsigned long nfract = SB_ONDISK_JOURNAL_SIZE(s) / 4;
3014 + jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
3015 + cur_start = SB_JOURNAL(s)->j_start;
3016 + start = jl->j_start;
3018 + /* pretend the log doesn't actually wrap */
3019 + if (cur_start < start) {
3020 + cur_start = cur_start + SB_ONDISK_JOURNAL_SIZE(s);
3023 + /* if the first transaction on the working list is more
3024 + * than nfract blocks away from the current transaction start
3025 + * or there are more than 128 working lists, start
3026 + * a background flush
3028 + if (cur_start - start > nfract ||
3029 + SB_JOURNAL(s)->j_num_work_lists > 32) {
3030 + tjl=JOURNAL_LIST_ENTRY(SB_JOURNAL(s)->j_journal_list.next);
3031 + ret = kupdate_transactions(s, jl, &tjl, &trans_id,32,128);
3036 + /* check again for new async commits that need tending */
3037 + list_for_each_safe(entry, safe, &kreiserfsd_supers) {
3038 + s = list_entry(entry, struct super_block,
3039 + u.reiserfs_sb.s_reiserfs_supers);
3040 + if (!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
3041 + struct reiserfs_journal_list *jl;
3042 + struct list_head *entry;
3044 + /* last entry is the youngest, commit it and you get everything */
3045 + entry = SB_JOURNAL(s)->j_journal_list.prev;
3046 + jl = JOURNAL_LIST_ENTRY(entry);
3047 + if (!atomic_read(&(jl->j_older_commits_done))) {
3048 + /* give new mounts a chance to come in */
3049 + up(&kreiserfsd_sem);
3050 + last_run = CURRENT_TIME;
3051 + wake_up_all(&reiserfs_commit_thread_done) ;
3056 + up(&kreiserfsd_sem);
3057 + last_run = CURRENT_TIME;
3059 /* if there aren't any more filesystems left, break */
3060 if (reiserfs_mounted_fs_count <= 0) {
3061 - run_task_queue(&reiserfs_commit_thread_tq) ;
3064 wake_up(&reiserfs_commit_thread_done) ;
3065 @@ -1914,12 +2264,28 @@
3069 +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
3071 + struct reiserfs_journal_list *jl;
3073 + // jl = (struct reiserfs_journal_list *)kmem_cache_alloc(journal_list_cachep, SLAB_NOFS);
3074 + jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
3079 + memset(jl, 0, sizeof(*jl));
3080 + INIT_LIST_HEAD(&jl->j_list);
3081 + INIT_LIST_HEAD(&jl->j_working_list);
3082 + INIT_LIST_HEAD(&jl->j_ordered_bh_list);
3083 + INIT_LIST_HEAD(&jl->j_tail_bh_list);
3084 + sema_init(&jl->j_commit_lock, 1);
3085 + SB_JOURNAL(s)->j_num_lists++;
3089 static void journal_list_init(struct super_block *p_s_sb) {
3091 - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
3092 - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
3093 - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
3095 + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
3098 static int release_journal_dev( struct super_block *super,
3099 @@ -1952,7 +2318,6 @@
3100 int blkdev_mode = FMODE_READ | FMODE_WRITE;
3104 journal -> j_dev_bd = NULL;
3105 journal -> j_dev_file = NULL;
3106 jdev = SB_JOURNAL_DEV( super ) =
3107 @@ -2030,7 +2395,6 @@
3108 printk( "journal_init_dev: journal device: %s", kdevname( SB_JOURNAL_DEV( super ) ) );
3113 ** must be called once on fs mount. calls journal_read for you
3115 @@ -2041,6 +2405,7 @@
3116 struct reiserfs_super_block * rs;
3117 struct reiserfs_journal_header *jh;
3118 struct reiserfs_journal *journal;
3119 + struct reiserfs_journal_list *jl;
3121 if (sizeof(struct reiserfs_journal_commit) != 4096 ||
3122 sizeof(struct reiserfs_journal_desc) != 4096) {
3123 @@ -2054,7 +2419,6 @@
3124 reiserfs_warning(p_s_sb, "Journal size %d is less than 512+1 blocks, which unsupported\n", SB_ONDISK_JOURNAL_SIZE(p_s_sb));
3128 journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
3130 reiserfs_warning(p_s_sb, "journal-1256: unable to get memory for journal structure\n") ;
3131 @@ -2155,15 +2519,9 @@
3132 SB_JOURNAL_MAX_BATCH(p_s_sb) = SB_JOURNAL_TRANS_MAX(p_s_sb)*9 / 10;
3138 SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
3139 - SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
3141 - /* clear out the journal list array */
3142 - memset(SB_JOURNAL_LIST(p_s_sb), 0,
3143 - sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ;
3145 journal_list_init(p_s_sb) ;
3147 @@ -2171,8 +2529,6 @@
3148 JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
3149 memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
3151 - INIT_LIST_HEAD(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
3153 SB_JOURNAL(p_s_sb)->j_start = 0 ;
3154 SB_JOURNAL(p_s_sb)->j_len = 0 ;
3155 SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
3156 @@ -2182,13 +2538,15 @@
3157 SB_JOURNAL(p_s_sb)->j_last = NULL ;
3158 SB_JOURNAL(p_s_sb)->j_first = NULL ;
3159 init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3160 - init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
3162 + sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
3163 + sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
3164 + INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_journal_list);
3165 + INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_working_list);
3167 SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;
3168 SB_JOURNAL(p_s_sb)->j_mount_id = 10 ;
3169 SB_JOURNAL(p_s_sb)->j_state = 0 ;
3170 atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
3171 - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
3172 SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
3173 SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
3174 SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ?
3175 @@ -2196,8 +2554,9 @@
3176 SB_JOURNAL(p_s_sb)->j_cnode_used = 0 ;
3177 SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
3178 init_journal_hash(p_s_sb) ;
3179 - SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
3180 - if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
3181 + jl = SB_JOURNAL(p_s_sb)->j_current_jl;
3182 + jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl) ;
3183 + if (!jl->j_list_bitmap) {
3184 reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0\n") ;
3185 goto free_and_return;
3187 @@ -2205,8 +2564,6 @@
3188 reiserfs_warning(p_s_sb, "Replay Failure, unable to mount\n") ;
3189 goto free_and_return;
3191 - /* once the read is done, we can set this where it belongs */
3192 - SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ;
3194 if (reiserfs_dont_log (p_s_sb))
3196 @@ -2216,6 +2573,9 @@
3197 kernel_thread((void *)(void *)reiserfs_journal_commit_thread, NULL,
3198 CLONE_FS | CLONE_FILES | CLONE_VM) ;
3200 + down(&kreiserfsd_sem);
3201 + list_add(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers, &kreiserfsd_supers);
3202 + up(&kreiserfsd_sem);
3206 @@ -2230,7 +2590,9 @@
3208 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
3209 time_t now = CURRENT_TIME ;
3210 - if (reiserfs_dont_log(th->t_super))
3212 + /* cannot restart while nested unless the parent allows it */
3213 + if (!reiserfs_restartable_handle(th) && th->t_refcount > 1)
3215 if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
3216 (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) ||
3217 @@ -2239,9 +2601,48 @@
3218 SB_JOURNAL(th->t_super)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(th->t_super) * 3)) {
3222 + /* we are allowing them to continue in the current transaction, so
3223 + * we have to bump the blocks allocated now.
3225 + th->t_blocks_allocated += new_alloc;
3226 + SB_JOURNAL(th->t_super)->j_len_alloc += new_alloc;
3232 +reiserfs_restart_transaction(struct reiserfs_transaction_handle *th, int num) {
3233 + int refcount = th->t_refcount ;
3234 + struct super_block *s = th->t_super ;
3235 + int flags = th->t_flags ;
3236 + int parent_flags = 0;
3237 + struct reiserfs_transaction_handle *saved_th = current->journal_info ;
3239 + /* if refcount is > 1, saved_th is the parent we've nested into, save
3240 + ** his flags as well. So far, only intermezzo needs this, 99% of the
3241 + ** time it is horribly unsafe.
3243 + if (refcount > 1) {
3244 + if (!reiserfs_restartable_handle(saved_th)) {
3247 + th->t_refcount = 1;
3248 + parent_flags = saved_th->t_flags ;
3251 + journal_end(th, s, th->t_blocks_allocated) ;
3252 + journal_begin(th, s, num) ;
3253 + th->t_flags = flags;
3254 + if (refcount > 1) {
3255 + current->journal_info = saved_th ;
3256 + th->t_refcount = refcount ;
3257 + memcpy(saved_th, th, sizeof(*th)) ;
3258 + saved_th->t_flags = parent_flags ;
3263 /* this must be called inside a transaction, and requires the
3264 ** kernel_lock to be held
3266 @@ -2268,6 +2669,37 @@
3267 !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
3270 +static void queue_log_writer(struct super_block *s) {
3271 + set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
3272 + sleep_on(&SB_JOURNAL(s)->j_join_wait);
3275 +static void wake_queued_writers(struct super_block *s) {
3276 + if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state)) {
3277 + wake_up(&SB_JOURNAL(s)->j_join_wait);
3281 +static void let_transaction_grow(struct super_block *sb,
3282 + unsigned long trans_id)
3284 + unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
3287 + while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
3288 + atomic_read(&SB_JOURNAL(sb)->j_jlock)) &&
3289 + SB_JOURNAL(sb)->j_trans_id == trans_id) {
3290 + queue_log_writer(sb);
3292 + if (SB_JOURNAL(sb)->j_trans_id != trans_id)
3294 + if (bcount == SB_JOURNAL(sb)->j_bcount)
3296 + bcount = SB_JOURNAL(sb)->j_bcount;
3301 /* join == true if you must join an existing transaction.
3302 ** join == false if you can deal with waiting for others to finish
3304 @@ -2275,8 +2707,10 @@
3305 ** expect to use in nblocks.
3307 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
3308 - time_t now = CURRENT_TIME ;
3311 + struct reiserfs_transaction_handle myth ;
3312 + int sched_count = 0;
3314 reiserfs_check_lock_depth("journal_begin") ;
3315 RFALSE( p_s_sb->s_flags & MS_RDONLY,
3316 @@ -2287,9 +2721,14 @@
3319 PROC_INFO_INC( p_s_sb, journal.journal_being );
3320 + /* set here for journal_join */
3321 + th->t_refcount = 1;
3323 + th->t_super = p_s_sb ;
3326 lock_journal(p_s_sb) ;
3327 + SB_JOURNAL(p_s_sb)->j_bcount++ ;
3329 if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
3330 unlock_journal(p_s_sb) ;
3331 @@ -2297,12 +2736,12 @@
3332 PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
3335 + now = CURRENT_TIME;
3337 /* if there is no room in the journal OR
3338 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
3339 ** we don't sleep if there aren't other writers
3342 if ( (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
3343 ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) ||
3344 (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3345 @@ -2310,54 +2749,128 @@
3346 (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
3347 (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
3349 + old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3350 unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
3352 - /* if writer count is 0, we can just force this transaction to end, and start
3353 - ** a new one afterwards.
3355 - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
3356 - struct reiserfs_transaction_handle myth ;
3357 - journal_join(&myth, p_s_sb, 1) ;
3358 - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3359 - journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3360 - do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
3361 + if (!join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >=
3362 + SB_JOURNAL_MAX_BATCH(p_s_sb) &&
3363 + ((SB_JOURNAL(p_s_sb)->j_len + nblocks + 2) * 100) <
3364 + (SB_JOURNAL(p_s_sb)->j_len_alloc * 75))
3366 + if (atomic_read(&SB_JOURNAL(p_s_sb)->j_wcount) > 10) {
3368 + queue_log_writer(p_s_sb);
3372 + /* don't mess with joining the transaction if all we have to do is
3373 + * wait for someone else to do a commit
3375 + if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3376 + while (SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id &&
3377 + atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3378 + queue_log_writer(p_s_sb);
3382 + journal_join(&myth, p_s_sb, 1) ;
3384 + /* someone might have ended the transaction while we joined */
3385 + if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
3386 + do_journal_end(&myth, p_s_sb, 1, 0) ;
3388 - /* but if the writer count isn't zero, we have to wait for the current writers to finish.
3389 - ** They won't batch on transaction end once we set j_jlock
3391 - atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3392 - old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3393 - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
3394 - SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
3395 - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3397 + do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
3399 PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
3403 if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
3404 - SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
3405 + SB_JOURNAL(p_s_sb)->j_trans_start_time = CURRENT_TIME;
3407 atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
3408 SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
3409 th->t_blocks_logged = 0 ;
3410 th->t_blocks_allocated = nblocks ;
3411 - th->t_super = p_s_sb ;
3412 th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3413 - th->t_caller = "Unknown" ;
3414 + reiserfs_set_handle_active(th) ;
3415 unlock_journal(p_s_sb) ;
3416 - p_s_sb->s_dirt = 1;
3420 +struct reiserfs_transaction_handle *
3421 +reiserfs_persistent_transaction(struct super_block *s, unsigned long nblocks) {
3423 + struct reiserfs_transaction_handle *th ;
3425 + /* if we're nesting into an existing transaction. It will be
3426 + ** persistent on its own
3428 + if (reiserfs_transaction_running(s)) {
3429 + th = current->journal_info ;
3430 + th->t_refcount++ ;
3431 + if (th->t_refcount < 2) {
3436 + th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
3438 + return ERR_PTR(-ENOMEM) ;
3440 + ret = journal_begin(th, s, nblocks) ;
3442 + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
3443 + return ERR_PTR(ret) ;
3445 + /* do_journal_end is now responsible for freeing the handle */
3446 + reiserfs_set_handle_persistent(th) ;
3449 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3450 + struct reiserfs_transaction_handle *cur_th = current->journal_info;
3452 + /* this keeps do_journal_end from NULLing out the current->journal_info
3455 + th->t_handle_save = cur_th ;
3456 + if (cur_th && cur_th->t_refcount > 1) {
3459 return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
3462 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) {
3463 - return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
3464 + struct reiserfs_transaction_handle *cur_th = current->journal_info ;
3467 + th->t_handle_save = NULL ;
3469 + /* we are nesting into the current transaction */
3470 + if (cur_th->t_super == p_s_sb) {
3471 + cur_th->t_refcount++ ;
3472 + memcpy(th, cur_th, sizeof(*th));
3474 + reiserfs_set_handle_active(th) ;
3475 + if (th->t_refcount <= 1)
3476 + printk("BAD: refcount <= 1, but journal_info != 0\n");
3479 + /* we've ended up with a handle from a different filesystem.
3480 + ** save it and restore on journal_end. This should never
3481 + ** really happen...
3483 + reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS\n") ;
3484 + th->t_handle_save = current->journal_info ;
3485 + current->journal_info = th;
3488 + current->journal_info = th;
3490 + ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
3491 + if (current->journal_info != th)
3496 /* not used at all */
3497 @@ -2389,7 +2902,7 @@
3498 reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
3499 th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
3501 - p_s_sb->s_dirt = 1 ;
3502 + p_s_sb->s_dirt = 1;
3504 prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
3505 /* already in this transaction, we are done */
3506 @@ -2413,6 +2926,7 @@
3508 if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
3509 reiserfs_warning(p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
3513 /* this error means I've screwed up, and we've overflowed the transaction.
3514 @@ -2479,25 +2993,36 @@
3519 -** if buffer already in current transaction, do a journal_mark_dirty
3520 -** otherwise, just mark it dirty and move on. Used for writes to meta blocks
3521 -** that don't need journaling
3523 -int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
3524 - if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) ||
3525 - buffer_journal_dirty(bh)) {
3526 - return journal_mark_dirty(th, p_s_sb, bh) ;
3528 - if (get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_dev,bh->b_blocknr,bh->b_size)) {
3529 - return journal_mark_dirty(th, p_s_sb, bh) ;
3531 - mark_buffer_dirty(bh) ;
3535 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3536 - return do_journal_end(th, p_s_sb, nblocks, 0) ;
3539 + if (!current->journal_info && th->t_refcount > 1)
3540 + printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
3541 + if (th->t_refcount > 1) {
3542 + struct reiserfs_transaction_handle *cur_th = current->journal_info ;
3544 + /* we aren't allowed to close a nested transaction on a different
3545 + ** filesystem from the one in the task struct
3547 + if (cur_th->t_super != th->t_super)
3551 + if (th != cur_th) {
3552 + int flags = cur_th->t_flags ;
3553 + /* nested handles are never persistent */
3554 + if (reiserfs_persistent_handle(th)) {
3557 + memcpy(cur_th, th, sizeof(*th));
3559 + cur_th->t_flags = flags ;
3563 + ret = do_journal_end(th, p_s_sb, nblocks, 0) ;
3568 /* removes from the current transaction, relsing and descrementing any counters.
3569 @@ -2600,6 +3125,10 @@
3571 int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3573 + /* you are not allowed to sync while nested, very, very bad */
3574 + if (th->t_refcount > 1) {
3577 if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3578 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3579 journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3580 @@ -2624,12 +3153,14 @@
3583 void flush_async_commits(struct super_block *p_s_sb) {
3585 + struct reiserfs_journal_list *jl;
3586 + struct list_head *entry;
3588 - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
3589 - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
3590 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
3592 + if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
3593 + /* last entry is the youngest, commit it and you get everything */
3594 + entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
3595 + jl = JOURNAL_LIST_ENTRY(entry);
3596 + flush_commit_list(p_s_sb, jl, 1);
3600 @@ -2637,58 +3168,39 @@
3601 ** flushes any old transactions to disk
3602 ** ends the current transaction if it is too old
3604 -** also calls flush_journal_list with old_only == 1, which allows me to reclaim
3605 -** memory and such from the journal lists whose real blocks are all on disk.
3607 -** called by sync_dev_journal from buffer.c
3609 -int flush_old_commits(struct super_block *p_s_sb, int immediate) {
3614 - struct reiserfs_transaction_handle th ;
3616 - start = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
3617 - now = CURRENT_TIME ;
3618 +int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
3620 + struct reiserfs_transaction_handle th ;
3622 + now = CURRENT_TIME ;
3623 + /* safety check so we don't flush while we are replaying the log during
3626 + if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
3630 - /* safety check so we don't flush while we are replaying the log during mount */
3631 - if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
3634 - /* starting with oldest, loop until we get to the start */
3635 - i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
3636 - while(i != start) {
3637 - if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
3639 - /* we have to check again to be sure the current transaction did not change */
3640 - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
3641 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
3644 - i = (i + 1) % JOURNAL_LIST_COUNT ;
3647 - /* now, check the current transaction. If there are no writers, and it is too old, finish it, and
3648 - ** force the commit blocks to disk
3650 - if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
3651 - SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3652 - SB_JOURNAL(p_s_sb)->j_len > 0 &&
3653 - (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
3654 - journal_join(&th, p_s_sb, 1) ;
3655 - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3656 - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3657 - do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
3658 - } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case. If they say to
3659 - flush, we must be sure old transactions hit the disk too. */
3660 - journal_join(&th, p_s_sb, 1) ;
3661 - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3662 - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3663 - do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
3665 - reiserfs_journal_kupdate(p_s_sb) ;
3667 + /* check the current transaction. If there are no writers, and it is
3668 + * too old, finish it, and force the commit blocks to disk
3670 + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
3671 + SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3672 + SB_JOURNAL(p_s_sb)->j_len > 0 &&
3673 + (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) >
3674 + SB_JOURNAL_MAX_TRANS_AGE(p_s_sb))
3676 + journal_join(&th, p_s_sb, 1) ;
3677 + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3678 + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3680 + /* we're only being called from kreiserfsd, it makes no sense to do
3681 + ** an async commit so that kreiserfsd can do it later
3683 + do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
3685 + reiserfs_journal_kupdate(p_s_sb) ;
3686 + return p_s_sb->s_dirt;
3690 @@ -2709,6 +3221,7 @@
3691 int flush = flags & FLUSH_ALL ;
3692 int commit_now = flags & COMMIT_NOW ;
3693 int wait_on_commit = flags & WAIT ;
3694 + struct reiserfs_journal_list *jl;
3696 if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
3697 reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
3698 @@ -2727,8 +3240,9 @@
3699 if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3700 int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
3701 unlock_journal(p_s_sb) ;
3702 - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) {
3703 - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
3705 + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) {
3706 + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
3707 wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3710 @@ -2741,24 +3255,37 @@
3712 if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
3713 if (flush || commit_now) {
3714 - int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
3715 + unsigned trans_id ;
3717 + jl = SB_JOURNAL(p_s_sb)->j_current_jl;
3718 + trans_id = jl->j_trans_id;
3720 atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3722 SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
3724 unlock_journal(p_s_sb) ;
3726 /* sleep while the current transaction is still j_jlocked */
3727 - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
3728 - SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
3729 - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3732 - if (wait_on_commit) {
3733 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
3735 - commit_flush_async(p_s_sb, orig_jindex) ;
3736 + while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3737 + if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3738 + queue_log_writer(p_s_sb);
3740 + lock_journal(p_s_sb);
3741 + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3742 + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3744 + unlock_journal(p_s_sb);
3747 + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3750 + if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
3753 + flush_commit_list(p_s_sb, jl, 1) ;
3757 unlock_journal(p_s_sb) ;
3758 @@ -2776,8 +3303,8 @@
3759 if (!(SB_JOURNAL(p_s_sb)->j_must_wait > 0) && !(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))) && !flush && !commit_now &&
3760 (SB_JOURNAL(p_s_sb)->j_len < SB_JOURNAL_MAX_BATCH(p_s_sb)) &&
3761 SB_JOURNAL(p_s_sb)->j_len_alloc < SB_JOURNAL_MAX_BATCH(p_s_sb) && SB_JOURNAL(p_s_sb)->j_cnode_free > (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3)) {
3762 - SB_JOURNAL(p_s_sb)->j_bcount++ ;
3763 unlock_journal(p_s_sb) ;
3768 @@ -2807,16 +3334,13 @@
3769 struct reiserfs_list_bitmap *jb = NULL ;
3772 - if (reiserfs_dont_log(th->t_super)) {
3773 - bh = sb_get_hash_table(p_s_sb, blocknr) ;
3774 - if (bh && buffer_dirty (bh)) {
3775 - reiserfs_warning (p_s_sb, "journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr);
3780 + cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, p_s_sb->s_dev,
3781 + blocknr, p_s_sb->s_blocksize) ;
3782 + if (cn && cn->bh) {
3786 - bh = sb_get_hash_table(p_s_sb, blocknr) ;
3788 /* if it is journal new, we just remove it from this transaction */
3789 if (bh && buffer_journal_new(bh)) {
3790 mark_buffer_notjournal_new(bh) ;
3791 @@ -2824,14 +3348,22 @@
3792 cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
3794 /* set the bit for this block in the journal bitmap for this transaction */
3795 - jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
3796 + jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
3798 reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
3800 - set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
3802 - /* Note, the entire while loop is not allowed to schedule. */
3803 + /* we set bits in the list bitmap so the block won't be reallocated
3804 + * as a data block which might get flushed before this transaction
3805 + * commits. When data logging is on, the block might get reallocated
3806 + * as a data block, but we know the data block won't get flushed before
3809 + if (!reiserfs_data_log(p_s_sb)) {
3810 + set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
3813 + /* Note, the entire while loop is not allowed to schedule. */
3815 clear_prepared_bits(bh) ;
3817 @@ -2876,57 +3408,77 @@
3819 void reiserfs_update_inode_transaction(struct inode *inode) {
3821 - inode->u.reiserfs_i.i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
3823 + inode->u.reiserfs_i.i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
3824 inode->u.reiserfs_i.i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
3827 void reiserfs_update_tail_transaction(struct inode *inode) {
3829 - inode->u.reiserfs_i.i_tail_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
3831 + inode->u.reiserfs_i.i_tail_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
3832 inode->u.reiserfs_i.i_tail_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
3835 -static void __commit_trans_index(struct inode *inode, unsigned long id,
3836 - unsigned long index)
3837 +static void __commit_trans_jl(struct inode *inode, unsigned long id,
3838 + struct reiserfs_journal_list *jl)
3840 - struct reiserfs_journal_list *jl ;
3841 struct reiserfs_transaction_handle th ;
3842 struct super_block *sb = inode->i_sb ;
3844 - jl = SB_JOURNAL_LIST(sb) + index;
3846 /* is it from the current transaction, or from an unknown transaction? */
3847 if (id == SB_JOURNAL(sb)->j_trans_id) {
3848 - journal_join(&th, sb, 1) ;
3849 + jl = SB_JOURNAL(sb)->j_current_jl;
3850 + /* try to let other writers come in and grow this transaction */
3851 + let_transaction_grow(sb, id);
3852 + if (SB_JOURNAL(sb)->j_trans_id != id) {
3853 + goto flush_commit_only;
3856 + journal_begin(&th, sb, 1) ;
3858 + /* someone might have ended this transaction while we joined */
3859 + if (SB_JOURNAL(sb)->j_trans_id != id) {
3860 + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
3861 + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
3862 + journal_end(&th, sb, 1) ;
3863 + goto flush_commit_only;
3866 journal_end_sync(&th, sb, 1) ;
3867 - } else if (jl->j_trans_id == id) {
3868 - flush_commit_list(sb, jl, 1) ;
3871 + /* this gets tricky, we have to make sure the journal list in
3872 + * the inode still exists. We know the list is still around
3873 + * if we've got a larger transaction id than the oldest list
3876 + if (journal_list_still_alive(inode->i_sb, id)) {
3877 + flush_commit_list(sb, jl, 1) ;
3880 - /* if the transaction id does not match, this list is long since flushed
3881 - ** and we don't have to do anything here
3883 + /* otherwise the list is gone, and long since committed */
3885 void reiserfs_commit_for_tail(struct inode *inode) {
3886 unsigned long id = inode->u.reiserfs_i.i_tail_trans_id;
3887 - unsigned long index = inode->u.reiserfs_i.i_tail_trans_index;
3888 + struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_tail_jl;
3890 /* for tails, if this info is unset there's nothing to commit */
3892 - __commit_trans_index(inode, id, index);
3894 + __commit_trans_jl(inode, id, jl);
3896 void reiserfs_commit_for_inode(struct inode *inode) {
3897 unsigned long id = inode->u.reiserfs_i.i_trans_id;
3898 - unsigned long index = inode->u.reiserfs_i.i_trans_index;
3899 + struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_jl;
3901 - /* for the whole inode, assume unset id or index means it was
3902 + /* for the whole inode, assume unset id means it was
3903 * changed in the current transaction. More conservative
3905 - if (!id || !index)
3907 reiserfs_update_inode_transaction(inode) ;
3908 + id = inode->u.reiserfs_i.i_trans_id;
3909 + /* jl will be updated in __commit_trans_jl */
3912 - __commit_trans_index(inode, id, index);
3913 + __commit_trans_jl(inode, id, jl);
3916 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
3917 @@ -2954,8 +3506,6 @@
3918 int retry_count = 0 ;
3920 PROC_INFO_INC( p_s_sb, journal.prepare );
3921 - if (reiserfs_dont_log (p_s_sb))
3924 while(!test_bit(BH_JPrepared, &bh->b_state) ||
3925 (wait && buffer_locked(bh))) {
3926 @@ -2964,16 +3514,37 @@
3929 set_bit(BH_JPrepared, &bh->b_state) ;
3932 RFALSE( buffer_locked(bh) && cur_tb != NULL,
3933 "waiting while do_balance was running\n") ;
3934 + /* only data buffers are allowed to come in dirty, and they
3935 + * never get run through restore_prepared_buffer. So we can
3936 + * just mark them clean here and know it is safe
3938 + mark_buffer_clean(bh);
3939 wait_on_buffer(bh) ;
3942 PROC_INFO_INC( p_s_sb, journal.prepare_retry );
3947 +static void flush_old_journal_lists(struct super_block *s) {
3948 + struct reiserfs_journal_list *jl;
3949 + struct list_head *entry;
3950 + time_t now = CURRENT_TIME;
3952 + while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
3953 + entry = SB_JOURNAL(s)->j_journal_list.next;
3954 + jl = JOURNAL_LIST_ENTRY(entry);
3955 + /* this check should always be run, to send old lists to disk */
3956 + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
3957 + flush_used_journal_lists(s, jl);
3964 ** long and ugly. If flush, will not return until all commit
3965 ** blocks and all real buffers in the trans are on disk.
3966 @@ -2990,18 +3561,30 @@
3967 struct buffer_head *c_bh ; /* commit bh */
3968 struct buffer_head *d_bh ; /* desc bh */
3969 int cur_write_start = 0 ; /* start index of current log write */
3970 - int cur_blocks_left = 0 ; /* number of journal blocks left to write */
3975 int flush = flags & FLUSH_ALL ;
3976 int commit_now = flags & COMMIT_NOW ;
3977 int wait_on_commit = flags & WAIT ;
3978 struct reiserfs_super_block *rs ;
3979 + struct reiserfs_journal_list *jl, *temp_jl;
3980 + struct list_head *entry, *safe;
3981 + int wakeup_kreiserfsd = 0;
3982 + unsigned long jindex;
3983 + unsigned long commit_trans_id;
3985 + if (th->t_refcount > 1)
3988 + reiserfs_check_lock_depth("journal end");
3989 + current->journal_info = th->t_handle_save;
3990 if (reiserfs_dont_log(th->t_super)) {
3995 + if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3996 + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3997 + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
4000 lock_journal(p_s_sb) ;
4001 @@ -3018,7 +3601,9 @@
4002 ** it tells us if we should continue with the journal_end, or just return
4004 if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
4006 + p_s_sb->s_dirt = 1;
4007 + wake_queued_writers(p_s_sb);
4011 /* check_journal_end might set these, check again */
4012 @@ -3037,8 +3622,11 @@
4015 #ifdef REISERFS_PREALLOCATE
4016 + /* quota ops might need to nest, setup the journal_info pointer for them */
4017 + current->journal_info = th ;
4018 reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
4019 * the transaction */
4020 + current->journal_info = th->t_handle_save ;
4023 rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
4024 @@ -3059,25 +3647,23 @@
4025 mark_buffer_uptodate(c_bh, 1) ;
4027 /* init this journal list */
4028 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
4029 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
4030 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
4031 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
4032 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
4033 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;
4034 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
4035 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
4036 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
4037 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
4038 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
4040 - /* which is faster, locking/unlocking at the start and end of the for
4041 - ** or locking once per iteration around the insert_journal_hash?
4042 - ** eitherway, we are write locking insert_journal_hash. The ENTIRE FOR
4043 - ** LOOP MUST not cause schedule to occur.
4045 + jl = SB_JOURNAL(p_s_sb)->j_current_jl;
4047 + /* save the transaction id in case we need to commit it later */
4048 + commit_trans_id = jl->j_trans_id;
4050 - /* for each real block, add it to the journal list hash,
4051 + atomic_set(&jl->j_older_commits_done, 0) ;
4052 + jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
4053 + jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
4054 + jl->j_commit_bh = c_bh ;
4055 + jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
4056 + jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;
4057 + atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
4058 + atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
4059 + jl->j_realblock = NULL ;
4061 + /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
4062 + ** for each real block, add it to the journal list hash,
4063 ** copy into real block index array in the commit or desc block
4065 for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
4066 @@ -3087,7 +3673,7 @@
4067 reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
4070 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
4071 + jl->j_realblock = jl_cn ;
4073 jl_cn->prev = last_cn ;
4074 jl_cn->next = NULL ;
4075 @@ -3105,7 +3691,7 @@
4077 jl_cn->dev = cn->bh->b_dev ;
4078 jl_cn->bh = cn->bh ;
4079 - jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
4080 + jl_cn->jlist = jl;
4081 insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ;
4082 if (i < JOURNAL_TRANS_HALF) {
4083 desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
4084 @@ -3130,29 +3716,34 @@
4085 reiserfs_warning(p_s_sb, "journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
4086 atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
4087 wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
4092 /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
4093 cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
4094 - cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len ;
4095 cn = SB_JOURNAL(p_s_sb)->j_first ;
4096 jindex = 1 ; /* start at one so we don't get the desc again */
4097 - while(cur_blocks_left > 0) {
4099 + clear_bit(BH_JNew, &(cn->bh->b_state)) ;
4100 /* copy all the real blocks into log area. dirty log blocks */
4101 if (test_bit(BH_JDirty, &cn->bh->b_state)) {
4102 struct buffer_head *tmp_bh ;
4103 tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
4104 ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
4105 mark_buffer_uptodate(tmp_bh, 1) ;
4106 - memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;
4107 + memcpy(tmp_bh->b_data, bh_kmap(cn->bh), cn->bh->b_size) ;
4108 + bh_kunmap(cn->bh);
4110 + set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ;
4111 + clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
4113 /* JDirty cleared sometime during transaction. don't log this one */
4114 reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
4118 - cur_blocks_left-- ;
4120 + free_cnode(p_s_sb, cn) ;
4124 /* we are done with both the c_bh and d_bh, but
4125 @@ -3160,47 +3751,19 @@
4126 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4129 - /* now loop through and mark all buffers from this transaction as JDirty_wait
4130 - ** clear the JDirty bit, clear BH_JNew too.
4131 - ** if they weren't JDirty, they weren't logged, just relse them and move on
4133 - cn = SB_JOURNAL(p_s_sb)->j_first ;
4135 - clear_bit(BH_JNew, &(cn->bh->b_state)) ;
4136 - if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
4137 - set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ;
4138 - clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
4143 - free_cnode(p_s_sb, cn) ;
4147 - /* unlock the journal list for committing and flushing */
4148 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
4149 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
4151 - orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
4152 - jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
4153 - SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
4154 + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
4156 - /* write any buffers that must hit disk before this commit is done */
4157 - fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
4158 + /* we lock the commit before putting it onto the main list because
4159 + * we want to make sure nobody tries to run flush_commit_list until
4160 + * the new transaction is fully setup, and we've already flushed the
4163 + down(&jl->j_commit_lock);
4165 - /* honor the flush and async wishes from the caller */
4168 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
4169 - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;
4170 - } else if (commit_now) {
4171 - if (wait_on_commit) {
4172 - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
4174 - commit_flush_async(p_s_sb, orig_jindex) ;
4177 + /* now it is safe to insert this transaction on the main list */
4178 + list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
4179 + list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
4180 + SB_JOURNAL(p_s_sb)->j_num_work_lists++;
4182 /* reset journal values for the next transaction */
4183 old_start = SB_JOURNAL(p_s_sb)->j_start ;
4184 @@ -3212,57 +3775,119 @@
4185 SB_JOURNAL(p_s_sb)->j_len = 0 ;
4186 SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
4187 SB_JOURNAL(p_s_sb)->j_trans_id++ ;
4188 + SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
4189 SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
4190 SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
4191 SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
4192 SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
4193 init_journal_hash(p_s_sb) ;
4195 + /* tail conversion targets have to hit the disk before we end the
4196 + * transaction. Otherwise a later transaction might repack the tail
4197 + * before this transaction commits, leaving the data block unflushed and
4198 + * clean, if we crash before the later transaction commits, the data block
4201 + while(!list_empty(&jl->j_tail_bh_list)) {
4203 + fsync_buffers_list(&jl->j_tail_bh_list);
4206 + up(&jl->j_commit_lock);
4208 + /* honor the flush wishes from the caller, simple commits can
4209 + ** be done outside the journal lock, they are done below
4212 + flush_commit_list(p_s_sb, jl, 1) ;
4213 + flush_journal_list(p_s_sb, jl, 1) ;
4217 /* if the next transaction has any chance of wrapping, flush
4218 ** transactions that might get overwritten. If any journal lists are very
4219 ** old flush them as well.
4221 - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
4223 - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4224 - if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4225 - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
4227 - } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 &&
4228 - (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
4229 - if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
4230 - SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4231 - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ;
4233 + list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
4234 + temp_jl = JOURNAL_LIST_ENTRY(entry);
4235 + if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
4236 + if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >=
4239 + flush_used_journal_lists(p_s_sb, temp_jl);
4240 + wakeup_kreiserfsd = 1;
4242 + } else if ((SB_JOURNAL(p_s_sb)->j_start +
4243 + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) <
4244 + SB_ONDISK_JOURNAL_SIZE(p_s_sb))
4246 + /* if we don't cross into the next transaction and we don't
4247 + * wrap, there is no way we can overlap any later transactions
4252 + } else if ((SB_JOURNAL(p_s_sb)->j_start +
4253 + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >
4254 + SB_ONDISK_JOURNAL_SIZE(p_s_sb))
4256 + if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) %
4257 + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
4259 + flush_used_journal_lists(p_s_sb, temp_jl);
4260 + wakeup_kreiserfsd = 1;
4263 + /* we don't overlap anything from out start to the end of the
4264 + * log, and our wrapped portion doesn't overlap anything at
4265 + * the start of the log. We can break
4270 - /* this check should always be run, to send old lists to disk */
4271 - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 &&
4272 - SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp <
4273 - (CURRENT_TIME - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
4274 - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ;
4277 + flush_old_journal_lists(p_s_sb);
4279 - /* if the next journal_list is still in use, flush it */
4280 - if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
4281 - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ;
4284 + if (SB_JOURNAL(p_s_sb)->j_num_work_lists > 128 || wakeup_kreiserfsd) {
4285 + wake_up(&reiserfs_commit_thread_wait) ;
4288 - /* we don't want anyone flushing the new transaction's list */
4289 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
4290 - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
4291 - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) +
4292 - SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
4293 + SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
4295 - if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
4296 + if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
4297 reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
4299 - unlock_journal(p_s_sb) ;
4301 atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
4302 + unlock_journal(p_s_sb) ;
4303 /* wake up any body waiting to join. */
4304 + clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
4305 wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
4307 + if (!flush && commit_now && wait_on_commit) {
4308 + if (current->need_resched) {
4311 + if (journal_list_still_alive(p_s_sb, commit_trans_id))
4312 + flush_commit_list(p_s_sb, jl, 1) ;
4314 + /* if we did an async commit, get kreiserfsd going on it */
4315 + if (!commit_now && !wait_on_commit) {
4316 + wake_up(&reiserfs_commit_thread_wait) ;
4320 + reiserfs_check_lock_depth("journal end2");
4321 + if (reiserfs_persistent_handle(th)) {
4322 + memset(th, 0, sizeof(*th));
4323 + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), p_s_sb) ;
4331 +int __init reiserfs_journal_cache_init(void) {
4334 diff -urN linux-2.4.22.org/fs/reiserfs/Makefile linux-2.4.22/fs/reiserfs/Makefile
4335 --- linux-2.4.22.org/fs/reiserfs/Makefile 2003-11-21 15:08:29.000000000 +0100
4336 +++ linux-2.4.22/fs/reiserfs/Makefile 2003-11-21 15:14:23.000000000 +0100
4339 # Note 2! The CFLAGS definitions are now in the main makefile...
4341 +export-objs := super.o
4342 O_TARGET := reiserfs.o
4343 obj-y := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \
4344 lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o item_ops.o ioctl.o procfs.o
4345 diff -urN linux-2.4.22.org/fs/reiserfs/namei.c linux-2.4.22/fs/reiserfs/namei.c
4346 --- linux-2.4.22.org/fs/reiserfs/namei.c 2003-11-21 15:08:29.000000000 +0100
4347 +++ linux-2.4.22/fs/reiserfs/namei.c 2003-11-21 15:14:23.000000000 +0100
4349 #include <linux/bitops.h>
4350 #include <linux/reiserfs_fs.h>
4351 #include <linux/smp_lock.h>
4352 +#include <linux/quotaops.h>
4354 #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
4355 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
4359 /* perform the insertion of the entry that we have prepared */
4360 - retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size);
4361 + retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
4362 if (buffer != small_buf)
4363 reiserfs_kfree (buffer, buflen, dir->i_sb);
4368 dir->i_size += paste_size;
4369 - dir->i_blocks = ((dir->i_size + 511) >> 9);
4370 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
4371 if (!S_ISDIR (inode->i_mode) && visible)
4372 // reiserfs_mkdir or reiserfs_rename will do that by itself
4374 ** inserted into the tree yet.
4376 static int drop_new_inode(struct inode *inode) {
4377 + DQUOT_DROP(inode);
4378 make_bad_inode(inode) ;
4379 + inode->i_flags |= S_NOQUOTA;
4383 @@ -518,6 +520,11 @@
4385 inode->i_gid = current->fsgid;
4387 + DQUOT_INIT(inode);
4388 + if (DQUOT_ALLOC_INODE(inode)) {
4389 + drop_new_inode(inode);
4398 journal_begin(&th, dir->i_sb, jbegin_count) ;
4399 - th.t_caller = "create" ;
4400 retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
4405 DEC_DIR_INODE_NLINK(dir)
4406 dir->i_size -= (DEH_SIZE + de.de_entrylen);
4407 - dir->i_blocks = ((dir->i_size + 511) >> 9);
4408 reiserfs_update_sd (&th, dir);
4410 /* prevent empty directory from getting lost */
4412 reiserfs_update_sd (&th, inode);
4414 dir->i_size -= (de.de_entrylen + DEH_SIZE);
4415 - dir->i_blocks = ((dir->i_size + 511) >> 9);
4416 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
4417 reiserfs_update_sd (&th, dir);
4419 @@ -1245,7 +1249,6 @@
4420 reiserfs_warning ((&th)->t_super, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?\n");
4422 old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
4423 - old_dir->i_blocks = ((old_dir->i_size + 511) >> 9);
4425 reiserfs_update_sd (&th, old_dir);
4426 reiserfs_update_sd (&th, new_dir);
4427 diff -urN linux-2.4.22.org/fs/reiserfs/objectid.c linux-2.4.22/fs/reiserfs/objectid.c
4428 --- linux-2.4.22.org/fs/reiserfs/objectid.c 2003-11-21 15:08:29.000000000 +0100
4429 +++ linux-2.4.22/fs/reiserfs/objectid.c 2003-11-21 15:14:23.000000000 +0100
4433 journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
4435 return unused_objectid;
4440 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
4441 journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
4445 /* start at the beginning of the objectid map (i = 0) and go to
4446 the end of it (i = disk_sb->s_oid_cursize). Linear search is
4447 diff -urN linux-2.4.22.org/fs/reiserfs/procfs.c linux-2.4.22/fs/reiserfs/procfs.c
4448 --- linux-2.4.22.org/fs/reiserfs/procfs.c 2003-11-21 15:08:29.000000000 +0100
4449 +++ linux-2.4.22/fs/reiserfs/procfs.c 2003-11-21 15:14:24.000000000 +0100
4451 "j_first_unflushed_offset: \t%lu\n"
4452 "j_last_flush_trans_id: \t%lu\n"
4453 "j_trans_start_time: \t%li\n"
4454 - "j_journal_list_index: \t%i\n"
4455 "j_list_bitmap_index: \t%i\n"
4456 "j_must_wait: \t%i\n"
4457 "j_next_full_flush: \t%i\n"
4459 JF( j_first_unflushed_offset ),
4460 JF( j_last_flush_trans_id ),
4461 JF( j_trans_start_time ),
4462 - JF( j_journal_list_index ),
4463 JF( j_list_bitmap_index ),
4465 JF( j_next_full_flush ),
4466 diff -urN linux-2.4.22.org/fs/reiserfs/stree.c linux-2.4.22/fs/reiserfs/stree.c
4467 --- linux-2.4.22.org/fs/reiserfs/stree.c 2003-11-21 15:08:29.000000000 +0100
4468 +++ linux-2.4.22/fs/reiserfs/stree.c 2003-11-21 15:14:25.000000000 +0100
4470 #include <linux/pagemap.h>
4471 #include <linux/reiserfs_fs.h>
4472 #include <linux/smp_lock.h>
4473 +#include <linux/quotaops.h>
4475 /* Does the buffer contain a disk block which is in the tree. */
4476 inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
4478 return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
4484 inline void copy_short_key (void * to, const void * from)
4486 memcpy (to, from, SHORT_KEY_SIZE);
4488 stop at leaf level - set to
4489 DISK_LEAF_NODE_LEVEL */
4491 - int n_block_number = SB_ROOT_BLOCK (p_s_sb),
4492 - expected_level = SB_TREE_HEIGHT (p_s_sb),
4493 - n_block_size = p_s_sb->s_blocksize;
4494 + int n_block_number,
4496 + n_block_size = p_s_sb->s_blocksize;
4497 struct buffer_head * p_s_bh;
4498 struct path_element * p_s_last_element;
4499 int n_node_level, n_retval;
4500 @@ -678,8 +676,11 @@
4501 /* With each iteration of this loop we search through the items in the
4502 current node, and calculate the next current node(next path element)
4503 for the next iteration of this loop.. */
4504 + n_block_number = SB_ROOT_BLOCK (p_s_sb);
4505 + expected_level = SB_TREE_HEIGHT (p_s_sb);
4508 + reiserfs_check_lock_depth("search_by_key");
4509 #ifdef CONFIG_REISERFS_CHECK
4510 if ( !(++n_repeat_counter % 50000) )
4511 reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
4512 @@ -1123,8 +1124,7 @@
4513 tmp = get_block_num(p_n_unfm_pointer,0);
4514 put_block_num(p_n_unfm_pointer, 0, 0);
4515 journal_mark_dirty (th, p_s_sb, p_s_bh);
4516 - inode->i_blocks -= p_s_sb->s_blocksize / 512;
4517 - reiserfs_free_block(th, tmp);
4518 + reiserfs_free_block(th, inode, tmp, 1);
4519 /* In case of big fragmentation it is possible that each block
4520 freed will cause dirtying of one more bitmap and then we will
4521 quickly overflow our transaction space. This is a
4522 @@ -1132,9 +1132,7 @@
4523 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
4524 int orig_len_alloc = th->t_blocks_allocated ;
4525 pathrelse(p_s_path) ;
4527 - journal_end(th, p_s_sb, orig_len_alloc) ;
4528 - journal_begin(th, p_s_sb, orig_len_alloc) ;
4529 + reiserfs_restart_transaction(th, orig_len_alloc);
4530 reiserfs_update_inode_transaction(inode) ;
4533 @@ -1168,8 +1166,7 @@
4538 -/* Calculate bytes number which will be deleted or cutted in the balance. */
4539 +/* Calculate number of bytes which will be deleted or cut during balance */
4540 int calc_deleted_bytes_number(
4541 struct tree_balance * p_s_tb,
4543 @@ -1180,14 +1177,14 @@
4544 if ( is_statdata_le_ih (p_le_ih) )
4547 + n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
4548 if ( is_direntry_le_ih (p_le_ih) ) {
4549 // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
4550 // we can't use EMPTY_DIR_SIZE, as old format dirs have a different
4551 // empty size. ick. FIXME, is this right?
4553 - return ih_item_len(p_le_ih);
4554 + return n_del_size ;
4556 - n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
4558 if ( is_indirect_le_ih (p_le_ih) )
4559 n_del_size = (n_del_size/UNFM_P_SIZE)*
4560 @@ -1221,17 +1218,46 @@
4564 +#ifdef REISERQUOTA_DEBUG
4565 +char key2type(struct key *ih)
4567 + if (is_direntry_le_key(2, ih))
4569 + if (is_direct_le_key(2, ih))
4571 + if (is_indirect_le_key(2, ih))
4573 + if (is_statdata_le_key(2, ih))
4578 +char head2type(struct item_head *ih)
4580 + if (is_direntry_le_ih(ih))
4582 + if (is_direct_le_ih(ih))
4584 + if (is_indirect_le_ih(ih))
4586 + if (is_statdata_le_ih(ih))
4592 /* Delete object item. */
4593 int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
4594 struct path * p_s_path, /* Path to the deleted item. */
4595 const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */
4596 - struct inode * p_s_inode,/* inode is here just to update i_blocks */
4597 + struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
4598 struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */
4600 struct super_block * p_s_sb = p_s_inode->i_sb;
4601 struct tree_balance s_del_balance;
4602 struct item_head s_ih;
4603 + struct item_head *q_ih;
4604 + int quota_cut_bytes;
4608 @@ -1281,6 +1307,22 @@
4610 // reiserfs_delete_item returns item length when success
4611 n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
4612 + q_ih = get_ih(p_s_path) ;
4613 + quota_cut_bytes = ih_item_len(q_ih) ;
4615 + /* hack so the quota code doesn't have to guess if the file
4616 + ** has a tail. On tail insert, we allocate quota for 1 unformatted node.
4617 + ** We test the offset because the tail might have been
4618 + ** split into multiple items, and we only want to decrement for
4619 + ** the unfm node once
4621 + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
4622 + if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
4623 + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
4625 + quota_cut_bytes = 0 ;
4631 @@ -1312,10 +1354,14 @@
4633 B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
4636 /* Perform balancing after all resources have been collected at once. */
4637 do_balance(&s_del_balance, NULL, NULL, M_DELETE);
4639 +#ifdef REISERQUOTA_DEBUG
4640 + printk(KERN_DEBUG "reiserquota delete_item(): freeing %u, id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
4642 + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
4644 /* Return deleted body length */
4647 @@ -1340,14 +1386,16 @@
4649 /* this deletes item which never gets split */
4650 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
4651 + struct inode *inode,
4654 struct tree_balance tb;
4655 INITIALIZE_PATH (path);
4659 struct cpu_key cpu_key;
4661 + int quota_cut_bytes = 0;
4663 le_key2cpu_key (&cpu_key, key);
4665 @@ -1371,6 +1419,7 @@
4666 item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
4667 init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
4669 + quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
4671 retval = fix_nodes (M_DELETE, &tb, NULL, 0);
4672 if (retval == REPEAT_SEARCH) {
4673 @@ -1380,6 +1429,12 @@
4675 if (retval == CARRY_ON) {
4676 do_balance (&tb, 0, 0, M_DELETE);
4677 + if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
4678 +#ifdef REISERQUOTA_DEBUG
4679 + printk(KERN_DEBUG "reiserquota delete_solid_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, inode->i_uid, key2type(key));
4681 + DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
4686 @@ -1412,7 +1467,7 @@
4688 /* USE_INODE_GENERATION_COUNTER */
4690 - reiserfs_delete_solid_item (th, INODE_PKEY (inode));
4691 + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
4695 @@ -1484,6 +1539,38 @@
4696 mark_inode_dirty (inode);
4700 +unmap_buffers(struct page *page, loff_t pos) {
4701 + struct buffer_head *bh ;
4702 + struct buffer_head *head ;
4703 + struct buffer_head *next ;
4704 + unsigned long tail_index ;
4705 + unsigned long cur_index ;
4707 + if (!page || !page->buffers)
4710 + tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
4712 + head = page->buffers ;
4715 + next = bh->b_this_page ;
4717 + /* we want to unmap the buffers that contain the tail, and
4718 + ** all the buffers after it (since the tail must be at the
4719 + ** end of the file). We don't want to unmap file data
4720 + ** before the tail, since it might be dirty and waiting to
4723 + cur_index += bh->b_size ;
4724 + if (cur_index > tail_index) {
4725 + reiserfs_unmap_buffer(bh) ;
4728 + } while (bh != head) ;
4732 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
4733 int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
4734 @@ -1499,12 +1586,15 @@
4735 structure by using the init_tb_struct and fix_nodes functions.
4736 After that we can make tree balancing. */
4737 struct tree_balance s_cut_balance;
4738 + struct item_head *p_le_ih;
4739 + loff_t tail_pos = 0;
4740 int n_cut_size = 0, /* Amount to be cut. */
4741 n_ret_value = CARRY_ON,
4742 n_removed = 0, /* Number of the removed unformatted nodes. */
4743 n_is_inode_locked = 0;
4744 char c_mode; /* Mode of the balance. */
4746 + int quota_cut_bytes;
4749 init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
4750 @@ -1531,6 +1621,9 @@
4751 /* tail has been left in the unformatted node */
4754 + if (n_is_inode_locked) {
4755 +printk("inode locked twice\n");
4757 n_is_inode_locked = 1;
4759 /* removing of last unformatted node will change value we
4760 @@ -1545,6 +1638,7 @@
4761 set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
4762 p_s_item_key->key_length = 4;
4763 n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
4764 + tail_pos = n_new_file_size;
4765 set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
4766 if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
4767 print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
4768 @@ -1592,23 +1686,27 @@
4769 RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "illegal mode");
4771 /* Calculate number of bytes that need to be cut from the item. */
4772 + quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
4774 n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
4776 n_ret_value = retval2;
4778 - if ( c_mode == M_DELETE ) {
4779 - struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4781 - if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
4782 - /* we delete first part of tail which was stored in direct
4786 + /* For direct items, we only change the quota when deleting the last
4789 + p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4790 + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
4791 + if (c_mode == M_DELETE &&
4792 + (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
4793 // FIXME: this is to keep 3.5 happy
4794 p_s_inode->u.reiserfs_i.i_first_direct_byte = U32_MAX;
4795 - p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512;
4796 + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
4798 + quota_cut_bytes = 0 ;
4802 #ifdef CONFIG_REISERFS_CHECK
4803 if (n_is_inode_locked) {
4804 struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4805 @@ -1642,7 +1740,12 @@
4806 ** deal with it here.
4808 p_s_inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
4809 + unmap_buffers(page, tail_pos);
4811 +#ifdef REISERQUOTA_DEBUG
4812 + printk(KERN_DEBUG "reiserquota cut_from_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, '?');
4814 + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
4818 @@ -1654,8 +1757,8 @@
4820 set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
4821 set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
4822 - reiserfs_delete_solid_item (th, INODE_PKEY (inode));
4824 + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
4825 + reiserfs_update_sd(th, inode) ;
4826 set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
4827 set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);
4829 @@ -1681,6 +1784,7 @@
4830 n_new_file_size;/* New file size. */
4831 int n_deleted; /* Number of deleted or truncated bytes. */
4833 + int jbegin_count = th->t_blocks_allocated;
4835 if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
4837 @@ -1760,17 +1864,14 @@
4838 ** sure the file is consistent before ending the current trans
4839 ** and starting a new one
4841 - if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
4842 - int orig_len_alloc = th->t_blocks_allocated ;
4843 + if (journal_transaction_should_end(th, jbegin_count)) {
4844 decrement_counters_in_path(&s_search_path) ;
4846 if (update_timestamps) {
4847 p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME;
4849 reiserfs_update_sd(th, p_s_inode) ;
4851 - journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
4852 - journal_begin(th, p_s_inode->i_sb, orig_len_alloc) ;
4853 + reiserfs_restart_transaction(th, jbegin_count) ;
4854 reiserfs_update_inode_transaction(p_s_inode) ;
4856 } while ( n_file_size > ROUND_UP (n_new_file_size) &&
4857 @@ -1822,18 +1923,37 @@
4858 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
4859 struct path * p_s_search_path, /* Path to the pasted item. */
4860 const struct cpu_key * p_s_key, /* Key to search for the needed item.*/
4861 + struct inode * inode, /* Inode item belongs to */
4862 const char * p_c_body, /* Pointer to the bytes to paste. */
4863 int n_pasted_size) /* Size of pasted bytes. */
4865 struct tree_balance s_paste_balance;
4869 + fs_gen = get_generation(inode->i_sb) ;
4871 +#ifdef REISERQUOTA_DEBUG
4872 + printk(KERN_DEBUG "reiserquota paste_into_item(): allocating %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
4875 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
4876 + pathrelse(p_s_search_path);
4879 init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
4880 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
4881 s_paste_balance.key = p_s_key->on_disk_key;
4884 - while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) {
4886 + /* DQUOT_* can schedule, must check before the fix_nodes */
4887 + if (fs_changed(fs_gen, inode->i_sb)) {
4888 + goto search_again;
4891 + while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
4894 /* file system changed while we were in the fix_nodes */
4895 PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
4896 retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
4897 @@ -1862,6 +1982,10 @@
4899 /* this also releases the path */
4900 unfix_nodes(&s_paste_balance);
4901 +#ifdef REISERQUOTA_DEBUG
4902 + printk(KERN_DEBUG "reiserquota paste_into_item(): freeing %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
4904 + DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
4908 @@ -1871,23 +1995,45 @@
4909 struct path * p_s_path, /* Path to the inserteded item. */
4910 const struct cpu_key * key,
4911 struct item_head * p_s_ih, /* Pointer to the item header to insert.*/
4912 + struct inode * inode,
4913 const char * p_c_body) /* Pointer to the bytes to insert. */
4915 struct tree_balance s_ins_balance;
4918 + int quota_bytes = 0 ;
4920 + if (inode) { /* Do we count quotas for item? */
4921 + fs_gen = get_generation(inode->i_sb);
4922 + quota_bytes = ih_item_len(p_s_ih);
4924 + /* hack so the quota code doesn't have to guess if the file has
4925 + ** a tail, links are always tails, so there's no guessing needed
4927 + if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
4928 + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
4930 +#ifdef REISERQUOTA_DEBUG
4931 + printk(KERN_DEBUG "reiserquota insert_item(): allocating %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
4933 + /* We can't dirty inode here. It would be immediately written but
4934 + * appropriate stat item isn't inserted yet... */
4935 + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
4936 + pathrelse(p_s_path);
4940 init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
4941 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
4942 s_ins_balance.key = key->on_disk_key;
4946 - if (p_c_body == 0)
4947 - n_zeros_num = ih_item_len(p_s_ih);
4949 - // le_key2cpu_key (&key, &(p_s_ih->ih_key));
4950 + /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
4951 + if (inode && fs_changed(fs_gen, inode->i_sb)) {
4952 + goto search_again;
4955 while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
4957 /* file system changed while we were in the fix_nodes */
4958 PROC_INFO_INC( th -> t_super, insert_item_restarted );
4959 retval = search_item (th->t_super, key, p_s_path);
4960 @@ -1902,7 +2048,7 @@
4966 /* make balancing after all resources will be collected at a time */
4967 if ( retval == CARRY_ON ) {
4968 do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
4969 @@ -1913,6 +2059,11 @@
4971 /* also releases the path */
4972 unfix_nodes(&s_ins_balance);
4973 +#ifdef REISERQUOTA_DEBUG
4974 + printk(KERN_DEBUG "reiserquota insert_item(): freeing %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
4977 + DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
4981 diff -urN linux-2.4.22.org/fs/reiserfs/super.c linux-2.4.22/fs/reiserfs/super.c
4982 --- linux-2.4.22.org/fs/reiserfs/super.c 2003-11-21 15:08:29.000000000 +0100
4983 +++ linux-2.4.22/fs/reiserfs/super.c 2003-11-21 15:14:25.000000000 +0100
4985 #include <linux/locks.h>
4986 #include <linux/init.h>
4988 +EXPORT_SYMBOL(journal_begin) ;
4989 +EXPORT_SYMBOL(journal_end) ;
4991 #define REISERFS_OLD_BLOCKSIZE 4096
4992 #define REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ 20
4995 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
4996 static int reiserfs_statfs (struct super_block * s, struct statfs * buf);
4998 -static void reiserfs_write_super (struct super_block * s)
4999 +static int reiserfs_sync_fs (struct super_block * s)
5001 + struct reiserfs_transaction_handle th;
5003 + if (!(s->s_flags & MS_RDONLY)) {
5004 + journal_begin(&th, s, 1);
5005 + journal_end_sync(&th, s, 1);
5014 - if (!(s->s_flags & MS_RDONLY)) {
5015 - dirty = flush_old_commits(s, 1) ;
5017 - s->s_dirt = dirty;
5019 +static void reiserfs_write_super (struct super_block * s)
5021 + reiserfs_sync_fs(s);
5025 static void reiserfs_write_super_lockfs (struct super_block * s)
5029 struct reiserfs_transaction_handle th ;
5031 if (!(s->s_flags & MS_RDONLY)) {
5033 reiserfs_block_writes(&th) ;
5034 journal_end(&th, s, 1) ;
5036 - s->s_dirt = dirty;
5042 /* we are going to do one balancing */
5043 journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
5045 - reiserfs_delete_solid_item (&th, key);
5046 + reiserfs_delete_solid_item (&th, NULL, key);
5048 /* removals are protected by direct items */
5049 reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
5051 /* body of "save" link */
5052 link = INODE_PKEY (inode)->k_dir_id;
5054 - /* put "save" link inot tree */
5055 - retval = reiserfs_insert_item (th, &path, &key, &ih, (char *)&link);
5056 + /* put "save" link inot tree, don't charge quota to anyone */
5057 + retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
5059 if (retval != -ENOSPC)
5060 reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d\n",
5062 ( inode -> u.reiserfs_i.i_flags & i_link_saved_truncate_mask ) ) ||
5064 ( inode -> u.reiserfs_i.i_flags & i_link_saved_unlink_mask ) ) )
5065 - reiserfs_delete_solid_item (&th, &key);
5066 + /* don't take quota bytes from anywhere */
5067 + reiserfs_delete_solid_item (&th, NULL, &key);
5069 reiserfs_release_objectid (&th, inode->i_ino);
5070 inode -> u.reiserfs_i.i_flags &= ~i_link_saved_unlink_mask;
5072 ** to do a journal_end
5074 journal_release(&th, s) ;
5077 for (i = 0; i < SB_BMAP_NR (s); i ++)
5078 brelse (SB_AP_BITMAP (s)[i].bh);
5080 put_super: reiserfs_put_super,
5081 write_super: reiserfs_write_super,
5082 write_super_lockfs: reiserfs_write_super_lockfs,
5083 + sync_fs: reiserfs_sync_fs,
5084 unlockfs: reiserfs_unlockfs,
5085 statfs: reiserfs_statfs,
5086 remount_fs: reiserfs_remount,
5087 @@ -463,6 +475,14 @@
5091 +/* possible values for -o data= */
5092 +static const arg_desc_t logging_mode[] = {
5093 + {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
5094 + {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
5095 + {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
5100 /* possible values for "-o block-allocator=" and bits which are to be set in
5101 s_mount_opt of reiserfs specific part of in-core super block */
5102 @@ -612,10 +632,14 @@
5104 {"block-allocator", 'a', balloc, 0, 0},
5105 {"hash", 'h', hash, 1<<FORCE_HASH_DETECT, 0},
5106 + {"data", 'd', logging_mode, 0, 0},
5108 {"resize", 'r', 0, 0, 0},
5109 {"attrs", 0, 0, 1<<REISERFS_ATTRS, 0},
5110 {"noattrs", 0, 0, 0, 1<<REISERFS_ATTRS},
5111 + {"usrquota", 0, 0, 0, 0},
5112 + {"grpquota", 0, 0, 0, 0},
5117 @@ -672,6 +696,47 @@
5121 +static void switch_data_mode(struct super_block *s, unsigned long mode) {
5122 + struct reiserfs_transaction_handle th;
5123 + int sync_all = !reiserfs_data_log(s);
5125 + journal_begin(&th, s, 1);
5126 + SB_JOURNAL(s)->j_must_wait = 1;
5127 + journal_end_sync(&th, s, 1);
5129 + s->u.reiserfs_sb.s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
5130 + (1 << REISERFS_DATA_ORDERED) |
5131 + (1 << REISERFS_DATA_WRITEBACK));
5132 + s->u.reiserfs_sb.s_mount_opt |= (1 << mode);
5134 + journal_begin(&th, s, 1);
5135 + SB_JOURNAL(s)->j_must_wait = 1;
5136 + journal_end_sync(&th, s, 1);
5139 + fsync_no_super(s->s_dev);
5142 +static void handle_data_mode(struct super_block *s, unsigned long mount_options)
5144 + if (mount_options & (1 << REISERFS_DATA_LOG)) {
5145 + if (!reiserfs_data_log(s)) {
5146 + switch_data_mode(s, REISERFS_DATA_LOG);
5147 + printk("reiserfs: switching to journaled data mode\n");
5149 + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
5150 + if (!reiserfs_data_ordered(s)) {
5151 + switch_data_mode(s, REISERFS_DATA_ORDERED);
5152 + printk("reiserfs: switching to ordered data mode\n");
5154 + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
5155 + if (!reiserfs_data_writeback(s)) {
5156 + switch_data_mode(s, REISERFS_DATA_WRITEBACK);
5157 + printk("reiserfs: switching to writeback data mode\n");
5162 static int reiserfs_remount (struct super_block * s, int * mount_flags, char * data)
5164 struct reiserfs_super_block * rs;
5165 @@ -723,9 +788,10 @@
5168 /* remount read-write */
5169 - if (!(s->s_flags & MS_RDONLY))
5170 + if (!(s->s_flags & MS_RDONLY)) {
5171 + handle_data_mode(s, mount_options);
5172 return 0; /* We are read-write already */
5175 s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
5176 journal_begin(&th, s, 10) ;
5178 @@ -743,9 +809,10 @@
5179 SB_JOURNAL(s)->j_must_wait = 1 ;
5180 journal_end(&th, s, 10) ;
5182 - if (!( *mount_flags & MS_RDONLY ) )
5183 + if (!( *mount_flags & MS_RDONLY ) ) {
5184 finish_unfinished( s );
5186 + handle_data_mode(s, mount_options);
5191 @@ -1172,9 +1239,6 @@
5193 if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
5201 @@ -1222,9 +1286,22 @@
5202 printk("reiserfs:warning: - it is slow mode for debugging.\n");
5207 + /* make data=ordered the default */
5208 + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
5209 + !reiserfs_data_writeback(s))
5211 + s->u.reiserfs_sb.s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
5214 + if (reiserfs_data_log(s)) {
5215 + printk("reiserfs: using journaled data mode\n");
5216 + } else if (reiserfs_data_ordered(s)) {
5217 + printk("reiserfs: using ordered data mode\n");
5219 + printk("reiserfs: using writeback data mode\n");
5223 if( journal_init(s, jdev_name, old_format) ) {
5224 reiserfs_warning(s, "sh-2022: reiserfs_read_super: unable to initialize journal space\n") ;
5226 @@ -1364,16 +1441,19 @@
5228 static int __init init_reiserfs_fs (void)
5231 reiserfs_proc_info_global_init();
5232 reiserfs_proc_register_global( "version",
5233 reiserfs_global_version_in_proc );
5234 + ret = reiserfs_journal_cache_init();
5237 return register_filesystem(&reiserfs_fs_type);
5240 MODULE_DESCRIPTION("ReiserFS journaled filesystem");
5241 MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
5242 MODULE_LICENSE("GPL");
5245 static void __exit exit_reiserfs_fs(void)
5247 diff -urN linux-2.4.22.org/fs/reiserfs/tail_conversion.c linux-2.4.22/fs/reiserfs/tail_conversion.c
5248 --- linux-2.4.22.org/fs/reiserfs/tail_conversion.c 2003-11-21 15:08:29.000000000 +0100
5249 +++ linux-2.4.22/fs/reiserfs/tail_conversion.c 2003-11-21 15:14:25.000000000 +0100
5251 set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
5252 put_ih_item_len( &ind_ih, UNFM_P_SIZE );
5253 PATH_LAST_POSITION (path)++;
5254 - n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih,
5255 + n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
5258 /* Paste into last indirect item of an object. */
5259 - n_retval = reiserfs_paste_into_item(th, path, &end_key,
5260 + n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
5261 (char *)&unfm_ptr, UNFM_P_SIZE);
5264 @@ -152,39 +152,6 @@
5269 -unmap_buffers(struct page *page, loff_t pos) {
5270 - struct buffer_head *bh ;
5271 - struct buffer_head *head ;
5272 - struct buffer_head *next ;
5273 - unsigned long tail_index ;
5274 - unsigned long cur_index ;
5277 - if (page->buffers) {
5278 - tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
5280 - head = page->buffers ;
5283 - next = bh->b_this_page ;
5285 - /* we want to unmap the buffers that contain the tail, and
5286 - ** all the buffers after it (since the tail must be at the
5287 - ** end of the file). We don't want to unmap file data
5288 - ** before the tail, since it might be dirty and waiting to
5291 - cur_index += bh->b_size ;
5292 - if (cur_index > tail_index) {
5293 - reiserfs_unmap_buffer(bh) ;
5296 - } while (bh != head) ;
5301 /* this first locks inode (neither reads nor sync are permitted),
5302 reads tail through page cache, insert direct item. When direct item
5303 inserted successfully inode is left locked. Return value is always
5305 set_cpu_key_k_type (&key, TYPE_DIRECT);
5307 /* Insert tail as new direct item in the tree */
5308 - if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih,
5309 + if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
5310 tail ? tail : NULL) < 0 ) {
5311 /* No disk memory. So we can not convert last unformatted node
5312 to the direct item. In this case we used to adjust
5313 @@ -274,10 +241,8 @@
5317 - /* this will invalidate all the buffers in the page after
5320 - unmap_buffers(page, pos1) ;
5321 + /* make sure to get the i_blocks changes from reiserfs_insert_item */
5322 + reiserfs_update_sd(th, p_s_inode);
5324 // note: we have now the same as in above direct2indirect
5325 // conversion: there are two keys which have matching first three
5328 /* We have inserted new direct item and must remove last
5329 unformatted node. */
5330 - p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512);
5333 /* we store position of first direct item in the in-core inode */
5334 diff -urN linux-2.4.22.org/include/linux/fs.h linux-2.4.22/include/linux/fs.h
5335 --- linux-2.4.22.org/include/linux/fs.h 2003-11-21 15:08:34.000000000 +0100
5336 +++ linux-2.4.22/include/linux/fs.h 2003-11-21 15:14:25.000000000 +0100
5337 @@ -1222,6 +1222,8 @@
5338 return test_and_set_bit(BH_Dirty, &bh->b_state);
5341 +extern void buffer_insert_list_journal_head(struct buffer_head *bh, struct list_head *list, void *journal_head);
5343 static inline void mark_buffer_async(struct buffer_head * bh, int on)
5346 @@ -1508,6 +1510,7 @@
5347 /* Generic buffer handling for block filesystems.. */
5348 extern int try_to_release_page(struct page * page, int gfp_mask);
5349 extern int discard_bh_page(struct page *, unsigned long, int);
5350 +extern void discard_buffer(struct buffer_head *bh) ;
5351 #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
5352 #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
5353 extern int block_symlink(struct inode *, const char *, int);
5354 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs.h linux-2.4.22/include/linux/reiserfs_fs.h
5355 --- linux-2.4.22.org/include/linux/reiserfs_fs.h 2003-11-21 15:08:34.000000000 +0100
5356 +++ linux-2.4.22/include/linux/reiserfs_fs.h 2003-11-21 15:14:25.000000000 +0100
5358 #define NO_DISK_SPACE -3
5359 #define NO_BALANCING_NEEDED (-4)
5360 #define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
5361 +#define QUOTA_EXCEEDED -6
5363 typedef unsigned long b_blocknr_t;
5364 typedef __u32 unp_t;
5365 @@ -1329,8 +1330,7 @@
5366 #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter)
5367 #define get_generation(s) atomic_read (&fs_generation(s))
5368 #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen)
5369 -#define fs_changed(gen,s) (gen != get_generation (s))
5371 +#define fs_changed(gen,s) (gen != get_generation(s))
5373 /***************************************************************************/
5375 @@ -1653,6 +1653,86 @@
5376 /* 12 */ struct journal_params jh_journal;
5380 +reiserfs_file_data_log(struct inode *inode) {
5381 + if (reiserfs_data_log(inode->i_sb) ||
5382 + (inode->u.reiserfs_i.i_flags & i_data_log))
5389 +/* flags for the nested transaction handle */
5390 +#define REISERFS_PERSISTENT_HANDLE 1
5391 +#define REISERFS_ACTIVE_HANDLE 2
5392 +#define REISERFS_CLOSE_NESTED 4
5393 +#define REISERFS_DANGLING_HANDLE 8
5395 +** transaction handle which is passed around for all journal calls
5397 +struct reiserfs_transaction_handle {
5398 + struct super_block *t_super ; /* super for this FS when journal_begin was
5399 + called. saves calls to reiserfs_get_super
5400 + also used by nested transactions to make
5401 + sure they are nesting on the right FS
5402 + _must_ be first in the handle
5405 + int t_blocks_logged ; /* number of blocks this writer has logged */
5406 + int t_blocks_allocated ; /* number of blocks this writer allocated */
5407 + unsigned long t_trans_id ; /* sanity check, equals the current trans id */
5409 + void *t_handle_save ; /* save existing current->journal_info */
5410 + int displace_new_blocks:1; /* if new block allocation occurs, that
5411 + block should be displaced from others */
5415 +reiserfs_dangling_handle(struct reiserfs_transaction_handle *th) {
5416 + return (th && (th->t_flags & REISERFS_DANGLING_HANDLE)) ;
5420 +reiserfs_set_handle_dangling(struct reiserfs_transaction_handle *th) {
5421 + th->t_flags |= REISERFS_DANGLING_HANDLE ;
5425 +reiserfs_clear_handle_dangling(struct reiserfs_transaction_handle *th) {
5426 + th->t_flags &= ~REISERFS_DANGLING_HANDLE ;
5430 +reiserfs_persistent_handle(struct reiserfs_transaction_handle *th) {
5431 + return (th && (th->t_flags & REISERFS_PERSISTENT_HANDLE)) ;
5435 +reiserfs_set_handle_persistent(struct reiserfs_transaction_handle *th) {
5436 + th->t_flags |= REISERFS_PERSISTENT_HANDLE ;
5440 +reiserfs_active_handle(struct reiserfs_transaction_handle *th) {
5441 + return (th && (th->t_flags & REISERFS_ACTIVE_HANDLE)) ;
5445 +reiserfs_set_handle_active(struct reiserfs_transaction_handle *th) {
5446 + th->t_flags |= REISERFS_ACTIVE_HANDLE ;
5450 +reiserfs_restartable_handle(struct reiserfs_transaction_handle *th) {
5451 + return (th && (th->t_flags & REISERFS_CLOSE_NESTED)) ;
5455 +reiserfs_set_handle_restartable(struct reiserfs_transaction_handle *th) {
5456 + th->t_flags |= REISERFS_CLOSE_NESTED ;
5459 extern task_queue reiserfs_commit_thread_tq ;
5460 extern wait_queue_head_t reiserfs_commit_thread_wait ;
5462 @@ -1693,6 +1773,8 @@
5464 #define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
5466 +int reiserfs_journal_cache_init(void);
5467 +int reiserfs_flush_old_commits(struct super_block *);
5468 void reiserfs_commit_for_inode(struct inode *) ;
5469 void reiserfs_commit_for_tail(struct inode *) ;
5470 void reiserfs_update_inode_transaction(struct inode *) ;
5471 @@ -1701,6 +1783,18 @@
5472 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
5473 void reiserfs_allow_writes(struct super_block *s) ;
5474 void reiserfs_check_lock_depth(char *caller) ;
5475 +int journal_mark_dirty(struct reiserfs_transaction_handle *,
5476 + struct super_block *, struct buffer_head *bh) ;
5478 +static inline int reiserfs_transaction_running(struct super_block *s) {
5479 + struct reiserfs_transaction_handle *th = current->journal_info ;
5480 + if (th && th->t_super == s)
5482 + if (th && th->t_super == NULL)
5487 void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
5488 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
5489 struct buffer_head * journal_bread (struct super_block *s, int block);
5490 @@ -1716,8 +1810,14 @@
5491 int push_journal_writer(char *w) ;
5492 int pop_journal_writer(int windex) ;
5493 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
5494 +int reiserfs_restart_transaction(struct reiserfs_transaction_handle *, int) ;
5495 int reiserfs_in_journal(struct super_block *p_s_sb, kdev_t dev, int bmap_nr, int bit_nr, int size, int searchall, unsigned int *next) ;
5496 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
5498 +/* allocates a transaction handle, and starts a new transaction it */
5499 +struct reiserfs_transaction_handle *
5500 +reiserfs_persistent_transaction(struct super_block *p_s_sb, unsigned long) ;
5502 struct super_block *reiserfs_get_super(kdev_t dev) ;
5503 void flush_async_commits(struct super_block *p_s_sb) ;
5505 @@ -1833,11 +1933,13 @@
5506 int reiserfs_insert_item (struct reiserfs_transaction_handle *th,
5508 const struct cpu_key * key,
5509 - struct item_head * ih, const char * body);
5510 + struct item_head * ih,
5511 + struct inode *inode, const char * body);
5513 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
5515 const struct cpu_key * key,
5516 + struct inode *inode,
5517 const char * body, int paste_size);
5519 int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
5520 @@ -1854,7 +1956,7 @@
5521 struct buffer_head * p_s_un_bh);
5523 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
5524 - struct key * key);
5525 + struct inode *inode, struct key * key);
5526 void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode);
5527 void reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
5528 struct inode * p_s_inode, struct page *,
5529 @@ -1895,8 +1997,18 @@
5531 struct dentry *dentry,
5532 struct inode *inode);
5533 -int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode);
5534 -void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode);
5536 +int reiserfs_sync_inode (struct reiserfs_transaction_handle *th,
5537 + struct inode * inode);
5539 +void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
5540 + struct inode * inode, loff_t size);
5542 +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
5543 + struct inode *inode)
5545 + reiserfs_update_sd_size(th, inode, inode->i_size) ;
5548 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode );
5549 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs );
5550 @@ -1981,7 +2093,7 @@
5551 extern struct inode_operations reiserfs_file_inode_operations;
5552 extern struct file_operations reiserfs_file_operations;
5553 extern struct address_space_operations reiserfs_address_space_operations ;
5554 -int get_new_buffer (struct reiserfs_transaction_handle *th, struct buffer_head *,
5555 +int get_new_buffer (struct reiserfs_transaction_handle *th, struct inode *, struct buffer_head *,
5556 struct buffer_head **, struct path *);
5559 @@ -2095,7 +2207,7 @@
5561 int reiserfs_parse_alloc_options (struct super_block *, char *);
5562 int is_reusable (struct super_block * s, unsigned long block, int bit_value);
5563 -void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
5564 +void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *inode, unsigned long, int);
5565 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
5566 extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb,
5567 b_blocknr_t *new_blocknrs, int amount_needed)
5568 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_i.h linux-2.4.22/include/linux/reiserfs_fs_i.h
5569 --- linux-2.4.22.org/include/linux/reiserfs_fs_i.h 2003-11-21 15:08:34.000000000 +0100
5570 +++ linux-2.4.22/include/linux/reiserfs_fs_i.h 2003-11-21 15:14:25.000000000 +0100
5573 #include <linux/list.h>
5575 +struct reiserfs_journal_list;
5577 /** bitmasks for i_flags field in reiserfs-specific part of inode */
5579 /** this says what format of key do all items (but stat data) of
5581 truncate or unlink. Safe link is used to avoid leakage of disk
5582 space on crash with some files open, but unlinked. */
5583 i_link_saved_unlink_mask = 0x0010,
5584 - i_link_saved_truncate_mask = 0x0020
5585 + i_link_saved_truncate_mask = 0x0020,
5586 + /** are we logging data blocks for this file? */
5587 + i_data_log = 0x0040,
5588 } reiserfs_inode_flags;
5592 ** needs to be committed in order for this inode to be properly
5594 unsigned long i_trans_id ;
5595 - unsigned long i_trans_index ;
5596 + struct reiserfs_journal_list *i_jl;
5598 /* direct io needs to make sure the tail is on disk to avoid
5599 * buffer alias problems. This records the transaction last
5600 * involved in a direct->indirect conversion for this file
5602 unsigned long i_tail_trans_id;
5603 - unsigned long i_tail_trans_index;
5604 + struct reiserfs_journal_list *i_tail_jl;
5608 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_sb.h linux-2.4.22/include/linux/reiserfs_fs_sb.h
5609 --- linux-2.4.22.org/include/linux/reiserfs_fs_sb.h 2003-11-21 15:08:34.000000000 +0100
5610 +++ linux-2.4.22/include/linux/reiserfs_fs_sb.h 2003-11-21 15:14:25.000000000 +0100
5612 #define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
5613 #define JOURNAL_HASH_SIZE 8192
5614 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */
5615 -#define JOURNAL_LIST_COUNT 64
5617 /* these are bh_state bit flag offset numbers, for use in the buffer head */
5619 @@ -167,20 +166,27 @@
5620 struct reiserfs_bitmap_node **bitmaps ;
5624 -** transaction handle which is passed around for all journal calls
5626 -struct reiserfs_transaction_handle {
5627 - /* ifdef it. -Hans */
5628 - char *t_caller ; /* debugging use */
5629 - int t_blocks_logged ; /* number of blocks this writer has logged */
5630 - int t_blocks_allocated ; /* number of blocks this writer allocated */
5631 - unsigned long t_trans_id ; /* sanity check, equals the current trans id */
5632 - struct super_block *t_super ; /* super for this FS when journal_begin was
5633 - called. saves calls to reiserfs_get_super */
5634 - int displace_new_blocks:1; /* if new block allocation occurres, that block
5635 - should be displaced from others */
5637 +struct reiserfs_journal_list;
5639 +/* so, we're using fsync_buffers_list to do the ordered buffer writes,
5640 + * but we don't want to have a full inode on each buffer list, it is
5641 + * a big waste of space.
5643 + * instead we copy the very head of the inode into a list here, a kludge
5644 + * but much smaller.
5646 +struct reiserfs_inode_list {
5647 + struct list_head i_hash;
5648 + struct list_head i_list;
5649 + struct list_head i_dentry;
5650 + struct list_head i_dirty_buffers;
5652 + /* we could be very smart and do math based on the location
5653 + * of the inode list in the journal list struct.
5654 + * lets do that after this works properly
5656 + struct reiserfs_journal_list *jl;
5660 ** one of these for each transaction. The most important part here is the j_realblock.
5661 @@ -190,20 +196,32 @@
5662 ** to be overwritten */
5663 struct reiserfs_journal_list {
5664 unsigned long j_start ;
5665 + unsigned long j_state ;
5666 unsigned long j_len ;
5667 atomic_t j_nonzerolen ;
5668 atomic_t j_commit_left ;
5669 - atomic_t j_flushing ;
5670 - atomic_t j_commit_flushing ;
5671 atomic_t j_older_commits_done ; /* all commits older than this on disk*/
5672 + struct semaphore j_commit_lock ;
5673 unsigned long j_trans_id ;
5674 time_t j_timestamp ;
5675 struct reiserfs_list_bitmap *j_list_bitmap ;
5676 struct buffer_head *j_commit_bh ; /* commit buffer head */
5677 struct reiserfs_journal_cnode *j_realblock ;
5678 struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans. free each of these on flush */
5679 - wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
5680 - wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
5682 + /* time ordered list of all the active transactions */
5683 + struct list_head j_list;
5685 + /* time ordered list of all transactions not touched by kreiserfsd */
5686 + struct list_head j_working_list;
5688 + /* for data=ordered support */
5689 + struct list_head j_ordered_bh_list;
5691 + /* sigh, the tails have slightly different rules for flushing, they
5692 + * need their own list
5694 + struct list_head j_tail_bh_list;
5697 struct reiserfs_page_list ; /* defined in reiserfs_fs.h */
5698 @@ -230,16 +248,11 @@
5699 unsigned long j_last_flush_trans_id ; /* last fully flushed journal timestamp */
5700 struct buffer_head *j_header_bh ;
5702 - /* j_flush_pages must be flushed before the current transaction can
5705 - struct reiserfs_page_list *j_flush_pages ;
5706 time_t j_trans_start_time ; /* time this transaction started */
5707 - wait_queue_head_t j_wait ; /* wait journal_end to finish I/O */
5708 - atomic_t j_wlock ; /* lock for j_wait */
5709 + struct semaphore j_lock ;
5710 + struct semaphore j_flush_sem ;
5711 wait_queue_head_t j_join_wait ; /* wait for current transaction to finish before starting new one */
5712 atomic_t j_jlock ; /* lock for j_join_wait */
5713 - int j_journal_list_index ; /* journal list number of the current trans */
5714 int j_list_bitmap_index ; /* number of next list bitmap to use */
5715 int j_must_wait ; /* no more journal begins allowed. MUST sleep on j_join_wait */
5716 int j_next_full_flush ; /* next journal_end will flush all journal list */
5717 @@ -255,13 +268,28 @@
5719 struct reiserfs_journal_cnode *j_cnode_free_list ;
5720 struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
5721 + struct reiserfs_journal_list *j_current_jl;
5723 int j_free_bitmap_nodes ;
5724 int j_used_bitmap_nodes ;
5725 + int j_num_lists; /* total number of active transactions */
5726 + int j_num_work_lists; /* number that need attention from kreiserfsd */
5728 + /* debugging to make sure things are flushed in order */
5729 + int j_last_flush_id;
5731 + /* debugging to make sure things are committed in order */
5732 + int j_last_commit_id;
5734 struct list_head j_bitmap_nodes ;
5735 - struct list_head j_dirty_buffers ;
5737 + /* list of all active transactions */
5738 + struct list_head j_journal_list;
5740 + /* lists that haven't been touched by kreiserfsd */
5741 + struct list_head j_working_list;
5743 struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ; /* array of bitmaps to record the deleted blocks */
5744 - struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ; /* array of all the journal lists */
5745 struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */
5746 struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all
5749 reiserfs_proc_info_data_t s_proc_info_data;
5750 struct proc_dir_entry *procdir;
5751 int reserved_blocks; /* amount of blocks reserved for further allocations */
5752 + struct list_head s_reiserfs_supers;
5755 /* Definitions of reiserfs on-disk properties: */
5756 @@ -420,11 +449,12 @@
5757 #define REISERFS_3_6 1
5760 -#define REISERFS_LARGETAIL 0 /* large tails will be created in a session */
5761 -#define REISERFS_SMALLTAIL 17 /* small (for files less than block size) tails will be created in a session */
5762 -#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
5763 -#define REISERFS_NOLOG 4 /* -o nolog: turn journalling off */
5764 -#define REISERFS_CONVERT 5 /* -o conv: causes conversion of old
5766 + REISERFS_LARGETAIL, /* large tails will be created in a session */
5767 + REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
5768 + REPLAYONLY, /* replay journal and return 0. Use by fsck */
5769 + REISERFS_NOLOG, /* -o nolog: turn journalling off */
5770 + REISERFS_CONVERT, /* -o conv: causes conversion of old
5771 format super block to the new
5772 format. If not specified - old
5773 partition will be dealt with in a
5774 @@ -438,27 +468,25 @@
5775 ** the existing hash on the FS, so if you have a tea hash disk, and mount
5776 ** with -o hash=rupasov, the mount will fail.
5778 -#define FORCE_TEA_HASH 6 /* try to force tea hash on mount */
5779 -#define FORCE_RUPASOV_HASH 7 /* try to force rupasov hash on mount */
5780 -#define FORCE_R5_HASH 8 /* try to force rupasov hash on mount */
5781 -#define FORCE_HASH_DETECT 9 /* try to detect hash function on mount */
5782 + FORCE_TEA_HASH, /* try to force tea hash on mount */
5783 + FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
5784 + FORCE_R5_HASH, /* try to force rupasov hash on mount */
5785 + FORCE_HASH_DETECT, /* try to detect hash function on mount */
5788 /* used for testing experimental features, makes benchmarking new
5789 features with and without more convenient, should never be used by
5790 users in any code shipped to users (ideally) */
5792 -#define REISERFS_NO_BORDER 11
5793 -#define REISERFS_NO_UNHASHED_RELOCATION 12
5794 -#define REISERFS_HASHED_RELOCATION 13
5795 -#define REISERFS_TEST4 14
5797 -#define REISERFS_TEST1 11
5798 -#define REISERFS_TEST2 12
5799 -#define REISERFS_TEST3 13
5800 -#define REISERFS_TEST4 14
5802 -#define REISERFS_ATTRS (15)
5803 + REISERFS_NO_BORDER,
5804 + REISERFS_NO_UNHASHED_RELOCATION,
5805 + REISERFS_HASHED_RELOCATION,
5806 + REISERFS_DATA_LOG,
5807 + REISERFS_DATA_ORDERED,
5808 + REISERFS_DATA_WRITEBACK,
5813 #define reiserfs_r5_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_R5_HASH))
5814 #define reiserfs_rupasov_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_RUPASOV_HASH))
5816 #define reiserfs_no_border(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_BORDER))
5817 #define reiserfs_no_unhashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
5818 #define reiserfs_hashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
5819 +#define reiserfs_data_log(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_LOG))
5820 +#define reiserfs_data_ordered(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_ORDERED))
5821 +#define reiserfs_data_writeback(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
5822 #define reiserfs_test4(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_TEST4))
5824 #define have_large_tails(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_LARGETAIL))
5827 void reiserfs_file_buffer (struct buffer_head * bh, int list);
5828 int reiserfs_is_super(struct super_block *s) ;
5829 -int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
5830 -int flush_old_commits(struct super_block *s, int) ;
5831 int show_reiserfs_locks(void) ;
5832 int reiserfs_resize(struct super_block *, unsigned long) ;
5835 #define SB_BUFFER_WITH_SB(s) ((s)->u.reiserfs_sb.s_sbh)
5836 #define SB_JOURNAL(s) ((s)->u.reiserfs_sb.s_journal)
5837 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
5838 -#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
5839 -#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index)
5840 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
5841 #define SB_AP_BITMAP(s) ((s)->u.reiserfs_sb.s_ap_bitmap)
5843 diff -urN linux-2.4.22.org/kernel/ksyms.c linux-2.4.22/kernel/ksyms.c
5844 --- linux-2.4.22.org/kernel/ksyms.c 2003-11-21 15:08:31.000000000 +0100
5845 +++ linux-2.4.22/kernel/ksyms.c 2003-11-21 15:15:21.000000000 +0100
5847 EXPORT_SYMBOL(end_buffer_io_async);
5848 EXPORT_SYMBOL(__mark_buffer_dirty);
5849 EXPORT_SYMBOL(__mark_inode_dirty);
5850 +EXPORT_SYMBOL(discard_buffer); /* for FS flushpage funcs */
5851 EXPORT_SYMBOL(fd_install);
5852 EXPORT_SYMBOL(get_empty_filp);
5853 EXPORT_SYMBOL(init_private_file);
5854 diff -urN linux-2.4.22.org/mm/filemap.c linux-2.4.22/mm/filemap.c
5855 --- linux-2.4.22.org/mm/filemap.c 2003-11-21 15:08:31.000000000 +0100
5856 +++ linux-2.4.22/mm/filemap.c 2003-11-21 15:14:25.000000000 +0100
5857 @@ -3041,6 +3041,14 @@
5861 +static void update_inode_times(struct inode *inode)
5863 + time_t now = CURRENT_TIME;
5864 + if (inode->i_ctime != now || inode->i_mtime != now) {
5865 + inode->i_ctime = inode->i_mtime = now;
5866 + mark_inode_dirty_sync(inode);
5870 * precheck_file_write():
5871 * Check the conditions on a file descriptor prior to beginning a write
5872 @@ -3302,8 +3310,7 @@
5876 - inode->i_ctime = inode->i_mtime = CURRENT_TIME;
5877 - mark_inode_dirty_sync(inode);
5878 + update_inode_times(inode);
5880 written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);