[packages/kernel.git] / linux-2.4.22-data-loging+quota.patch

diff -urN linux-2.4.22.org/fs/buffer.c linux-2.4.22/fs/buffer.c
--- linux-2.4.22.org/fs/buffer.c	2003-11-21 15:08:24.000000000 +0100
+++ linux-2.4.22/fs/buffer.c	2003-11-21 15:14:23.000000000 +0100
@@ -659,6 +659,20 @@
 	spin_unlock(&lru_list_lock);
 }
 
+void buffer_insert_list_journal_head(struct buffer_head *bh, 
+                                     struct list_head *list,
+				     void *journal_head)
+{
+	spin_lock(&lru_list_lock);
+	if (buffer_attached(bh))
+		list_del(&bh->b_inode_buffers);
+	set_buffer_attached(bh);
+	list_add(&bh->b_inode_buffers, list);
+	bh->b_journal_head = journal_head;
+	spin_unlock(&lru_list_lock);
+}
+EXPORT_SYMBOL(buffer_insert_list_journal_head);
+
 /*
  * The caller must have the lru_list lock before calling the 
  * remove_inode_queue functions.
@@ -1370,7 +1384,7 @@
 /*
  * Called when truncating a buffer on a page completely.
  */
-static void discard_buffer(struct buffer_head * bh)
+void discard_buffer(struct buffer_head * bh)
 {
 	if (buffer_mapped(bh) || buffer_delay(bh)) {
 		mark_buffer_clean(bh);
diff -urN linux-2.4.22.org/fs/inode.c linux-2.4.22/fs/inode.c
--- linux-2.4.22.org/fs/inode.c	2003-11-21 15:08:24.000000000 +0100
+++ linux-2.4.22/fs/inode.c	2003-11-21 15:14:23.000000000 +0100
@@ -476,7 +476,7 @@
 	}
 }
 
-static void try_to_sync_unused_inodes(void * arg)
+static void try_to_sync_unused_inodes(void)
 {
 	struct super_block * sb;
 	int nr_inodes = inodes_stat.nr_unused;
@@ -495,7 +495,8 @@
 	spin_unlock(&inode_lock);
 }
 
-static struct tq_struct unused_inodes_flush_task;
+static DECLARE_WAIT_QUEUE_HEAD(kinoded_wait) ;
+static atomic_t kinoded_goal = ATOMIC_INIT(0) ;
 
 /**
  *	write_inode_now	-	write an inode to disk
@@ -758,7 +759,7 @@
 	 !inode_has_buffers(inode))
 #define INODE(entry)	(list_entry(entry, struct inode, i_list))
 
-void prune_icache(int goal)
+static void _prune_icache(int goal)
 {
 	LIST_HEAD(list);
 	struct list_head *entry, *freeable = &list;
@@ -792,35 +793,29 @@
 	spin_unlock(&inode_lock);
 
 	dispose_list(freeable);
+	kmem_cache_shrink(inode_cachep);
 
 	/* 
-	 * If we didn't freed enough clean inodes schedule
-	 * a sync of the dirty inodes, we cannot do it
-	 * from here or we're either synchronously dogslow
-	 * or we deadlock with oom.
+	 * If we didn't freed enough clean inodes 
+	 * start a sync now
 	 */
 	if (goal)
-		schedule_task(&unused_inodes_flush_task);
+		try_to_sync_unused_inodes();
+}
+
+void prune_icache(int goal) {
+	atomic_add(goal, &kinoded_goal);
+	if (atomic_read(&kinoded_goal) > 16) {
+		wake_up_interruptible(&kinoded_wait);
+	} 
 }
 
 int shrink_icache_memory(int priority, int gfp_mask)
 {
 	int count = 0;
-
-	/*
-	 * Nasty deadlock avoidance..
-	 *
-	 * We may hold various FS locks, and we don't
-	 * want to recurse into the FS that called us
-	 * in clear_inode() and friends..
-	 */
-	if (!(gfp_mask & __GFP_FS))
-		return 0;
-
 	count = inodes_stat.nr_unused / priority;
-
 	prune_icache(count);
-	return kmem_cache_shrink(inode_cachep);
+	return 0;
 }
 
 /*
@@ -1198,6 +1193,34 @@
 	return res;
 }
 
+int kinoded(void *startup) {
+
+	struct task_struct *tsk = current;
+	int goal ;
+
+	daemonize();
+	strcpy(tsk->comm, "kinoded");
+
+	/* avoid getting signals */
+	spin_lock_irq(&tsk->sigmask_lock);
+	flush_signals(tsk);
+	sigfillset(&tsk->blocked);
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	printk("kinoded started\n") ;
+	complete((struct completion *)startup);
+	while(1) {
+	        wait_event_interruptible(kinoded_wait,
+	                                atomic_read(&kinoded_goal));
+	        while((goal = atomic_read(&kinoded_goal))) {
+			_prune_icache(goal);
+			atomic_sub(goal, &kinoded_goal);
+			cond_resched();
+		}
+	} 
+}
+
 /*
  * Initialize the hash tables.
  */
@@ -1249,8 +1272,17 @@
 					 NULL);
 	if (!inode_cachep)
 		panic("cannot create inode slab cache");
+}
 
-	unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
+/* we need to start a thread, and inode_init happens too early for that
+** to work.  So, add a second init func through module_init
+*/
+static int __init inode_mod_init(void)
+{
+	static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
+	kernel_thread(kinoded, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	wait_for_completion(&startup);
+	return 0;
 }
 
 /**
@@ -1344,3 +1376,5 @@
 }
 
 #endif
+
+module_init(inode_mod_init) ;
diff -urN linux-2.4.22.org/fs/reiserfs/bitmap.c linux-2.4.22/fs/reiserfs/bitmap.c
--- linux-2.4.22.org/fs/reiserfs/bitmap.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/bitmap.c	2003-11-21 15:14:23.000000000 +0100
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/locks.h>
 #include <linux/kernel.h>
+#include <linux/quotaops.h>
 
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_fs_sb.h>
@@ -287,7 +288,8 @@
 }
 
 static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
-			  b_blocknr_t block)
+			          struct inode *inode, b_blocknr_t block,
+				  int for_unformatted)
 {
     struct super_block * s = th->t_super;
     struct reiserfs_super_block * rs;
@@ -296,7 +298,6 @@
     int nr, offset;
 
     PROC_INFO_INC( s, free_block );
-
     rs = SB_DISK_SUPER_BLOCK (s);
     sbh = SB_BUFFER_WITH_SB (s);
     apbi = SB_AP_BITMAP(s);
@@ -309,7 +310,6 @@
 			  block, bdevname(s->s_dev));
 	return;
     }
-
     reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
   
     /* clear bit for the given block in bit map */
@@ -329,39 +329,55 @@
     set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
   
     journal_mark_dirty (th, s, sbh);
+    if (for_unformatted) {
+#ifdef REISERQUOTA_DEBUG
+      printk(KERN_DEBUG "reiserquota: freeing block id=%u\n", inode->i_uid);
+#endif
+      DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
+    }
+
 }
 
 void reiserfs_free_block (struct reiserfs_transaction_handle *th,
-			  unsigned long block) {
+			  struct inode *inode, unsigned long block,
+			  int for_unformatted)
+{
     struct super_block * s = th->t_super;
 
     RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
     RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
     /* mark it before we clear it, just in case */
     journal_mark_freed(th, s, block) ;
-    _reiserfs_free_block(th, block) ;
+    _reiserfs_free_block(th, inode, block, for_unformatted) ;
 }
 
 /* preallocated blocks don't need to be run through journal_mark_freed */
 void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, 
-                          unsigned long block) {
+			  struct inode *inode,
+                          unsigned long block)
+{
     RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
     RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
-    _reiserfs_free_block(th, block) ;
+    _reiserfs_free_block(th, inode, block, 1) ;
 }
 
 static void __discard_prealloc (struct reiserfs_transaction_handle * th,
 				struct inode * inode)
 {
     unsigned long save = inode->u.reiserfs_i.i_prealloc_block ;
+    int dirty=0;
 #ifdef CONFIG_REISERFS_CHECK
     if (inode->u.reiserfs_i.i_prealloc_count < 0)
 	reiserfs_warning(th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.\n", __FUNCTION__ );
 #endif  
     while (inode->u.reiserfs_i.i_prealloc_count > 0) {
-	reiserfs_free_prealloc_block(th,inode->u.reiserfs_i.i_prealloc_block);
+	reiserfs_free_prealloc_block(th, inode, inode->u.reiserfs_i.i_prealloc_block);
 	inode->u.reiserfs_i.i_prealloc_block++;
 	inode->u.reiserfs_i.i_prealloc_count --;
+	dirty = 1 ;
+    }
+    if (dirty) {
+	reiserfs_update_sd(th, inode) ;
     }
     inode->u.reiserfs_i.i_prealloc_block = save ;
     list_del (&(inode->u.reiserfs_i.i_prealloc_list));
@@ -599,7 +615,6 @@
     if (hint->formatted_node || hint->inode == NULL) {
 	return 0;
     }
-
     hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
     border = hint->beg + (unsigned long) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
     if (border > hint->search_start)
@@ -776,6 +791,24 @@
     int nr_allocated = 0;
 
     determine_prealloc_size(hint);
+    if (!hint->formatted_node) {
+        int quota_ret;
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota: allocating %d blocks id=%u\n", amount_needed, hint->inode->i_uid);
+#endif
+	quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+	if (quota_ret)    /* Quota exceeded? */
+	    return QUOTA_EXCEEDED;
+	if (hint->preallocate && hint->prealloc_size ) {
+#ifdef REISERQUOTA_DEBUG
+	    printk(KERN_DEBUG "reiserquota: allocating (prealloc) %d blocks id=%u\n", hint->prealloc_size, hint->inode->i_uid);
+#endif
+	    quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
+	    if (quota_ret)
+		hint->preallocate=hint->prealloc_size=0;
+	}
+    }
+
     while((nr_allocated
 	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
 					  amount_needed - nr_allocated, hint->prealloc_size))
@@ -783,8 +816,14 @@
 
 	/* not all blocks were successfully allocated yet*/
 	if (second_pass) {	/* it was a second pass; we must free all blocks */
+	    if (!hint->formatted_node) {
+#ifdef REISERQUOTA_DEBUG
+		printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
+#endif
+		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
+	    }
 	    while (nr_allocated --)
-		reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
+		reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
 
 	    return NO_DISK_SPACE;
 	} else {		/* refine search parameters for next pass */
@@ -794,6 +833,13 @@
 	    continue;
 	}
     }
+    if ( !hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + INODE_INFO(hint->inode)->i_prealloc_count) {
+    /* Some of preallocation blocks were not allocated */
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota: freeing (failed prealloc) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count, hint->inode->i_uid);
+#endif
+	DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count);
+    }
     return CARRY_ON;
 }
 
@@ -862,7 +908,7 @@
 
     if (ret != CARRY_ON) {
 	while (amount_needed ++ < initial_amount_needed) {
-	    reiserfs_free_block(hint->th, *(--new_blocknrs));
+	    reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
 	}
     }
     return ret;
diff -urN linux-2.4.22.org/fs/reiserfs/do_balan.c linux-2.4.22/fs/reiserfs/do_balan.c
--- linux-2.4.22.org/fs/reiserfs/do_balan.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/do_balan.c	2003-11-21 15:14:23.000000000 +0100
@@ -33,16 +33,8 @@
 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
 					struct buffer_head * bh, int flag)
 {
-    if (reiserfs_dont_log(tb->tb_sb)) {
-	if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
-	    __mark_buffer_dirty(bh) ;
-	    tb->need_balance_dirty = 1;
-	}
-    } else {
-	int windex = push_journal_writer("do_balance") ;
-	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-	pop_journal_writer(windex) ;
-    }
+    journal_mark_dirty(tb->transaction_handle, 
+                       tb->transaction_handle->t_super, bh) ;
 }
 
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
@@ -1247,7 +1239,7 @@
 	    if (buffer_dirty (tb->thrown[i]))
 	      reiserfs_warning (tb->tb_sb, "free_thrown deals with dirty buffer %ld\n", blocknr);
 	    brelse(tb->thrown[i]) ; /* incremented in store_thrown */
-	    reiserfs_free_block (tb->transaction_handle, blocknr);
+	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 	}
     }
 }
@@ -1259,9 +1251,11 @@
     set_blkh_level( blkh, FREE_LEVEL );
     set_blkh_nr_item( blkh, 0 );
     
-    mark_buffer_clean (bh);
+    if (buffer_dirty(bh))
+        BUG();
+    // mark_buffer_clean (bh);
     /* reiserfs_free_block is no longer schedule safe 
-    reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr);
+    reiserfs_free_block (tb->transaction_handle, NULL, tb->tb_sb, bh->b_blocknr, 0);
     */
 
     store_thrown (tb, bh);
@@ -1575,6 +1569,7 @@
     tb->tb_mode = flag;
     tb->need_balance_dirty = 0;
 
+    reiserfs_check_lock_depth("do balance");
     if (FILESYSTEM_CHANGED_TB(tb)) {
         reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
     }
@@ -1605,5 +1600,6 @@
 
 
     do_balance_completed (tb);
+    reiserfs_check_lock_depth("do balance2");
 
 }
diff -urN linux-2.4.22.org/fs/reiserfs/file.c linux-2.4.22/fs/reiserfs/file.c
--- linux-2.4.22.org/fs/reiserfs/file.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/file.c	2003-11-21 15:14:23.000000000 +0100
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 
 /*
 ** We pack the tails of files on file close, not at the time they are written.
@@ -42,7 +43,6 @@
     lock_kernel() ;
     down (&inode->i_sem); 
     journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
-    reiserfs_update_inode_transaction(inode) ;
 
 #ifdef REISERFS_PREALLOCATE
     reiserfs_discard_prealloc (&th, inode);
@@ -93,7 +93,9 @@
 static int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
     struct inode *inode = dentry->d_inode ;
     int error ;
-    if (attr->ia_valid & ATTR_SIZE) {
+    unsigned int ia_valid = attr->ia_valid ;
+
+    if (ia_valid & ATTR_SIZE) {
 	/* version 2 items will be caught by the s_maxbytes check
 	** done for us in vmtruncate
 	*/
@@ -101,8 +103,17 @@
 	    attr->ia_size > MAX_NON_LFS)
             return -EFBIG ;
 
+        /* During a truncate, we have to make sure the new i_size is in
+	** the transaction before we start dropping updates to data logged
+	** or ordered write data pages.
+	*/
+	if (attr->ia_size < inode->i_size && reiserfs_file_data_log(inode)) {
+	    struct reiserfs_transaction_handle th ;
+	    journal_begin(&th, inode->i_sb, 1) ;
+	    reiserfs_update_sd_size(&th, inode, attr->ia_size) ;
+	    journal_end(&th, inode->i_sb, 1) ;
 	/* fill in hole pointers in the expanding truncate case. */
-        if (attr->ia_size > inode->i_size) {
+        } else if (attr->ia_size > inode->i_size) {
 	    error = generic_cont_expand(inode, attr->ia_size) ;
 	    if (inode->u.reiserfs_i.i_prealloc_count > 0) {
 		struct reiserfs_transaction_handle th ;
@@ -123,15 +134,35 @@
 	    return -EINVAL;
 
     error = inode_change_ok(inode, attr) ;
-    if (!error)
-        inode_setattr(inode, attr) ;
+    if (!error) {
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
+		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 
+	if (!error)
+	    error = inode_setattr(inode, attr) ;
+    }
     return error ;
 }
 
+static ssize_t
+reiserfs_file_write(struct file *f, const char *b, size_t count, loff_t *ppos)
+{
+    ssize_t ret;
+    struct inode *inode = f->f_dentry->d_inode;
+
+    ret = generic_file_write(f, b, count, ppos);
+    if (ret >= 0 && f->f_flags & O_SYNC) {
+        lock_kernel();
+	reiserfs_commit_for_inode(inode);
+	unlock_kernel();
+    }
+    return ret;
+}
+
 struct file_operations reiserfs_file_operations = {
     read:	generic_file_read,
-    write:	generic_file_write,
+    write:	reiserfs_file_write,
     ioctl:	reiserfs_ioctl,
     mmap:	generic_file_mmap,
     release:	reiserfs_file_release,
diff -urN linux-2.4.22.org/fs/reiserfs/fix_node.c linux-2.4.22/fs/reiserfs/fix_node.c
--- linux-2.4.22.org/fs/reiserfs/fix_node.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/fix_node.c	2003-11-21 15:14:23.000000000 +0100
@@ -795,8 +795,9 @@
   else /* If we have enough already then there is nothing to do. */
     return CARRY_ON;
 
-  if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
-                                   n_amount_needed) == NO_DISK_SPACE )
+  /* No need to check quota - is not allocated for blocks used for formatted nodes */
+  if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
+                                   n_amount_needed) == NO_DISK_SPACE)
     return NO_DISK_SPACE;
 
   /* for each blocknumber we just got, get a buffer and stick it on FEB */
@@ -2121,7 +2122,8 @@
 
 static void clear_all_dirty_bits(struct super_block *s, 
                                  struct buffer_head *bh) {
-  reiserfs_prepare_for_journal(s, bh, 0) ;
+  // reiserfs_prepare_for_journal(s, bh, 0) ;
+  set_bit(BH_JPrepared, &bh->b_state) ;
 }
 
 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
@@ -2518,7 +2520,7 @@
 	    /* de-allocated block which was not used by balancing and
                bforget about buffer for it */
 	    brelse (tb->FEB[i]);
-	    reiserfs_free_block (tb->transaction_handle, blocknr);
+	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 	}
 	if (tb->used[i]) {
 	    /* release used as new nodes including a new root */
diff -urN linux-2.4.22.org/fs/reiserfs/ibalance.c linux-2.4.22/fs/reiserfs/ibalance.c
--- linux-2.4.22.org/fs/reiserfs/ibalance.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/ibalance.c	2003-11-21 15:14:23.000000000 +0100
@@ -632,7 +632,6 @@
 		/* use check_internal if new root is an internal node */
 		check_internal (new_root);
 	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    tb->tb_sb->s_dirt = 1;
 
 	    /* do what is needed for buffer thrown from tree */
 	    reiserfs_invalidate_buffer(tb, tbSh);
@@ -950,7 +949,6 @@
         PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
         PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 	do_balance_mark_sb_dirty (tb, tb->tb_sb->u.reiserfs_sb.s_sbh, 1);
-	tb->tb_sb->s_dirt = 1;
     }
 	
     if ( tb->blknum[h] == 2 ) {
diff -urN linux-2.4.22.org/fs/reiserfs/inode.c linux-2.4.22/fs/reiserfs/inode.c
--- linux-2.4.22.org/fs/reiserfs/inode.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/inode.c	2003-11-21 15:14:23.000000000 +0100
@@ -4,9 +4,11 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/locks.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
@@ -17,6 +19,8 @@
 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
 
+static int reiserfs_commit_write(struct file *, struct page *, 
+                                 unsigned from, unsigned to) ;
 static int reiserfs_get_block (struct inode * inode, long block,
 			       struct buffer_head * bh_result, int create);
 
@@ -33,6 +37,7 @@
   
     lock_kernel() ; 
 
+    DQUOT_FREE_INODE(inode);
     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
     if (INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
 	down (&inode->i_sem); 
@@ -106,9 +111,13 @@
 }
 
 static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+    struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
+    buffer_insert_list_journal_head(bh, &jl->j_ordered_bh_list, jl);
+}
 
-    buffer_insert_list(bh, &j->j_dirty_buffers) ;
+static void add_to_tail_list(struct inode *inode, struct buffer_head *bh) {
+    struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
+    buffer_insert_list_journal_head(bh, &jl->j_tail_bh_list, jl);
 }
 
 //
@@ -201,15 +210,16 @@
     return 0;
 }
 
-/*static*/ void restart_transaction(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct path *path) {
-  struct super_block *s = th->t_super ;
-  int len = th->t_blocks_allocated ;
-
+static void restart_transaction(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct path *path,
+				int jbegin_count) {
+  /* we cannot restart while nested unless the parent allows it */
+  if (!reiserfs_restartable_handle(th) && th->t_refcount > 1) {
+      return  ;
+  }
   pathrelse(path) ;
   reiserfs_update_sd(th, inode) ;
-  journal_end(th, s, len) ;
-  journal_begin(th, s, len) ;
+  reiserfs_restart_transaction(th, jbegin_count) ;
   reiserfs_update_inode_transaction(inode) ;
 }
 
@@ -327,6 +337,10 @@
 	}
     }
     p += offset ;
+    if ((offset + inode->i_sb->s_blocksize) > PAGE_CACHE_SIZE) {
+printk("get_block_create_0 offset %lu too large\n", offset);
+    }
+        
     memset (p, 0, inode->i_sb->s_blocksize);
     do {
 	if (!is_direct_le_ih (ih)) {
@@ -421,10 +435,32 @@
 static int reiserfs_get_block_direct_io (struct inode * inode, long block,
 			struct buffer_head * bh_result, int create) {
     int ret ;
-
+    struct reiserfs_transaction_handle *th;
+    int refcount = 0;
+    struct super_block *s = inode->i_sb;
+
+    /* get_block might start a new transaction and leave it running.
+     * test for that by checking for a transaction running right now
+     * and recording its refcount.  Run a journal_end if the refcount
+     * after reiserfs_get_block is higher than it was before.
+     */
+    if (reiserfs_transaction_running(s)) {
+	th = current->journal_info;
+	refcount = th->t_refcount;
+    }
     bh_result->b_page = NULL;
     ret = reiserfs_get_block(inode, block, bh_result, create) ;
 
+    if (!ret && reiserfs_transaction_running(s)) {
+	th = current->journal_info;
+	if (th->t_refcount > refcount) {
+	    lock_kernel();
+	    reiserfs_update_sd(th, inode) ;
+	    journal_end(th, s, th->t_blocks_allocated);
+	    unlock_kernel();
+	}
+    }
+
     /* don't allow direct io onto tail pages */
     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 	/* make sure future calls to the direct io funcs for this offset
@@ -459,7 +495,6 @@
                                  struct buffer_head *bh_result,
 				 loff_t tail_offset) {
     unsigned long index ;
-    unsigned long tail_end ; 
     unsigned long tail_start ;
     struct page * tail_page ;
     struct page * hole_page = bh_result->b_page ;
@@ -470,7 +505,6 @@
 
     /* always try to read until the end of the block */
     tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
-    tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 
     index = tail_offset >> PAGE_CACHE_SHIFT ;
     if ( !hole_page || index != hole_page->index) {
@@ -492,16 +526,13 @@
     ** data that has been read directly into the page, and block_prepare_write
     ** won't trigger a get_block in this case.
     */
-    fix_tail_page_for_writing(tail_page) ;
-    retval = block_prepare_write(tail_page, tail_start, tail_end, 
-                                 reiserfs_get_block) ; 
+    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_start) ;
     if (retval)
         goto unlock ;
 
     /* tail conversion might change the data in the page */
     flush_dcache_page(tail_page) ;
-
-    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
+    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_start) ;
 
 unlock:
     if (tail_page != hole_page) {
@@ -541,20 +572,34 @@
     int done;
     int fs_gen;
     int windex ;
-    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *th = NULL ;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
         . 1 block involved into reiserfs_update_sd()
+	. 1 bitmap block
        XXX in practically impossible worst case direct2indirect()
-       can incur (much) more that 3 balancings. */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
+       can incur (much) more that 3 balancings, but we deal with
+       direct2indirect lower down */
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT + 2;
     int version;
-    int transaction_started = 0 ;
+    int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
+    int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
 
-				/* bad.... */
+    /* if this block might contain a tail, we need to be more conservative */
+    if (new_offset <= (loff_t)(16 * 1024)) {
+        jbegin_count += JOURNAL_PER_BALANCE_CNT * 2;
+    }
+    /* we might nest for the entire page, so we need to make sure
+     * to reserve enough to insert pointers in the tree for each block
+     * in the file
+     */
+    jbegin_count *= blocks_per_page;
+    if (reiserfs_file_data_log(inode)) {
+        jbegin_count += blocks_per_page;
+
+    }
     lock_kernel() ;
-    th.t_trans_id = 0 ;
     version = get_inode_item_key_version (inode);
 
     if (block < 0) {
@@ -579,6 +624,10 @@
 	return ret;
     }
 
+    /* don't leave the trans running if we are already nested */
+    if (reiserfs_transaction_running(inode->i_sb))
+	dangle = 0;
+
     /* If file is of such a size, that it might have a tail and tails are enabled
     ** we should mark it as possibly needing tail packing on close
     */
@@ -591,10 +640,18 @@
     /* set the key of the first byte in the 'block'-th block of file */
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
+
+    /* reiserfs_commit_write will close any transaction currently
+    ** running.  So, if we are nesting into someone else, we have to
+    ** make sure and bump the refcount
+    */
     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
+	if (IS_ERR(th)) {
+	    retval = PTR_ERR(th) ;
+	    goto failure ;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
     }
  research:
 
@@ -614,28 +671,34 @@
 
     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 	/* we have to allocate block for the unformatted node */
-	if (!transaction_started) {
+	if (!reiserfs_active_handle(th)) {
 	    pathrelse(&path) ;
-	    journal_begin(&th, inode->i_sb, jbegin_count) ;
+	    th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
+	    if (IS_ERR(th)) {
+		retval = PTR_ERR(th) ;
+		goto failure ;
+	    }
 	    reiserfs_update_inode_transaction(inode) ;
-	    transaction_started = 1 ;
 	    goto research ;
 	}
 
-	repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
+	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 
-	if (repeat == NO_DISK_SPACE) {
+	if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 	    /* restart the transaction to give the journal a chance to free
 	    ** some blocks.  releases the path, so we have to go back to
 	    ** research if we succeed on the second try
 	    */
-	    restart_transaction(&th, inode, &path) ; 
-	    repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
+	    restart_transaction(th, inode, &path, jbegin_count) ; 
+	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 
-	    if (repeat != NO_DISK_SPACE) {
+	    if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 		goto research ;
 	    }
-	    retval = -ENOSPC;
+	    if (repeat == QUOTA_EXCEEDED)
+		retval = -EDQUOT;
+	    else
+		retval = -ENOSPC;
 	    goto failure;
 	}
 
@@ -660,15 +723,12 @@
 	    bh_result->b_state |= (1UL << BH_New);
 	    put_block_num(item, pos_in_item, allocated_block_nr) ;
             unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (&th, inode->i_sb, bh);
-	    inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
-	    reiserfs_update_sd(&th, inode) ;
+	    journal_mark_dirty (th, inode->i_sb, bh);
+	    reiserfs_update_sd(th, inode) ;
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
 	pop_journal_writer(windex) ;
-	if (transaction_started)
-	    journal_end(&th, inode->i_sb, jbegin_count) ;
 
 	unlock_kernel() ;
 	 
@@ -676,18 +736,23 @@
 	** there is no need to make sure the inode is updated with this 
 	** transaction
 	*/
+	if (!dangle && reiserfs_active_handle(th))
+	    journal_end(th, inode->i_sb, jbegin_count) ;
 	return 0;
     }
 
-    if (!transaction_started) {
+    if (!reiserfs_active_handle(th)) {
 	/* if we don't pathrelse, we could vs-3050 on the buffer if
 	** someone is waiting for it (they can't finish until the buffer
-	** is released, we can start a new transaction until they finish)
+	** is released, we can't start a new transaction until they finish)
 	*/
 	pathrelse(&path) ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
+	if (IS_ERR(th)) {
+	    retval = PTR_ERR(th) ;
+	    goto failure ;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
 	goto research;
     }
 
@@ -716,13 +781,11 @@
 	    set_cpu_key_k_offset (&tmp_key, 1);
 	    PATH_LAST_POSITION(&path) ++;
 
-	    retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
+	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
-		goto failure; // retval == -ENOSPC or -EIO or -EEXIST
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
+		goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 	    }
-	    if (unp)
-		inode->i_blocks += inode->i_sb->s_blocksize / 512;
 	    //mark_tail_converted (inode);
 	} else if (is_direct_le_ih (ih)) {
 	    /* direct item has to be converted */
@@ -742,8 +805,13 @@
 		   node. FIXME: this should also get into page cache */
 
 		pathrelse(&path) ;
-		journal_end(&th, inode->i_sb, jbegin_count) ;
-		transaction_started = 0 ;
+		/* ugly, but we should only end the transaction if
+		** we aren't nested
+		*/
+		if (th->t_refcount == 1) {
+		    journal_end(th, inode->i_sb, jbegin_count) ;
+		    th = NULL ;
+		}
 
 		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 		if (retval) {
@@ -751,20 +819,27 @@
 			reiserfs_warning(inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 		    if (allocated_block_nr) {
 			/* the bitmap, the super, and the stat data == 3 */
-			journal_begin(&th, inode->i_sb, 3) ;
-			reiserfs_free_block (&th, allocated_block_nr);
-			transaction_started = 1 ;
+			if (!reiserfs_active_handle(th)) {
+			    th = reiserfs_persistent_transaction(inode->i_sb,3);
+			}
+			if (!IS_ERR(th)) {
+			    reiserfs_free_block(th,inode,allocated_block_nr,1);
+			}
+
 		    }
 		    goto failure ;
 		}
 		goto research ;
 	    }
-	    retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
+	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 	    if (retval) {
 		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
 		goto failure;
 	    }
+
+	    reiserfs_update_sd(th, inode) ;
+
 	    /* it is important the mark_buffer_uptodate is done after
 	    ** the direct2indirect.  The buffer might contain valid
 	    ** data newer than the data on disk (read by readpage, changed,
@@ -775,24 +850,25 @@
 	    */
 	    mark_buffer_uptodate (unbh, 1);
 
-	    /* unbh->b_page == NULL in case of DIRECT_IO request, this means
-	       buffer will disappear shortly, so it should not be added to
-	       any of our lists.
+	    /* we've converted the tail, so we must
+	    ** flush unbh before the transaction commits.
+	    ** unbh->b_page will be NULL for direct io requests, and
+	    ** in that case there's no data to log, dirty or order
 	    */
 	    if ( unbh->b_page ) {
-		/* we've converted the tail, so we must 
-		** flush unbh before the transaction commits
-		*/
-		add_to_flushlist(inode, unbh) ;
-
-		/* mark it dirty now to prevent commit_write from adding
-		 ** this buffer to the inode's dirty buffer list
-		 */
-		__mark_buffer_dirty(unbh) ;
+		if (reiserfs_file_data_log(inode)) {
+		    reiserfs_prepare_for_journal(inode->i_sb, unbh, 1) ;
+		    journal_mark_dirty(th, inode->i_sb, unbh) ;
+		} else {
+		    /* mark it dirty now to prevent commit_write from adding
+		    ** this buffer to the inode's dirty buffer list
+		    */
+		    __mark_buffer_dirty(unbh) ;
+		    /* note, this covers the data=ordered case too */
+		    add_to_tail_list(inode, unbh) ;
+		}
 	    }
 
-	    //inode->i_blocks += inode->i_sb->s_blocksize / 512;
-	    //mark_tail_converted (inode);
 	} else {
 	    /* append indirect item with holes if needed, when appending
 	       pointer to 'block'-th block use block, which is already
@@ -840,18 +916,16 @@
 		   only have space for one block */
 		blocks_needed=max_to_insert?max_to_insert:1;
 	    }
-	    retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
+	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 
 	    if (blocks_needed != 1)
 		 kfree(un);
 
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, inode, allocated_block_nr, 1);
 		goto failure;
 	    }
-	    if (done) {
-		inode->i_blocks += inode->i_sb->s_blocksize / 512;
-	    } else {
+	    if (!done) {
 		/* We need to mark new file size in case this function will be
 		   interrupted/aborted later on. And we may do this only for
 		   holes. */
@@ -870,9 +944,12 @@
 	**
 	** release the path so that anybody waiting on the path before
 	** ending their transaction will be able to continue.
+	**
+	** this only happens when inserting holes into the file, so it
+	** does not affect data=ordered safety at all
 	*/
-	if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
-	  restart_transaction(&th, inode, &path) ; 
+	if (journal_transaction_should_end(th, jbegin_count)) {
+	    restart_transaction(th, inode, &path, jbegin_count) ; 
 	}
 	/* inserting indirect pointers for a hole can take a 
 	** long time.  reschedule if needed
@@ -890,7 +967,7 @@
 			      "%K should not be found\n", &key);
 	    retval = -EEXIST;
 	    if (allocated_block_nr)
-	        reiserfs_free_block (&th, allocated_block_nr);
+	        reiserfs_free_block (th, inode, allocated_block_nr, 1);
 	    pathrelse(&path) ;
 	    goto failure;
 	}
@@ -902,20 +979,82 @@
 
 
     retval = 0;
-    reiserfs_check_path(&path) ;
 
  failure:
-    if (transaction_started) {
-      reiserfs_update_sd(&th, inode) ;
-      journal_end(&th, inode->i_sb, jbegin_count) ;
+    pathrelse(&path) ;
+    /* if we had an error, end the transaction */
+    if (!IS_ERR(th) && reiserfs_active_handle(th)) {
+        if (retval != 0) {
+	    reiserfs_update_sd(th, inode) ;
+	    journal_end(th, inode->i_sb, jbegin_count) ;
+	    th = NULL ;
+	} else if (!dangle) {
+	    journal_end(th, inode->i_sb, jbegin_count) ;
+	    th = NULL ;
+	}
     }
     pop_journal_writer(windex) ;
+    if (retval == 0 && reiserfs_active_handle(th) && 
+        current->journal_info != th) {
+        BUG() ;
+    }
     unlock_kernel() ;
-    reiserfs_check_path(&path) ;
     return retval;
 }
 
 
+/* Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in stat item
+ */
+static int real_space_diff(struct inode *inode, int sd_size)
+{
+    int bytes;
+    loff_t blocksize = inode->i_sb->s_blocksize ;
+
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+        return sd_size ;
+
+    /* End of file is also in full block with indirect reference, so round
+    ** up to the next block.
+    **
+    ** there is just no way to know if the tail is actually packed
+    ** on the file, so we have to assume it isn't.  When we pack the
+    ** tail, we add 4 bytes to pretend there really is an unformatted
+    ** node pointer
+    */
+    bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
+    return bytes ;
+}
+
+static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, 
+                                        int sd_size)
+{
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+        return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
+    }
+    return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
+}
+
+/* Compute number of blocks used by file in ReiserFS counting */
+static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
+{
+    loff_t bytes = inode_get_bytes(inode) ;
+    loff_t real_space = real_space_diff(inode, sd_size) ;
+    
+    /* keeps fsck and non-quota versions of reiserfs happy */
+    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+        bytes += (loff_t)511 ;
+    }
+
+    /* files from before the quota patch might i_blocks such that
+    ** bytes < real_space.  Deal with that here to prevent it from
+    ** going negative.  
+    */
+    if (bytes < real_space)
+        return 0 ;
+    return (bytes - real_space) >> 9;
+}
+
 //
 // BAD: new directories have stat data of new type and all other items
 // of old type. Version stored in the inode says about body items, so
@@ -971,6 +1110,14 @@
 
         rdev = sd_v1_rdev(sd);
 	inode->u.reiserfs_i.i_first_direct_byte = sd_v1_first_direct_byte(sd);
+	/* an early bug in the quota code can give us an odd number for the
+	** block count.  This is incorrect, fix it here.
+	*/
+	if (inode->i_blocks & 1) {
+	    inode->i_blocks++ ;
+	}
+	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, 
+	                                          SD_V1_SIZE));
 	/* nopack is initially zero for v1 objects. For v2 objects,
 	   nopack is initialised from sd_attrs */
 	inode->u.reiserfs_i.i_flags &= ~i_nopack_mask;
@@ -1000,6 +1147,8 @@
             set_inode_item_key_version (inode, KEY_FORMAT_3_6);
 
         set_inode_sd_version (inode, STAT_DATA_V2);
+	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, 
+	                                          SD_V2_SIZE));
 	/* read persistent inode attributes from sd and initalise
 	   generic inode flags from them */
 	inode -> u.reiserfs_i.i_attrs = sd_v2_attrs( sd );
@@ -1026,7 +1175,7 @@
 
 
 // update new stat data with inode fields
-static void inode2sd (void * sd, struct inode * inode)
+static void inode2sd (void * sd, struct inode * inode, loff_t new_size)
 {
     struct stat_data * sd_v2 = (struct stat_data *)sd;
     __u16 flags;
@@ -1034,12 +1183,12 @@
     set_sd_v2_mode(sd_v2, inode->i_mode );
     set_sd_v2_nlink(sd_v2, inode->i_nlink );
     set_sd_v2_uid(sd_v2, inode->i_uid );
-    set_sd_v2_size(sd_v2, inode->i_size );
+    set_sd_v2_size(sd_v2, new_size);
     set_sd_v2_gid(sd_v2, inode->i_gid );
     set_sd_v2_mtime(sd_v2, inode->i_mtime );
     set_sd_v2_atime(sd_v2, inode->i_atime );
     set_sd_v2_ctime(sd_v2, inode->i_ctime );
-    set_sd_v2_blocks(sd_v2, inode->i_blocks );
+    set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
         set_sd_v2_rdev(sd_v2, inode->i_rdev );
     else
@@ -1051,7 +1200,7 @@
 
 
 // used to copy inode's fields to old stat data
-static void inode2sd_v1 (void * sd, struct inode * inode)
+static void inode2sd_v1 (void * sd, struct inode * inode, loff_t new_size)
 {
     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
 
@@ -1059,7 +1208,7 @@
     set_sd_v1_uid(sd_v1, inode->i_uid );
     set_sd_v1_gid(sd_v1, inode->i_gid );
     set_sd_v1_nlink(sd_v1, inode->i_nlink );
-    set_sd_v1_size(sd_v1, inode->i_size );
+    set_sd_v1_size(sd_v1, new_size);
     set_sd_v1_atime(sd_v1, inode->i_atime );
     set_sd_v1_ctime(sd_v1, inode->i_ctime );
     set_sd_v1_mtime(sd_v1, inode->i_mtime );
@@ -1067,7 +1216,7 @@
     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
         set_sd_v1_rdev(sd_v1, inode->i_rdev );
     else
-        set_sd_v1_blocks(sd_v1, inode->i_blocks );
+        set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
 
     // Sigh. i_first_direct_byte is back
     set_sd_v1_first_direct_byte(sd_v1, inode->u.reiserfs_i.i_first_direct_byte);
@@ -1077,7 +1226,8 @@
 /* NOTE, you must prepare the buffer head before sending it here,
 ** and then log it after the call
 */
-static void update_stat_data (struct path * path, struct inode * inode)
+static void update_stat_data (struct path * path, struct inode * inode,
+                              loff_t new_size)
 {
     struct buffer_head * bh;
     struct item_head * ih;
@@ -1091,17 +1241,16 @@
   
     if (stat_data_v1 (ih)) {
 	// path points to old stat data
-	inode2sd_v1 (B_I_PITEM (bh, ih), inode);
+	inode2sd_v1 (B_I_PITEM (bh, ih), inode, new_size);
     } else {
-	inode2sd (B_I_PITEM (bh, ih), inode);
+	inode2sd (B_I_PITEM (bh, ih), inode, new_size);
     }
 
     return;
 }
 
-
-void reiserfs_update_sd (struct reiserfs_transaction_handle *th, 
-			 struct inode * inode)
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, 
+                             struct inode *inode, loff_t new_size)
 {
     struct cpu_key key;
     INITIALIZE_PATH(path);
@@ -1151,7 +1300,7 @@
 	}
 	break;
     }
-    update_stat_data (&path, inode);
+    update_stat_data (&path, inode, new_size);
     journal_mark_dirty(th, th->t_super, bh) ; 
     pathrelse (&path);
     return;
@@ -1236,6 +1385,7 @@
 	    reiserfs_make_bad_inode( inode );
     }
 
+    reiserfs_update_inode_transaction(inode);
     reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
 
 }
@@ -1415,8 +1565,6 @@
 ** does something when called for a synchronous update.
 */
 void reiserfs_write_inode (struct inode * inode, int do_sync) {
-    struct reiserfs_transaction_handle th ;
-    int jbegin_count = 1 ;
 
     if (inode->i_sb->s_flags & MS_RDONLY) {
         reiserfs_warning(inode->i_sb, "clm-6005: writing inode %lu on readonly FS\n", 
@@ -1430,9 +1578,7 @@
     */
     if (do_sync && !(current->flags & PF_MEMALLOC)) {
 	lock_kernel() ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
-	reiserfs_update_sd (&th, inode);
-	journal_end_sync(&th, inode->i_sb, jbegin_count) ;
+ 	reiserfs_commit_for_inode(inode) ;
 	unlock_kernel() ;
     }
 }
@@ -1450,6 +1596,7 @@
 /* stat data of new object is inserted already, this inserts the item
    containing "." and ".." entries */
 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, 
+				   struct inode *inode,
 				   struct item_head * ih, struct path * path,
 				   const struct inode * dir)
 {
@@ -1494,13 +1641,14 @@
     }
 
     /* insert item, that is empty directory item */
-    return reiserfs_insert_item (th, path, &key, ih, body);
+    return reiserfs_insert_item (th, path, &key, ih, inode, body);
 }
 
 
 /* stat data of object has been inserted, this inserts the item
    containing the body of symlink */
 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, 
+				 struct inode *inode,	/* Inode of symlink */
 				 struct item_head * ih,
 				 struct path * path, const char * symname, int item_len)
 {
@@ -1530,7 +1678,7 @@
     }
 
     /* insert item, that is body of symlink */
-    return reiserfs_insert_item (th, path, &key, ih, symname);
+    return reiserfs_insert_item (th, path, &key, ih, inode, symname);
 }
 
 
@@ -1604,7 +1752,8 @@
 
     inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
     inode->i_size = i_size;
-    inode->i_blocks = (inode->i_size + 511) >> 9;
+    inode->i_blocks = 0;
+    inode->i_bytes = 0;
     inode->u.reiserfs_i.i_first_direct_byte = S_ISLNK(mode) ? 1 : 
       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
 
@@ -1638,9 +1787,9 @@
 	    err = -EINVAL;
 	    goto out_bad_inode;
 	}
-	inode2sd_v1 (&sd, inode);
+	inode2sd_v1 (&sd, inode, inode->i_size);
     } else
-	inode2sd (&sd, inode);
+	inode2sd (&sd, inode, inode->i_size);
 
     // these do not go to on-disk stat data
     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
@@ -1665,7 +1814,7 @@
     if (dir->u.reiserfs_i.new_packing_locality)
 	th->displace_new_blocks = 1;
 #endif
-    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd));
+    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
     if (retval) {
 	reiserfs_check_path(&path_to_key) ;
 	err = retval;
@@ -1678,14 +1827,14 @@
 #endif
     if (S_ISDIR(mode)) {
 	/* insert item with "." and ".." */
-	retval = reiserfs_new_directory (th, &ih, &path_to_key, dir);
+	retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
     }
 
     if (S_ISLNK(mode)) {
 	/* insert body of symlink */
 	if (!old_format_only (sb))
 	    i_size = ROUND_UP(i_size);
-	retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size);
+	retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
     }
     if (retval) {
 	err = retval;
@@ -1705,6 +1854,9 @@
 
     /* dquot_drop must be done outside a transaction */
     journal_end(th, th->t_super, th->t_blocks_allocated) ;
+    DQUOT_FREE_INODE(inode);
+    DQUOT_DROP(inode);
+    inode->i_flags |= S_NOQUOTA;
     make_bad_inode(inode);
 
 out_inserted_sd:
@@ -1816,6 +1968,7 @@
     unsigned length ;
     struct page *page = NULL ;
     int error ;
+    int need_balance_dirty = 0 ;
     struct buffer_head *bh = NULL ;
 
     if (p_s_inode->i_size > 0) {
@@ -1848,34 +2001,58 @@
 	       transaction of truncating gets committed - on reboot the file
 	       either appears truncated properly or not truncated at all */
 	add_save_link (&th, p_s_inode, 1);
+    if (page)
+	kmap(page);
     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
     pop_journal_writer(windex) ;
-    journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
-
-    if (update_timestamps)
-	remove_save_link (p_s_inode, 1/* truncate */);
 
     if (page) {
+	if (!PageLocked(page))
+	    BUG();
         length = offset & (blocksize - 1) ;
 	/* if we are not on a block boundary */
 	if (length) {
 	    length = blocksize - length ;
-	    memset((char *)kmap(page) + offset, 0, length) ;   
+	    if ((offset + length) > PAGE_CACHE_SIZE) {
+		BUG();
+	    }
+	    memset((char *)page_address(page) + offset, 0, length) ;   
 	    flush_dcache_page(page) ;
-	    kunmap(page) ;
 	    if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-	        if (!atomic_set_buffer_dirty(bh)) {
+		if (reiserfs_file_data_log(p_s_inode)) {
+		    reiserfs_prepare_for_journal(p_s_inode->i_sb, bh, 1) ;
+		    journal_mark_dirty(&th, p_s_inode->i_sb, bh) ;
+		} else {
+		    /* it is safe to block here, but it would be faster
+		    ** to balance dirty after the journal lock is dropped
+		    */
+		    if (!atomic_set_buffer_dirty(bh)) {
 			set_buffer_flushtime(bh);
 			refile_buffer(bh);
 			buffer_insert_inode_data_queue(bh, p_s_inode);
-			balance_dirty();
+			need_balance_dirty = 1;
+
+			if (reiserfs_data_ordered(p_s_inode->i_sb)) {
+			    add_to_flushlist(p_s_inode, bh) ;
+			}
+		    }
 		}
 	    }
 	}
+	kunmap(page);
+    }
+    journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1) ;
+
+    if (update_timestamps)
+	remove_save_link(p_s_inode, 1/* truncate */);
+
+    if (page) {
 	UnlockPage(page) ;
 	page_cache_release(page) ;
     }
-
+    if (need_balance_dirty) {
+	balance_dirty() ;
+    }
     return ;
 }
 
@@ -1944,6 +2121,8 @@
 	    goto research;
 	}
 
+	if (((B_I_PITEM(bh, ih) - bh->b_data) + pos_in_item + copy_size) > inode->i_sb->s_blocksize)
+	    BUG();
 	memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
 
 	journal_mark_dirty(&th, inode->i_sb, bh) ;
@@ -1971,9 +2150,37 @@
 
     /* this is where we fill in holes in the file. */
     if (use_get_block) {
+	int old_refcount = 0 ;
+	struct reiserfs_transaction_handle *hole_th ;
+	if (reiserfs_transaction_running(inode->i_sb)) {
+	    hole_th = current->journal_info ;
+	    old_refcount = hole_th->t_refcount ;
+	}
 	retval = reiserfs_get_block(inode, block, bh_result, 
 	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
 	if (!retval) {
+	    /* did reiserfs_get_block leave us a running transaction? */
+	    if (reiserfs_transaction_running(inode->i_sb)) {
+		hole_th = current->journal_info ;
+		if (old_refcount < hole_th->t_refcount) {
+		    lock_kernel() ;
+		    /* we've filled a hole, make sure the new block
+		     * gets to disk before transaction commit
+		     */
+		    if (buffer_mapped(bh_result) && bh_result->b_blocknr != 0 &&
+		        reiserfs_data_ordered(inode->i_sb))
+		    {
+			__mark_buffer_dirty(bh_result) ;
+			mark_buffer_uptodate(bh_result, 1);
+			/* no need to update the inode trans, already done */
+			add_to_flushlist(inode, bh_result) ;
+		    }
+		    reiserfs_update_sd(hole_th, inode) ;
+		    journal_end(hole_th, hole_th->t_super, 
+		                hole_th->t_blocks_allocated) ;
+		    unlock_kernel() ;
+		}
+	    }
 	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
 	        /* get_block failed to find a mapped unformatted node. */
 		use_get_block = 0 ;
@@ -1988,33 +2195,41 @@
 /* helper func to get a buffer head ready for writepage to send to
 ** ll_rw_block
 */
-static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
+static void submit_bh_for_writepage(struct page *page, 
+                                    struct buffer_head **bhp, int nr) {
     struct buffer_head *bh ;
     int i;
 
-    /* lock them all first so the end_io handler doesn't unlock the page
-    ** too early
+    /* lock them all first so the end_io handler doesn't
+    ** unlock too early
+    **
+    ** There's just no safe way to log the buffers during writepage,
+    ** we'll deadlock if kswapd tries to start a transaction.
+    **
+    ** There's also no useful way to tie them to a specific transaction,
+    ** so we just don't bother.
     */
     for(i = 0 ; i < nr ; i++) {
-        bh = bhp[i] ;
-	lock_buffer(bh) ;
-	set_buffer_async_io(bh) ;
+	bh = bhp[i] ;
+	lock_buffer(bh);
+	set_buffer_async_io(bh);
+	set_bit(BH_Uptodate, &bh->b_state) ;
     }
     for(i = 0 ; i < nr ; i++) {
+	bh = bhp[i] ;
 	/* submit_bh doesn't care if the buffer is dirty, but nobody
 	** later on in the call chain will be cleaning it.  So, we
 	** clean the buffer here, it still gets written either way.
 	*/
-        bh = bhp[i] ;
 	clear_bit(BH_Dirty, &bh->b_state) ;
-	set_bit(BH_Uptodate, &bh->b_state) ;
 	submit_bh(WRITE, bh) ;
     }
 }
 
 static int reiserfs_write_full_page(struct page *page) {
     struct inode *inode = page->mapping->host ;
-    unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
+    loff_t size = inode->i_size;
+    unsigned long end_index = size >> PAGE_CACHE_SHIFT ;
     unsigned last_offset = PAGE_CACHE_SIZE;
     int error = 0;
     unsigned long block ;
@@ -2024,21 +2239,36 @@
     struct buffer_head *arr[PAGE_CACHE_SIZE/512] ;
     int nr = 0 ;
 
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        BUG();
+    }
+
+    if (!PageLocked(page))
+        BUG();
+
     if (!page->buffers) {
         block_prepare_write(page, 0, 0, NULL) ;
 	kunmap(page) ;
     }
+
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        BUG();
+    }
     /* last page in the file, zero out any contents past the
     ** last byte in the file
     */
     if (page->index >= end_index) {
-        last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
+        char *p ;
+        last_offset = size & (PAGE_CACHE_SIZE - 1) ;
 	/* no file contents in this page */
 	if (page->index >= end_index + 1 || !last_offset) {
 	    error =  -EIO ;
 	    goto fail ;
 	}
-	memset((char *)kmap(page)+last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
+	p = kmap(page);
+	if (last_offset > PAGE_CACHE_SIZE)
+	    BUG();
+	memset(p + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
 	flush_dcache_page(page) ;
 	kunmap(page) ;
     }
@@ -2079,7 +2309,7 @@
     ** nr == 0 without there being any kind of error.
     */
     if (nr) {
-        submit_bh_for_writepage(arr, nr) ;
+        submit_bh_for_writepage(page, arr, nr) ;
 	wakeup_page_waiters(page);
     } else {
         UnlockPage(page) ;
@@ -2091,7 +2321,7 @@
 
 fail:
     if (nr) {
-        submit_bh_for_writepage(arr, nr) ;
+        submit_bh_for_writepage(page, arr, nr) ;
     } else {
         UnlockPage(page) ;
     }
@@ -2116,10 +2346,46 @@
 
 int reiserfs_prepare_write(struct file *f, struct page *page, 
 			   unsigned from, unsigned to) {
+    int cur_refcount = 0 ;
+    int ret ;
     struct inode *inode = page->mapping->host ;
+    struct reiserfs_transaction_handle *th ;
+
     reiserfs_wait_on_write_block(inode->i_sb) ;
     fix_tail_page_for_writing(page) ;
-    return block_prepare_write(page, from, to, reiserfs_get_block) ;
+
+    /* we look for a running transaction before the block_prepare_write
+    ** call, and then again afterwards.  This lets us know if
+    ** reiserfs_get_block added any additional transactions, so we can
+    ** let reiserfs_commit_write know if he needs to close them.
+    ** this is just nasty
+    */
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	th = current->journal_info ;
+	cur_refcount = th->t_refcount ;
+    }
+    ret =  block_prepare_write(page, from, to, reiserfs_get_block) ;
+
+    /* it is very important that we only set the dangling bit when
+    ** there is no chance of additional nested transactions. 
+    */
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        th = current->journal_info ;
+	if (th->t_refcount > cur_refcount) {
+	    /* if we return an error, commit_write isn't going to get called
+	     * we need to make sure we end any transactions 
+	     * reiserfs_get_block left hanging around
+	     */
+	    if (ret) {
+		lock_kernel();
+		journal_end(th, th->t_super, th->t_blocks_allocated) ;
+		unlock_kernel();
+	    } else {
+		reiserfs_set_handle_dangling(th) ;
+	    }
+	}
+    }
+    return ret ;
 }
 
 
@@ -2127,20 +2393,96 @@
   return generic_block_bmap(as, block, reiserfs_bmap) ;
 }
 
+/* taken from fs/buffer.c */
+static int __commit_write(struct reiserfs_transaction_handle *th,
+                          struct inode *inode, struct page *page,
+			  unsigned from, unsigned to, int *balance)
+{
+    unsigned block_start, block_end;
+    int partial = 0;
+    unsigned blocksize;
+    struct buffer_head *bh, *head;
+    int logbh = 0 ;
+
+    blocksize = 1 << inode->i_blkbits;
+    if (reiserfs_file_data_log(inode)) {
+        logbh = 1 ;
+	lock_kernel() ;
+	/* one for each block + the stat data, the caller closes the handle */
+	journal_begin(th, inode->i_sb,(PAGE_CACHE_SIZE >> inode->i_blkbits)+1);
+	reiserfs_update_inode_transaction(inode) ;
+	unlock_kernel() ;
+    }
+
+    for(bh = head = page->buffers, block_start = 0;
+        bh != head || !block_start;
+        block_start=block_end, bh = bh->b_this_page) {
+	block_end = block_start + blocksize;
+	if (block_end <= from || block_start >= to) {
+	    if (!buffer_uptodate(bh))
+		    partial = 1;
+	} else {
+	    set_bit(BH_Uptodate, &bh->b_state);
+	    if (logbh) {
+	        lock_kernel() ;
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
+		journal_mark_dirty (th, inode->i_sb, bh);
+		unlock_kernel() ;
+	    } else if (!atomic_set_buffer_dirty(bh)) {
+		__mark_dirty(bh);
+		if (reiserfs_data_ordered(inode->i_sb)) {
+		    lock_kernel();
+		    add_to_flushlist(inode, bh);
+		    /* if we don't update the inode trans information,
+		     * an fsync(fd) might not catch these data blocks
+		     */
+		    reiserfs_update_inode_transaction(inode);
+		    unlock_kernel();
+		} else {
+		    buffer_insert_inode_data_queue(bh, inode);
+		}
+		*balance = 1;
+	    }
+	}
+    }
+
+    /*
+     * is this a partial write that happened to make all buffers
+     * uptodate then we can optimize away a bogus readpage() for
+     * the next read(). Here we 'discover' wether the page went
+     * uptodate as a result of this (potentially partial) write.
+     */
+    if (!partial)
+	SetPageUptodate(page);
+    return 0;
+}
+
 static int reiserfs_commit_write(struct file *f, struct page *page, 
                                  unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
     int ret ; 
-    
+    int need_balance = 0;
+    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *dth = NULL ;
+
+    /* we must do this before anything that might nest a transaction or
+    ** mess with the handle flags
+    */
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	dth = current->journal_info ;
+	if (reiserfs_dangling_handle(dth)) {
+	    reiserfs_clear_handle_dangling(dth) ;
+	} else {
+	    dth = NULL ;
+	}
+    }
     reiserfs_wait_on_write_block(inode->i_sb) ;
+
+    th.t_flags = 0 ;
+    ret = __commit_write(&th, inode, page, from, to, &need_balance) ;
  
-    /* generic_commit_write does this for us, but does not update the
-    ** transaction tracking stuff when the size changes.  So, we have
-    ** to do the i_size updates here.
-    */
     if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle th ;
 	lock_kernel();
 	/* If the file have grown beyond the border where it
 	   can have a tail, unmark it as needing a tail
@@ -2149,24 +2491,135 @@
 	     (have_small_tails (inode->i_sb) && inode->i_size > block_size(inode)) )
 	    inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
 
-	journal_begin(&th, inode->i_sb, 1) ;
+	if (!reiserfs_active_handle(&th)) {
+	    journal_begin(&th, inode->i_sb, 1) ;
+	}
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
 	reiserfs_update_sd(&th, inode) ;
-	journal_end(&th, inode->i_sb, 1) ;
-	unlock_kernel();
+	journal_end(&th, th.t_super, th.t_blocks_allocated) ;
+	unlock_kernel() ;
+    } else if (reiserfs_active_handle(&th)) {
+	/* in case commit_write left one running and the i_size update did
+	** not close it
+	*/
+	lock_kernel() ;
+	journal_end(&th, th.t_super, th.t_blocks_allocated) ;
+	unlock_kernel() ;
     }
- 
-    ret = generic_commit_write(f, page, from, to) ;
 
-    /* we test for O_SYNC here so we can commit the transaction
-    ** for any packed tails the file might have had
+    /* did reiserfs_get_block leave us with a running transaction?
     */
-    if (f && (f->f_flags & O_SYNC)) {
+    if (dth) {
 	lock_kernel() ;
- 	reiserfs_commit_for_inode(inode) ;
+	journal_end(dth, dth->t_super, dth->t_blocks_allocated) ;
 	unlock_kernel();
     }
+
+    kunmap(page) ;
+
+    if (need_balance)
+	balance_dirty();
+
+    return ret ;
+}
+
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int flushpage_can_drop(struct inode *inode, struct buffer_head *bh) {
+    int ret = 1 ;
+    
+    if (!buffer_mapped(bh)) {
+        return 1 ;
+    }
+    if (reiserfs_file_data_log(inode)) {
+	lock_kernel() ;
+	/* very conservative, leave the buffer pinned if anyone might need it.
+	** this should be changed to drop the buffer if it is only in the
+	** current transaction
+	*/
+        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+	    ret = 0 ;
+	}
+	unlock_kernel() ;
+    }
+    if (reiserfs_data_ordered(inode->i_sb)) {
+        if (buffer_dirty(bh) && bh->b_journal_head) {
+	    struct reiserfs_journal_list *jl = NULL;
+	    lock_kernel();
+
+	    /* we can race against fsync_inode_buffers if we aren't careful */
+	    if (buffer_attached(bh) && buffer_dirty(bh))
+		jl = bh->b_journal_head;
+
+	    /* why is this safe?
+	     * reiserfs_setattr updates i_size in the on disk
+	     * stat data before allowing vmtruncate to be called.
+	     *
+	     * If buffer was put onto the ordered list for this
+	     * transaction, we know for sure either this transaction
+	     * or an older one already has updated i_size on disk,
+	     * and this ordered data won't be referenced in the file
+	     * if we crash.
+	     *
+	     * if the buffer was put onto the ordered list for an older
+	     * transaction, we need to leave it around
+	     */
+	    if (jl != SB_JOURNAL(inode->i_sb)->j_current_jl) {
+	        ret = 0;
+	    } 
+	    unlock_kernel();
+	}
+    }
+    return ret ;
+}
+
+/* stolen from fs/buffer.c:discard_bh_page */
+static int reiserfs_flushpage(struct page *page, unsigned long offset) {
+    struct buffer_head *head, *bh, *next;
+    struct inode *inode = page->mapping->host ;
+    unsigned int curr_off = 0;
+    int ret = 1;
+
+    if (!PageLocked(page))
+	BUG();
+    if (!page->buffers)
+	return 1;
+
+    head = page->buffers;
+    bh = head;
+    do {
+	unsigned int next_off = curr_off + bh->b_size;
+	next = bh->b_this_page;
+
+	/* is this buffer to be completely truncated away? */
+	if (offset <= curr_off) {
+            if (flushpage_can_drop(inode, bh))
+		discard_buffer(bh);
+	    else
+	        ret = 0 ;
+	}
+	curr_off = next_off;
+	bh = next;
+    } while (bh != head);
+
+    /*
+     * subtle. We release buffer-heads only if this is
+     * the 'final' flushpage. We have invalidated the get_block
+     * cached value unconditionally, so real IO is not
+     * possible anymore.
+     *
+     * If the free doesn't work out, the buffers can be
+     * left around - they just turn into anonymous buffers
+     * instead.
+     */
+    if (!offset) {
+	if (!ret || !try_to_free_buffers(page, 0))
+	    return 0;
+        if (page->buffers)
+	    BUG();
+    }
     return ret ;
 }
 
@@ -2222,6 +2675,9 @@
                               struct kiobuf *iobuf, unsigned long blocknr,
 			      int blocksize) 
 {
+    if (reiserfs_data_ordered(inode->i_sb) || reiserfs_file_data_log(inode)) {
+	return -EINVAL;
+    }
     lock_kernel();
     reiserfs_commit_for_tail(inode);
     unlock_kernel();
@@ -2237,4 +2693,5 @@
     commit_write: reiserfs_commit_write,
     bmap: reiserfs_aop_bmap,
     direct_IO: reiserfs_direct_io,
+    flushpage: reiserfs_flushpage,
 } ;
diff -urN linux-2.4.22.org/fs/reiserfs/ioctl.c linux-2.4.22/fs/reiserfs/ioctl.c
--- linux-2.4.22.org/fs/reiserfs/ioctl.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/ioctl.c	2003-11-21 15:14:23.000000000 +0100
@@ -25,12 +25,21 @@
 	switch (cmd) {
 	    case REISERFS_IOC_UNPACK:
 		if( S_ISREG( inode -> i_mode ) ) {
-		if (arg)
-		    return reiserfs_unpack (inode, filp);
-			else
-				return 0;
+		    if (arg) {
+			int result; 
+			result = reiserfs_unpack (inode, filp);
+			if (reiserfs_file_data_log(inode)) {
+			    struct reiserfs_transaction_handle th;
+			    lock_kernel();
+			    journal_begin(&th, inode->i_sb, 1);
+			    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
+			    journal_end_sync(&th, inode->i_sb, 1);
+			    unlock_kernel();
+			}
+		    } else
+			return 0;
 		} else
-			return -ENOTTY;
+		    return -ENOTTY;
 	/*
 	 * Following {G,S}ETFLAGS, and {G,S}ETVERSION are providing ext2
 	 * binary compatible interface (used by lsattr(1), and chattr(1)) and
@@ -97,6 +106,7 @@
     int retval = 0;
     int index ;
     struct page *page ;
+    struct address_space *mapping ;
     unsigned long write_from ;
     unsigned long blocksize = inode->i_sb->s_blocksize ;
     	
@@ -127,19 +137,20 @@
     ** reiserfs_get_block to unpack the tail for us.
     */
     index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    page = grab_cache_page(inode->i_mapping, index) ;
+    mapping = inode->i_mapping ;
+    page = grab_cache_page(mapping, index) ;
     retval = -ENOMEM;
     if (!page) {
         goto out ;
     }
-    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
+    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
     if (retval)
         goto out_unlock ;
 
     /* conversion can change page contents, must flush */
     flush_dcache_page(page) ;
     inode->u.reiserfs_i.i_flags |= i_nopack_mask;
-    kunmap(page) ; /* mapped by prepare_write */
+    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
 
 out_unlock:
     UnlockPage(page) ;
diff -urN linux-2.4.22.org/fs/reiserfs/journal.c linux-2.4.22/fs/reiserfs/journal.c
--- linux-2.4.22.org/fs/reiserfs/journal.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/journal.c	2003-11-21 15:14:23.000000000 +0100
@@ -33,17 +33,17 @@
 **		     -- Note, if you call this as an immediate flush from 
 **		        from within kupdate, it will ignore the immediate flag
 **
-** The commit thread -- a writer process for async commits.  It allows a 
-**                      a process to request a log flush on a task queue.
-**                      the commit will happen once the commit thread wakes up.
-**                      The benefit here is the writer (with whatever
-**                      related locks it has) doesn't have to wait for the
-**                      log blocks to hit disk if it doesn't want to.
+** The commit thread -- a writer process  for metadata and async commits.
+**			this allows us to do less io with the journal lock
+** 			held.
 */
 
+#define EXPORT_SYMTAB
+#include <linux/module.h>
 #include <linux/config.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <linux/init.h>
 
 #include <linux/sched.h>
 #include <asm/semaphore.h>
@@ -59,17 +59,25 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
+
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit thread
 */
 static int reiserfs_mounted_fs_count = 0 ;
 
-/* wake this up when you add something to the commit thread task queue */
+static struct list_head kreiserfsd_supers = LIST_HEAD_INIT(kreiserfsd_supers);
+
+/* wake this up when you want help from the commit thread */
 DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_wait) ;
 
-/* wait on this if you need to be sure you task queue entries have been run */
+/* so we can wait for the commit thread to make progress */
 static DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_done) ;
-DECLARE_TASK_QUEUE(reiserfs_commit_thread_tq) ;
+DECLARE_MUTEX(kreiserfsd_sem) ;
 
 #define JOURNAL_TRANS_HALF 1018   /* must be correct to keep the desc and commit
 				     structs at 4k */
@@ -82,6 +90,9 @@
 
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
 
+/* journal list state bits */
+#define LIST_TOUCHED 1
+
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
 #define COMMIT_NOW  2		/* end and commit this transaction */
@@ -89,6 +100,9 @@
 
 /* state bits for the journal */
 #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
+#define WRITERS_QUEUED 2       /* set when log is full due to too many 
+                                *  writers 
+				*/
 
 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
@@ -107,7 +121,7 @@
 ** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
 ** more details.
 */
-static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
+static inline int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
   if (bh) {
     clear_bit(BH_Dirty, &bh->b_state) ;
     refile_buffer(bh) ;
@@ -473,6 +487,8 @@
 int pop_journal_writer(int index) {
 #ifdef CONFIG_REISERFS_CHECK
   if (index >= 0) {
+    if (index >= 512)
+        BUG();
     journal_writers[index] = NULL ;
   }
 #endif
@@ -522,6 +538,12 @@
     return 0 ;
   }
 
+  /* when data logging is on, no special action is needed for the data
+   * blocks
+   */
+  if (reiserfs_data_log(p_s_sb))
+      search_all = 0;
+
   PROC_INFO_INC( p_s_sb, journal.in_journal );
   /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
   ** if we crash before the transaction that freed it commits,  this transaction won't
@@ -549,6 +571,7 @@
 
   /* is it in the current transaction.  This should never happen */
   if ((cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, dev,bl,size))) {
+    BUG();
     return 1; 
   }
 
@@ -574,17 +597,12 @@
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
   PROC_INFO_INC( p_s_sb, journal.lock_journal );
-  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
-    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
-    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
-  }
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
+  down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /* unlock the current transaction */
 inline static void unlock_journal(struct super_block *p_s_sb) {
-  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
-  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
+  up(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /*
@@ -602,6 +620,83 @@
   jl->j_list_bitmap = NULL ;
 }
 
+static int journal_list_still_alive(struct super_block *s, 
+                                    unsigned long trans_id)
+{
+    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
+    struct reiserfs_journal_list *jl;
+
+    if (!list_empty(entry)) {
+        jl = JOURNAL_LIST_ENTRY(entry->next);
+	if (jl->j_trans_id <= trans_id) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+    struct reiserfs_journal_list *other_jl;
+    struct reiserfs_journal_list *first_jl;
+    struct list_head *entry;
+    unsigned long trans_id = jl->j_trans_id;
+    unsigned long other_trans_id;
+    unsigned long first_trans_id;
+
+find_first:
+    /* 
+     * first we walk backwards to find the oldest uncommitted transation
+     */
+    first_jl = jl;
+    entry = jl->j_list.prev;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (entry == &SB_JOURNAL(s)->j_journal_list || 
+	    atomic_read(&other_jl->j_older_commits_done))
+	    break;
+        
+        first_jl = other_jl;
+	entry = other_jl->j_list.prev;
+    }
+
+    /* if we didn't find any older uncommitted transactions, return now */
+    if (first_jl == jl) {
+        return 0;
+    }
+
+    first_trans_id = first_jl->j_trans_id;
+
+    entry = &first_jl->j_list;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	other_trans_id = other_jl->j_trans_id;
+	
+	if (other_trans_id < trans_id) { 
+	    if (atomic_read(&other_jl->j_commit_left) != 0) {
+		flush_commit_list(s, other_jl, 0);
+
+		/* list we were called with is gone, return */
+		if (!journal_list_still_alive(s, trans_id))
+		    return 1;
+
+		/* the one we just flushed is gone, this means all
+		 * older lists are also gone, so first_jl is no longer
+		 * valid either.  Go back to the beginning.
+		 */
+		if (!journal_list_still_alive(s, other_trans_id)) {
+		    goto find_first;
+		}
+	    }
+	    entry = entry->next;
+	    if (entry == &SB_JOURNAL(s)->j_journal_list)
+		return 0;
+	} else {
+	    return 0;
+	}
+    }
+    return 0;
+}
+
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
@@ -611,16 +706,19 @@
 */
 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
   int i, count ;
-  int index = 0 ;
   int bn ;
   int retry_count = 0 ;
   int orig_commit_left = 0 ;
   struct buffer_head *tbh = NULL ;
-  struct reiserfs_journal_list *other_jl ;
+  unsigned long trans_id = jl->j_trans_id;
 
   reiserfs_check_lock_depth("flush_commit_list") ;
 
   if (atomic_read(&jl->j_older_commits_done)) {
+    if (!list_empty(&jl->j_ordered_bh_list))
+        BUG();
+    if (!list_empty(&jl->j_tail_bh_list))
+        BUG();
     return 0 ;
   }
 
@@ -628,50 +726,51 @@
   ** us is on disk too
   */
   if (jl->j_len <= 0) {
+    BUG();
     return 0 ;
   }
+  if (trans_id == SB_JOURNAL(s)->j_trans_id)
+      BUG();
+
   if (flushall) {
-    /* we _must_ make sure the transactions are committed in order.  Start with the
-    ** index after this one, wrap all the way around 
-    */
-    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
-    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
-      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && 
-          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
-        flush_commit_list(s, other_jl, 0) ;
-      }
+    if (flush_older_commits(s, jl) == 1) {
+        /* list disappeared during flush_older_commits.  return */
+        return 0;
     }
   }
 
   count = 0 ;
-  /* don't flush the commit list for the current transactoin */
-  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
-    return 0 ;
-  }
 
   /* make sure nobody is trying to flush this one at the same time */
-  if (atomic_read(&(jl->j_commit_flushing))) {
-    sleep_on(&(jl->j_commit_wait)) ;
-    if (flushall) {
-      atomic_set(&(jl->j_older_commits_done), 1) ;
-    }
-    return 0 ;
+  down(&jl->j_commit_lock);
+  if (!journal_list_still_alive(s, trans_id)) {
+      up(&jl->j_commit_lock);
+      return 0;
   }
+  if (jl->j_trans_id == 0)
+      BUG();
   
   /* this commit is done, exit */
   if (atomic_read(&(jl->j_commit_left)) <= 0) {
     if (flushall) {
       atomic_set(&(jl->j_older_commits_done), 1) ;
     }
+    if (!list_empty(&jl->j_ordered_bh_list))
+        BUG();
+    if (!list_empty(&jl->j_tail_bh_list))
+        BUG();
+    up(&jl->j_commit_lock);
     return 0 ;
   }
-  /* keeps others from flushing while we are flushing */
-  atomic_set(&(jl->j_commit_flushing), 1) ; 
-
 
+  /* write any buffers that must hit disk before the commit is done */
+  while(!list_empty(&jl->j_ordered_bh_list)) {
+      unlock_kernel();
+      fsync_buffers_list(&jl->j_ordered_bh_list);
+      lock_kernel();
+  }
   if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
+    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, trans_id %lu\n", jl->j_len, jl->j_trans_id) ;
     return 0 ;
   }
 
@@ -701,7 +800,7 @@
       if (buffer_dirty(tbh)) {
 	reiserfs_warning(s, "journal-569: flush_commit_list, block already dirty!\n") ;
       } else {				
-	mark_buffer_dirty(tbh) ;
+	atomic_set_buffer_dirty(tbh);
       }
       ll_rw_block(WRITE, 1, &tbh) ;
       count++ ;
@@ -745,16 +844,22 @@
   atomic_dec(&(jl->j_commit_left)) ;
   bforget(jl->j_commit_bh) ;
 
+  if (SB_JOURNAL(s)->j_last_commit_id != 0 && 
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
+      reiserfs_warning(s, "clm-2200: dev %s, last commit %lu, current %lu\n",
+                       kdevname(s->s_dev), SB_JOURNAL(s)->j_last_commit_id,
+		       SB_JOURNAL(s)->j_last_commit_id);
+  }
+  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
+
   /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
   cleanup_freed_for_journal_list(s, jl) ;
 
   if (flushall) {
     atomic_set(&(jl->j_older_commits_done), 1) ;
   }
-  atomic_set(&(jl->j_commit_flushing), 0) ;
-  wake_up(&(jl->j_commit_wait)) ;
+  up(&jl->j_commit_lock);
 
-  s->s_dirt = 1 ;
   return 0 ;
 }
 
@@ -853,22 +958,27 @@
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
-  int i, index ;
-  struct reiserfs_journal_list *other_jl ;
-
-  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
-    if (other_jl && other_jl->j_len > 0 && 
-        other_jl->j_trans_id > 0 && 
-	other_jl->j_trans_id < trans_id && 
-        other_jl != jl) {
-      /* do not flush all */
-      flush_journal_list(p_s_sb, other_jl, 0) ; 
+static int flush_older_journal_lists(struct super_block *p_s_sb, 
+                                     struct reiserfs_journal_list *jl)
+{
+    struct list_head *entry;
+    struct reiserfs_journal_list *other_jl ;
+    unsigned long trans_id = jl->j_trans_id;
+
+    /* we know we are the only ones flushing things, no extra race
+     * protection is required.
+     */
+restart:
+    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
+    other_jl = JOURNAL_LIST_ENTRY(entry);
+    if (other_jl->j_trans_id < trans_id) {
+	/* do not flush all */
+	flush_journal_list(p_s_sb, other_jl, 0) ; 
+
+	/* other_jl is now deleted from the list */
+	goto restart;
     }
-  }
-  return 0 ;
+    return 0 ;
 }
 
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
@@ -881,14 +991,23 @@
     put_bh(bh) ;
 }
 static void submit_logged_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
     get_bh(bh) ;
     bh->b_end_io = reiserfs_end_buffer_io_sync ;
     mark_buffer_notjournal_new(bh) ;
     clear_bit(BH_Dirty, &bh->b_state) ;
+    if (!buffer_uptodate(bh))
+        BUG();
     submit_bh(WRITE, bh) ;
 }
 
+static void del_from_work_list(struct super_block *s, 
+                               struct reiserfs_journal_list *jl) {
+    if (!list_empty(&jl->j_working_list)) {
+        list_del_init(&jl->j_working_list);
+	SB_JOURNAL(s)->j_num_work_lists--;
+    }
+}
+
 /* flush a journal list, both commit and real blocks
 **
 ** always set flushall to 1, unless you are calling from inside
@@ -909,29 +1028,27 @@
   unsigned long j_len_saved = jl->j_len ;
 
   if (j_len_saved <= 0) {
-    return 0 ;
+    BUG();
   }
 
   if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
     reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d\n",
                       atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
   }
-  /* if someone is getting the commit list, we must wait for them */
-  while (atomic_read(&(jl->j_commit_flushing))) { 
-    sleep_on(&(jl->j_commit_wait)) ;
-  }
-  /* if someone is flushing this list, we must wait for them */
-  while (atomic_read(&(jl->j_flushing))) {
-    sleep_on(&(jl->j_flush_wait)) ;
-  }
 
-  /* this list is now ours, we can change anything we want */
-  atomic_set(&(jl->j_flushing), 1) ;
+  if (jl->j_trans_id == 0)
+      BUG();
+
+  /* if flushall == 0, the lock is already held */
+  if (flushall) {
+      down(&SB_JOURNAL(s)->j_flush_sem);
+  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
+      BUG();
+  }
 
   count = 0 ;
   if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
-    atomic_dec(&(jl->j_flushing)) ;
+    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, transid %lu\n", j_len_saved, jl->j_trans_id) ;
     return 0 ;
   }
 
@@ -981,13 +1098,13 @@
       get_bh(saved_bh) ;
 
       if (buffer_journal_dirty(saved_bh)) {
+	if (!can_dirty(cn))
+	    BUG();
         was_jwait = 1 ;
-	mark_buffer_notjournal_dirty(saved_bh) ;
-        /* undo the inc from journal_mark_dirty */
-	put_bh(saved_bh) ;
-      }
-      if (can_dirty(cn)) {
-        was_dirty = 1 ;
+	was_dirty = 1;
+      } else if (can_dirty(cn)) {
+	  /* everything with !pjl && jwait should be writable */
+          BUG();
       }
     }
 
@@ -995,7 +1112,8 @@
     ** sure they are commited, and don't try writing it to disk
     */
     if (pjl) {
-      flush_commit_list(s, pjl, 1) ;
+      if (atomic_read(&pjl->j_commit_left))
+	  flush_commit_list(s, pjl, 1) ;
       goto free_cnode ;
     }
 
@@ -1029,7 +1147,12 @@
       /* we inc again because saved_bh gets decremented at free_cnode */
       get_bh(saved_bh) ;
       set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
+      lock_buffer(saved_bh);
       submit_logged_buffer(saved_bh) ;
+      if (cn->blocknr != saved_bh->b_blocknr) {
+printk("cn %lu does not match bh %lu\n", cn->blocknr, saved_bh->b_blocknr);
+      BUG();
+      }
       count++ ;
     } else {
       reiserfs_warning(s, "clm-2082: Unable to flush buffer %lu in flush_journal_list\n",
@@ -1057,9 +1180,23 @@
 	if (!cn->bh) {
 	  reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
 	}
+        if (cn->blocknr != cn->bh->b_blocknr) {
+printk("2cn %lu does not match bh %lu\n", cn->blocknr, cn->bh->b_blocknr);
+	    BUG();
+        }
 	if (!buffer_uptodate(cn->bh)) {
-	  reiserfs_panic(s, "journal-949: buffer write failed\n") ;
+	  reiserfs_panic(s, "journal-949: buffer %lu write failed\n", cn->bh->b_blocknr) ;
 	}
+
+	/* note, we must clear the JDirty_wait bit after the up to date
+	** check, otherwise we race against our flushpage routine
+	*/
+	if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
+	    BUG();
+
+        /* undo the inc from journal_mark_dirty */
+	put_bh(cn->bh) ;
+
 	refile_buffer(cn->bh) ;
         brelse(cn->bh) ;
       }
@@ -1074,7 +1211,7 @@
   ** replayed after a crash
   */
   if (flushall) {
-    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
+    flush_older_journal_lists(s, jl);
   } 
   
   /* before we can remove everything from the hash tables for this 
@@ -1089,46 +1226,137 @@
     update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
   }
   remove_all_from_journal_list(s, jl, 0) ;
+  list_del(&jl->j_list);
+  SB_JOURNAL(s)->j_num_lists--;
+  del_from_work_list(s, jl);
+
+  if (SB_JOURNAL(s)->j_last_flush_id != 0 && 
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
+      reiserfs_warning(s, "clm-2201: dev %s, last flush %lu, current %lu\n",
+                       kdevname(s->s_dev), SB_JOURNAL(s)->j_last_flush_id,
+		       SB_JOURNAL(s)->j_last_flush_id);
+  }
+  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
+
+  /* not strictly required since we are freeing the list, but it should
+   * help find code using dead lists later on
+   */
   jl->j_len = 0 ;
   atomic_set(&(jl->j_nonzerolen), 0) ;
   jl->j_start = 0 ;
   jl->j_realblock = NULL ;
   jl->j_commit_bh = NULL ;
   jl->j_trans_id = 0 ;
-  atomic_dec(&(jl->j_flushing)) ;
-  wake_up(&(jl->j_flush_wait)) ;
+  jl->j_state = 0;
+
+  if (!list_empty(&jl->j_ordered_bh_list))
+      BUG();
+
+  if (!list_empty(&jl->j_tail_bh_list))
+      BUG();
+
+  // kmem_cache_free(journal_list_cachep, jl);
+  reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
+
+  if (flushall)
+      up(&SB_JOURNAL(s)->j_flush_sem);
   return 0 ;
 } 
 
 
-static int kupdate_one_transaction(struct super_block *s,
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE)
+        write_chunk(chunk);
+}
+
+static int write_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk) 
+{
+    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
+    struct reiserfs_journal_cnode *cn;
+    int ret = 0 ;
+
+    jl->j_state |= LIST_TOUCHED;
+    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+	del_from_work_list(s, jl);
+        return 0;
+    }
+    del_from_work_list(s, jl);
+
+    cn = jl->j_realblock ;
+    while(cn) {
+        /* if the blocknr == 0, this has been cleared from the hash,
+        ** skip it
+        */
+        if (cn->blocknr == 0) {
+            goto next ;
+        }
+        /* look for a more recent transaction that logged this
+        ** buffer.  Only the most recent transaction with a buffer in
+        ** it is allowed to send that buffer to disk
+        */
+        pjl = find_newer_jl_for_cn(cn) ;
+        if (!pjl && cn->bh && buffer_journal_dirty(cn->bh) && can_dirty(cn)) {
+            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
+		struct buffer_head *tmp_bh;
+		/* we can race against journal_mark_freed when we try
+		 * to lock_buffer(cn->bh), so we have to inc the buffer
+		 * count, and recheck things after locking
+		 */
+		tmp_bh = cn->bh;
+		get_bh(tmp_bh);
+                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
+		lock_buffer(tmp_bh);
+		if (cn->bh && buffer_journal_dirty(tmp_bh) && 
+		    !test_bit(BH_JPrepared, &tmp_bh->b_state)) 
+		{
+		    add_to_chunk(chunk, tmp_bh);
+		    ret++;
+		} else {
+		    /* note, cn->bh might be null now */
+		    unlock_buffer(tmp_bh);
+		}
+		put_bh(tmp_bh);
+            }
+        } 
+next:
+        cn = cn->next ;
+	if (current->need_resched)
+	    schedule();
+    }
+    return ret ;
+}
+
+static int wait_one_transaction(struct super_block *s,
                                     struct reiserfs_journal_list *jl) 
 {
     struct reiserfs_journal_list *pjl ; /* previous list for this cn */
     struct reiserfs_journal_cnode *cn, *walk_cn ;
     unsigned long blocknr ;
-    int run = 0 ;
-    int orig_trans_id = jl->j_trans_id ;
     struct buffer_head *saved_bh ; 
     int ret = 0 ;
 
-    /* if someone is getting the commit list, we must wait for them */
-    while (atomic_read(&(jl->j_commit_flushing))) {
-        sleep_on(&(jl->j_commit_wait)) ;
-    }
-    /* if someone is flushing this list, we must wait for them */
-    while (atomic_read(&(jl->j_flushing))) {
-        sleep_on(&(jl->j_flush_wait)) ;
-    }
-    /* was it flushed while we slept? */
-    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
-        return 0 ;
+    if (atomic_read(&jl->j_commit_left) != 0 || jl->j_len <= 0) {
+        BUG();
     }
-
-    /* this list is now ours, we can change anything we want */
-    atomic_set(&(jl->j_flushing), 1) ;
-
-loop_start:
     cn = jl->j_realblock ;
     while(cn) {
         saved_bh = NULL ;
@@ -1143,27 +1371,14 @@
         ** it is allowed to send that buffer to disk
         */
         pjl = find_newer_jl_for_cn(cn) ;
-        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
-            can_dirty(cn)) 
-        {
-            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
-                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-		submit_logged_buffer(cn->bh) ;
-            } else {
-                /* someone else is using this buffer.  We can't 
-                ** send it to disk right now because they might
-                ** be changing/logging it.
-                */
-                ret = 1 ;
-            }
-        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+        if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
             clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
             if (!pjl && cn->bh) {
                 wait_on_buffer(cn->bh) ;
-            }
-            /* check again, someone could have logged while we scheduled */
-            pjl = find_newer_jl_for_cn(cn) ;
+		/* check again, someone could have logged while we scheduled */
+		pjl = find_newer_jl_for_cn(cn) ;
 
+            }
             /* before the JDirty_wait bit is set, the 
             ** buffer is added to the hash list.  So, if we are
             ** run in the middle of a do_journal_end, we will notice
@@ -1210,60 +1425,182 @@
         } 
 next:
         cn = cn->next ;
+	if (current->need_resched)
+	    schedule();
     }
-    /* the first run through the loop sends all the dirty buffers to
-    ** ll_rw_block.
-    ** the second run through the loop does all the accounting
-    */
-    if (run++ == 0) {
-        goto loop_start ;
+    return ret ;
+}
+
+static int kupdate_transactions(struct super_block *s,
+                                   struct reiserfs_journal_list *jl,
+				   struct reiserfs_journal_list **next_jl,
+				   unsigned long *next_trans_id,
+				   int num_blocks,
+				   int num_trans) {
+    int ret = 0;
+    int written = 0 ;
+    int transactions_flushed = 0;
+    unsigned long orig_trans_id = jl->j_trans_id;
+    struct reiserfs_journal_list *orig_jl = jl;
+    struct buffer_chunk chunk;
+    struct list_head *entry;
+    chunk.nr = 0;
+
+    down(&SB_JOURNAL(s)->j_flush_sem);
+    if (!journal_list_still_alive(s, orig_trans_id)) {
+	goto done;
     }
 
-    atomic_set(&(jl->j_flushing), 0) ;
-    wake_up(&(jl->j_flush_wait)) ;
-    return ret ;
+    /* we've got j_flush_sem held, nobody is going to delete any
+     * of these lists out from underneath us
+     */
+    while((num_trans && transactions_flushed < num_trans) || 
+          (!num_trans && written < num_blocks)) {
+
+	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+	    atomic_read(&jl->j_commit_left))
+	{
+	    del_from_work_list(s, jl);
+	    break;
+	}
+	ret = write_one_transaction(s, jl, &chunk);
+
+	if (ret < 0)
+	    goto done;
+	transactions_flushed++;
+	written += ret;
+	entry = jl->j_list.next;
+
+	/* did we wrap? */
+	if (entry == &SB_JOURNAL(s)->j_journal_list) {
+	    break;
+        }
+	jl = JOURNAL_LIST_ENTRY(entry);
+
+	/* don't bother with older transactions */
+	if (jl->j_trans_id <= orig_trans_id)
+	    break;
+    }
+    if (chunk.nr) {
+        write_chunk(&chunk);
+    }
+
+    jl = orig_jl;
+    *next_jl = jl;
+    *next_trans_id = jl->j_trans_id;
+    ret = transactions_flushed;
+    while(transactions_flushed--) {
+
+	wait_one_transaction(s, jl);
+	entry = jl->j_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+
+	/* make sure we can really count */
+	if (jl->j_trans_id <= orig_trans_id && transactions_flushed > 0) {
+printk("flushing %s %lu, orig_trans_id was %lu\n", kdevname(s->s_dev), jl->j_trans_id, orig_trans_id);
+	    BUG();
+        }
+	*next_jl = jl;
+	*next_trans_id = jl->j_trans_id;
+    }
+
+done:
+    up(&SB_JOURNAL(s)->j_flush_sem);
+    return ret;
 }
+
+/* for o_sync and fsync heavy applications, they tend to use 
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
+** 
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
+*/
+static int flush_used_journal_lists(struct super_block *s, 
+                                    struct reiserfs_journal_list *jl) {
+    unsigned long len = 0;
+    unsigned long cur_len;
+    int ret;
+    int i;
+    struct reiserfs_journal_list *tjl;
+    struct reiserfs_journal_list *flush_jl;
+    unsigned long trans_id;
+
+    flush_jl = tjl = jl;
+
+    /* flush for 256 transactions or 256 blocks, whichever comes first */
+    for(i = 0 ; i < 256 && len < 256 ; i++) {
+	if (atomic_read(&tjl->j_commit_left) || 
+	    tjl->j_trans_id < jl->j_trans_id) {
+	    break;
+	}
+	cur_len = atomic_read(&tjl->j_nonzerolen);
+	if (cur_len > 0) {
+	    tjl->j_state &= ~LIST_TOUCHED;
+	}
+	len += cur_len;
+	flush_jl = tjl;
+	if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
+	    break;
+	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+    }
+    /* try to find a group of blocks we can flush across all the
+    ** transactions, but only bother if we've actually spanned 
+    ** across multiple lists
+    */
+    if (flush_jl != jl) {
+	ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+    }
+    flush_journal_list(s, flush_jl, 1) ;  
+    return 0;
+}
+
+
 /* since we never give dirty buffers to bdflush/kupdate, we have to
 ** flush them ourselves.  This runs through the journal lists, finds
 ** old metadata in need of flushing and sends it to disk.
 ** this does not end transactions, commit anything, or free
 ** cnodes.
-**
-** returns the highest transaction id that was flushed last time
 */
 static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
-    struct reiserfs_journal_list *jl ;
-    int i ;
-    int start ;
+    struct reiserfs_journal_list *jl, *next_jl;
+    unsigned long trans_id, next_trans_id;
     time_t age ;
-    int ret = 0 ;
 
-    start = SB_JOURNAL_LIST_INDEX(s) ;
+    jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
 
-    /* safety check to prevent flush attempts during a mount */
-    if (start < 0) {
+restart:
+    /* kupdate transactions might not set next_trans_id, it must be
+     * initialized before each call 
+     */
+    next_trans_id = 0;
+    if (list_empty(&SB_JOURNAL(s)->j_working_list)) {
         return 0 ;
     }
-    i = (start + 1) % JOURNAL_LIST_COUNT ;
-    while(i != start) {
-        jl = SB_JOURNAL_LIST(s) + i  ;
-        age = CURRENT_TIME - jl->j_timestamp ;
-        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && 
-            atomic_read(&(jl->j_nonzerolen)) > 0 &&
-            atomic_read(&(jl->j_commit_left)) == 0) {
+    trans_id = jl->j_trans_id;
 
-            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
-                break ;
-            }
-            /* if ret was already 1, we want to preserve that */
-            ret |= kupdate_one_transaction(s, jl) ;
-        } 
-        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
-            ret |= 1 ;
-        }
-        i = (i + 1) % JOURNAL_LIST_COUNT ;
+    /* check for race with the code that frees lists */
+    if (jl->j_trans_id == 0)
+        BUG();
+    age = CURRENT_TIME - jl->j_timestamp ;
+    if (age >= SB_JOURNAL_MAX_COMMIT_AGE(s) &&
+        atomic_read(&jl->j_nonzerolen) > 0 &&
+	atomic_read(&jl->j_commit_left) == 0)
+    {
+        if (kupdate_transactions(s, jl, &next_jl, &next_trans_id, 32, 32) < 0)
+	    return 0;
+	if (next_jl != JOURNAL_WORK_ENTRY(&SB_JOURNAL(s)->j_working_list) &&
+	    next_trans_id > trans_id) 
+	{
+	    if (journal_list_still_alive(s, next_trans_id)) {
+		jl = next_jl;
+		goto restart;
+	    }
+	}
     }
-    return ret ;
+    return 0;
 }
 
 /*
@@ -1307,6 +1644,12 @@
 }
 
 static void free_journal_ram(struct super_block *p_s_sb) {
+
+  // kmem_cache_free(journal_list_cachep, SB_JOURNAL(p_s_sb)->j_current_jl);
+  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl, 
+                 sizeof(struct reiserfs_journal_list), p_s_sb);
+  SB_JOURNAL(p_s_sb)->j_num_lists--;
+
   vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
   free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
   free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
@@ -1327,6 +1670,10 @@
 static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
   struct reiserfs_transaction_handle myth ;
 
+  down(&kreiserfsd_sem);
+  list_del(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers);
+  up(&kreiserfsd_sem);
+
   /* we only want to flush out transactions if we were called with error == 0
   */
   if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
@@ -1813,66 +2160,6 @@
   return 0 ;
 }
 
-
-struct reiserfs_journal_commit_task {
-  struct super_block *p_s_sb ;
-  int jindex ;
-  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
-                       ** is zero, we free the whole struct on finish
-		       */
-  struct reiserfs_journal_commit_task *self ;
-  struct wait_queue *task_done ;
-  struct tq_struct task ;
-} ;
-
-static void reiserfs_journal_commit_task_func(struct reiserfs_journal_commit_task *ct) {
-
-  struct reiserfs_journal_list *jl ;
-  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
-
-  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; 
-
-  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
-      atomic_read(&(jl->j_commit_left)) == 0) {
-    kupdate_one_transaction(ct->p_s_sb, jl) ;
-  }
-  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
-}
-
-static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
-                                  struct super_block *p_s_sb, 
-				  int jindex) {
-  if (!ct) {
-    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
-  }
-  ct->p_s_sb = p_s_sb ;
-  ct->jindex = jindex ;
-  ct->task_done = NULL ;
-  INIT_LIST_HEAD(&ct->task.list) ;
-  ct->task.sync = 0 ;
-  ct->task.routine = (void *)(void *)reiserfs_journal_commit_task_func ; 
-  ct->self = ct ;
-  ct->task.data = (void *)ct ;
-}
-
-static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
-  struct reiserfs_journal_commit_task *ct ;
-  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
-  ** to start/join a transaction, which will deadlock
-  */
-  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
-  if (ct) {
-    setup_commit_task_arg(ct, p_s_sb, jindex) ;
-    queue_task(&(ct->task), &reiserfs_commit_thread_tq);
-    wake_up(&reiserfs_commit_thread_wait) ;
-  } else {
-#ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning(p_s_sb, "journal-1540: kmalloc failed, doing sync commit\n") ;
-#endif
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
-  }
-}
-
 /*
 ** this is the commit thread.  It is started with kernel_thread on
 ** FS mount, and journal_release() waits for it to exit.
@@ -1885,6 +2172,9 @@
 ** then run the per filesystem commit task queue when we wakeup.
 */
 static int reiserfs_journal_commit_thread(void *nullp) {
+  struct list_head *entry, *safe ;
+  struct super_block *s;
+  time_t last_run = 0;
 
   daemonize() ;
 
@@ -1897,13 +2187,73 @@
   lock_kernel() ;
   while(1) {
 
-    while(TQ_ACTIVE(reiserfs_commit_thread_tq)) {
-      run_task_queue(&reiserfs_commit_thread_tq) ;
+restart:
+    down(&kreiserfsd_sem);
+    list_for_each_safe(entry, safe, &kreiserfsd_supers) {
+	s = list_entry(entry, struct super_block, 
+		       u.reiserfs_sb.s_reiserfs_supers);    
+	if (!(s->s_flags & MS_RDONLY)) {
+	    flush_async_commits(s);
+
+	    if (CURRENT_TIME - last_run > 5) {
+		reiserfs_flush_old_commits(s);
+	    }
+
+	    if (!list_empty(&SB_JOURNAL(s)->j_working_list)) {
+	        struct reiserfs_journal_list *jl, *tjl;
+		unsigned long trans_id ;
+		unsigned long start;
+		unsigned long cur_start;
+		unsigned long nfract = SB_ONDISK_JOURNAL_SIZE(s) / 4;
+		int ret;
+
+		jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
+		cur_start = SB_JOURNAL(s)->j_start;
+		start = jl->j_start;
+
+		/* pretend the log doesn't actually wrap */
+		if (cur_start < start) {
+		    cur_start = cur_start + SB_ONDISK_JOURNAL_SIZE(s);
+		}
+
+		/* if the first transaction on the working list is more
+		 * than nfract blocks away from the current transaction start
+		 * or there are more than 128 working lists, start
+		 * a background flush
+		 */
+		if (cur_start - start > nfract || 
+		    SB_JOURNAL(s)->j_num_work_lists > 32) {
+		    tjl=JOURNAL_LIST_ENTRY(SB_JOURNAL(s)->j_journal_list.next);
+		    ret = kupdate_transactions(s, jl, &tjl, &trans_id,32,128);
+		}
+	    }
+	}
     }
+    /* check again for new async commits that need tending */
+    list_for_each_safe(entry, safe, &kreiserfsd_supers) {
+	s = list_entry(entry, struct super_block, 
+		       u.reiserfs_sb.s_reiserfs_supers);    
+	if (!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+	    struct reiserfs_journal_list *jl;
+	    struct list_head *entry;
+
+	    /* last entry is the youngest, commit it and you get everything */
+	    entry = SB_JOURNAL(s)->j_journal_list.prev;
+	    jl = JOURNAL_LIST_ENTRY(entry);
+	    if (!atomic_read(&(jl->j_older_commits_done))) {
+		/* give new mounts a chance to come in */
+		up(&kreiserfsd_sem);
+		last_run = CURRENT_TIME;
+		wake_up_all(&reiserfs_commit_thread_done) ;
+		goto restart;
+	    }
+	}
+    }
+    up(&kreiserfsd_sem);
+    last_run = CURRENT_TIME;
 
     /* if there aren't any more filesystems left, break */
     if (reiserfs_mounted_fs_count <= 0) {
-      run_task_queue(&reiserfs_commit_thread_tq) ;
       break ;
     }
     wake_up(&reiserfs_commit_thread_done) ;
@@ -1914,12 +2264,28 @@
   return 0 ;
 }
 
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+    struct reiserfs_journal_list *jl;
+retry:
+    // jl = (struct reiserfs_journal_list *)kmem_cache_alloc(journal_list_cachep, SLAB_NOFS);
+    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
+    if (!jl) {
+	yield();
+	goto retry;
+    }
+    memset(jl, 0, sizeof(*jl));
+    INIT_LIST_HEAD(&jl->j_list);
+    INIT_LIST_HEAD(&jl->j_working_list);
+    INIT_LIST_HEAD(&jl->j_ordered_bh_list);
+    INIT_LIST_HEAD(&jl->j_tail_bh_list);
+    sema_init(&jl->j_commit_lock, 1);
+    SB_JOURNAL(s)->j_num_lists++;
+    return jl;
+}
+
 static void journal_list_init(struct super_block *p_s_sb) {
-  int i ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
-  }
+    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 }
 
 static int release_journal_dev( struct super_block *super,
@@ -1952,7 +2318,6 @@
 	int blkdev_mode = FMODE_READ | FMODE_WRITE;
 
 	result = 0;
-
 	journal -> j_dev_bd = NULL;
 	journal -> j_dev_file = NULL;
 	jdev = SB_JOURNAL_DEV( super ) = 
@@ -2030,7 +2395,6 @@
 	printk( "journal_init_dev: journal device: %s", kdevname( SB_JOURNAL_DEV( super ) ) );
 	return result;
 }
-
 /*
 ** must be called once on fs mount.  calls journal_read for you
 */
@@ -2041,6 +2405,7 @@
     struct reiserfs_super_block * rs;
     struct reiserfs_journal_header *jh;
     struct reiserfs_journal *journal;
+    struct reiserfs_journal_list *jl;
 
     if (sizeof(struct reiserfs_journal_commit) != 4096 ||
 	sizeof(struct reiserfs_journal_desc) != 4096) {
@@ -2054,7 +2419,6 @@
 	reiserfs_warning(p_s_sb, "Journal size %d is less than 512+1 blocks, which unsupported\n", SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 	return 1 ;
     }
-
     journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
     if (!journal) {
 	reiserfs_warning(p_s_sb, "journal-1256: unable to get memory for journal structure\n") ;
@@ -2155,15 +2519,9 @@
 	    SB_JOURNAL_MAX_BATCH(p_s_sb) = SB_JOURNAL_TRANS_MAX(p_s_sb)*9 / 10;
 	}
     }
-
     brelse (bhjh);
 
     SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
-    SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
-
-    /* clear out the journal list array */
-    memset(SB_JOURNAL_LIST(p_s_sb), 0, 
-           sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; 
 
     journal_list_init(p_s_sb) ;
 
@@ -2171,8 +2529,6 @@
            JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
     memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
 
-    INIT_LIST_HEAD(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
-
     SB_JOURNAL(p_s_sb)->j_start = 0 ;
     SB_JOURNAL(p_s_sb)->j_len = 0 ;
     SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
@@ -2182,13 +2538,15 @@
     SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
     SB_JOURNAL(p_s_sb)->j_first = NULL ;     
     init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; 
-
+    sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
+    sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
+    INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_journal_list);
+    INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_working_list);
+    
     SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;  
     SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; 
     SB_JOURNAL(p_s_sb)->j_state = 0 ;
     atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-    atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
     SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
     SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
     SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? 
@@ -2196,8 +2554,9 @@
     SB_JOURNAL(p_s_sb)->j_cnode_used = 0 ;
     SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
     init_journal_hash(p_s_sb) ;
-    SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
-    if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
+    jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+    jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl) ;
+    if (!jl->j_list_bitmap) {
 	reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0\n") ;
 	goto free_and_return;
     }
@@ -2205,8 +2564,6 @@
 	reiserfs_warning(p_s_sb, "Replay Failure, unable to mount\n") ;
 	goto free_and_return;
     }
-    /* once the read is done, we can set this where it belongs */
-    SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; 
 
     if (reiserfs_dont_log (p_s_sb))
 	return 0;
@@ -2216,6 +2573,9 @@
 	kernel_thread((void *)(void *)reiserfs_journal_commit_thread, NULL,
 		      CLONE_FS | CLONE_FILES | CLONE_VM) ;
     }
+    down(&kreiserfsd_sem);
+    list_add(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers, &kreiserfsd_supers);
+    up(&kreiserfsd_sem);
     return 0 ;
 
 free_and_return:
@@ -2230,7 +2590,9 @@
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
   time_t now = CURRENT_TIME ;
-  if (reiserfs_dont_log(th->t_super)) 
+
+  /* cannot restart while nested unless the parent allows it */
+  if (!reiserfs_restartable_handle(th) && th->t_refcount > 1)
     return 0 ;
   if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
        (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) || 
@@ -2239,9 +2601,48 @@
        SB_JOURNAL(th->t_super)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(th->t_super) * 3)) { 
     return 1 ;
   }
+
+  /* we are allowing them to continue in the current transaction, so
+  * we have to bump the blocks allocated now.
+  */
+  th->t_blocks_allocated += new_alloc;
+  SB_JOURNAL(th->t_super)->j_len_alloc += new_alloc;
+
   return 0 ;
 }
 
+int 
+reiserfs_restart_transaction(struct reiserfs_transaction_handle *th, int num) {
+    int refcount = th->t_refcount ;
+    struct super_block *s = th->t_super ;
+    int flags = th->t_flags ;
+    int parent_flags = 0;
+    struct reiserfs_transaction_handle *saved_th = current->journal_info ;
+
+    /* if refcount is > 1, saved_th is the parent we've nested into, save
+    ** his flags as well.  So far, only intermezzo needs this, 99% of the
+    ** time it is horribly unsafe.
+    */
+    if (refcount > 1) {
+	if (!reiserfs_restartable_handle(saved_th)) {
+	    BUG() ;
+	}
+	th->t_refcount = 1; 
+	parent_flags = saved_th->t_flags ;
+    }
+    th->t_flags = 0 ;
+    journal_end(th, s, th->t_blocks_allocated) ;
+    journal_begin(th, s, num) ;
+    th->t_flags = flags; 
+    if (refcount > 1) {
+	current->journal_info = saved_th ;
+        th->t_refcount = refcount ;
+	memcpy(saved_th, th, sizeof(*th)) ;
+	saved_th->t_flags = parent_flags ;
+    }
+    return 0 ;
+}
+
 /* this must be called inside a transaction, and requires the 
 ** kernel_lock to be held
 */
@@ -2268,6 +2669,37 @@
                !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
 }
 
+static void queue_log_writer(struct super_block *s) {
+    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
+    sleep_on(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void wake_queued_writers(struct super_block *s) {
+    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state)) {
+        wake_up(&SB_JOURNAL(s)->j_join_wait);
+    }
+}
+
+static void let_transaction_grow(struct super_block *sb, 
+                                 unsigned long trans_id)
+{
+    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
+    while(1) {
+	yield();
+        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
+	        atomic_read(&SB_JOURNAL(sb)->j_jlock)) && 
+	       SB_JOURNAL(sb)->j_trans_id == trans_id) {
+	    queue_log_writer(sb);
+	}
+	if (SB_JOURNAL(sb)->j_trans_id != trans_id)
+	    break;
+	if (bcount == SB_JOURNAL(sb)->j_bcount)
+	    break;
+	bcount = SB_JOURNAL(sb)->j_bcount;
+    }
+}
+
+
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
@@ -2275,8 +2707,10 @@
 ** expect to use in nblocks.
 */
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
-  time_t now = CURRENT_TIME ;
+  time_t now ;
   int old_trans_id  ;
+  struct reiserfs_transaction_handle myth ;
+  int sched_count = 0;
 
   reiserfs_check_lock_depth("journal_begin") ;
   RFALSE( p_s_sb->s_flags & MS_RDONLY, 
@@ -2287,9 +2721,14 @@
     return 0 ;
   }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
+  /* set here for journal_join */
+  th->t_refcount = 1; 
+  th->t_flags = 0 ;
+  th->t_super = p_s_sb ;
 
 relock:
   lock_journal(p_s_sb) ;
+  SB_JOURNAL(p_s_sb)->j_bcount++ ;
 
   if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
     unlock_journal(p_s_sb) ;
@@ -2297,12 +2736,12 @@
     PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
     goto relock ;
   }
+  now = CURRENT_TIME;
 
   /* if there is no room in the journal OR
   ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
   ** we don't sleep if there aren't other writers
   */
-
   if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
      ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
      (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
@@ -2310,54 +2749,128 @@
      (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
      (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
 
+    old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
     unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
 
-    /* if writer count is 0, we can just force this transaction to end, and start
-    ** a new one afterwards.
-    */
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
-      struct reiserfs_transaction_handle myth ;
-      journal_join(&myth, p_s_sb, 1) ;
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
+    if (!join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= 
+        SB_JOURNAL_MAX_BATCH(p_s_sb) && 
+	((SB_JOURNAL(p_s_sb)->j_len + nblocks + 2) * 100) < 
+	(SB_JOURNAL(p_s_sb)->j_len_alloc * 75))
+    {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_wcount) > 10) {
+	    sched_count++;
+	    queue_log_writer(p_s_sb);
+	    goto relock;
+	}
+    } 
+    /* don't mess with joining the transaction if all we have to do is
+     * wait for someone else to do a commit
+     */
+    if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	while (SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id &&
+	       atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        }
+	goto relock;
+    }
+    journal_join(&myth, p_s_sb, 1) ;
+  
+    /* someone might have ended the transaction while we joined */
+    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
+        do_journal_end(&myth, p_s_sb, 1, 0) ;
     } else {
-      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
-      ** They won't batch on transaction end once we set j_jlock
-      */
-      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
-            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
+        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
     }
     PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
     goto relock ;
   }
 
   if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
-    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
+    SB_JOURNAL(p_s_sb)->j_trans_start_time = CURRENT_TIME;
   }
   atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
   SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_super = p_s_sb ;
   th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  th->t_caller = "Unknown" ;
+  reiserfs_set_handle_active(th) ;
   unlock_journal(p_s_sb) ;
-  p_s_sb->s_dirt = 1; 
   return 0 ;
 }
 
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, unsigned long nblocks) {
+    int ret ;
+    struct reiserfs_transaction_handle *th ;
 
+    /* if we're nesting into an existing transaction.  It will be
+    ** persistent on its own
+    */
+    if (reiserfs_transaction_running(s)) {
+        th = current->journal_info ;
+	th->t_refcount++ ;
+	if (th->t_refcount < 2) {
+	    BUG() ;
+	}
+	return th ;
+    }
+    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+    if (!th) {
+       return ERR_PTR(-ENOMEM) ;
+    }
+    ret = journal_begin(th, s, nblocks) ;
+    if (ret) {
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+        return ERR_PTR(ret) ;
+    }
+    /* do_journal_end is now responsible for freeing the handle */
+    reiserfs_set_handle_persistent(th) ;
+    return th ;
+}
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+  /* this keeps do_journal_end from NULLing out the current->journal_info
+  ** pointer
+  */
+  th->t_handle_save = cur_th ;
+  if (cur_th && cur_th->t_refcount > 1) {
+      BUG() ;
+  }
   return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
-  return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+    int ret ;
+
+    th->t_handle_save = NULL ;
+    if (cur_th) {
+	/* we are nesting into the current transaction */
+	if (cur_th->t_super == p_s_sb) {
+	      cur_th->t_refcount++ ;
+	      memcpy(th, cur_th, sizeof(*th)); 
+	      th->t_flags = 0 ;
+	      reiserfs_set_handle_active(th) ;
+	      if (th->t_refcount <= 1) 
+		      printk("BAD: refcount <= 1, but journal_info != 0\n"); 
+	      return 0;
+	} else {
+	    /* we've ended up with a handle from a different filesystem.
+	    ** save it and restore on journal_end.  This should never
+	    ** really happen...
+	    */
+	    reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS\n") ;
+	    th->t_handle_save = current->journal_info ;
+	    current->journal_info = th;
+	}
+    } else {
+	current->journal_info = th;
+    }
+    ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    if (current->journal_info != th)
+        BUG() ;
+    return ret ;
 }
 
 /* not used at all */
@@ -2389,7 +2902,7 @@
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
                    th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
   }
-  p_s_sb->s_dirt = 1 ;
+  p_s_sb->s_dirt = 1;
 
   prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
   /* already in this transaction, we are done */
@@ -2413,6 +2926,7 @@
 
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
     reiserfs_warning(p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
+    BUG();
     return 1 ;
   }
   /* this error means I've screwed up, and we've overflowed the transaction.  
@@ -2479,25 +2993,36 @@
   return 0 ;
 }
 
-/*
-** if buffer already in current transaction, do a journal_mark_dirty
-** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
-** that don't need journaling
-*/
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || 
-      buffer_journal_dirty(bh)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  if (get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_dev,bh->b_blocknr,bh->b_size)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  mark_buffer_dirty(bh) ;
-  return 0 ;
-}
-
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  return do_journal_end(th, p_s_sb, nblocks, 0) ;
+
+    int ret;
+    if (!current->journal_info && th->t_refcount > 1) 
+	printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount); 
+    if (th->t_refcount > 1) { 
+	struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+
+	/* we aren't allowed to close a nested transaction on a different
+	** filesystem from the one in the task struct
+	*/
+	if (cur_th->t_super != th->t_super)
+	    BUG() ;
+
+	th->t_refcount--;
+	if (th != cur_th) {
+	    int flags = cur_th->t_flags ;
+	    /* nested handles are never persistent */
+	    if (reiserfs_persistent_handle(th)) {
+		BUG() ;
+	    }
+	    memcpy(cur_th, th, sizeof(*th));
+	    th->t_flags = 0 ;
+	    cur_th->t_flags = flags ;
+	}
+	ret = 0;
+    } else {
+	ret = do_journal_end(th, p_s_sb, nblocks, 0) ;
+    }
+    return ret;
 }
 
 /* removes from the current transaction, relsing and descrementing any counters.  
@@ -2600,6 +3125,10 @@
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
 
+  /* you are not allowed to sync while nested, very, very bad */
+  if (th->t_refcount > 1) {
+    BUG() ;
+  }
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
     reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
     journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
@@ -2624,12 +3153,14 @@
 **
 */
 void flush_async_commits(struct super_block *p_s_sb) {
-  int i ;
+  struct reiserfs_journal_list *jl;
+  struct list_head *entry;
 
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; 
-    }
+  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+      /* last entry is the youngest, commit it and you get everything */
+      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
+      jl = JOURNAL_LIST_ENTRY(entry);
+      flush_commit_list(p_s_sb, jl, 1);
   }
 }
 
@@ -2637,58 +3168,39 @@
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 **
-** also calls flush_journal_list with old_only == 1, which allows me to reclaim
-** memory and such from the journal lists whose real blocks are all on disk.
-**
-** called by sync_dev_journal from buffer.c
 */
-int flush_old_commits(struct super_block *p_s_sb, int immediate) {
-  int i ;
-  int count = 0;
-  int start ; 
-  time_t now ; 
-  struct reiserfs_transaction_handle th ; 
-
-  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  now = CURRENT_TIME ;
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
+    time_t now ; 
+    struct reiserfs_transaction_handle th ; 
+
+    now = CURRENT_TIME ;
+    /* safety check so we don't flush while we are replaying the log during 
+     * mount 
+     */
+    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+	return 0  ;
+    }
 
-  /* safety check so we don't flush while we are replaying the log during mount */
-  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
-    return 0  ;
-  }
-  /* starting with oldest, loop until we get to the start */
-  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
-  while(i != start) {
-    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
-       immediate)) {
-      /* we have to check again to be sure the current transaction did not change */
-      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
-	flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
-      }
-    }
-    i = (i + 1) % JOURNAL_LIST_COUNT ;
-    count++ ;
-  }
-  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
-  ** force the commit blocks to disk
-  */
-  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
-     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-     SB_JOURNAL(p_s_sb)->j_len > 0 && 
-     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
-  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to 
-                             flush, we must be sure old transactions hit the disk too. */
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  }
-   reiserfs_journal_kupdate(p_s_sb) ;
-   return 0 ;
+    /* check the current transaction.  If there are no writers, and it is 
+     * too old, finish it, and force the commit blocks to disk 
+     */
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
+        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
+        SB_JOURNAL(p_s_sb)->j_len > 0 && 
+        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > 
+	SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) 
+    {
+	journal_join(&th, p_s_sb, 1) ;
+	reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+	journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+
+	/* we're only being called from kreiserfsd, it makes no sense to do
+	** an async commit so that kreiserfsd can do it later
+	*/
+	do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
+    } 
+    reiserfs_journal_kupdate(p_s_sb) ;
+    return p_s_sb->s_dirt;
 }
 
 /*
@@ -2709,6 +3221,7 @@
   int flush = flags & FLUSH_ALL ;
   int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
+  struct reiserfs_journal_list *jl;
 
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
@@ -2727,8 +3240,9 @@
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
     int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
     unlock_journal(p_s_sb) ;
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
-      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
+    BUG();
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) {
+      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
       wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
     }
     return 0 ;
@@ -2741,24 +3255,37 @@
   */
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
     if (flush || commit_now) {
-      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      unsigned trans_id ;
+
+      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+      trans_id = jl->j_trans_id;
+
       atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
       if (flush) {
         SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
       }
       unlock_journal(p_s_sb) ;
+
       /* sleep while the current transaction is still j_jlocked */
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && 
-            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
-      if (commit_now) {
-	if (wait_on_commit) {
-	  flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-	} else {
-	  commit_flush_async(p_s_sb, orig_jindex) ; 
+      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        } else {
+	    lock_journal(p_s_sb);
+	    if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
+	    } 
+	    unlock_journal(p_s_sb);
 	}
       }
+      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+          BUG();
+      }
+      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+          wait_on_commit) 
+      {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+      }
       return 0 ;
     } 
     unlock_journal(p_s_sb) ;
@@ -2776,8 +3303,8 @@
   if (!(SB_JOURNAL(p_s_sb)->j_must_wait > 0) && !(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))) && !flush && !commit_now && 
       (SB_JOURNAL(p_s_sb)->j_len < SB_JOURNAL_MAX_BATCH(p_s_sb))  && 
       SB_JOURNAL(p_s_sb)->j_len_alloc < SB_JOURNAL_MAX_BATCH(p_s_sb) && SB_JOURNAL(p_s_sb)->j_cnode_free > (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3)) {
-    SB_JOURNAL(p_s_sb)->j_bcount++ ;
     unlock_journal(p_s_sb) ;
+
     return 0 ;
   }
 
@@ -2807,16 +3334,13 @@
   struct reiserfs_list_bitmap *jb = NULL ;
   int cleaned = 0 ;
   
-  if (reiserfs_dont_log(th->t_super)) {
-    bh = sb_get_hash_table(p_s_sb, blocknr) ;
-    if (bh && buffer_dirty (bh)) {
-      reiserfs_warning (p_s_sb, "journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr);
-      BUG ();
-    }
-    brelse (bh);
-    return 0 ;
+  cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, p_s_sb->s_dev,
+                                       blocknr, p_s_sb->s_blocksize) ;
+  if (cn && cn->bh) {
+      bh = cn->bh ;
+      get_bh(bh) ;
   }
-  bh = sb_get_hash_table(p_s_sb, blocknr) ;
+
   /* if it is journal new, we just remove it from this transaction */
   if (bh && buffer_journal_new(bh)) {
     mark_buffer_notjournal_new(bh) ;
@@ -2824,14 +3348,22 @@
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
   } else {
     /* set the bit for this block in the journal bitmap for this transaction */
-    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
+    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
     if (!jb) {
       reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
     }
-    set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
 
-    /* Note, the entire while loop is not allowed to schedule.  */
+    /* we set bits in the list bitmap so the block won't be reallocated
+     * as a data block which might get flushed before this transaction 
+     * commits.  When data logging is on, the block might get reallocated
+     * as a data block, but we know the data block won't get flushed before
+     * we commit
+     */
+    if (!reiserfs_data_log(p_s_sb)) {
+	set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
+    } 
 
+    /* Note, the entire while loop is not allowed to schedule.  */
     if (bh) {
       clear_prepared_bits(bh) ;
     }
@@ -2876,57 +3408,77 @@
 
 void reiserfs_update_inode_transaction(struct inode *inode) {
   
-  inode->u.reiserfs_i.i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
-
+  inode->u.reiserfs_i.i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
   inode->u.reiserfs_i.i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
 void reiserfs_update_tail_transaction(struct inode *inode) {
   
-  inode->u.reiserfs_i.i_tail_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
-
+  inode->u.reiserfs_i.i_tail_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
   inode->u.reiserfs_i.i_tail_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static void __commit_trans_index(struct inode *inode, unsigned long id,
-                                 unsigned long index) 
+static void __commit_trans_jl(struct inode *inode, unsigned long id,
+                                 struct reiserfs_journal_list *jl) 
 {
-    struct reiserfs_journal_list *jl ;
     struct reiserfs_transaction_handle th ;
     struct super_block *sb = inode->i_sb ;
 
-    jl = SB_JOURNAL_LIST(sb) + index;
-
     /* is it from the current transaction, or from an unknown transaction? */
     if (id == SB_JOURNAL(sb)->j_trans_id) {
-	journal_join(&th, sb, 1) ;
+	jl = SB_JOURNAL(sb)->j_current_jl;
+	/* try to let other writers come in and grow this transaction */
+	let_transaction_grow(sb, id);
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    goto flush_commit_only;
+	}
+
+	journal_begin(&th, sb, 1) ;
+
+	/* someone might have ended this transaction while we joined */
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+	    journal_end(&th, sb, 1) ;
+	    goto flush_commit_only;
+	}
+
 	journal_end_sync(&th, sb, 1) ;
-    } else if (jl->j_trans_id == id) {
-	flush_commit_list(sb, jl, 1) ;
+	
+    } else {
+	/* this gets tricky, we have to make sure the journal list in
+	 * the inode still exists.  We know the list is still around
+	 * if we've got a larger transaction id than the oldest list
+	 */
+flush_commit_only:
+	if (journal_list_still_alive(inode->i_sb, id)) {
+	    flush_commit_list(sb, jl, 1) ;
+	}
     }
-    /* if the transaction id does not match, this list is long since flushed
-    ** and we don't have to do anything here
-    */
+    /* otherwise the list is gone, and long since committed */
 }
 void reiserfs_commit_for_tail(struct inode *inode) {
     unsigned long id = inode->u.reiserfs_i.i_tail_trans_id;
-    unsigned long index = inode->u.reiserfs_i.i_tail_trans_index;
+    struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_tail_jl;
 
     /* for tails, if this info is unset there's nothing to commit */
-    if (id && index)
-	__commit_trans_index(inode, id, index);
+    if (id && jl)
+	__commit_trans_jl(inode, id, jl);
 }
 void reiserfs_commit_for_inode(struct inode *inode) {
     unsigned long id = inode->u.reiserfs_i.i_trans_id;
-    unsigned long index = inode->u.reiserfs_i.i_trans_index;
+    struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_jl;
 
-    /* for the whole inode, assume unset id or index means it was
+    /* for the whole inode, assume unset id means it was
      * changed in the current transaction.  More conservative
      */
-    if (!id || !index)
+    if (!id || !jl) {
 	reiserfs_update_inode_transaction(inode) ;
+	id = inode->u.reiserfs_i.i_trans_id;
+	/* jl will be updated in __commit_trans_jl */
+    }
 
-    __commit_trans_index(inode, id, index);
+    __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
@@ -2954,8 +3506,6 @@
   int retry_count = 0 ;
 
   PROC_INFO_INC( p_s_sb, journal.prepare );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
 
   while(!test_bit(BH_JPrepared, &bh->b_state) ||
         (wait && buffer_locked(bh))) {
@@ -2964,16 +3514,37 @@
       return ;
     }
     set_bit(BH_JPrepared, &bh->b_state) ;
+
     if (wait) {
       RFALSE( buffer_locked(bh) && cur_tb != NULL,
 	      "waiting while do_balance was running\n") ;
+      /* only data buffers are allowed to come in dirty, and they 
+       * never get run through restore_prepared_buffer.  So we can
+       * just mark them clean here and know it is safe
+       */
+      mark_buffer_clean(bh);
       wait_on_buffer(bh) ;
-    }
+    } 
     PROC_INFO_INC( p_s_sb, journal.prepare_retry );
     retry_count++ ;
   }
 }
-
+static void flush_old_journal_lists(struct super_block *s) {
+    struct reiserfs_journal_list *jl;
+    struct list_head *entry;
+    time_t now = CURRENT_TIME;
+
+    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+        entry = SB_JOURNAL(s)->j_journal_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* this check should always be run, to send old lists to disk */
+	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+	    flush_used_journal_lists(s, jl);
+	} else {
+	    break;
+	}
+    }
+}
 /* 
 ** long and ugly.  If flush, will not return until all commit
 ** blocks and all real buffers in the trans are on disk.
@@ -2990,18 +3561,30 @@
   struct buffer_head *c_bh ; /* commit bh */
   struct buffer_head *d_bh ; /* desc bh */
   int cur_write_start = 0 ; /* start index of current log write */
-  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
   int old_start ;
   int i ;
-  int jindex ;
-  int orig_jindex ;
   int flush = flags & FLUSH_ALL ;
   int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
   struct reiserfs_super_block *rs ; 
+  struct reiserfs_journal_list *jl, *temp_jl;
+  struct list_head *entry, *safe;
+  int wakeup_kreiserfsd = 0;
+  unsigned long jindex;
+  unsigned long commit_trans_id;
+
+  if (th->t_refcount > 1)
+    BUG() ;
 
+  reiserfs_check_lock_depth("journal end");
+  current->journal_info = th->t_handle_save;
   if (reiserfs_dont_log(th->t_super)) {
-    return 0 ;
+    goto out ;
+  }
+
+  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
   }
 
   lock_journal(p_s_sb) ;
@@ -3018,7 +3601,9 @@
   ** it tells us if we should continue with the journal_end, or just return
   */
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    return 0 ;
+    p_s_sb->s_dirt = 1;
+    wake_queued_writers(p_s_sb);
+    goto out ;
   }
 
   /* check_journal_end might set these, check again */
@@ -3037,8 +3622,11 @@
   }
 
 #ifdef REISERFS_PREALLOCATE
+  /* quota ops might need to nest, setup the journal_info pointer for them */
+  current->journal_info = th ;
   reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
 				      * the transaction */
+  current->journal_info = th->t_handle_save ;
 #endif
   
   rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
@@ -3059,25 +3647,23 @@
   mark_buffer_uptodate(c_bh, 1) ;
 
   /* init this journal list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;  
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-
-  /* which is faster, locking/unlocking at the start and end of the for
-  ** or locking once per iteration around the insert_journal_hash?
-  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
-  ** LOOP MUST not cause schedule to occur.
-  */
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+
+  /* save the transaction id in case we need to commit it later */
+  commit_trans_id = jl->j_trans_id;
 
-  /* for each real block, add it to the journal list hash,
+  atomic_set(&jl->j_older_commits_done, 0) ;
+  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
+  jl->j_commit_bh = c_bh ;
+  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
+  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;  
+  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
+  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
+  jl->j_realblock = NULL ;
+
+  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+  **  for each real block, add it to the journal list hash,
   ** copy into real block index array in the commit or desc block
   */
   for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
@@ -3087,7 +3673,7 @@
         reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
       }
       if (i == 0) {
-        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
+        jl->j_realblock = jl_cn ;
       }
       jl_cn->prev = last_cn ;
       jl_cn->next = NULL ;
@@ -3105,7 +3691,7 @@
       jl_cn->state = 0 ;
       jl_cn->dev = cn->bh->b_dev ; 
       jl_cn->bh = cn->bh ;
-      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      jl_cn->jlist = jl;
       insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; 
       if (i < JOURNAL_TRANS_HALF) {
 	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
@@ -3130,29 +3716,34 @@
 reiserfs_warning(p_s_sb, "journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
     atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
     wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    return 0 ;
+    goto out ;
   }
 
   /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
   cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
-  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
   cn = SB_JOURNAL(p_s_sb)->j_first ;
   jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cur_blocks_left > 0) {
+  while(cn) {
+    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
       tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
 		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
       mark_buffer_uptodate(tmp_bh, 1) ;
-      memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;  
+      memcpy(tmp_bh->b_data, bh_kmap(cn->bh), cn->bh->b_size) ;
+      bh_kunmap(cn->bh);
       jindex++ ;
+      set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ;
+      clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
     } else {
       /* JDirty cleared sometime during transaction.  don't log this one */
       reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
+      brelse(cn->bh) ;
     }
-    cn = cn->next ;
-    cur_blocks_left-- ;
+    next = cn->next ;
+    free_cnode(p_s_sb, cn) ;
+    cn = next ;
   }
 
   /* we are done  with both the c_bh and d_bh, but
@@ -3160,47 +3751,19 @@
   ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
   */
 
-  /* now loop through and mark all buffers from this transaction as JDirty_wait
-  ** clear the JDirty bit, clear BH_JNew too.  
-  ** if they weren't JDirty, they weren't logged, just relse them and move on
-  */
-  cn = SB_JOURNAL(p_s_sb)->j_first ; 
-  while(cn) {
-    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
-    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
-      set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; 
-      clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
-    } else {
-      brelse(cn->bh) ;
-    }
-    next = cn->next ;
-    free_cnode(p_s_sb, cn) ;
-    cn = next ;
-  }
-
-  /* unlock the journal list for committing and flushing */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
-
-  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; 
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
+  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 
-  /* write any buffers that must hit disk before this commit is done */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  /* we lock the commit before putting it onto the main list because
+   * we want to make sure nobody tries to run flush_commit_list until
+   * the new transaction is fully setup, and we've already flushed the
+   * ordered bh list
+   */
+  down(&jl->j_commit_lock);
 
-  /* honor the flush and async wishes from the caller */
-  if (flush) {
-  
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
-  }
+  /* now it is safe to insert this transaction on the main list */
+  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
+  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
+  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
 
   /* reset journal values for the next transaction */
   old_start = SB_JOURNAL(p_s_sb)->j_start ;
@@ -3212,57 +3775,119 @@
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_id++ ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  /* tail conversion targets have to hit the disk before we end the 
+   * transaction.  Otherwise a later transaction might repack the tail
+   * before this transaction commits, leaving the data block unflushed and 
+   * clean, if we crash before the later transaction commits, the data block
+   * is lost.
+   */
+  while(!list_empty(&jl->j_tail_bh_list)) {
+      unlock_kernel();
+      fsync_buffers_list(&jl->j_tail_bh_list);
+      lock_kernel();
+  }
+  up(&jl->j_commit_lock);
+
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  */
+  if (flush) {
+    flush_commit_list(p_s_sb, jl, 1) ;
+    flush_journal_list(p_s_sb, jl, 1) ;  
+  }
+
+
   /* if the next transaction has any chance of wrapping, flush 
   ** transactions that might get overwritten.  If any journal lists are very 
   ** old flush them as well.  
   */
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    jindex = i ;
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; 
-      }
-    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 
-            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
+first_jl:
+  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
+    temp_jl = JOURNAL_LIST_ENTRY(entry);
+    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
+      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= 
+          temp_jl->j_start) 
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	wakeup_kreiserfsd = 1;
+	goto first_jl;
+      } else if ((SB_JOURNAL(p_s_sb)->j_start + 
+                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) < 
+		  SB_ONDISK_JOURNAL_SIZE(p_s_sb)) 
+      {
+          /* if we don't cross into the next transaction and we don't
+	   * wrap, there is no way we can overlap any later transactions
+	   * break now
+	   */
+	  break;
+      }
+    } else if ((SB_JOURNAL(p_s_sb)->j_start + 
+                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > 
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb)) 
+    {
+      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % 
+            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) 
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	wakeup_kreiserfsd = 1;
+	goto first_jl;
+      } else {
+	  /* we don't overlap anything from out start to the end of the 
+	   * log, and our wrapped portion doesn't overlap anything at
+	   * the start of the log.  We can break
+	   */
+	  break;
       }
-    } 
-    /* this check should always be run, to send old lists to disk */
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < 
-	      (CURRENT_TIME - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
     }
   }
+  flush_old_journal_lists(p_s_sb);
 
-  /* if the next journal_list is still in use, flush it */
-  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
-    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; 
-  }
+  /* soft limit */
+  if (SB_JOURNAL(p_s_sb)->j_num_work_lists > 128 || wakeup_kreiserfsd) {
+      wake_up(&reiserfs_commit_thread_wait) ;
+  } 
 
-  /* we don't want anyone flushing the new transaction's list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + 
-											 SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
 
-  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
+  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
     reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
   }
-  unlock_journal(p_s_sb) ;
+
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
+  unlock_journal(p_s_sb) ;
   /* wake up any body waiting to join. */
+  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+  
+  if (!flush && commit_now && wait_on_commit) {
+      if (current->need_resched) {
+          schedule() ;
+      }
+      if (journal_list_still_alive(p_s_sb, commit_trans_id))
+	  flush_commit_list(p_s_sb, jl, 1) ;
+  }
+  /* if we did an async commit, get kreiserfsd going on it */
+  if (!commit_now && !wait_on_commit) {
+      wake_up(&reiserfs_commit_thread_wait) ;
+      schedule();
+  } 
+out:
+  reiserfs_check_lock_depth("journal end2");
+  if (reiserfs_persistent_handle(th)) {
+      memset(th, 0, sizeof(*th));
+      reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), p_s_sb) ;
+  } else
+      th->t_flags = 0 ;
   return 0 ;
 }
 
-
-
+int __init reiserfs_journal_cache_init(void) {
+    return 0;
+}
diff -urN linux-2.4.22.org/fs/reiserfs/Makefile linux-2.4.22/fs/reiserfs/Makefile
--- linux-2.4.22.org/fs/reiserfs/Makefile	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/Makefile	2003-11-21 15:14:23.000000000 +0100
@@ -7,6 +7,7 @@
 #
 # Note 2! The CFLAGS definitions are now in the main makefile...
 
+export-objs := super.o
 O_TARGET := reiserfs.o
 obj-y   := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \
 lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o item_ops.o ioctl.o procfs.o
diff -urN linux-2.4.22.org/fs/reiserfs/namei.c linux-2.4.22/fs/reiserfs/namei.c
--- linux-2.4.22.org/fs/reiserfs/namei.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/namei.c	2003-11-21 15:14:23.000000000 +0100
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 
 #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
@@ -469,7 +470,7 @@
     }
   
     /* perform the insertion of the entry that we have prepared */
-    retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size);
+    retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
     if (buffer != small_buf)
 	reiserfs_kfree (buffer, buflen, dir->i_sb);
     if (retval) {
@@ -478,7 +479,6 @@
     }
 
     dir->i_size += paste_size;
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     dir->i_mtime = dir->i_ctime = CURRENT_TIME;
     if (!S_ISDIR (inode->i_mode) && visible)
 	// reiserfs_mkdir or reiserfs_rename will do that by itself
@@ -494,7 +494,9 @@
 ** inserted into the tree yet.
 */
 static int drop_new_inode(struct inode *inode) {
+    DQUOT_DROP(inode);
     make_bad_inode(inode) ;
+    inode->i_flags |= S_NOQUOTA;
     iput(inode) ;
     return 0 ;
 }
@@ -518,6 +520,11 @@
     } else
         inode->i_gid = current->fsgid;
 
+    DQUOT_INIT(inode);
+    if (DQUOT_ALLOC_INODE(inode)) {
+        drop_new_inode(inode);
+	return -EDQUOT;
+    }
     return 0 ;
 }
   
@@ -536,7 +543,6 @@
 	return retval ;
 
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    th.t_caller = "create" ;
     retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
     if (retval) {
 	goto out_failed ;
@@ -750,7 +756,6 @@
 
     DEC_DIR_INODE_NLINK(dir)
     dir->i_size -= (DEH_SIZE + de.de_entrylen);
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     reiserfs_update_sd (&th, dir);
 
     /* prevent empty directory from getting lost */
@@ -835,7 +840,6 @@
     reiserfs_update_sd (&th, inode);
 
     dir->i_size -= (de.de_entrylen + DEH_SIZE);
-    dir->i_blocks = ((dir->i_size + 511) >> 9);
     dir->i_ctime = dir->i_mtime = CURRENT_TIME;
     reiserfs_update_sd (&th, dir);
 
@@ -1245,7 +1249,6 @@
 	reiserfs_warning ((&th)->t_super, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?\n");
 
     old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-    old_dir->i_blocks = ((old_dir->i_size + 511) >> 9);
 
     reiserfs_update_sd (&th, old_dir);
     reiserfs_update_sd (&th, new_dir);
diff -urN linux-2.4.22.org/fs/reiserfs/objectid.c linux-2.4.22/fs/reiserfs/objectid.c
--- linux-2.4.22.org/fs/reiserfs/objectid.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/objectid.c	2003-11-21 15:14:23.000000000 +0100
@@ -87,7 +87,6 @@
     }
 
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 1;
     return unused_objectid;
 }
 
@@ -106,8 +105,6 @@
 
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-    s->s_dirt = 1;
-
 
     /* start at the beginning of the objectid map (i = 0) and go to
        the end of it (i = disk_sb->s_oid_cursize).  Linear search is
diff -urN linux-2.4.22.org/fs/reiserfs/procfs.c linux-2.4.22/fs/reiserfs/procfs.c
--- linux-2.4.22.org/fs/reiserfs/procfs.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/procfs.c	2003-11-21 15:14:24.000000000 +0100
@@ -497,7 +497,6 @@
 			"j_first_unflushed_offset: \t%lu\n"
 			"j_last_flush_trans_id: \t%lu\n"
 			"j_trans_start_time: \t%li\n"
-			"j_journal_list_index: \t%i\n"
 			"j_list_bitmap_index: \t%i\n"
 			"j_must_wait: \t%i\n"
 			"j_next_full_flush: \t%i\n"
@@ -543,7 +542,6 @@
 			JF( j_first_unflushed_offset ),
 			JF( j_last_flush_trans_id ),
 			JF( j_trans_start_time ),
-			JF( j_journal_list_index ),
 			JF( j_list_bitmap_index ),
 			JF( j_must_wait ),
 			JF( j_next_full_flush ),
diff -urN linux-2.4.22.org/fs/reiserfs/stree.c linux-2.4.22/fs/reiserfs/stree.c
--- linux-2.4.22.org/fs/reiserfs/stree.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/stree.c	2003-11-21 15:14:25.000000000 +0100
@@ -60,6 +60,7 @@
 #include <linux/pagemap.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/smp_lock.h>
+#include <linux/quotaops.h>
 
 /* Does the buffer contain a disk block which is in the tree. */
 inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
@@ -71,9 +72,6 @@
   return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
 }
 
-
-
-
 inline void copy_short_key (void * to, const void * from)
 {
     memcpy (to, from, SHORT_KEY_SIZE);
@@ -652,9 +650,9 @@
                                        stop at leaf level - set to
                                        DISK_LEAF_NODE_LEVEL */
     ) {
-    int  n_block_number = SB_ROOT_BLOCK (p_s_sb),
-      expected_level = SB_TREE_HEIGHT (p_s_sb),
-      n_block_size    = p_s_sb->s_blocksize;
+    int  n_block_number, 
+         expected_level,
+         n_block_size    = p_s_sb->s_blocksize;
     struct buffer_head  *       p_s_bh;
     struct path_element *       p_s_last_element;
     int				n_node_level, n_retval;
@@ -678,8 +676,11 @@
     /* With each iteration of this loop we search through the items in the
        current node, and calculate the next current node(next path element)
        for the next iteration of this loop.. */
+    n_block_number = SB_ROOT_BLOCK (p_s_sb);
+    expected_level = SB_TREE_HEIGHT (p_s_sb);
     while ( 1 ) {
 
+        reiserfs_check_lock_depth("search_by_key");
 #ifdef CONFIG_REISERFS_CHECK
 	if ( !(++n_repeat_counter % 50000) )
 	    reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
@@ -1123,8 +1124,7 @@
 		tmp = get_block_num(p_n_unfm_pointer,0);
 		put_block_num(p_n_unfm_pointer, 0, 0);
 		journal_mark_dirty (th, p_s_sb, p_s_bh);
-		inode->i_blocks -= p_s_sb->s_blocksize / 512;
-		reiserfs_free_block(th, tmp);
+		reiserfs_free_block(th, inode, tmp, 1);
 		/* In case of big fragmentation it is possible that each block
 		   freed will cause dirtying of one more bitmap and then we will
 		   quickly overflow our transaction space. This is a
@@ -1132,9 +1132,7 @@
 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 		    int orig_len_alloc = th->t_blocks_allocated ;
 		    pathrelse(p_s_path) ;
-
-		    journal_end(th, p_s_sb, orig_len_alloc) ;
-		    journal_begin(th, p_s_sb, orig_len_alloc) ;
+		    reiserfs_restart_transaction(th, orig_len_alloc);
 		    reiserfs_update_inode_transaction(inode) ;
 		    need_research = 1;
 		    break;
@@ -1168,8 +1166,7 @@
     }
 }
 
-
-/* Calculate bytes number which will be deleted or cutted in the balance. */
+/* Calculate number of bytes which will be deleted or cut during balance */
 int calc_deleted_bytes_number(
     struct  tree_balance  * p_s_tb,
     char                    c_mode
@@ -1180,14 +1177,14 @@
     if ( is_statdata_le_ih (p_le_ih) )
 	return 0;
 
+    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
     if ( is_direntry_le_ih (p_le_ih) ) {
 	// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
 	// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
 	// empty size.  ick. FIXME, is this right?
 	//
-	return ih_item_len(p_le_ih);
+	return n_del_size ;
     }
-    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
 
     if ( is_indirect_le_ih (p_le_ih) )
 	n_del_size = (n_del_size/UNFM_P_SIZE)*
@@ -1221,17 +1218,46 @@
 	item [--i] = 0;
 }
 
+#ifdef REISERQUOTA_DEBUG
+char key2type(struct key *ih)
+{
+  if (is_direntry_le_key(2, ih))
+    return 'd';
+  if (is_direct_le_key(2, ih))
+    return 'D';
+  if (is_indirect_le_key(2, ih))
+    return 'i';
+  if (is_statdata_le_key(2, ih))
+    return 's';
+  return 'u';
+}
+
+char head2type(struct item_head *ih)
+{
+  if (is_direntry_le_ih(ih))
+    return 'd';
+  if (is_direct_le_ih(ih))
+    return 'D';
+  if (is_indirect_le_ih(ih))
+    return 'i';
+  if (is_statdata_le_ih(ih))
+    return 's';
+  return 'u';
+}
+#endif
 
 /* Delete object item. */
 int reiserfs_delete_item (struct reiserfs_transaction_handle *th, 
 			  struct path * p_s_path, /* Path to the deleted item. */
 			  const struct cpu_key * p_s_item_key, /* Key to search for the deleted item.  */
-			  struct inode * p_s_inode,/* inode is here just to update i_blocks */
+			  struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
 			  struct buffer_head  * p_s_un_bh)    /* NULL or unformatted node pointer.    */
 {
     struct super_block * p_s_sb = p_s_inode->i_sb;
     struct tree_balance   s_del_balance;
     struct item_head      s_ih;
+    struct item_head      *q_ih;
+    int			  quota_cut_bytes;
     int                   n_ret_value,
 	n_del_size,
 	n_removed;
@@ -1281,6 +1307,22 @@
 
     // reiserfs_delete_item returns item length when success
     n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+    q_ih = get_ih(p_s_path) ;
+    quota_cut_bytes = ih_item_len(q_ih) ;
+
+    /* hack so the quota code doesn't have to guess if the file
+    ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
+    ** We test the offset because the tail might have been
+    ** split into multiple items, and we only want to decrement for
+    ** the unfm node once
+    */
+    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
+        if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
+            quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+        } else {
+	    quota_cut_bytes = 0 ;
+	}
+    }
 
     if ( p_s_un_bh )  {
 	int off;
@@ -1312,10 +1354,14 @@
 	memcpy(data + off,
 	       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
     }
-
     /* Perform balancing after all resources have been collected at once. */ 
     do_balance(&s_del_balance, NULL, NULL, M_DELETE);
 
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota delete_item(): freeing %u, id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+
     /* Return deleted body length */
     return n_ret_value;
 }
@@ -1340,14 +1386,16 @@
 
 /* this deletes item which never gets split */
 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
+				 struct inode *inode,
 				 struct key * key)
 {
     struct tree_balance tb;
     INITIALIZE_PATH (path);
-    int item_len;
+    int item_len = 0;
     int tb_init = 0 ;
     struct cpu_key cpu_key;
     int retval;
+    int quota_cut_bytes = 0;
     
     le_key2cpu_key (&cpu_key, key);
     
@@ -1371,6 +1419,7 @@
 	    item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
 	    init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
 	}
+	quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
 
 	retval = fix_nodes (M_DELETE, &tb, NULL, 0);
 	if (retval == REPEAT_SEARCH) {
@@ -1380,6 +1429,12 @@
 
 	if (retval == CARRY_ON) {
 	    do_balance (&tb, 0, 0, M_DELETE);
+	    if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
+#ifdef REISERQUOTA_DEBUG
+		printk(KERN_DEBUG "reiserquota delete_solid_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, inode->i_uid, key2type(key));
+#endif
+		DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
+	    }
 	    break;
 	}
 
@@ -1412,7 +1467,7 @@
       }
 /* USE_INODE_GENERATION_COUNTER */
 #endif
-    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
+    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
 }
 
 
@@ -1484,6 +1539,38 @@
     mark_inode_dirty (inode);
 }
 
+static void
+unmap_buffers(struct page *page, loff_t pos) {
+    struct buffer_head *bh ;
+    struct buffer_head *head ;
+    struct buffer_head *next ;
+    unsigned long tail_index ;
+    unsigned long cur_index ;
+
+    if (!page || !page->buffers)
+        return;
+    
+    tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
+    cur_index = 0 ;
+    head = page->buffers ;
+    bh = head ;
+    do {
+        next = bh->b_this_page ;
+
+        /* we want to unmap the buffers that contain the tail, and
+        ** all the buffers after it (since the tail must be at the
+        ** end of the file).  We don't want to unmap file data
+        ** before the tail, since it might be dirty and waiting to
+        ** reach disk
+        */
+        cur_index += bh->b_size ;
+        if (cur_index > tail_index) {
+            reiserfs_unmap_buffer(bh) ;
+        }
+	bh = next ;
+    } while (bh != head) ;
+}
+
 
 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
 int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, 
@@ -1499,12 +1586,15 @@
        structure by using the init_tb_struct and fix_nodes functions.
        After that we can make tree balancing. */
     struct tree_balance s_cut_balance;
+    struct item_head *p_le_ih;
+    loff_t tail_pos = 0;
     int n_cut_size = 0,        /* Amount to be cut. */
 	n_ret_value = CARRY_ON,
 	n_removed = 0,     /* Number of the removed unformatted nodes. */
 	n_is_inode_locked = 0;
     char                c_mode;            /* Mode of the balance. */
     int retval2 = -1;
+    int quota_cut_bytes;
     
     
     init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
@@ -1531,6 +1621,9 @@
 		/* tail has been left in the unformatted node */
 		return n_ret_value;
 
+	    if (n_is_inode_locked) {
+printk("inode locked twice\n");
+	    }
 	    n_is_inode_locked = 1;
 	  
 	    /* removing of last unformatted node will change value we
@@ -1545,6 +1638,7 @@
       	    set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
 	    p_s_item_key->key_length = 4;
 	    n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
+	    tail_pos = n_new_file_size;
 	    set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
 	    if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
 		print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
@@ -1592,23 +1686,27 @@
     RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "illegal mode");
 
     /* Calculate number of bytes that need to be cut from the item. */
+    quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
     if (retval2 == -1)
 	n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
     else
 	n_ret_value = retval2;
-    
-    if ( c_mode == M_DELETE ) {
-	struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
-	
-	if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
-	    /* we delete first part of tail which was stored in direct
-               item(s) */
+
+
+    /* For direct items, we only change the quota when deleting the last
+    ** item.
+    */
+    p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
+    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+        if (c_mode == M_DELETE && 
+	   (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
 	    // FIXME: this is to keep 3.5 happy
 	    p_s_inode->u.reiserfs_i.i_first_direct_byte = U32_MAX;
-	    p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512;
+	    quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
+        } else {
+	    quota_cut_bytes = 0 ;
 	}
     }
-
 #ifdef CONFIG_REISERFS_CHECK
     if (n_is_inode_locked) {
 	struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
@@ -1642,7 +1740,12 @@
 	** deal with it here.
 	*/
 	p_s_inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
+	unmap_buffers(page, tail_pos);
     }
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota cut_from_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, '?');
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
     return n_ret_value;
 }
 
@@ -1654,8 +1757,8 @@
 
     set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
     set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
-    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
-
+    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
+    reiserfs_update_sd(th, inode) ;
     set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
     set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);    
 }
@@ -1681,6 +1784,7 @@
 	n_new_file_size;/* New file size. */
     int                   n_deleted;      /* Number of deleted or truncated bytes. */
     int retval;
+    int jbegin_count = th->t_blocks_allocated;
 
     if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
 	return;
@@ -1760,17 +1864,14 @@
 	** sure the file is consistent before ending the current trans
 	** and starting a new one
 	*/
-        if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-	  int orig_len_alloc = th->t_blocks_allocated ;
+        if (journal_transaction_should_end(th, jbegin_count)) {
 	  decrement_counters_in_path(&s_search_path) ;
 
 	  if (update_timestamps) {
 	      p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME;
 	  } 
 	  reiserfs_update_sd(th, p_s_inode) ;
-
-	  journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
-	  journal_begin(th, p_s_inode->i_sb, orig_len_alloc) ;
+	  reiserfs_restart_transaction(th, jbegin_count) ;
 	  reiserfs_update_inode_transaction(p_s_inode) ;
 	}
     } while ( n_file_size > ROUND_UP (n_new_file_size) &&
@@ -1822,18 +1923,37 @@
 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, 
 			      struct path         * p_s_search_path,	/* Path to the pasted item.          */
 			      const struct cpu_key      * p_s_key,        	/* Key to search for the needed item.*/
+			      struct inode	  * inode,		/* Inode item belongs to */
 			      const char          * p_c_body,       	/* Pointer to the bytes to paste.    */
 			      int                   n_pasted_size)  	/* Size of pasted bytes.             */
 {
     struct tree_balance s_paste_balance;
     int                 retval;
+    int			fs_gen;
+
+    fs_gen = get_generation(inode->i_sb) ;
+
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota paste_into_item(): allocating %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+#endif
 
+    if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
+	pathrelse(p_s_search_path);
+	return -EDQUOT;
+    }
     init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
     s_paste_balance.key = p_s_key->on_disk_key;
 #endif
-    
-    while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) {
+
+    /* DQUOT_* can schedule, must check before the fix_nodes */
+    if (fs_changed(fs_gen, inode->i_sb)) {
+	goto search_again;
+    }
+
+    while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
+REPEAT_SEARCH ) {
+search_again:
 	/* file system changed while we were in the fix_nodes */
 	PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
 	retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
@@ -1862,6 +1982,10 @@
 error_out:
     /* this also releases the path */
     unfix_nodes(&s_paste_balance);
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota paste_into_item(): freeing %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+#endif
+    DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
     return retval ;
 }
 
@@ -1871,23 +1995,45 @@
 			 struct path         * 	p_s_path,         /* Path to the inserteded item.         */
 			 const struct cpu_key      * key,
 			 struct item_head    * 	p_s_ih,           /* Pointer to the item header to insert.*/
+			 struct inode        * inode,
 			 const char          * 	p_c_body)         /* Pointer to the bytes to insert.      */
 {
     struct tree_balance s_ins_balance;
     int                 retval;
+    int fs_gen = 0 ;
+    int quota_bytes = 0 ;
 
+    if (inode) {      /* Do we count quotas for item? */
+	fs_gen = get_generation(inode->i_sb);
+	quota_bytes = ih_item_len(p_s_ih);
+
+	/* hack so the quota code doesn't have to guess if the file has
+	 ** a tail, links are always tails, so there's no guessing needed
+	 */
+	if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
+	    quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
+	}
+#ifdef REISERQUOTA_DEBUG
+	printk(KERN_DEBUG "reiserquota insert_item(): allocating %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
+#endif
+	/* We can't dirty inode here. It would be immediately written but
+	 * appropriate stat item isn't inserted yet... */
+	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
+	    pathrelse(p_s_path);
+	    return -EDQUOT;
+	}
+    }
     init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
     s_ins_balance.key = key->on_disk_key;
 #endif
-
-    /*
-    if (p_c_body == 0)
-      n_zeros_num = ih_item_len(p_s_ih);
-    */
-    //    le_key2cpu_key (&key, &(p_s_ih->ih_key));
+    /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+    if (inode && fs_changed(fs_gen, inode->i_sb)) {
+	goto search_again;
+    }
 
     while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
+search_again:
 	/* file system changed while we were in the fix_nodes */
 	PROC_INFO_INC( th -> t_super, insert_item_restarted );
 	retval = search_item (th->t_super, key, p_s_path);
@@ -1902,7 +2048,7 @@
 	    goto error_out; 
 	}
     }
-
+ 
     /* make balancing after all resources will be collected at a time */ 
     if ( retval == CARRY_ON ) {
 	do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
@@ -1913,6 +2059,11 @@
 error_out:
     /* also releases the path */
     unfix_nodes(&s_ins_balance);
+#ifdef REISERQUOTA_DEBUG
+    printk(KERN_DEBUG "reiserquota insert_item(): freeing %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
+#endif
+    if (inode)
+	DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
     return retval; 
 }
 
diff -urN linux-2.4.22.org/fs/reiserfs/super.c linux-2.4.22/fs/reiserfs/super.c
--- linux-2.4.22.org/fs/reiserfs/super.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/super.c	2003-11-21 15:14:25.000000000 +0100
@@ -13,6 +13,9 @@
 #include <linux/locks.h>
 #include <linux/init.h>
 
+EXPORT_SYMBOL(journal_begin) ;
+EXPORT_SYMBOL(journal_end) ;
+
 #define REISERFS_OLD_BLOCKSIZE 4096
 #define REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ 20
 
@@ -50,22 +53,28 @@
 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
 static int reiserfs_statfs (struct super_block * s, struct statfs * buf);
 
-static void reiserfs_write_super (struct super_block * s)
+static int reiserfs_sync_fs (struct super_block * s)
 {
+    struct reiserfs_transaction_handle th;
+    lock_kernel() ;
+    if (!(s->s_flags & MS_RDONLY)) {
+	journal_begin(&th, s, 1);
+	journal_end_sync(&th, s, 1);
+	s->s_dirt = 0;
+    }
+    unlock_kernel() ;
+    return 0;
+}
 
-  int dirty = 0 ;
-  lock_kernel() ;
-  if (!(s->s_flags & MS_RDONLY)) {
-    dirty = flush_old_commits(s, 1) ;
-  }
-  s->s_dirt = dirty;
-  unlock_kernel() ;
+static void reiserfs_write_super (struct super_block * s)
+{
+    reiserfs_sync_fs(s);
 }
 
+
 static void reiserfs_write_super_lockfs (struct super_block * s)
 {
 
-  int dirty = 0 ;
   struct reiserfs_transaction_handle th ;
   lock_kernel() ;
   if (!(s->s_flags & MS_RDONLY)) {
@@ -75,7 +84,7 @@
     reiserfs_block_writes(&th) ;
     journal_end(&th, s, 1) ;
   }
-  s->s_dirt = dirty;
+  s->s_dirt = 0;
   unlock_kernel() ;
 }
 
@@ -100,7 +109,7 @@
      /* we are going to do one balancing */
      journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
  
-     reiserfs_delete_solid_item (&th, key);
+     reiserfs_delete_solid_item (&th, NULL, key);
      if (oid_free)
         /* removals are protected by direct items */
         reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
@@ -286,8 +295,8 @@
     /* body of "save" link */
     link = INODE_PKEY (inode)->k_dir_id;
 
-    /* put "save" link inot tree */
-    retval = reiserfs_insert_item (th, &path, &key, &ih, (char *)&link);
+    /* put "save" link inot tree, don't charge quota to anyone */
+    retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
     if (retval) {
 	if (retval != -ENOSPC)
 	    reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d\n",
@@ -329,7 +338,8 @@
           ( inode -> u.reiserfs_i.i_flags & i_link_saved_truncate_mask ) ) ||
         ( !truncate && 
           ( inode -> u.reiserfs_i.i_flags & i_link_saved_unlink_mask ) ) )
-	reiserfs_delete_solid_item (&th, &key);
+	/* don't take quota bytes from anywhere */
+	reiserfs_delete_solid_item (&th, NULL, &key);
     if (!truncate) {
 	reiserfs_release_objectid (&th, inode->i_ino);
 	inode -> u.reiserfs_i.i_flags &= ~i_link_saved_unlink_mask;
@@ -357,6 +367,7 @@
   ** to do a journal_end
   */
   journal_release(&th, s) ;
+  s->s_dirt = 0;
 
   for (i = 0; i < SB_BMAP_NR (s); i ++)
     brelse (SB_AP_BITMAP (s)[i].bh);
@@ -418,6 +429,7 @@
   put_super: reiserfs_put_super,
   write_super: reiserfs_write_super,
   write_super_lockfs: reiserfs_write_super_lockfs,
+  sync_fs: reiserfs_sync_fs,
   unlockfs: reiserfs_unlockfs,
   statfs: reiserfs_statfs,
   remount_fs: reiserfs_remount,
@@ -463,6 +475,14 @@
     {NULL, 0, 0}
 };
 
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
+    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
+    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
+    {NULL, 0}
+};
+
 
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
@@ -612,10 +632,14 @@
 		
 		{"block-allocator", 'a', balloc, 0, 0},
 		{"hash", 'h', hash, 1<<FORCE_HASH_DETECT, 0},
+		{"data", 'd', logging_mode, 0, 0},
 		
 		{"resize", 'r', 0, 0, 0},
 		{"attrs", 0, 0, 1<<REISERFS_ATTRS, 0},
 		{"noattrs", 0, 0, 0, 1<<REISERFS_ATTRS},
+		{"usrquota", 0, 0, 0, 0},
+		{"grpquota", 0, 0, 0, 0},
+
 		{NULL, 0, 0, 0, 0}
     };
 	
@@ -672,6 +696,47 @@
 	}
 }
 
+static void switch_data_mode(struct super_block *s, unsigned long mode) {
+    struct reiserfs_transaction_handle th;
+    int sync_all = !reiserfs_data_log(s);
+
+    journal_begin(&th, s, 1);
+    SB_JOURNAL(s)->j_must_wait = 1;
+    journal_end_sync(&th, s, 1);
+
+    s->u.reiserfs_sb.s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+                                       (1 << REISERFS_DATA_ORDERED) |
+				       (1 << REISERFS_DATA_WRITEBACK));
+    s->u.reiserfs_sb.s_mount_opt |= (1 << mode);
+
+    journal_begin(&th, s, 1);
+    SB_JOURNAL(s)->j_must_wait = 1;
+    journal_end_sync(&th, s, 1);
+
+    if (sync_all)
+        fsync_no_super(s->s_dev);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+    if (mount_options & (1 << REISERFS_DATA_LOG)) {
+        if (!reiserfs_data_log(s)) {
+	    switch_data_mode(s, REISERFS_DATA_LOG);
+	    printk("reiserfs: switching to journaled data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+        if (!reiserfs_data_ordered(s)) {
+	    switch_data_mode(s, REISERFS_DATA_ORDERED);
+	    printk("reiserfs: switching to ordered data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+        if (!reiserfs_data_writeback(s)) {
+	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+	    printk("reiserfs: switching to writeback data mode\n");
+	}
+    }
+}
+
 static int reiserfs_remount (struct super_block * s, int * mount_flags, char * data)
 {
   struct reiserfs_super_block * rs;
@@ -723,9 +788,10 @@
     s->s_dirt = 0;
   } else {
     /* remount read-write */
-    if (!(s->s_flags & MS_RDONLY))
+    if (!(s->s_flags & MS_RDONLY)) {
+	handle_data_mode(s, mount_options);
 	return 0; /* We are read-write already */
-
+    }
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
     
@@ -743,9 +809,10 @@
   SB_JOURNAL(s)->j_must_wait = 1 ;
   journal_end(&th, s, 10) ;
 
-  if (!( *mount_flags & MS_RDONLY ) )
+  if (!( *mount_flags & MS_RDONLY ) ) {
     finish_unfinished( s );
-
+    handle_data_mode(s, mount_options);
+  }
   return 0;
 }
 
@@ -1172,9 +1239,6 @@
 
     if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
       return NULL;
-
-
-
     }
 
     if (blocks) {
@@ -1222,9 +1286,22 @@
     printk("reiserfs:warning: - it is slow mode for debugging.\n");
 #endif
 
-    /* fixme */
-    jdev_name = NULL;
+    /* make data=ordered the default */
+    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+        !reiserfs_data_writeback(s))
+    {
+        s->u.reiserfs_sb.s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+    }
+
+    if (reiserfs_data_log(s)) {
+        printk("reiserfs: using journaled data mode\n");
+    } else if (reiserfs_data_ordered(s)) {
+        printk("reiserfs: using ordered data mode\n");
+    } else {
+        printk("reiserfs: using writeback data mode\n");
+    }
 
+    jdev_name = NULL;
     if( journal_init(s, jdev_name, old_format) ) {
 	reiserfs_warning(s, "sh-2022: reiserfs_read_super: unable to initialize journal space\n") ;
 	goto error ;
@@ -1364,16 +1441,19 @@
 
 static int __init init_reiserfs_fs (void)
 {
+        int ret;
 	reiserfs_proc_info_global_init();
 	reiserfs_proc_register_global( "version", 
 				       reiserfs_global_version_in_proc );
+	ret = reiserfs_journal_cache_init();
+	if (ret)
+	    return ret;
         return register_filesystem(&reiserfs_fs_type);
 }
 
 MODULE_DESCRIPTION("ReiserFS journaled filesystem");
 MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
 MODULE_LICENSE("GPL");
-EXPORT_NO_SYMBOLS;
 
 static void __exit exit_reiserfs_fs(void)
 {
diff -urN linux-2.4.22.org/fs/reiserfs/tail_conversion.c linux-2.4.22/fs/reiserfs/tail_conversion.c
--- linux-2.4.22.org/fs/reiserfs/tail_conversion.c	2003-11-21 15:08:29.000000000 +0100
+++ linux-2.4.22/fs/reiserfs/tail_conversion.c	2003-11-21 15:14:25.000000000 +0100
@@ -66,11 +66,11 @@
 	set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
         put_ih_item_len( &ind_ih, UNFM_P_SIZE );
 	PATH_LAST_POSITION (path)++;
-	n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, 
+	n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
 					 (char *)&unfm_ptr);
     } else {
 	/* Paste into last indirect item of an object. */
-	n_retval = reiserfs_paste_into_item(th, path, &end_key,
+	n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
 					    (char *)&unfm_ptr, UNFM_P_SIZE);
     }
     if ( n_retval ) {
@@ -152,39 +152,6 @@
   }
 }
 
-static void
-unmap_buffers(struct page *page, loff_t pos) {
-  struct buffer_head *bh ;
-  struct buffer_head *head ;
-  struct buffer_head *next ;
-  unsigned long tail_index ;
-  unsigned long cur_index ;
-
-  if (page) {
-    if (page->buffers) {
-      tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
-      cur_index = 0 ;
-      head = page->buffers ;
-      bh = head ;
-      do {
-	next = bh->b_this_page ;
-
-        /* we want to unmap the buffers that contain the tail, and
-        ** all the buffers after it (since the tail must be at the
-        ** end of the file).  We don't want to unmap file data 
-        ** before the tail, since it might be dirty and waiting to 
-        ** reach disk
-        */
-        cur_index += bh->b_size ;
-        if (cur_index > tail_index) {
-          reiserfs_unmap_buffer(bh) ;
-        }
-	bh = next ;
-      } while (bh != head) ;
-    }
-  } 
-}
-
 /* this first locks inode (neither reads nor sync are permitted),
    reads tail through page cache, insert direct item. When direct item
    inserted successfully inode is left locked. Return value is always
@@ -261,7 +228,7 @@
     set_cpu_key_k_type (&key, TYPE_DIRECT);
     key.key_length = 4;
     /* Insert tail as new direct item in the tree */
-    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih,
+    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
 			      tail ? tail : NULL) < 0 ) {
 	/* No disk memory. So we can not convert last unformatted node
 	   to the direct item.  In this case we used to adjust
@@ -274,10 +241,8 @@
     }
     kunmap(page) ;
 
-    /* this will invalidate all the buffers in the page after
-    ** pos1
-    */
-    unmap_buffers(page, pos1) ;
+    /* make sure to get the i_blocks changes from reiserfs_insert_item */
+    reiserfs_update_sd(th, p_s_inode);
 
     // note: we have now the same as in above direct2indirect
     // conversion: there are two keys which have matching first three
@@ -285,7 +250,6 @@
 
     /* We have inserted new direct item and must remove last
        unformatted node. */
-    p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512);
     *p_c_mode = M_CUT;
 
     /* we store position of first direct item in the in-core inode */
diff -urN linux-2.4.22.org/include/linux/fs.h linux-2.4.22/include/linux/fs.h
--- linux-2.4.22.org/include/linux/fs.h	2003-11-21 15:08:34.000000000 +0100
+++ linux-2.4.22/include/linux/fs.h	2003-11-21 15:14:25.000000000 +0100
@@ -1222,6 +1222,8 @@
 	return test_and_set_bit(BH_Dirty, &bh->b_state);
 }
 
+extern void buffer_insert_list_journal_head(struct buffer_head *bh, struct list_head *list, void *journal_head);
+
 static inline void mark_buffer_async(struct buffer_head * bh, int on)
 {
 	if (on)
@@ -1508,6 +1510,7 @@
 /* Generic buffer handling for block filesystems.. */
 extern int try_to_release_page(struct page * page, int gfp_mask);
 extern int discard_bh_page(struct page *, unsigned long, int);
+extern void discard_buffer(struct buffer_head *bh) ;
 #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
 #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
 extern int block_symlink(struct inode *, const char *, int);
diff -urN linux-2.4.22.org/include/linux/reiserfs_fs.h linux-2.4.22/include/linux/reiserfs_fs.h
--- linux-2.4.22.org/include/linux/reiserfs_fs.h	2003-11-21 15:08:34.000000000 +0100
+++ linux-2.4.22/include/linux/reiserfs_fs.h	2003-11-21 15:14:25.000000000 +0100
@@ -266,6 +266,7 @@
 #define NO_DISK_SPACE -3
 #define NO_BALANCING_NEEDED  (-4)
 #define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
+#define QUOTA_EXCEEDED -6
 
 typedef unsigned long b_blocknr_t;
 typedef __u32 unp_t;
@@ -1329,8 +1330,7 @@
 #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter)
 #define get_generation(s) atomic_read (&fs_generation(s))
 #define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
-#define fs_changed(gen,s) (gen != get_generation (s))
-
+#define fs_changed(gen,s) (gen != get_generation(s))
 
 /***************************************************************************/
 /*                  FIXATE NODES                                           */
@@ -1653,6 +1653,86 @@
   /* 12 */ struct journal_params jh_journal;
 } ;
 
+static inline int
+reiserfs_file_data_log(struct inode *inode) {
+    if (reiserfs_data_log(inode->i_sb) || 
+       (inode->u.reiserfs_i.i_flags & i_data_log))
+    {
+        return 1 ;
+    }
+    return 0 ;
+}
+
+/* flags for the nested transaction handle */
+#define REISERFS_PERSISTENT_HANDLE 1
+#define REISERFS_ACTIVE_HANDLE 2
+#define REISERFS_CLOSE_NESTED 4
+#define REISERFS_DANGLING_HANDLE 8 
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+  struct super_block *t_super ; /* super for this FS when journal_begin was 
+				   called. saves calls to reiserfs_get_super 
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle 
+				*/
+  int t_refcount;
+  int t_blocks_logged ;         /* number of blocks this writer has logged */
+  int t_blocks_allocated ;      /* number of blocks this writer allocated */
+  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
+  int t_flags ;
+  void *t_handle_save ;		/* save existing current->journal_info */
+  int displace_new_blocks:1;    /* if new block allocation occurs, that
+  				   block should be displaced from others */
+} ;
+
+static inline int
+reiserfs_dangling_handle(struct reiserfs_transaction_handle *th) {
+    return (th && (th->t_flags & REISERFS_DANGLING_HANDLE)) ;
+}
+
+static inline void
+reiserfs_set_handle_dangling(struct reiserfs_transaction_handle *th) {
+    th->t_flags |= REISERFS_DANGLING_HANDLE ;
+}
+
+static inline void
+reiserfs_clear_handle_dangling(struct reiserfs_transaction_handle *th) {
+    th->t_flags &= ~REISERFS_DANGLING_HANDLE ;
+}
+
+static inline int 
+reiserfs_persistent_handle(struct reiserfs_transaction_handle *th) {
+    return (th && (th->t_flags & REISERFS_PERSISTENT_HANDLE)) ;
+}
+
+static inline void 
+reiserfs_set_handle_persistent(struct reiserfs_transaction_handle *th) {
+    th->t_flags |= REISERFS_PERSISTENT_HANDLE ;
+}
+
+static inline int
+reiserfs_active_handle(struct reiserfs_transaction_handle *th) {
+    return (th && (th->t_flags & REISERFS_ACTIVE_HANDLE)) ;
+}
+
+static inline void
+reiserfs_set_handle_active(struct reiserfs_transaction_handle *th) {
+    th->t_flags |= REISERFS_ACTIVE_HANDLE ;
+}
+
+static inline int
+reiserfs_restartable_handle(struct reiserfs_transaction_handle *th) {
+    return (th && (th->t_flags & REISERFS_CLOSE_NESTED)) ;
+}
+
+static inline void
+reiserfs_set_handle_restartable(struct reiserfs_transaction_handle *th) {
+    th->t_flags |= REISERFS_CLOSE_NESTED ;
+}
+
 extern task_queue reiserfs_commit_thread_tq ;
 extern wait_queue_head_t reiserfs_commit_thread_wait ;
 
@@ -1693,6 +1773,8 @@
 */
 #define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
 
+int reiserfs_journal_cache_init(void);
+int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_commit_for_tail(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
@@ -1701,6 +1783,18 @@
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
 void reiserfs_allow_writes(struct super_block *s) ;
 void reiserfs_check_lock_depth(char *caller) ;
+int journal_mark_dirty(struct reiserfs_transaction_handle *, 
+                       struct super_block *, struct buffer_head *bh) ;
+
+static inline int reiserfs_transaction_running(struct super_block *s) {
+    struct reiserfs_transaction_handle *th = current->journal_info ;
+    if (th && th->t_super == s)
+        return 1 ;
+    if (th && th->t_super == NULL)
+        BUG();
+    return 0 ;
+}
+
 void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
 struct buffer_head  * journal_bread (struct super_block *s, int block);
@@ -1716,8 +1810,14 @@
 int push_journal_writer(char *w) ;
 int pop_journal_writer(int windex) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
+int reiserfs_restart_transaction(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, kdev_t dev, int bmap_nr, int bit_nr, int size, int searchall, unsigned int *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
+
+/* allocates a transaction handle, and starts a new transaction it */
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *p_s_sb, unsigned long) ;
+
 struct super_block *reiserfs_get_super(kdev_t dev) ;
 void flush_async_commits(struct super_block *p_s_sb) ;
 
@@ -1833,11 +1933,13 @@
 int reiserfs_insert_item (struct reiserfs_transaction_handle *th, 
 			  struct path * path, 
 			  const struct cpu_key * key,
-			  struct item_head * ih, const char * body);
+			  struct item_head * ih,
+			  struct inode *inode, const char * body);
 
 int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
 			      struct path * path,
 			      const struct cpu_key * key,
+			      struct inode *inode,
 			      const char * body, int paste_size);
 
 int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
@@ -1854,7 +1956,7 @@
 			  struct buffer_head  * p_s_un_bh);
 
 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
-                                                                struct key * key);
+				 struct inode *inode, struct key * key);
 void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode);
 void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, 
 			   struct  inode * p_s_inode, struct page *, 
@@ -1895,8 +1997,18 @@
                                int i_size,
                                struct dentry *dentry,
                                struct inode *inode);
-int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode);
-void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode);
+
+int reiserfs_sync_inode (struct reiserfs_transaction_handle *th,
+                         struct inode * inode);
+
+void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
+                              struct inode * inode, loff_t size);
+
+static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
+                                      struct inode *inode)
+{
+    reiserfs_update_sd_size(th, inode, inode->i_size) ;
+}
 
 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode );
 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs );
@@ -1981,7 +2093,7 @@
 extern struct inode_operations reiserfs_file_inode_operations;
 extern struct file_operations reiserfs_file_operations;
 extern struct address_space_operations reiserfs_address_space_operations ;
-int get_new_buffer (struct reiserfs_transaction_handle *th, struct buffer_head *,
+int get_new_buffer (struct reiserfs_transaction_handle *th, struct inode *, struct buffer_head *,
 		    struct buffer_head **, struct path *);
 
 
@@ -2095,7 +2207,7 @@
 
 int reiserfs_parse_alloc_options (struct super_block *, char *);
 int is_reusable (struct super_block * s, unsigned long block, int bit_value);
-void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
+void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *inode, unsigned long, int);
 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
 extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb,
 					      b_blocknr_t *new_blocknrs, int amount_needed)
diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_i.h linux-2.4.22/include/linux/reiserfs_fs_i.h
--- linux-2.4.22.org/include/linux/reiserfs_fs_i.h	2003-11-21 15:08:34.000000000 +0100
+++ linux-2.4.22/include/linux/reiserfs_fs_i.h	2003-11-21 15:14:25.000000000 +0100
@@ -6,6 +6,8 @@
 
 #include <linux/list.h>
 
+struct reiserfs_journal_list;
+
 /** bitmasks for i_flags field in reiserfs-specific part of inode */
 typedef enum {
     /** this says what format of key do all items (but stat data) of
@@ -23,7 +25,9 @@
 	truncate or unlink. Safe link is used to avoid leakage of disk
 	space on crash with some files open, but unlinked. */
     i_link_saved_unlink_mask   =  0x0010,
-    i_link_saved_truncate_mask =  0x0020
+    i_link_saved_truncate_mask =  0x0020,
+    /** are we logging data blocks for this file? */
+    i_data_log                 =  0x0040,
 } reiserfs_inode_flags;
 
 
@@ -52,14 +56,14 @@
     ** needs to be committed in order for this inode to be properly
     ** flushed */
     unsigned long i_trans_id ;
-    unsigned long i_trans_index ;
+    struct reiserfs_journal_list *i_jl;
 
     /* direct io needs to make sure the tail is on disk to avoid
      * buffer alias problems.  This records the transaction last
      * involved in a direct->indirect conversion for this file
      */
     unsigned long i_tail_trans_id;
-    unsigned long i_tail_trans_index;
+    struct reiserfs_journal_list *i_tail_jl;
 };
 
 #endif
diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_sb.h linux-2.4.22/include/linux/reiserfs_fs_sb.h
--- linux-2.4.22.org/include/linux/reiserfs_fs_sb.h	2003-11-21 15:08:34.000000000 +0100
+++ linux-2.4.22/include/linux/reiserfs_fs_sb.h	2003-11-21 15:14:25.000000000 +0100
@@ -120,7 +120,6 @@
 #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_LIST_COUNT 64
 
 /* these are bh_state bit flag offset numbers, for use in the buffer head */
 
@@ -167,20 +166,27 @@
   struct reiserfs_bitmap_node **bitmaps ;
 } ;
 
-/*
-** transaction handle which is passed around for all journal calls
-*/
-struct reiserfs_transaction_handle {
-				/* ifdef it. -Hans */
-  char *t_caller ;              /* debugging use */
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  struct super_block *t_super ; /* super for this FS when journal_begin was 
-                                   called. saves calls to reiserfs_get_super */
-  int displace_new_blocks:1;	/* if new block allocation occurres, that block
-				   should be displaced from others */
-} ;
+struct reiserfs_journal_list;
+
+/* so, we're using fsync_buffers_list to do the ordered buffer writes,
+ * but we don't want to have a full inode on each buffer list, it is 
+ * a big waste of space.
+ *
+ * instead we copy the very head of the inode into a list here, a kludge
+ * but much smaller.
+ */
+struct reiserfs_inode_list {
+    struct list_head        i_hash;
+    struct list_head        i_list;
+    struct list_head        i_dentry;
+    struct list_head        i_dirty_buffers;
+
+    /* we could be very smart and do math based on the location
+     * of the inode list in the journal list struct.
+     * lets do that after this works properly
+     */
+    struct reiserfs_journal_list *jl;
+};
 
 /*
 ** one of these for each transaction.  The most important part here is the j_realblock.
@@ -190,20 +196,32 @@
 ** to be overwritten */
 struct reiserfs_journal_list {
   unsigned long j_start ;
+  unsigned long j_state ;
   unsigned long j_len ;
   atomic_t j_nonzerolen ;
   atomic_t j_commit_left ;
-  atomic_t j_flushing ;
-  atomic_t j_commit_flushing ;
   atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
+  struct semaphore j_commit_lock ;
   unsigned long j_trans_id ;
   time_t j_timestamp ;
   struct reiserfs_list_bitmap *j_list_bitmap ;
   struct buffer_head *j_commit_bh ; /* commit buffer head */
   struct reiserfs_journal_cnode *j_realblock  ;
   struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
-  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
+
+  /* time ordered list of all the active transactions */
+  struct list_head j_list;
+
+  /* time ordered list of all transactions not touched by kreiserfsd */
+  struct list_head j_working_list;
+
+  /* for data=ordered support */
+  struct list_head j_ordered_bh_list;
+
+  /* sigh, the tails have slightly different rules for flushing, they
+   * need their own list
+   */
+  struct list_head j_tail_bh_list;
 } ;
 
 struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
@@ -230,16 +248,11 @@
   unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
   struct buffer_head *j_header_bh ;   
 
-  /* j_flush_pages must be flushed before the current transaction can
-  ** commit
-  */
-  struct reiserfs_page_list *j_flush_pages ;
   time_t j_trans_start_time ;         /* time this transaction started */
-  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
-  atomic_t j_wlock ;                       /* lock for j_wait */
+  struct semaphore j_lock ;
+  struct semaphore j_flush_sem ;
   wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
   atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_journal_list_index ;	      /* journal list number of the current trans */
   int j_list_bitmap_index ;	      /* number of next list bitmap to use */
   int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
   int j_next_full_flush ;             /* next journal_end will flush all journal list */
@@ -255,13 +268,28 @@
 
   struct reiserfs_journal_cnode *j_cnode_free_list ;
   struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
+  struct reiserfs_journal_list *j_current_jl;
 
   int j_free_bitmap_nodes ;
   int j_used_bitmap_nodes ;
+  int j_num_lists;      /* total number of active transactions */
+  int j_num_work_lists; /* number that need attention from kreiserfsd */
+
+  /* debugging to make sure things are flushed in order */
+  int j_last_flush_id;
+
+  /* debugging to make sure things are committed in order */
+  int j_last_commit_id;
+
   struct list_head j_bitmap_nodes ;
-  struct list_head j_dirty_buffers ;
+
+  /* list of all active transactions */
+  struct list_head j_journal_list;
+
+  /* lists that haven't been touched by kreiserfsd */
+  struct list_head j_working_list;
+
   struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;	    /* array of all the journal lists */
   struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
   struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
   										the transactions */
@@ -413,6 +441,7 @@
     reiserfs_proc_info_data_t s_proc_info_data;
     struct proc_dir_entry *procdir;
     int reserved_blocks; /* amount of blocks reserved for further allocations */
+    struct list_head s_reiserfs_supers;
 };
 
 /* Definitions of reiserfs on-disk properties: */
@@ -420,11 +449,12 @@
 #define REISERFS_3_6 1
 
 /* Mount options */
-#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
-#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
-#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
-#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
+enum {
+    REISERFS_LARGETAIL, /* large tails will be created in a session */
+    REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
+    REPLAYONLY,          /* replay journal and return 0. Use by fsck */
+    REISERFS_NOLOG,      /* -o nolog: turn journalling off */
+    REISERFS_CONVERT,    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
                                  partition will be dealt with in a
@@ -438,27 +468,25 @@
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
-#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
-#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
-#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
+    FORCE_TEA_HASH,       /* try to force tea hash on mount */
+    FORCE_RUPASOV_HASH,   /* try to force rupasov hash on mount */
+    FORCE_R5_HASH,        /* try to force rupasov hash on mount */
+    FORCE_HASH_DETECT,    /* try to detect hash function on mount */
 
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-#define REISERFS_NO_BORDER 11
-#define REISERFS_NO_UNHASHED_RELOCATION 12
-#define REISERFS_HASHED_RELOCATION 13
-#define REISERFS_TEST4 14 
-
-#define REISERFS_TEST1 11
-#define REISERFS_TEST2 12
-#define REISERFS_TEST3 13
-#define REISERFS_TEST4 14 
-
-#define REISERFS_ATTRS (15)
+    REISERFS_NO_BORDER,
+    REISERFS_NO_UNHASHED_RELOCATION,
+    REISERFS_HASHED_RELOCATION,
+    REISERFS_DATA_LOG,
+    REISERFS_DATA_ORDERED,
+    REISERFS_DATA_WRITEBACK,
+    REISERFS_ATTRS,
+    REISERFS_TEST4,
+};
 
 #define reiserfs_r5_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_R5_HASH))
 #define reiserfs_rupasov_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_RUPASOV_HASH))
@@ -467,6 +495,9 @@
 #define reiserfs_no_border(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_BORDER))
 #define reiserfs_no_unhashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
 #define reiserfs_hashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
+#define reiserfs_data_log(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 #define reiserfs_test4(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_TEST4))
 
 #define have_large_tails(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_LARGETAIL))
@@ -480,8 +511,6 @@
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 int reiserfs_is_super(struct super_block *s)  ;
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-int flush_old_commits(struct super_block *s, int) ;
 int show_reiserfs_locks(void) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
@@ -492,8 +521,6 @@
 #define SB_BUFFER_WITH_SB(s) ((s)->u.reiserfs_sb.s_sbh)
 #define SB_JOURNAL(s) ((s)->u.reiserfs_sb.s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
-#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) 
 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
 #define SB_AP_BITMAP(s) ((s)->u.reiserfs_sb.s_ap_bitmap)
 
diff -urN linux-2.4.22.org/kernel/ksyms.c linux-2.4.22/kernel/ksyms.c
--- linux-2.4.22.org/kernel/ksyms.c	2003-11-21 15:08:31.000000000 +0100
+++ linux-2.4.22/kernel/ksyms.c	2003-11-21 15:15:21.000000000 +0100
@@ -182,6 +182,7 @@
 EXPORT_SYMBOL(end_buffer_io_async);
 EXPORT_SYMBOL(__mark_buffer_dirty);
 EXPORT_SYMBOL(__mark_inode_dirty);
+EXPORT_SYMBOL(discard_buffer);      /* for FS flushpage funcs */
 EXPORT_SYMBOL(fd_install);
 EXPORT_SYMBOL(get_empty_filp);
 EXPORT_SYMBOL(init_private_file);
diff -urN linux-2.4.22.org/mm/filemap.c linux-2.4.22/mm/filemap.c
--- linux-2.4.22.org/mm/filemap.c	2003-11-21 15:08:31.000000000 +0100
+++ linux-2.4.22/mm/filemap.c	2003-11-21 15:14:25.000000000 +0100
@@ -3041,6 +3041,14 @@
 	}
 }
 
+static void update_inode_times(struct inode *inode) 
+{
+	time_t now = CURRENT_TIME;
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+	    inode->i_ctime = inode->i_mtime = now;
+	    mark_inode_dirty_sync(inode);
+	} 
+}
 /*
  * precheck_file_write():
  * Check the conditions on a file descriptor prior to beginning a write
@@ -3302,8 +3310,7 @@
 		BUG();
 
 	remove_suid(inode);
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	mark_inode_dirty_sync(inode);
+	update_inode_times(inode);
 
 	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
 	if (written > 0) {