linux-2.4.22-data-loging+quota.patch

   1 diff -urN linux-2.4.22.org/fs/buffer.c linux-2.4.22/fs/buffer.c
   2 --- linux-2.4.22.org/fs/buffer.c        2003-11-21 15:08:24.000000000 +0100
   3 +++ linux-2.4.22/fs/buffer.c    2003-11-21 15:14:23.000000000 +0100
   4 @@ -659,6 +659,20 @@
   5         spin_unlock(&lru_list_lock);
   6  }
   7
   8 +void buffer_insert_list_journal_head(struct buffer_head *bh,
   9 +                                     struct list_head *list,
  10 +                                    void *journal_head)
  11 +{
  12 +       spin_lock(&lru_list_lock);
  13 +       if (buffer_attached(bh))
  14 +               list_del(&bh->b_inode_buffers);
  15 +       set_buffer_attached(bh);
  16 +       list_add(&bh->b_inode_buffers, list);
  17 +       bh->b_journal_head = journal_head;
  18 +       spin_unlock(&lru_list_lock);
  19 +}
  20 +EXPORT_SYMBOL(buffer_insert_list_journal_head);
  21 +
  22  /*
  23   * The caller must have the lru_list lock before calling the
  24   * remove_inode_queue functions.
  25 @@ -1370,7 +1384,7 @@
  26  /*
  27   * Called when truncating a buffer on a page completely.
  28   */
  29 -static void discard_buffer(struct buffer_head * bh)
  30 +void discard_buffer(struct buffer_head * bh)
  31  {
  32         if (buffer_mapped(bh) || buffer_delay(bh)) {
  33                 mark_buffer_clean(bh);
  34 diff -urN linux-2.4.22.org/fs/inode.c linux-2.4.22/fs/inode.c
  35 --- linux-2.4.22.org/fs/inode.c 2003-11-21 15:08:24.000000000 +0100
  36 +++ linux-2.4.22/fs/inode.c     2003-11-21 15:14:23.000000000 +0100
  37 @@ -476,7 +476,7 @@
  38         }
  39  }
  40
  41 -static void try_to_sync_unused_inodes(void * arg)
  42 +static void try_to_sync_unused_inodes(void)
  43  {
  44         struct super_block * sb;
  45         int nr_inodes = inodes_stat.nr_unused;
  46 @@ -495,7 +495,8 @@
  47         spin_unlock(&inode_lock);
  48  }
  49
  50 -static struct tq_struct unused_inodes_flush_task;
  51 +static DECLARE_WAIT_QUEUE_HEAD(kinoded_wait) ;
  52 +static atomic_t kinoded_goal = ATOMIC_INIT(0) ;
  53
  54  /**
  55   *     write_inode_now -       write an inode to disk
  56 @@ -758,7 +759,7 @@
  57          !inode_has_buffers(inode))
  58  #define INODE(entry)   (list_entry(entry, struct inode, i_list))
  59
  60 -void prune_icache(int goal)
  61 +static void _prune_icache(int goal)
  62  {
  63         LIST_HEAD(list);
  64         struct list_head *entry, *freeable = &list;
  65 @@ -792,35 +793,29 @@
  66         spin_unlock(&inode_lock);
  67
  68         dispose_list(freeable);
  69 +       kmem_cache_shrink(inode_cachep);
  70
  71         /*
  72 -        * If we didn't freed enough clean inodes schedule
  73 -        * a sync of the dirty inodes, we cannot do it
  74 -        * from here or we're either synchronously dogslow
  75 -        * or we deadlock with oom.
  76 +        * If we didn't freed enough clean inodes
  77 +        * start a sync now
  78          */
  79         if (goal)
  80 -               schedule_task(&unused_inodes_flush_task);
  81 +               try_to_sync_unused_inodes();
  82 +}
  83 +
  84 +void prune_icache(int goal) {
  85 +       atomic_add(goal, &kinoded_goal);
  86 +       if (atomic_read(&kinoded_goal) > 16) {
  87 +               wake_up_interruptible(&kinoded_wait);
  88 +       }
  89  }
  90
  91  int shrink_icache_memory(int priority, int gfp_mask)
  92  {
  93         int count = 0;
  94 -
  95 -       /*
  96 -        * Nasty deadlock avoidance..
  97 -        *
  98 -        * We may hold various FS locks, and we don't
  99 -        * want to recurse into the FS that called us
 100 -        * in clear_inode() and friends..
 101 -        */
 102 -       if (!(gfp_mask & __GFP_FS))
 103 -               return 0;
 104 -
 105         count = inodes_stat.nr_unused / priority;
 106 -
 107         prune_icache(count);
 108 -       return kmem_cache_shrink(inode_cachep);
 109 +       return 0;
 110  }
 111
 112  /*
 113 @@ -1198,6 +1193,34 @@
 114         return res;
 115  }
 116
 117 +int kinoded(void *startup) {
 118 +
 119 +       struct task_struct *tsk = current;
 120 +       int goal ;
 121 +
 122 +       daemonize();
 123 +       strcpy(tsk->comm, "kinoded");
 124 +
 125 +       /* avoid getting signals */
 126 +       spin_lock_irq(&tsk->sigmask_lock);
 127 +       flush_signals(tsk);
 128 +       sigfillset(&tsk->blocked);
 129 +       recalc_sigpending(tsk);
 130 +       spin_unlock_irq(&tsk->sigmask_lock);
 131 +
 132 +       printk("kinoded started\n") ;
 133 +       complete((struct completion *)startup);
 134 +       while(1) {
 135 +               wait_event_interruptible(kinoded_wait,
 136 +                                       atomic_read(&kinoded_goal));
 137 +               while((goal = atomic_read(&kinoded_goal))) {
 138 +                       _prune_icache(goal);
 139 +                       atomic_sub(goal, &kinoded_goal);
 140 +                       cond_resched();
 141 +               }
 142 +       }
 143 +}
 144 +
 145  /*
 146   * Initialize the hash tables.
 147   */
 148 @@ -1249,8 +1272,17 @@
 149                                          NULL);
 150         if (!inode_cachep)
 151                 panic("cannot create inode slab cache");
 152 +}
 153
 154 -       unused_inodes_flush_task.routine = try_to_sync_unused_inodes;
 155 +/* we need to start a thread, and inode_init happens too early for that
 156 +** to work.  So, add a second init func through module_init
 157 +*/
 158 +static int __init inode_mod_init(void)
 159 +{
 160 +       static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
 161 +       kernel_thread(kinoded, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 162 +       wait_for_completion(&startup);
 163 +       return 0;
 164  }
 165
 166  /**
 167 @@ -1344,3 +1376,5 @@
 168  }
 169
 170  #endif
 171 +
 172 +module_init(inode_mod_init) ;
 173 diff -urN linux-2.4.22.org/fs/reiserfs/bitmap.c linux-2.4.22/fs/reiserfs/bitmap.c
 174 --- linux-2.4.22.org/fs/reiserfs/bitmap.c       2003-11-21 15:08:29.000000000 +0100
 175 +++ linux-2.4.22/fs/reiserfs/bitmap.c   2003-11-21 15:14:23.000000000 +0100
 176 @@ -10,6 +10,7 @@
 177  #include <linux/errno.h>
 178  #include <linux/locks.h>
 179  #include <linux/kernel.h>
 180 +#include <linux/quotaops.h>
 181
 182  #include <linux/reiserfs_fs.h>
 183  #include <linux/reiserfs_fs_sb.h>
 184 @@ -287,7 +288,8 @@
 185  }
 186
 187  static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
 188 -                         b_blocknr_t block)
 189 +                                 struct inode *inode, b_blocknr_t block,
 190 +                                 int for_unformatted)
 191  {
 192      struct super_block * s = th->t_super;
 193      struct reiserfs_super_block * rs;
 194 @@ -296,7 +298,6 @@
 195      int nr, offset;
 196
 197      PROC_INFO_INC( s, free_block );
 198 -
 199      rs = SB_DISK_SUPER_BLOCK (s);
 200      sbh = SB_BUFFER_WITH_SB (s);
 201      apbi = SB_AP_BITMAP(s);
 202 @@ -309,7 +310,6 @@
 203                           block, bdevname(s->s_dev));
 204         return;
 205      }
 206 -
 207      reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
 208
 209      /* clear bit for the given block in bit map */
 210 @@ -329,39 +329,55 @@
 211      set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
 212
 213      journal_mark_dirty (th, s, sbh);
 214 +    if (for_unformatted) {
 215 +#ifdef REISERQUOTA_DEBUG
 216 +      printk(KERN_DEBUG "reiserquota: freeing block id=%u\n", inode->i_uid);
 217 +#endif
 218 +      DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
 219 +    }
 220 +
 221  }
 222
 223  void reiserfs_free_block (struct reiserfs_transaction_handle *th,
 224 -                         unsigned long block) {
 225 +                         struct inode *inode, unsigned long block,
 226 +                         int for_unformatted)
 227 +{
 228      struct super_block * s = th->t_super;
 229
 230      RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
 231      RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
 232      /* mark it before we clear it, just in case */
 233      journal_mark_freed(th, s, block) ;
 234 -    _reiserfs_free_block(th, block) ;
 235 +    _reiserfs_free_block(th, inode, block, for_unformatted) ;
 236  }
 237
 238  /* preallocated blocks don't need to be run through journal_mark_freed */
 239  void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
 240 -                          unsigned long block) {
 241 +                         struct inode *inode,
 242 +                          unsigned long block)
 243 +{
 244      RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
 245      RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
 246 -    _reiserfs_free_block(th, block) ;
 247 +    _reiserfs_free_block(th, inode, block, 1) ;
 248  }
 249
 250  static void __discard_prealloc (struct reiserfs_transaction_handle * th,
 251                                 struct inode * inode)
 252  {
 253      unsigned long save = inode->u.reiserfs_i.i_prealloc_block ;
 254 +    int dirty=0;
 255  #ifdef CONFIG_REISERFS_CHECK
 256      if (inode->u.reiserfs_i.i_prealloc_count < 0)
 257         reiserfs_warning(th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.\n", __FUNCTION__ );
 258  #endif
 259      while (inode->u.reiserfs_i.i_prealloc_count > 0) {
 260 -       reiserfs_free_prealloc_block(th,inode->u.reiserfs_i.i_prealloc_block);
 261 +       reiserfs_free_prealloc_block(th, inode, inode->u.reiserfs_i.i_prealloc_block);
 262         inode->u.reiserfs_i.i_prealloc_block++;
 263         inode->u.reiserfs_i.i_prealloc_count --;
 264 +       dirty = 1 ;
 265 +    }
 266 +    if (dirty) {
 267 +       reiserfs_update_sd(th, inode) ;
 268      }
 269      inode->u.reiserfs_i.i_prealloc_block = save ;
 270      list_del (&(inode->u.reiserfs_i.i_prealloc_list));
 271 @@ -599,7 +615,6 @@
 272      if (hint->formatted_node || hint->inode == NULL) {
 273         return 0;
 274      }
 275 -
 276      hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
 277      border = hint->beg + (unsigned long) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
 278      if (border > hint->search_start)
 279 @@ -776,6 +791,24 @@
 280      int nr_allocated = 0;
 281
 282      determine_prealloc_size(hint);
 283 +    if (!hint->formatted_node) {
 284 +        int quota_ret;
 285 +#ifdef REISERQUOTA_DEBUG
 286 +       printk(KERN_DEBUG "reiserquota: allocating %d blocks id=%u\n", amount_needed, hint->inode->i_uid);
 287 +#endif
 288 +       quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
 289 +       if (quota_ret)    /* Quota exceeded? */
 290 +           return QUOTA_EXCEEDED;
 291 +       if (hint->preallocate && hint->prealloc_size ) {
 292 +#ifdef REISERQUOTA_DEBUG
 293 +           printk(KERN_DEBUG "reiserquota: allocating (prealloc) %d blocks id=%u\n", hint->prealloc_size, hint->inode->i_uid);
 294 +#endif
 295 +           quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
 296 +           if (quota_ret)
 297 +               hint->preallocate=hint->prealloc_size=0;
 298 +       }
 299 +    }
 300 +
 301      while((nr_allocated
 302           += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
 303                                           amount_needed - nr_allocated, hint->prealloc_size))
 304 @@ -783,8 +816,14 @@
 305
 306         /* not all blocks were successfully allocated yet*/
 307         if (second_pass) {      /* it was a second pass; we must free all blocks */
 308 +           if (!hint->formatted_node) {
 309 +#ifdef REISERQUOTA_DEBUG
 310 +               printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
 311 +#endif
 312 +               DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
 313 +           }
 314             while (nr_allocated --)
 315 -               reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
 316 +               reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
 317
 318             return NO_DISK_SPACE;
 319         } else {                /* refine search parameters for next pass */
 320 @@ -794,6 +833,13 @@
 321             continue;
 322         }
 323      }
 324 +    if ( !hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + INODE_INFO(hint->inode)->i_prealloc_count) {
 325 +    /* Some of preallocation blocks were not allocated */
 326 +#ifdef REISERQUOTA_DEBUG
 327 +       printk(KERN_DEBUG "reiserquota: freeing (failed prealloc) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count, hint->inode->i_uid);
 328 +#endif
 329 +       DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count);
 330 +    }
 331      return CARRY_ON;
 332  }
 333
 334 @@ -862,7 +908,7 @@
 335
 336      if (ret != CARRY_ON) {
 337         while (amount_needed ++ < initial_amount_needed) {
 338 -           reiserfs_free_block(hint->th, *(--new_blocknrs));
 339 +           reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
 340         }
 341      }
 342      return ret;
 343 diff -urN linux-2.4.22.org/fs/reiserfs/do_balan.c linux-2.4.22/fs/reiserfs/do_balan.c
 344 --- linux-2.4.22.org/fs/reiserfs/do_balan.c     2003-11-21 15:08:29.000000000 +0100
 345 +++ linux-2.4.22/fs/reiserfs/do_balan.c 2003-11-21 15:14:23.000000000 +0100
 346 @@ -33,16 +33,8 @@
 347  inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
 348                                         struct buffer_head * bh, int flag)
 349  {
 350 -    if (reiserfs_dont_log(tb->tb_sb)) {
 351 -       if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
 352 -           __mark_buffer_dirty(bh) ;
 353 -           tb->need_balance_dirty = 1;
 354 -       }
 355 -    } else {
 356 -       int windex = push_journal_writer("do_balance") ;
 357 -       journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
 358 -       pop_journal_writer(windex) ;
 359 -    }
 360 +    journal_mark_dirty(tb->transaction_handle,
 361 +                       tb->transaction_handle->t_super, bh) ;
 362  }
 363
 364  #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
 365 @@ -1247,7 +1239,7 @@
 366             if (buffer_dirty (tb->thrown[i]))
 367               reiserfs_warning (tb->tb_sb, "free_thrown deals with dirty buffer %ld\n", blocknr);
 368             brelse(tb->thrown[i]) ; /* incremented in store_thrown */
 369 -           reiserfs_free_block (tb->transaction_handle, blocknr);
 370 +           reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 371         }
 372      }
 373  }
 374 @@ -1259,9 +1251,11 @@
 375      set_blkh_level( blkh, FREE_LEVEL );
 376      set_blkh_nr_item( blkh, 0 );
 377
 378 -    mark_buffer_clean (bh);
 379 +    if (buffer_dirty(bh))
 380 +        BUG();
 381 +    // mark_buffer_clean (bh);
 382      /* reiserfs_free_block is no longer schedule safe
 383 -    reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr);
 384 +    reiserfs_free_block (tb->transaction_handle, NULL, tb->tb_sb, bh->b_blocknr, 0);
 385      */
 386
 387      store_thrown (tb, bh);
 388 @@ -1575,6 +1569,7 @@
 389      tb->tb_mode = flag;
 390      tb->need_balance_dirty = 0;
 391
 392 +    reiserfs_check_lock_depth("do balance");
 393      if (FILESYSTEM_CHANGED_TB(tb)) {
 394          reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
 395      }
 396 @@ -1605,5 +1600,6 @@
 397
 398
 399      do_balance_completed (tb);
 400 +    reiserfs_check_lock_depth("do balance2");
 401
 402  }
 403 diff -urN linux-2.4.22.org/fs/reiserfs/file.c linux-2.4.22/fs/reiserfs/file.c
 404 --- linux-2.4.22.org/fs/reiserfs/file.c 2003-11-21 15:08:29.000000000 +0100
 405 +++ linux-2.4.22/fs/reiserfs/file.c     2003-11-21 15:14:23.000000000 +0100
 406 @@ -6,6 +6,7 @@
 407  #include <linux/sched.h>
 408  #include <linux/reiserfs_fs.h>
 409  #include <linux/smp_lock.h>
 410 +#include <linux/quotaops.h>
 411
 412  /*
 413  ** We pack the tails of files on file close, not at the time they are written.
 414 @@ -42,7 +43,6 @@
 415      lock_kernel() ;
 416      down (&inode->i_sem);
 417      journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
 418 -    reiserfs_update_inode_transaction(inode) ;
 419
 420  #ifdef REISERFS_PREALLOCATE
 421      reiserfs_discard_prealloc (&th, inode);
 422 @@ -93,7 +93,9 @@
 423  static int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
 424      struct inode *inode = dentry->d_inode ;
 425      int error ;
 426 -    if (attr->ia_valid & ATTR_SIZE) {
 427 +    unsigned int ia_valid = attr->ia_valid ;
 428 +
 429 +    if (ia_valid & ATTR_SIZE) {
 430         /* version 2 items will be caught by the s_maxbytes check
 431         ** done for us in vmtruncate
 432         */
 433 @@ -101,8 +103,17 @@
 434             attr->ia_size > MAX_NON_LFS)
 435              return -EFBIG ;
 436
 437 +        /* During a truncate, we have to make sure the new i_size is in
 438 +       ** the transaction before we start dropping updates to data logged
 439 +       ** or ordered write data pages.
 440 +       */
 441 +       if (attr->ia_size < inode->i_size && reiserfs_file_data_log(inode)) {
 442 +           struct reiserfs_transaction_handle th ;
 443 +           journal_begin(&th, inode->i_sb, 1) ;
 444 +           reiserfs_update_sd_size(&th, inode, attr->ia_size) ;
 445 +           journal_end(&th, inode->i_sb, 1) ;
 446         /* fill in hole pointers in the expanding truncate case. */
 447 -        if (attr->ia_size > inode->i_size) {
 448 +        } else if (attr->ia_size > inode->i_size) {
 449             error = generic_cont_expand(inode, attr->ia_size) ;
 450             if (inode->u.reiserfs_i.i_prealloc_count > 0) {
 451                 struct reiserfs_transaction_handle th ;
 452 @@ -123,15 +134,35 @@
 453             return -EINVAL;
 454
 455      error = inode_change_ok(inode, attr) ;
 456 -    if (!error)
 457 -        inode_setattr(inode, attr) ;
 458 +    if (!error) {
 459 +       if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 460 +           (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
 461 +               error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 462
 463 +       if (!error)
 464 +           error = inode_setattr(inode, attr) ;
 465 +    }
 466      return error ;
 467  }
 468
 469 +static ssize_t
 470 +reiserfs_file_write(struct file *f, const char *b, size_t count, loff_t *ppos)
 471 +{
 472 +    ssize_t ret;
 473 +    struct inode *inode = f->f_dentry->d_inode;
 474 +
 475 +    ret = generic_file_write(f, b, count, ppos);
 476 +    if (ret >= 0 && f->f_flags & O_SYNC) {
 477 +        lock_kernel();
 478 +       reiserfs_commit_for_inode(inode);
 479 +       unlock_kernel();
 480 +    }
 481 +    return ret;
 482 +}
 483 +
 484  struct file_operations reiserfs_file_operations = {
 485      read:      generic_file_read,
 486 -    write:     generic_file_write,
 487 +    write:     reiserfs_file_write,
 488      ioctl:     reiserfs_ioctl,
 489      mmap:      generic_file_mmap,
 490      release:   reiserfs_file_release,
 491 diff -urN linux-2.4.22.org/fs/reiserfs/fix_node.c linux-2.4.22/fs/reiserfs/fix_node.c
 492 --- linux-2.4.22.org/fs/reiserfs/fix_node.c     2003-11-21 15:08:29.000000000 +0100
 493 +++ linux-2.4.22/fs/reiserfs/fix_node.c 2003-11-21 15:14:23.000000000 +0100
 494 @@ -795,8 +795,9 @@
 495    else /* If we have enough already then there is nothing to do. */
 496      return CARRY_ON;
 497
 498 -  if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
 499 -                                   n_amount_needed) == NO_DISK_SPACE )
 500 +  /* No need to check quota - is not allocated for blocks used for formatted nodes */
 501 +  if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
 502 +                                   n_amount_needed) == NO_DISK_SPACE)
 503      return NO_DISK_SPACE;
 504
 505    /* for each blocknumber we just got, get a buffer and stick it on FEB */
 506 @@ -2121,7 +2122,8 @@
 507
 508  static void clear_all_dirty_bits(struct super_block *s,
 509                                   struct buffer_head *bh) {
 510 -  reiserfs_prepare_for_journal(s, bh, 0) ;
 511 +  // reiserfs_prepare_for_journal(s, bh, 0) ;
 512 +  set_bit(BH_JPrepared, &bh->b_state) ;
 513  }
 514
 515  static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 516 @@ -2518,7 +2520,7 @@
 517             /* de-allocated block which was not used by balancing and
 518                 bforget about buffer for it */
 519             brelse (tb->FEB[i]);
 520 -           reiserfs_free_block (tb->transaction_handle, blocknr);
 521 +           reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
 522         }
 523         if (tb->used[i]) {
 524             /* release used as new nodes including a new root */
 525 diff -urN linux-2.4.22.org/fs/reiserfs/ibalance.c linux-2.4.22/fs/reiserfs/ibalance.c
 526 --- linux-2.4.22.org/fs/reiserfs/ibalance.c     2003-11-21 15:08:29.000000000 +0100
 527 +++ linux-2.4.22/fs/reiserfs/ibalance.c 2003-11-21 15:14:23.000000000 +0100
 528 @@ -632,7 +632,6 @@
 529                 /* use check_internal if new root is an internal node */
 530                 check_internal (new_root);
 531             /*&&&&&&&&&&&&&&&&&&&&&&*/
 532 -           tb->tb_sb->s_dirt = 1;
 533
 534             /* do what is needed for buffer thrown from tree */
 535             reiserfs_invalidate_buffer(tb, tbSh);
 536 @@ -950,7 +949,6 @@
 537          PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
 538          PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 539         do_balance_mark_sb_dirty (tb, tb->tb_sb->u.reiserfs_sb.s_sbh, 1);
 540 -       tb->tb_sb->s_dirt = 1;
 541      }
 542
 543      if ( tb->blknum[h] == 2 ) {
 544 diff -urN linux-2.4.22.org/fs/reiserfs/inode.c linux-2.4.22/fs/reiserfs/inode.c
 545 --- linux-2.4.22.org/fs/reiserfs/inode.c        2003-11-21 15:08:29.000000000 +0100
 546 +++ linux-2.4.22/fs/reiserfs/inode.c    2003-11-21 15:14:23.000000000 +0100
 547 @@ -4,9 +4,11 @@
 548
 549  #include <linux/config.h>
 550  #include <linux/sched.h>
 551 +#include <linux/fs.h>
 552  #include <linux/reiserfs_fs.h>
 553  #include <linux/locks.h>
 554  #include <linux/smp_lock.h>
 555 +#include <linux/quotaops.h>
 556  #include <asm/uaccess.h>
 557  #include <asm/unaligned.h>
 558
 559 @@ -17,6 +19,8 @@
 560  #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 561  #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
 562
 563 +static int reiserfs_commit_write(struct file *, struct page *,
 564 +                                 unsigned from, unsigned to) ;
 565  static int reiserfs_get_block (struct inode * inode, long block,
 566                                struct buffer_head * bh_result, int create);
 567
 568 @@ -33,6 +37,7 @@
 569
 570      lock_kernel() ;
 571
 572 +    DQUOT_FREE_INODE(inode);
 573      /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 574      if (INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
 575         down (&inode->i_sem);
 576 @@ -106,9 +111,13 @@
 577  }
 578
 579  static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
 580 -    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
 581 +    struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
 582 +    buffer_insert_list_journal_head(bh, &jl->j_ordered_bh_list, jl);
 583 +}
 584
 585 -    buffer_insert_list(bh, &j->j_dirty_buffers) ;
 586 +static void add_to_tail_list(struct inode *inode, struct buffer_head *bh) {
 587 +    struct reiserfs_journal_list *jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
 588 +    buffer_insert_list_journal_head(bh, &jl->j_tail_bh_list, jl);
 589  }
 590
 591  //
 592 @@ -201,15 +210,16 @@
 593      return 0;
 594  }
 595
 596 -/*static*/ void restart_transaction(struct reiserfs_transaction_handle *th,
 597 -                               struct inode *inode, struct path *path) {
 598 -  struct super_block *s = th->t_super ;
 599 -  int len = th->t_blocks_allocated ;
 600 -
 601 +static void restart_transaction(struct reiserfs_transaction_handle *th,
 602 +                               struct inode *inode, struct path *path,
 603 +                               int jbegin_count) {
 604 +  /* we cannot restart while nested unless the parent allows it */
 605 +  if (!reiserfs_restartable_handle(th) && th->t_refcount > 1) {
 606 +      return  ;
 607 +  }
 608    pathrelse(path) ;
 609    reiserfs_update_sd(th, inode) ;
 610 -  journal_end(th, s, len) ;
 611 -  journal_begin(th, s, len) ;
 612 +  reiserfs_restart_transaction(th, jbegin_count) ;
 613    reiserfs_update_inode_transaction(inode) ;
 614  }
 615
 616 @@ -327,6 +337,10 @@
 617         }
 618      }
 619      p += offset ;
 620 +    if ((offset + inode->i_sb->s_blocksize) > PAGE_CACHE_SIZE) {
 621 +printk("get_block_create_0 offset %lu too large\n", offset);
 622 +    }
 623 +
 624      memset (p, 0, inode->i_sb->s_blocksize);
 625      do {
 626         if (!is_direct_le_ih (ih)) {
 627 @@ -421,10 +435,32 @@
 628  static int reiserfs_get_block_direct_io (struct inode * inode, long block,
 629                         struct buffer_head * bh_result, int create) {
 630      int ret ;
 631 -
 632 +    struct reiserfs_transaction_handle *th;
 633 +    int refcount = 0;
 634 +    struct super_block *s = inode->i_sb;
 635 +
 636 +    /* get_block might start a new transaction and leave it running.
 637 +     * test for that by checking for a transaction running right now
 638 +     * and recording its refcount.  Run a journal_end if the refcount
 639 +     * after reiserfs_get_block is higher than it was before.
 640 +     */
 641 +    if (reiserfs_transaction_running(s)) {
 642 +       th = current->journal_info;
 643 +       refcount = th->t_refcount;
 644 +    }
 645      bh_result->b_page = NULL;
 646      ret = reiserfs_get_block(inode, block, bh_result, create) ;
 647
 648 +    if (!ret && reiserfs_transaction_running(s)) {
 649 +       th = current->journal_info;
 650 +       if (th->t_refcount > refcount) {
 651 +           lock_kernel();
 652 +           reiserfs_update_sd(th, inode) ;
 653 +           journal_end(th, s, th->t_blocks_allocated);
 654 +           unlock_kernel();
 655 +       }
 656 +    }
 657 +
 658      /* don't allow direct io onto tail pages */
 659      if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 660         /* make sure future calls to the direct io funcs for this offset
 661 @@ -459,7 +495,6 @@
 662                                   struct buffer_head *bh_result,
 663                                  loff_t tail_offset) {
 664      unsigned long index ;
 665 -    unsigned long tail_end ;
 666      unsigned long tail_start ;
 667      struct page * tail_page ;
 668      struct page * hole_page = bh_result->b_page ;
 669 @@ -470,7 +505,6 @@
 670
 671      /* always try to read until the end of the block */
 672      tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
 673 -    tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 674
 675      index = tail_offset >> PAGE_CACHE_SHIFT ;
 676      if ( !hole_page || index != hole_page->index) {
 677 @@ -492,16 +526,13 @@
 678      ** data that has been read directly into the page, and block_prepare_write
 679      ** won't trigger a get_block in this case.
 680      */
 681 -    fix_tail_page_for_writing(tail_page) ;
 682 -    retval = block_prepare_write(tail_page, tail_start, tail_end,
 683 -                                 reiserfs_get_block) ;
 684 +    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_start) ;
 685      if (retval)
 686          goto unlock ;
 687
 688      /* tail conversion might change the data in the page */
 689      flush_dcache_page(tail_page) ;
 690 -
 691 -    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
 692 +    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_start) ;
 693
 694  unlock:
 695      if (tail_page != hole_page) {
 696 @@ -541,20 +572,34 @@
 697      int done;
 698      int fs_gen;
 699      int windex ;
 700 -    struct reiserfs_transaction_handle th ;
 701 +    struct reiserfs_transaction_handle *th = NULL ;
 702      /* space reserved in transaction batch:
 703          . 3 balancings in direct->indirect conversion
 704          . 1 block involved into reiserfs_update_sd()
 705 +       . 1 bitmap block
 706         XXX in practically impossible worst case direct2indirect()
 707 -       can incur (much) more that 3 balancings. */
 708 -    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
 709 +       can incur (much) more that 3 balancings, but we deal with
 710 +       direct2indirect lower down */
 711 +    int jbegin_count = JOURNAL_PER_BALANCE_CNT + 2;
 712      int version;
 713 -    int transaction_started = 0 ;
 714 +    int dangle = 1;
 715      loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 716 +    int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
 717
 718 -                               /* bad.... */
 719 +    /* if this block might contain a tail, we need to be more conservative */
 720 +    if (new_offset <= (loff_t)(16 * 1024)) {
 721 +        jbegin_count += JOURNAL_PER_BALANCE_CNT * 2;
 722 +    }
 723 +    /* we might nest for the entire page, so we need to make sure
 724 +     * to reserve enough to insert pointers in the tree for each block
 725 +     * in the file
 726 +     */
 727 +    jbegin_count *= blocks_per_page;
 728 +    if (reiserfs_file_data_log(inode)) {
 729 +        jbegin_count += blocks_per_page;
 730 +
 731 +    }
 732      lock_kernel() ;
 733 -    th.t_trans_id = 0 ;
 734      version = get_inode_item_key_version (inode);
 735
 736      if (block < 0) {
 737 @@ -579,6 +624,10 @@
 738         return ret;
 739      }
 740
 741 +    /* don't leave the trans running if we are already nested */
 742 +    if (reiserfs_transaction_running(inode->i_sb))
 743 +       dangle = 0;
 744 +
 745      /* If file is of such a size, that it might have a tail and tails are enabled
 746      ** we should mark it as possibly needing tail packing on close
 747      */
 748 @@ -591,10 +640,18 @@
 749      /* set the key of the first byte in the 'block'-th block of file */
 750      make_cpu_key (&key, inode, new_offset,
 751                   TYPE_ANY, 3/*key length*/);
 752 +
 753 +    /* reiserfs_commit_write will close any transaction currently
 754 +    ** running.  So, if we are nesting into someone else, we have to
 755 +    ** make sure and bump the refcount
 756 +    */
 757      if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 758 -       journal_begin(&th, inode->i_sb, jbegin_count) ;
 759 +       th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
 760 +       if (IS_ERR(th)) {
 761 +           retval = PTR_ERR(th) ;
 762 +           goto failure ;
 763 +       }
 764         reiserfs_update_inode_transaction(inode) ;
 765 -       transaction_started = 1 ;
 766      }
 767   research:
 768
 769 @@ -614,28 +671,34 @@
 770
 771      if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 772         /* we have to allocate block for the unformatted node */
 773 -       if (!transaction_started) {
 774 +       if (!reiserfs_active_handle(th)) {
 775             pathrelse(&path) ;
 776 -           journal_begin(&th, inode->i_sb, jbegin_count) ;
 777 +           th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
 778 +           if (IS_ERR(th)) {
 779 +               retval = PTR_ERR(th) ;
 780 +               goto failure ;
 781 +           }
 782             reiserfs_update_inode_transaction(inode) ;
 783 -           transaction_started = 1 ;
 784             goto research ;
 785         }
 786
 787 -       repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
 788 +       repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 789
 790 -       if (repeat == NO_DISK_SPACE) {
 791 +       if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 792             /* restart the transaction to give the journal a chance to free
 793             ** some blocks.  releases the path, so we have to go back to
 794             ** research if we succeed on the second try
 795             */
 796 -           restart_transaction(&th, inode, &path) ;
 797 -           repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
 798 +           restart_transaction(th, inode, &path, jbegin_count) ;
 799 +           repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 800
 801 -           if (repeat != NO_DISK_SPACE) {
 802 +           if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 803                 goto research ;
 804             }
 805 -           retval = -ENOSPC;
 806 +           if (repeat == QUOTA_EXCEEDED)
 807 +               retval = -EDQUOT;
 808 +           else
 809 +               retval = -ENOSPC;
 810             goto failure;
 811         }
 812
 813 @@ -660,15 +723,12 @@
 814             bh_result->b_state |= (1UL << BH_New);
 815             put_block_num(item, pos_in_item, allocated_block_nr) ;
 816              unfm_ptr = allocated_block_nr;
 817 -           journal_mark_dirty (&th, inode->i_sb, bh);
 818 -           inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
 819 -           reiserfs_update_sd(&th, inode) ;
 820 +           journal_mark_dirty (th, inode->i_sb, bh);
 821 +           reiserfs_update_sd(th, inode) ;
 822         }
 823         set_block_dev_mapped(bh_result, unfm_ptr, inode);
 824         pathrelse (&path);
 825         pop_journal_writer(windex) ;
 826 -       if (transaction_started)
 827 -           journal_end(&th, inode->i_sb, jbegin_count) ;
 828
 829         unlock_kernel() ;
 830
 831 @@ -676,18 +736,23 @@
 832         ** there is no need to make sure the inode is updated with this
 833         ** transaction
 834         */
 835 +       if (!dangle && reiserfs_active_handle(th))
 836 +           journal_end(th, inode->i_sb, jbegin_count) ;
 837         return 0;
 838      }
 839
 840 -    if (!transaction_started) {
 841 +    if (!reiserfs_active_handle(th)) {
 842         /* if we don't pathrelse, we could vs-3050 on the buffer if
 843         ** someone is waiting for it (they can't finish until the buffer
 844 -       ** is released, we can start a new transaction until they finish)
 845 +       ** is released, we can't start a new transaction until they finish)
 846         */
 847         pathrelse(&path) ;
 848 -       journal_begin(&th, inode->i_sb, jbegin_count) ;
 849 +       th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count) ;
 850 +       if (IS_ERR(th)) {
 851 +           retval = PTR_ERR(th) ;
 852 +           goto failure ;
 853 +       }
 854         reiserfs_update_inode_transaction(inode) ;
 855 -       transaction_started = 1 ;
 856         goto research;
 857      }
 858
 859 @@ -716,13 +781,11 @@
 860             set_cpu_key_k_offset (&tmp_key, 1);
 861             PATH_LAST_POSITION(&path) ++;
 862
 863 -           retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
 864 +           retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 865             if (retval) {
 866 -               reiserfs_free_block (&th, allocated_block_nr);
 867 -               goto failure; // retval == -ENOSPC or -EIO or -EEXIST
 868 +               reiserfs_free_block (th, inode, allocated_block_nr, 1);
 869 +               goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 870             }
 871 -           if (unp)
 872 -               inode->i_blocks += inode->i_sb->s_blocksize / 512;
 873             //mark_tail_converted (inode);
 874         } else if (is_direct_le_ih (ih)) {
 875             /* direct item has to be converted */
 876 @@ -742,8 +805,13 @@
 877                    node. FIXME: this should also get into page cache */
 878
 879                 pathrelse(&path) ;
 880 -               journal_end(&th, inode->i_sb, jbegin_count) ;
 881 -               transaction_started = 0 ;
 882 +               /* ugly, but we should only end the transaction if
 883 +               ** we aren't nested
 884 +               */
 885 +               if (th->t_refcount == 1) {
 886 +                   journal_end(th, inode->i_sb, jbegin_count) ;
 887 +                   th = NULL ;
 888 +               }
 889
 890                 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 891                 if (retval) {
 892 @@ -751,20 +819,27 @@
 893                         reiserfs_warning(inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 894                     if (allocated_block_nr) {
 895                         /* the bitmap, the super, and the stat data == 3 */
 896 -                       journal_begin(&th, inode->i_sb, 3) ;
 897 -                       reiserfs_free_block (&th, allocated_block_nr);
 898 -                       transaction_started = 1 ;
 899 +                       if (!reiserfs_active_handle(th)) {
 900 +                           th = reiserfs_persistent_transaction(inode->i_sb,3);
 901 +                       }
 902 +                       if (!IS_ERR(th)) {
 903 +                           reiserfs_free_block(th,inode,allocated_block_nr,1);
 904 +                       }
 905 +
 906                     }
 907                     goto failure ;
 908                 }
 909                 goto research ;
 910             }
 911 -           retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
 912 +           retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 913             if (retval) {
 914                 reiserfs_unmap_buffer(unbh);
 915 -               reiserfs_free_block (&th, allocated_block_nr);
 916 +               reiserfs_free_block (th, inode, allocated_block_nr, 1);
 917                 goto failure;
 918             }
 919 +
 920 +           reiserfs_update_sd(th, inode) ;
 921 +
 922             /* it is important the mark_buffer_uptodate is done after
 923             ** the direct2indirect.  The buffer might contain valid
 924             ** data newer than the data on disk (read by readpage, changed,
 925 @@ -775,24 +850,25 @@
 926             */
 927             mark_buffer_uptodate (unbh, 1);
 928
 929 -           /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 930 -              buffer will disappear shortly, so it should not be added to
 931 -              any of our lists.
 932 +           /* we've converted the tail, so we must
 933 +           ** flush unbh before the transaction commits.
 934 +           ** unbh->b_page will be NULL for direct io requests, and
 935 +           ** in that case there's no data to log, dirty or order
 936             */
 937             if ( unbh->b_page ) {
 938 -               /* we've converted the tail, so we must
 939 -               ** flush unbh before the transaction commits
 940 -               */
 941 -               add_to_flushlist(inode, unbh) ;
 942 -
 943 -               /* mark it dirty now to prevent commit_write from adding
 944 -                ** this buffer to the inode's dirty buffer list
 945 -                */
 946 -               __mark_buffer_dirty(unbh) ;
 947 +               if (reiserfs_file_data_log(inode)) {
 948 +                   reiserfs_prepare_for_journal(inode->i_sb, unbh, 1) ;
 949 +                   journal_mark_dirty(th, inode->i_sb, unbh) ;
 950 +               } else {
 951 +                   /* mark it dirty now to prevent commit_write from adding
 952 +                   ** this buffer to the inode's dirty buffer list
 953 +                   */
 954 +                   __mark_buffer_dirty(unbh) ;
 955 +                   /* note, this covers the data=ordered case too */
 956 +                   add_to_tail_list(inode, unbh) ;
 957 +               }
 958             }
 959
 960 -           //inode->i_blocks += inode->i_sb->s_blocksize / 512;
 961 -           //mark_tail_converted (inode);
 962         } else {
 963             /* append indirect item with holes if needed, when appending
 964                pointer to 'block'-th block use block, which is already
 965 @@ -840,18 +916,16 @@
 966                    only have space for one block */
 967                 blocks_needed=max_to_insert?max_to_insert:1;
 968             }
 969 -           retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
 970 +           retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 971
 972             if (blocks_needed != 1)
 973                  kfree(un);
 974
 975             if (retval) {
 976 -               reiserfs_free_block (&th, allocated_block_nr);
 977 +               reiserfs_free_block (th, inode, allocated_block_nr, 1);
 978                 goto failure;
 979             }
 980 -           if (done) {
 981 -               inode->i_blocks += inode->i_sb->s_blocksize / 512;
 982 -           } else {
 983 +           if (!done) {
 984                 /* We need to mark new file size in case this function will be
 985                    interrupted/aborted later on. And we may do this only for
 986                    holes. */
 987 @@ -870,9 +944,12 @@
 988         **
 989         ** release the path so that anybody waiting on the path before
 990         ** ending their transaction will be able to continue.
 991 +       **
 992 +       ** this only happens when inserting holes into the file, so it
 993 +       ** does not affect data=ordered safety at all
 994         */
 995 -       if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
 996 -         restart_transaction(&th, inode, &path) ;
 997 +       if (journal_transaction_should_end(th, jbegin_count)) {
 998 +           restart_transaction(th, inode, &path, jbegin_count) ;
 999         }
1000         /* inserting indirect pointers for a hole can take a
1001         ** long time.  reschedule if needed
1002 @@ -890,7 +967,7 @@
1003                               "%K should not be found\n", &key);
1004             retval = -EEXIST;
1005             if (allocated_block_nr)
1006 -               reiserfs_free_block (&th, allocated_block_nr);
1007 +               reiserfs_free_block (th, inode, allocated_block_nr, 1);
1008             pathrelse(&path) ;
1009             goto failure;
1010         }
1011 @@ -902,20 +979,82 @@
1012
1013
1014      retval = 0;
1015 -    reiserfs_check_path(&path) ;
1016
1017   failure:
1018 -    if (transaction_started) {
1019 -      reiserfs_update_sd(&th, inode) ;
1020 -      journal_end(&th, inode->i_sb, jbegin_count) ;
1021 +    pathrelse(&path) ;
1022 +    /* if we had an error, end the transaction */
1023 +    if (!IS_ERR(th) && reiserfs_active_handle(th)) {
1024 +        if (retval != 0) {
1025 +           reiserfs_update_sd(th, inode) ;
1026 +           journal_end(th, inode->i_sb, jbegin_count) ;
1027 +           th = NULL ;
1028 +       } else if (!dangle) {
1029 +           journal_end(th, inode->i_sb, jbegin_count) ;
1030 +           th = NULL ;
1031 +       }
1032      }
1033      pop_journal_writer(windex) ;
1034 +    if (retval == 0 && reiserfs_active_handle(th) &&
1035 +        current->journal_info != th) {
1036 +        BUG() ;
1037 +    }
1038      unlock_kernel() ;
1039 -    reiserfs_check_path(&path) ;
1040      return retval;
1041  }
1042
1043
1044 +/* Compute real number of used bytes by file
1045 + * Following three functions can go away when we'll have enough space in stat item
1046 + */
1047 +static int real_space_diff(struct inode *inode, int sd_size)
1048 +{
1049 +    int bytes;
1050 +    loff_t blocksize = inode->i_sb->s_blocksize ;
1051 +
1052 +    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1053 +        return sd_size ;
1054 +
1055 +    /* End of file is also in full block with indirect reference, so round
1056 +    ** up to the next block.
1057 +    **
1058 +    ** there is just no way to know if the tail is actually packed
1059 +    ** on the file, so we have to assume it isn't.  When we pack the
1060 +    ** tail, we add 4 bytes to pretend there really is an unformatted
1061 +    ** node pointer
1062 +    */
1063 +    bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
1064 +    return bytes ;
1065 +}
1066 +
1067 +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1068 +                                        int sd_size)
1069 +{
1070 +    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1071 +        return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
1072 +    }
1073 +    return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
1074 +}
1075 +
1076 +/* Compute number of blocks used by file in ReiserFS counting */
1077 +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1078 +{
1079 +    loff_t bytes = inode_get_bytes(inode) ;
1080 +    loff_t real_space = real_space_diff(inode, sd_size) ;
1081 +
1082 +    /* keeps fsck and non-quota versions of reiserfs happy */
1083 +    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1084 +        bytes += (loff_t)511 ;
1085 +    }
1086 +
1087 +    /* files from before the quota patch might i_blocks such that
1088 +    ** bytes < real_space.  Deal with that here to prevent it from
1089 +    ** going negative.
1090 +    */
1091 +    if (bytes < real_space)
1092 +        return 0 ;
1093 +    return (bytes - real_space) >> 9;
1094 +}
1095 +
1096  //
1097  // BAD: new directories have stat data of new type and all other items
1098  // of old type. Version stored in the inode says about body items, so
1099 @@ -971,6 +1110,14 @@
1100
1101          rdev = sd_v1_rdev(sd);
1102         inode->u.reiserfs_i.i_first_direct_byte = sd_v1_first_direct_byte(sd);
1103 +       /* an early bug in the quota code can give us an odd number for the
1104 +       ** block count.  This is incorrect, fix it here.
1105 +       */
1106 +       if (inode->i_blocks & 1) {
1107 +           inode->i_blocks++ ;
1108 +       }
1109 +       inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1110 +                                                 SD_V1_SIZE));
1111         /* nopack is initially zero for v1 objects. For v2 objects,
1112            nopack is initialised from sd_attrs */
1113         inode->u.reiserfs_i.i_flags &= ~i_nopack_mask;
1114 @@ -1000,6 +1147,8 @@
1115              set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1116
1117          set_inode_sd_version (inode, STAT_DATA_V2);
1118 +       inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1119 +                                                 SD_V2_SIZE));
1120         /* read persistent inode attributes from sd and initalise
1121            generic inode flags from them */
1122         inode -> u.reiserfs_i.i_attrs = sd_v2_attrs( sd );
1123 @@ -1026,7 +1175,7 @@
1124
1125
1126  // update new stat data with inode fields
1127 -static void inode2sd (void * sd, struct inode * inode)
1128 +static void inode2sd (void * sd, struct inode * inode, loff_t new_size)
1129  {
1130      struct stat_data * sd_v2 = (struct stat_data *)sd;
1131      __u16 flags;
1132 @@ -1034,12 +1183,12 @@
1133      set_sd_v2_mode(sd_v2, inode->i_mode );
1134      set_sd_v2_nlink(sd_v2, inode->i_nlink );
1135      set_sd_v2_uid(sd_v2, inode->i_uid );
1136 -    set_sd_v2_size(sd_v2, inode->i_size );
1137 +    set_sd_v2_size(sd_v2, new_size);
1138      set_sd_v2_gid(sd_v2, inode->i_gid );
1139      set_sd_v2_mtime(sd_v2, inode->i_mtime );
1140      set_sd_v2_atime(sd_v2, inode->i_atime );
1141      set_sd_v2_ctime(sd_v2, inode->i_ctime );
1142 -    set_sd_v2_blocks(sd_v2, inode->i_blocks );
1143 +    set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1144      if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1145          set_sd_v2_rdev(sd_v2, inode->i_rdev );
1146      else
1147 @@ -1051,7 +1200,7 @@
1148
1149
1150  // used to copy inode's fields to old stat data
1151 -static void inode2sd_v1 (void * sd, struct inode * inode)
1152 +static void inode2sd_v1 (void * sd, struct inode * inode, loff_t new_size)
1153  {
1154      struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1155
1156 @@ -1059,7 +1208,7 @@
1157      set_sd_v1_uid(sd_v1, inode->i_uid );
1158      set_sd_v1_gid(sd_v1, inode->i_gid );
1159      set_sd_v1_nlink(sd_v1, inode->i_nlink );
1160 -    set_sd_v1_size(sd_v1, inode->i_size );
1161 +    set_sd_v1_size(sd_v1, new_size);
1162      set_sd_v1_atime(sd_v1, inode->i_atime );
1163      set_sd_v1_ctime(sd_v1, inode->i_ctime );
1164      set_sd_v1_mtime(sd_v1, inode->i_mtime );
1165 @@ -1067,7 +1216,7 @@
1166      if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1167          set_sd_v1_rdev(sd_v1, inode->i_rdev );
1168      else
1169 -        set_sd_v1_blocks(sd_v1, inode->i_blocks );
1170 +        set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1171
1172      // Sigh. i_first_direct_byte is back
1173      set_sd_v1_first_direct_byte(sd_v1, inode->u.reiserfs_i.i_first_direct_byte);
1174 @@ -1077,7 +1226,8 @@
1175  /* NOTE, you must prepare the buffer head before sending it here,
1176  ** and then log it after the call
1177  */
1178 -static void update_stat_data (struct path * path, struct inode * inode)
1179 +static void update_stat_data (struct path * path, struct inode * inode,
1180 +                              loff_t new_size)
1181  {
1182      struct buffer_head * bh;
1183      struct item_head * ih;
1184 @@ -1091,17 +1241,16 @@
1185
1186      if (stat_data_v1 (ih)) {
1187         // path points to old stat data
1188 -       inode2sd_v1 (B_I_PITEM (bh, ih), inode);
1189 +       inode2sd_v1 (B_I_PITEM (bh, ih), inode, new_size);
1190      } else {
1191 -       inode2sd (B_I_PITEM (bh, ih), inode);
1192 +       inode2sd (B_I_PITEM (bh, ih), inode, new_size);
1193      }
1194
1195      return;
1196  }
1197
1198 -
1199 -void reiserfs_update_sd (struct reiserfs_transaction_handle *th,
1200 -                        struct inode * inode)
1201 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1202 +                             struct inode *inode, loff_t new_size)
1203  {
1204      struct cpu_key key;
1205      INITIALIZE_PATH(path);
1206 @@ -1151,7 +1300,7 @@
1207         }
1208         break;
1209      }
1210 -    update_stat_data (&path, inode);
1211 +    update_stat_data (&path, inode, new_size);
1212      journal_mark_dirty(th, th->t_super, bh) ;
1213      pathrelse (&path);
1214      return;
1215 @@ -1236,6 +1385,7 @@
1216             reiserfs_make_bad_inode( inode );
1217      }
1218
1219 +    reiserfs_update_inode_transaction(inode);
1220      reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1221
1222  }
1223 @@ -1415,8 +1565,6 @@
1224  ** does something when called for a synchronous update.
1225  */
1226  void reiserfs_write_inode (struct inode * inode, int do_sync) {
1227 -    struct reiserfs_transaction_handle th ;
1228 -    int jbegin_count = 1 ;
1229
1230      if (inode->i_sb->s_flags & MS_RDONLY) {
1231          reiserfs_warning(inode->i_sb, "clm-6005: writing inode %lu on readonly FS\n",
1232 @@ -1430,9 +1578,7 @@
1233      */
1234      if (do_sync && !(current->flags & PF_MEMALLOC)) {
1235         lock_kernel() ;
1236 -       journal_begin(&th, inode->i_sb, jbegin_count) ;
1237 -       reiserfs_update_sd (&th, inode);
1238 -       journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1239 +       reiserfs_commit_for_inode(inode) ;
1240         unlock_kernel() ;
1241      }
1242  }
1243 @@ -1450,6 +1596,7 @@
1244  /* stat data of new object is inserted already, this inserts the item
1245     containing "." and ".." entries */
1246  static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1247 +                                  struct inode *inode,
1248                                    struct item_head * ih, struct path * path,
1249                                    const struct inode * dir)
1250  {
1251 @@ -1494,13 +1641,14 @@
1252      }
1253
1254      /* insert item, that is empty directory item */
1255 -    return reiserfs_insert_item (th, path, &key, ih, body);
1256 +    return reiserfs_insert_item (th, path, &key, ih, inode, body);
1257  }
1258
1259
1260  /* stat data of object has been inserted, this inserts the item
1261     containing the body of symlink */
1262  static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1263 +                                struct inode *inode,   /* Inode of symlink */
1264                                  struct item_head * ih,
1265                                  struct path * path, const char * symname, int item_len)
1266  {
1267 @@ -1530,7 +1678,7 @@
1268      }
1269
1270      /* insert item, that is body of symlink */
1271 -    return reiserfs_insert_item (th, path, &key, ih, symname);
1272 +    return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1273  }
1274
1275
1276 @@ -1604,7 +1752,8 @@
1277
1278      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1279      inode->i_size = i_size;
1280 -    inode->i_blocks = (inode->i_size + 511) >> 9;
1281 +    inode->i_blocks = 0;
1282 +    inode->i_bytes = 0;
1283      inode->u.reiserfs_i.i_first_direct_byte = S_ISLNK(mode) ? 1 :
1284        U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1285
1286 @@ -1638,9 +1787,9 @@
1287             err = -EINVAL;
1288             goto out_bad_inode;
1289         }
1290 -       inode2sd_v1 (&sd, inode);
1291 +       inode2sd_v1 (&sd, inode, inode->i_size);
1292      } else
1293 -       inode2sd (&sd, inode);
1294 +       inode2sd (&sd, inode, inode->i_size);
1295
1296      // these do not go to on-disk stat data
1297      inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1298 @@ -1665,7 +1814,7 @@
1299      if (dir->u.reiserfs_i.new_packing_locality)
1300         th->displace_new_blocks = 1;
1301  #endif
1302 -    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd));
1303 +    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1304      if (retval) {
1305         reiserfs_check_path(&path_to_key) ;
1306         err = retval;
1307 @@ -1678,14 +1827,14 @@
1308  #endif
1309      if (S_ISDIR(mode)) {
1310         /* insert item with "." and ".." */
1311 -       retval = reiserfs_new_directory (th, &ih, &path_to_key, dir);
1312 +       retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1313      }
1314
1315      if (S_ISLNK(mode)) {
1316         /* insert body of symlink */
1317         if (!old_format_only (sb))
1318             i_size = ROUND_UP(i_size);
1319 -       retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size);
1320 +       retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1321      }
1322      if (retval) {
1323         err = retval;
1324 @@ -1705,6 +1854,9 @@
1325
1326      /* dquot_drop must be done outside a transaction */
1327      journal_end(th, th->t_super, th->t_blocks_allocated) ;
1328 +    DQUOT_FREE_INODE(inode);
1329 +    DQUOT_DROP(inode);
1330 +    inode->i_flags |= S_NOQUOTA;
1331      make_bad_inode(inode);
1332
1333  out_inserted_sd:
1334 @@ -1816,6 +1968,7 @@
1335      unsigned length ;
1336      struct page *page = NULL ;
1337      int error ;
1338 +    int need_balance_dirty = 0 ;
1339      struct buffer_head *bh = NULL ;
1340
1341      if (p_s_inode->i_size > 0) {
1342 @@ -1848,34 +2001,58 @@
1343                transaction of truncating gets committed - on reboot the file
1344                either appears truncated properly or not truncated at all */
1345         add_save_link (&th, p_s_inode, 1);
1346 +    if (page)
1347 +       kmap(page);
1348      reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
1349      pop_journal_writer(windex) ;
1350 -    journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1351 -
1352 -    if (update_timestamps)
1353 -       remove_save_link (p_s_inode, 1/* truncate */);
1354
1355      if (page) {
1356 +       if (!PageLocked(page))
1357 +           BUG();
1358          length = offset & (blocksize - 1) ;
1359         /* if we are not on a block boundary */
1360         if (length) {
1361             length = blocksize - length ;
1362 -           memset((char *)kmap(page) + offset, 0, length) ;
1363 +           if ((offset + length) > PAGE_CACHE_SIZE) {
1364 +               BUG();
1365 +           }
1366 +           memset((char *)page_address(page) + offset, 0, length) ;
1367             flush_dcache_page(page) ;
1368 -           kunmap(page) ;
1369             if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1370 -               if (!atomic_set_buffer_dirty(bh)) {
1371 +               if (reiserfs_file_data_log(p_s_inode)) {
1372 +                   reiserfs_prepare_for_journal(p_s_inode->i_sb, bh, 1) ;
1373 +                   journal_mark_dirty(&th, p_s_inode->i_sb, bh) ;
1374 +               } else {
1375 +                   /* it is safe to block here, but it would be faster
1376 +                   ** to balance dirty after the journal lock is dropped
1377 +                   */
1378 +                   if (!atomic_set_buffer_dirty(bh)) {
1379                         set_buffer_flushtime(bh);
1380                         refile_buffer(bh);
1381                         buffer_insert_inode_data_queue(bh, p_s_inode);
1382 -                       balance_dirty();
1383 +                       need_balance_dirty = 1;
1384 +
1385 +                       if (reiserfs_data_ordered(p_s_inode->i_sb)) {
1386 +                           add_to_flushlist(p_s_inode, bh) ;
1387 +                       }
1388 +                   }
1389                 }
1390             }
1391         }
1392 +       kunmap(page);
1393 +    }
1394 +    journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1) ;
1395 +
1396 +    if (update_timestamps)
1397 +       remove_save_link(p_s_inode, 1/* truncate */);
1398 +
1399 +    if (page) {
1400         UnlockPage(page) ;
1401         page_cache_release(page) ;
1402      }
1403 -
1404 +    if (need_balance_dirty) {
1405 +       balance_dirty() ;
1406 +    }
1407      return ;
1408  }
1409
1410 @@ -1944,6 +2121,8 @@
1411             goto research;
1412         }
1413
1414 +       if (((B_I_PITEM(bh, ih) - bh->b_data) + pos_in_item + copy_size) > inode->i_sb->s_blocksize)
1415 +           BUG();
1416         memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
1417
1418         journal_mark_dirty(&th, inode->i_sb, bh) ;
1419 @@ -1971,9 +2150,37 @@
1420
1421      /* this is where we fill in holes in the file. */
1422      if (use_get_block) {
1423 +       int old_refcount = 0 ;
1424 +       struct reiserfs_transaction_handle *hole_th ;
1425 +       if (reiserfs_transaction_running(inode->i_sb)) {
1426 +           hole_th = current->journal_info ;
1427 +           old_refcount = hole_th->t_refcount ;
1428 +       }
1429         retval = reiserfs_get_block(inode, block, bh_result,
1430                                     GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
1431         if (!retval) {
1432 +           /* did reiserfs_get_block leave us a running transaction? */
1433 +           if (reiserfs_transaction_running(inode->i_sb)) {
1434 +               hole_th = current->journal_info ;
1435 +               if (old_refcount < hole_th->t_refcount) {
1436 +                   lock_kernel() ;
1437 +                   /* we've filled a hole, make sure the new block
1438 +                    * gets to disk before transaction commit
1439 +                    */
1440 +                   if (buffer_mapped(bh_result) && bh_result->b_blocknr != 0 &&
1441 +                       reiserfs_data_ordered(inode->i_sb))
1442 +                   {
1443 +                       __mark_buffer_dirty(bh_result) ;
1444 +                       mark_buffer_uptodate(bh_result, 1);
1445 +                       /* no need to update the inode trans, already done */
1446 +                       add_to_flushlist(inode, bh_result) ;
1447 +                   }
1448 +                   reiserfs_update_sd(hole_th, inode) ;
1449 +                   journal_end(hole_th, hole_th->t_super,
1450 +                               hole_th->t_blocks_allocated) ;
1451 +                   unlock_kernel() ;
1452 +               }
1453 +           }
1454             if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
1455                 /* get_block failed to find a mapped unformatted node. */
1456                 use_get_block = 0 ;
1457 @@ -1988,33 +2195,41 @@
1458  /* helper func to get a buffer head ready for writepage to send to
1459  ** ll_rw_block
1460  */
1461 -static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
1462 +static void submit_bh_for_writepage(struct page *page,
1463 +                                    struct buffer_head **bhp, int nr) {
1464      struct buffer_head *bh ;
1465      int i;
1466
1467 -    /* lock them all first so the end_io handler doesn't unlock the page
1468 -    ** too early
1469 +    /* lock them all first so the end_io handler doesn't
1470 +    ** unlock too early
1471 +    **
1472 +    ** There's just no safe way to log the buffers during writepage,
1473 +    ** we'll deadlock if kswapd tries to start a transaction.
1474 +    **
1475 +    ** There's also no useful way to tie them to a specific transaction,
1476 +    ** so we just don't bother.
1477      */
1478      for(i = 0 ; i < nr ; i++) {
1479 -        bh = bhp[i] ;
1480 -       lock_buffer(bh) ;
1481 -       set_buffer_async_io(bh) ;
1482 +       bh = bhp[i] ;
1483 +       lock_buffer(bh);
1484 +       set_buffer_async_io(bh);
1485 +       set_bit(BH_Uptodate, &bh->b_state) ;
1486      }
1487      for(i = 0 ; i < nr ; i++) {
1488 +       bh = bhp[i] ;
1489         /* submit_bh doesn't care if the buffer is dirty, but nobody
1490         ** later on in the call chain will be cleaning it.  So, we
1491         ** clean the buffer here, it still gets written either way.
1492         */
1493 -        bh = bhp[i] ;
1494         clear_bit(BH_Dirty, &bh->b_state) ;
1495 -       set_bit(BH_Uptodate, &bh->b_state) ;
1496         submit_bh(WRITE, bh) ;
1497      }
1498  }
1499
1500  static int reiserfs_write_full_page(struct page *page) {
1501      struct inode *inode = page->mapping->host ;
1502 -    unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
1503 +    loff_t size = inode->i_size;
1504 +    unsigned long end_index = size >> PAGE_CACHE_SHIFT ;
1505      unsigned last_offset = PAGE_CACHE_SIZE;
1506      int error = 0;
1507      unsigned long block ;
1508 @@ -2024,21 +2239,36 @@
1509      struct buffer_head *arr[PAGE_CACHE_SIZE/512] ;
1510      int nr = 0 ;
1511
1512 +    if (reiserfs_transaction_running(inode->i_sb)) {
1513 +        BUG();
1514 +    }
1515 +
1516 +    if (!PageLocked(page))
1517 +        BUG();
1518 +
1519      if (!page->buffers) {
1520          block_prepare_write(page, 0, 0, NULL) ;
1521         kunmap(page) ;
1522      }
1523 +
1524 +    if (reiserfs_transaction_running(inode->i_sb)) {
1525 +        BUG();
1526 +    }
1527      /* last page in the file, zero out any contents past the
1528      ** last byte in the file
1529      */
1530      if (page->index >= end_index) {
1531 -        last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1532 +        char *p ;
1533 +        last_offset = size & (PAGE_CACHE_SIZE - 1) ;
1534         /* no file contents in this page */
1535         if (page->index >= end_index + 1 || !last_offset) {
1536             error =  -EIO ;
1537             goto fail ;
1538         }
1539 -       memset((char *)kmap(page)+last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
1540 +       p = kmap(page);
1541 +       if (last_offset > PAGE_CACHE_SIZE)
1542 +           BUG();
1543 +       memset(p + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
1544         flush_dcache_page(page) ;
1545         kunmap(page) ;
1546      }
1547 @@ -2079,7 +2309,7 @@
1548      ** nr == 0 without there being any kind of error.
1549      */
1550      if (nr) {
1551 -        submit_bh_for_writepage(arr, nr) ;
1552 +        submit_bh_for_writepage(page, arr, nr) ;
1553         wakeup_page_waiters(page);
1554      } else {
1555          UnlockPage(page) ;
1556 @@ -2091,7 +2321,7 @@
1557
1558  fail:
1559      if (nr) {
1560 -        submit_bh_for_writepage(arr, nr) ;
1561 +        submit_bh_for_writepage(page, arr, nr) ;
1562      } else {
1563          UnlockPage(page) ;
1564      }
1565 @@ -2116,10 +2346,46 @@
1566
1567  int reiserfs_prepare_write(struct file *f, struct page *page,
1568                            unsigned from, unsigned to) {
1569 +    int cur_refcount = 0 ;
1570 +    int ret ;
1571      struct inode *inode = page->mapping->host ;
1572 +    struct reiserfs_transaction_handle *th ;
1573 +
1574      reiserfs_wait_on_write_block(inode->i_sb) ;
1575      fix_tail_page_for_writing(page) ;
1576 -    return block_prepare_write(page, from, to, reiserfs_get_block) ;
1577 +
1578 +    /* we look for a running transaction before the block_prepare_write
1579 +    ** call, and then again afterwards.  This lets us know if
1580 +    ** reiserfs_get_block added any additional transactions, so we can
1581 +    ** let reiserfs_commit_write know if he needs to close them.
1582 +    ** this is just nasty
1583 +    */
1584 +    if (reiserfs_transaction_running(inode->i_sb)) {
1585 +       th = current->journal_info ;
1586 +       cur_refcount = th->t_refcount ;
1587 +    }
1588 +    ret =  block_prepare_write(page, from, to, reiserfs_get_block) ;
1589 +
1590 +    /* it is very important that we only set the dangling bit when
1591 +    ** there is no chance of additional nested transactions.
1592 +    */
1593 +    if (reiserfs_transaction_running(inode->i_sb)) {
1594 +        th = current->journal_info ;
1595 +       if (th->t_refcount > cur_refcount) {
1596 +           /* if we return an error, commit_write isn't going to get called
1597 +            * we need to make sure we end any transactions
1598 +            * reiserfs_get_block left hanging around
1599 +            */
1600 +           if (ret) {
1601 +               lock_kernel();
1602 +               journal_end(th, th->t_super, th->t_blocks_allocated) ;
1603 +               unlock_kernel();
1604 +           } else {
1605 +               reiserfs_set_handle_dangling(th) ;
1606 +           }
1607 +       }
1608 +    }
1609 +    return ret ;
1610  }
1611
1612
1613 @@ -2127,20 +2393,96 @@
1614    return generic_block_bmap(as, block, reiserfs_bmap) ;
1615  }
1616
1617 +/* taken from fs/buffer.c */
1618 +static int __commit_write(struct reiserfs_transaction_handle *th,
1619 +                          struct inode *inode, struct page *page,
1620 +                         unsigned from, unsigned to, int *balance)
1621 +{
1622 +    unsigned block_start, block_end;
1623 +    int partial = 0;
1624 +    unsigned blocksize;
1625 +    struct buffer_head *bh, *head;
1626 +    int logbh = 0 ;
1627 +
1628 +    blocksize = 1 << inode->i_blkbits;
1629 +    if (reiserfs_file_data_log(inode)) {
1630 +        logbh = 1 ;
1631 +       lock_kernel() ;
1632 +       /* one for each block + the stat data, the caller closes the handle */
1633 +       journal_begin(th, inode->i_sb,(PAGE_CACHE_SIZE >> inode->i_blkbits)+1);
1634 +       reiserfs_update_inode_transaction(inode) ;
1635 +       unlock_kernel() ;
1636 +    }
1637 +
1638 +    for(bh = head = page->buffers, block_start = 0;
1639 +        bh != head || !block_start;
1640 +        block_start=block_end, bh = bh->b_this_page) {
1641 +       block_end = block_start + blocksize;
1642 +       if (block_end <= from || block_start >= to) {
1643 +           if (!buffer_uptodate(bh))
1644 +                   partial = 1;
1645 +       } else {
1646 +           set_bit(BH_Uptodate, &bh->b_state);
1647 +           if (logbh) {
1648 +               lock_kernel() ;
1649 +               reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1650 +               journal_mark_dirty (th, inode->i_sb, bh);
1651 +               unlock_kernel() ;
1652 +           } else if (!atomic_set_buffer_dirty(bh)) {
1653 +               __mark_dirty(bh);
1654 +               if (reiserfs_data_ordered(inode->i_sb)) {
1655 +                   lock_kernel();
1656 +                   add_to_flushlist(inode, bh);
1657 +                   /* if we don't update the inode trans information,
1658 +                    * an fsync(fd) might not catch these data blocks
1659 +                    */
1660 +                   reiserfs_update_inode_transaction(inode);
1661 +                   unlock_kernel();
1662 +               } else {
1663 +                   buffer_insert_inode_data_queue(bh, inode);
1664 +               }
1665 +               *balance = 1;
1666 +           }
1667 +       }
1668 +    }
1669 +
1670 +    /*
1671 +     * is this a partial write that happened to make all buffers
1672 +     * uptodate then we can optimize away a bogus readpage() for
1673 +     * the next read(). Here we 'discover' wether the page went
1674 +     * uptodate as a result of this (potentially partial) write.
1675 +     */
1676 +    if (!partial)
1677 +       SetPageUptodate(page);
1678 +    return 0;
1679 +}
1680 +
1681  static int reiserfs_commit_write(struct file *f, struct page *page,
1682                                   unsigned from, unsigned to) {
1683      struct inode *inode = page->mapping->host ;
1684      loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1685      int ret ;
1686 -
1687 +    int need_balance = 0;
1688 +    struct reiserfs_transaction_handle th ;
1689 +    struct reiserfs_transaction_handle *dth = NULL ;
1690 +
1691 +    /* we must do this before anything that might nest a transaction or
1692 +    ** mess with the handle flags
1693 +    */
1694 +    if (reiserfs_transaction_running(inode->i_sb)) {
1695 +       dth = current->journal_info ;
1696 +       if (reiserfs_dangling_handle(dth)) {
1697 +           reiserfs_clear_handle_dangling(dth) ;
1698 +       } else {
1699 +           dth = NULL ;
1700 +       }
1701 +    }
1702      reiserfs_wait_on_write_block(inode->i_sb) ;
1703 +
1704 +    th.t_flags = 0 ;
1705 +    ret = __commit_write(&th, inode, page, from, to, &need_balance) ;
1706
1707 -    /* generic_commit_write does this for us, but does not update the
1708 -    ** transaction tracking stuff when the size changes.  So, we have
1709 -    ** to do the i_size updates here.
1710 -    */
1711      if (pos > inode->i_size) {
1712 -       struct reiserfs_transaction_handle th ;
1713         lock_kernel();
1714         /* If the file have grown beyond the border where it
1715            can have a tail, unmark it as needing a tail
1716 @@ -2149,24 +2491,135 @@
1717              (have_small_tails (inode->i_sb) && inode->i_size > block_size(inode)) )
1718             inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
1719
1720 -       journal_begin(&th, inode->i_sb, 1) ;
1721 +       if (!reiserfs_active_handle(&th)) {
1722 +           journal_begin(&th, inode->i_sb, 1) ;
1723 +       }
1724         reiserfs_update_inode_transaction(inode) ;
1725         inode->i_size = pos ;
1726         reiserfs_update_sd(&th, inode) ;
1727 -       journal_end(&th, inode->i_sb, 1) ;
1728 -       unlock_kernel();
1729 +       journal_end(&th, th.t_super, th.t_blocks_allocated) ;
1730 +       unlock_kernel() ;
1731 +    } else if (reiserfs_active_handle(&th)) {
1732 +       /* in case commit_write left one running and the i_size update did
1733 +       ** not close it
1734 +       */
1735 +       lock_kernel() ;
1736 +       journal_end(&th, th.t_super, th.t_blocks_allocated) ;
1737 +       unlock_kernel() ;
1738      }
1739 -
1740 -    ret = generic_commit_write(f, page, from, to) ;
1741
1742 -    /* we test for O_SYNC here so we can commit the transaction
1743 -    ** for any packed tails the file might have had
1744 +    /* did reiserfs_get_block leave us with a running transaction?
1745      */
1746 -    if (f && (f->f_flags & O_SYNC)) {
1747 +    if (dth) {
1748         lock_kernel() ;
1749 -       reiserfs_commit_for_inode(inode) ;
1750 +       journal_end(dth, dth->t_super, dth->t_blocks_allocated) ;
1751         unlock_kernel();
1752      }
1753 +
1754 +    kunmap(page) ;
1755 +
1756 +    if (need_balance)
1757 +       balance_dirty();
1758 +
1759 +    return ret ;
1760 +}
1761 +
1762 +/* decide if this buffer needs to stay around for data logging or ordered
1763 +** write purposes
1764 +*/
1765 +static int flushpage_can_drop(struct inode *inode, struct buffer_head *bh) {
1766 +    int ret = 1 ;
1767 +
1768 +    if (!buffer_mapped(bh)) {
1769 +        return 1 ;
1770 +    }
1771 +    if (reiserfs_file_data_log(inode)) {
1772 +       lock_kernel() ;
1773 +       /* very conservative, leave the buffer pinned if anyone might need it.
1774 +       ** this should be changed to drop the buffer if it is only in the
1775 +       ** current transaction
1776 +       */
1777 +        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
1778 +           ret = 0 ;
1779 +       }
1780 +       unlock_kernel() ;
1781 +    }
1782 +    if (reiserfs_data_ordered(inode->i_sb)) {
1783 +        if (buffer_dirty(bh) && bh->b_journal_head) {
1784 +           struct reiserfs_journal_list *jl = NULL;
1785 +           lock_kernel();
1786 +
1787 +           /* we can race against fsync_inode_buffers if we aren't careful */
1788 +           if (buffer_attached(bh) && buffer_dirty(bh))
1789 +               jl = bh->b_journal_head;
1790 +
1791 +           /* why is this safe?
1792 +            * reiserfs_setattr updates i_size in the on disk
1793 +            * stat data before allowing vmtruncate to be called.
1794 +            *
1795 +            * If buffer was put onto the ordered list for this
1796 +            * transaction, we know for sure either this transaction
1797 +            * or an older one already has updated i_size on disk,
1798 +            * and this ordered data won't be referenced in the file
1799 +            * if we crash.
1800 +            *
1801 +            * if the buffer was put onto the ordered list for an older
1802 +            * transaction, we need to leave it around
1803 +            */
1804 +           if (jl != SB_JOURNAL(inode->i_sb)->j_current_jl) {
1805 +               ret = 0;
1806 +           }
1807 +           unlock_kernel();
1808 +       }
1809 +    }
1810 +    return ret ;
1811 +}
1812 +
1813 +/* stolen from fs/buffer.c:discard_bh_page */
1814 +static int reiserfs_flushpage(struct page *page, unsigned long offset) {
1815 +    struct buffer_head *head, *bh, *next;
1816 +    struct inode *inode = page->mapping->host ;
1817 +    unsigned int curr_off = 0;
1818 +    int ret = 1;
1819 +
1820 +    if (!PageLocked(page))
1821 +       BUG();
1822 +    if (!page->buffers)
1823 +       return 1;
1824 +
1825 +    head = page->buffers;
1826 +    bh = head;
1827 +    do {
1828 +       unsigned int next_off = curr_off + bh->b_size;
1829 +       next = bh->b_this_page;
1830 +
1831 +       /* is this buffer to be completely truncated away? */
1832 +       if (offset <= curr_off) {
1833 +            if (flushpage_can_drop(inode, bh))
1834 +               discard_buffer(bh);
1835 +           else
1836 +               ret = 0 ;
1837 +       }
1838 +       curr_off = next_off;
1839 +       bh = next;
1840 +    } while (bh != head);
1841 +
1842 +    /*
1843 +     * subtle. We release buffer-heads only if this is
1844 +     * the 'final' flushpage. We have invalidated the get_block
1845 +     * cached value unconditionally, so real IO is not
1846 +     * possible anymore.
1847 +     *
1848 +     * If the free doesn't work out, the buffers can be
1849 +     * left around - they just turn into anonymous buffers
1850 +     * instead.
1851 +     */
1852 +    if (!offset) {
1853 +       if (!ret || !try_to_free_buffers(page, 0))
1854 +           return 0;
1855 +        if (page->buffers)
1856 +           BUG();
1857 +    }
1858      return ret ;
1859  }
1860
1861 @@ -2222,6 +2675,9 @@
1862                                struct kiobuf *iobuf, unsigned long blocknr,
1863                               int blocksize)
1864  {
1865 +    if (reiserfs_data_ordered(inode->i_sb) || reiserfs_file_data_log(inode)) {
1866 +       return -EINVAL;
1867 +    }
1868      lock_kernel();
1869      reiserfs_commit_for_tail(inode);
1870      unlock_kernel();
1871 @@ -2237,4 +2693,5 @@
1872      commit_write: reiserfs_commit_write,
1873      bmap: reiserfs_aop_bmap,
1874      direct_IO: reiserfs_direct_io,
1875 +    flushpage: reiserfs_flushpage,
1876  } ;
1877 diff -urN linux-2.4.22.org/fs/reiserfs/ioctl.c linux-2.4.22/fs/reiserfs/ioctl.c
1878 --- linux-2.4.22.org/fs/reiserfs/ioctl.c        2003-11-21 15:08:29.000000000 +0100
1879 +++ linux-2.4.22/fs/reiserfs/ioctl.c    2003-11-21 15:14:23.000000000 +0100
1880 @@ -25,12 +25,21 @@
1881         switch (cmd) {
1882             case REISERFS_IOC_UNPACK:
1883                 if( S_ISREG( inode -> i_mode ) ) {
1884 -               if (arg)
1885 -                   return reiserfs_unpack (inode, filp);
1886 -                       else
1887 -                               return 0;
1888 +                   if (arg) {
1889 +                       int result;
1890 +                       result = reiserfs_unpack (inode, filp);
1891 +                       if (reiserfs_file_data_log(inode)) {
1892 +                           struct reiserfs_transaction_handle th;
1893 +                           lock_kernel();
1894 +                           journal_begin(&th, inode->i_sb, 1);
1895 +                           SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
1896 +                           journal_end_sync(&th, inode->i_sb, 1);
1897 +                           unlock_kernel();
1898 +                       }
1899 +                   } else
1900 +                       return 0;
1901                 } else
1902 -                       return -ENOTTY;
1903 +                   return -ENOTTY;
1904         /*
1905          * Following {G,S}ETFLAGS, and {G,S}ETVERSION are providing ext2
1906          * binary compatible interface (used by lsattr(1), and chattr(1)) and
1907 @@ -97,6 +106,7 @@
1908      int retval = 0;
1909      int index ;
1910      struct page *page ;
1911 +    struct address_space *mapping ;
1912      unsigned long write_from ;
1913      unsigned long blocksize = inode->i_sb->s_blocksize ;
1914
1915 @@ -127,19 +137,20 @@
1916      ** reiserfs_get_block to unpack the tail for us.
1917      */
1918      index = inode->i_size >> PAGE_CACHE_SHIFT ;
1919 -    page = grab_cache_page(inode->i_mapping, index) ;
1920 +    mapping = inode->i_mapping ;
1921 +    page = grab_cache_page(mapping, index) ;
1922      retval = -ENOMEM;
1923      if (!page) {
1924          goto out ;
1925      }
1926 -    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
1927 +    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
1928      if (retval)
1929          goto out_unlock ;
1930
1931      /* conversion can change page contents, must flush */
1932      flush_dcache_page(page) ;
1933      inode->u.reiserfs_i.i_flags |= i_nopack_mask;
1934 -    kunmap(page) ; /* mapped by prepare_write */
1935 +    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
1936
1937  out_unlock:
1938      UnlockPage(page) ;
1939 diff -urN linux-2.4.22.org/fs/reiserfs/journal.c linux-2.4.22/fs/reiserfs/journal.c
1940 --- linux-2.4.22.org/fs/reiserfs/journal.c      2003-11-21 15:08:29.000000000 +0100
1941 +++ linux-2.4.22/fs/reiserfs/journal.c  2003-11-21 15:14:23.000000000 +0100
1942 @@ -33,17 +33,17 @@
1943  **                  -- Note, if you call this as an immediate flush from
1944  **                     from within kupdate, it will ignore the immediate flag
1945  **
1946 -** The commit thread -- a writer process for async commits.  It allows a
1947 -**                      a process to request a log flush on a task queue.
1948 -**                      the commit will happen once the commit thread wakes up.
1949 -**                      The benefit here is the writer (with whatever
1950 -**                      related locks it has) doesn't have to wait for the
1951 -**                      log blocks to hit disk if it doesn't want to.
1952 +** The commit thread -- a writer process  for metadata and async commits.
1953 +**                     this allows us to do less io with the journal lock
1954 +**                     held.
1955  */
1956
1957 +#define EXPORT_SYMTAB
1958 +#include <linux/module.h>
1959  #include <linux/config.h>
1960  #include <asm/uaccess.h>
1961  #include <asm/system.h>
1962 +#include <linux/init.h>
1963
1964  #include <linux/sched.h>
1965  #include <asm/semaphore.h>
1966 @@ -59,17 +59,25 @@
1967  #include <linux/string.h>
1968  #include <linux/smp_lock.h>
1969
1970 +/* gets a struct reiserfs_journal_list * from a list head */
1971 +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
1972 +                               j_list))
1973 +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
1974 +                               j_working_list))
1975 +
1976  /* the number of mounted filesystems.  This is used to decide when to
1977  ** start and kill the commit thread
1978  */
1979  static int reiserfs_mounted_fs_count = 0 ;
1980
1981 -/* wake this up when you add something to the commit thread task queue */
1982 +static struct list_head kreiserfsd_supers = LIST_HEAD_INIT(kreiserfsd_supers);
1983 +
1984 +/* wake this up when you want help from the commit thread */
1985  DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_wait) ;
1986
1987 -/* wait on this if you need to be sure you task queue entries have been run */
1988 +/* so we can wait for the commit thread to make progress */
1989  static DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_done) ;
1990 -DECLARE_TASK_QUEUE(reiserfs_commit_thread_tq) ;
1991 +DECLARE_MUTEX(kreiserfsd_sem) ;
1992
1993  #define JOURNAL_TRANS_HALF 1018   /* must be correct to keep the desc and commit
1994                                      structs at 4k */
1995 @@ -82,6 +90,9 @@
1996
1997  #define BLOCK_NEEDS_FLUSH 4    /* used in flush_journal_list */
1998
1999 +/* journal list state bits */
2000 +#define LIST_TOUCHED 1
2001 +
2002  /* flags for do_journal_end */
2003  #define FLUSH_ALL   1          /* flush commit and real blocks */
2004  #define COMMIT_NOW  2          /* end and commit this transaction */
2005 @@ -89,6 +100,9 @@
2006
2007  /* state bits for the journal */
2008  #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
2009 +#define WRITERS_QUEUED 2       /* set when log is full due to too many
2010 +                                *  writers
2011 +                               */
2012
2013  static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
2014  static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
2015 @@ -107,7 +121,7 @@
2016  ** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
2017  ** more details.
2018  */
2019 -static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
2020 +static inline int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
2021    if (bh) {
2022      clear_bit(BH_Dirty, &bh->b_state) ;
2023      refile_buffer(bh) ;
2024 @@ -473,6 +487,8 @@
2025  int pop_journal_writer(int index) {
2026  #ifdef CONFIG_REISERFS_CHECK
2027    if (index >= 0) {
2028 +    if (index >= 512)
2029 +        BUG();
2030      journal_writers[index] = NULL ;
2031    }
2032  #endif
2033 @@ -522,6 +538,12 @@
2034      return 0 ;
2035    }
2036
2037 +  /* when data logging is on, no special action is needed for the data
2038 +   * blocks
2039 +   */
2040 +  if (reiserfs_data_log(p_s_sb))
2041 +      search_all = 0;
2042 +
2043    PROC_INFO_INC( p_s_sb, journal.in_journal );
2044    /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
2045    ** if we crash before the transaction that freed it commits,  this transaction won't
2046 @@ -549,6 +571,7 @@
2047
2048    /* is it in the current transaction.  This should never happen */
2049    if ((cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, dev,bl,size))) {
2050 +    BUG();
2051      return 1;
2052    }
2053
2054 @@ -574,17 +597,12 @@
2055  /* lock the current transaction */
2056  inline static void lock_journal(struct super_block *p_s_sb) {
2057    PROC_INFO_INC( p_s_sb, journal.lock_journal );
2058 -  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
2059 -    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
2060 -    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
2061 -  }
2062 -  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
2063 +  down(&SB_JOURNAL(p_s_sb)->j_lock);
2064  }
2065
2066  /* unlock the current transaction */
2067  inline static void unlock_journal(struct super_block *p_s_sb) {
2068 -  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
2069 -  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
2070 +  up(&SB_JOURNAL(p_s_sb)->j_lock);
2071  }
2072
2073  /*
2074 @@ -602,6 +620,83 @@
2075    jl->j_list_bitmap = NULL ;
2076  }
2077
2078 +static int journal_list_still_alive(struct super_block *s,
2079 +                                    unsigned long trans_id)
2080 +{
2081 +    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
2082 +    struct reiserfs_journal_list *jl;
2083 +
2084 +    if (!list_empty(entry)) {
2085 +        jl = JOURNAL_LIST_ENTRY(entry->next);
2086 +       if (jl->j_trans_id <= trans_id) {
2087 +           return 1;
2088 +       }
2089 +    }
2090 +    return 0;
2091 +}
2092 +
2093 +static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
2094 +    struct reiserfs_journal_list *other_jl;
2095 +    struct reiserfs_journal_list *first_jl;
2096 +    struct list_head *entry;
2097 +    unsigned long trans_id = jl->j_trans_id;
2098 +    unsigned long other_trans_id;
2099 +    unsigned long first_trans_id;
2100 +
2101 +find_first:
2102 +    /*
2103 +     * first we walk backwards to find the oldest uncommitted transation
2104 +     */
2105 +    first_jl = jl;
2106 +    entry = jl->j_list.prev;
2107 +    while(1) {
2108 +       other_jl = JOURNAL_LIST_ENTRY(entry);
2109 +       if (entry == &SB_JOURNAL(s)->j_journal_list ||
2110 +           atomic_read(&other_jl->j_older_commits_done))
2111 +           break;
2112 +
2113 +        first_jl = other_jl;
2114 +       entry = other_jl->j_list.prev;
2115 +    }
2116 +
2117 +    /* if we didn't find any older uncommitted transactions, return now */
2118 +    if (first_jl == jl) {
2119 +        return 0;
2120 +    }
2121 +
2122 +    first_trans_id = first_jl->j_trans_id;
2123 +
2124 +    entry = &first_jl->j_list;
2125 +    while(1) {
2126 +       other_jl = JOURNAL_LIST_ENTRY(entry);
2127 +       other_trans_id = other_jl->j_trans_id;
2128 +
2129 +       if (other_trans_id < trans_id) {
2130 +           if (atomic_read(&other_jl->j_commit_left) != 0) {
2131 +               flush_commit_list(s, other_jl, 0);
2132 +
2133 +               /* list we were called with is gone, return */
2134 +               if (!journal_list_still_alive(s, trans_id))
2135 +                   return 1;
2136 +
2137 +               /* the one we just flushed is gone, this means all
2138 +                * older lists are also gone, so first_jl is no longer
2139 +                * valid either.  Go back to the beginning.
2140 +                */
2141 +               if (!journal_list_still_alive(s, other_trans_id)) {
2142 +                   goto find_first;
2143 +               }
2144 +           }
2145 +           entry = entry->next;
2146 +           if (entry == &SB_JOURNAL(s)->j_journal_list)
2147 +               return 0;
2148 +       } else {
2149 +           return 0;
2150 +       }
2151 +    }
2152 +    return 0;
2153 +}
2154 +
2155  /*
2156  ** if this journal list still has commit blocks unflushed, send them to disk.
2157  **
2158 @@ -611,16 +706,19 @@
2159  */
2160  static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
2161    int i, count ;
2162 -  int index = 0 ;
2163    int bn ;
2164    int retry_count = 0 ;
2165    int orig_commit_left = 0 ;
2166    struct buffer_head *tbh = NULL ;
2167 -  struct reiserfs_journal_list *other_jl ;
2168 +  unsigned long trans_id = jl->j_trans_id;
2169
2170    reiserfs_check_lock_depth("flush_commit_list") ;
2171
2172    if (atomic_read(&jl->j_older_commits_done)) {
2173 +    if (!list_empty(&jl->j_ordered_bh_list))
2174 +        BUG();
2175 +    if (!list_empty(&jl->j_tail_bh_list))
2176 +        BUG();
2177      return 0 ;
2178    }
2179
2180 @@ -628,50 +726,51 @@
2181    ** us is on disk too
2182    */
2183    if (jl->j_len <= 0) {
2184 +    BUG();
2185      return 0 ;
2186    }
2187 +  if (trans_id == SB_JOURNAL(s)->j_trans_id)
2188 +      BUG();
2189 +
2190    if (flushall) {
2191 -    /* we _must_ make sure the transactions are committed in order.  Start with the
2192 -    ** index after this one, wrap all the way around
2193 -    */
2194 -    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
2195 -    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
2196 -      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
2197 -      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 &&
2198 -          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
2199 -        flush_commit_list(s, other_jl, 0) ;
2200 -      }
2201 +    if (flush_older_commits(s, jl) == 1) {
2202 +        /* list disappeared during flush_older_commits.  return */
2203 +        return 0;
2204      }
2205    }
2206
2207    count = 0 ;
2208 -  /* don't flush the commit list for the current transactoin */
2209 -  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
2210 -    return 0 ;
2211 -  }
2212
2213    /* make sure nobody is trying to flush this one at the same time */
2214 -  if (atomic_read(&(jl->j_commit_flushing))) {
2215 -    sleep_on(&(jl->j_commit_wait)) ;
2216 -    if (flushall) {
2217 -      atomic_set(&(jl->j_older_commits_done), 1) ;
2218 -    }
2219 -    return 0 ;
2220 +  down(&jl->j_commit_lock);
2221 +  if (!journal_list_still_alive(s, trans_id)) {
2222 +      up(&jl->j_commit_lock);
2223 +      return 0;
2224    }
2225 +  if (jl->j_trans_id == 0)
2226 +      BUG();
2227
2228    /* this commit is done, exit */
2229    if (atomic_read(&(jl->j_commit_left)) <= 0) {
2230      if (flushall) {
2231        atomic_set(&(jl->j_older_commits_done), 1) ;
2232      }
2233 +    if (!list_empty(&jl->j_ordered_bh_list))
2234 +        BUG();
2235 +    if (!list_empty(&jl->j_tail_bh_list))
2236 +        BUG();
2237 +    up(&jl->j_commit_lock);
2238      return 0 ;
2239    }
2240 -  /* keeps others from flushing while we are flushing */
2241 -  atomic_set(&(jl->j_commit_flushing), 1) ;
2242 -
2243
2244 +  /* write any buffers that must hit disk before the commit is done */
2245 +  while(!list_empty(&jl->j_ordered_bh_list)) {
2246 +      unlock_kernel();
2247 +      fsync_buffers_list(&jl->j_ordered_bh_list);
2248 +      lock_kernel();
2249 +  }
2250    if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
2251 -    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
2252 +    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, trans_id %lu\n", jl->j_len, jl->j_trans_id) ;
2253      return 0 ;
2254    }
2255
2256 @@ -701,7 +800,7 @@
2257        if (buffer_dirty(tbh)) {
2258         reiserfs_warning(s, "journal-569: flush_commit_list, block already dirty!\n") ;
2259        } else {
2260 -       mark_buffer_dirty(tbh) ;
2261 +       atomic_set_buffer_dirty(tbh);
2262        }
2263        ll_rw_block(WRITE, 1, &tbh) ;
2264        count++ ;
2265 @@ -745,16 +844,22 @@
2266    atomic_dec(&(jl->j_commit_left)) ;
2267    bforget(jl->j_commit_bh) ;
2268
2269 +  if (SB_JOURNAL(s)->j_last_commit_id != 0 &&
2270 +     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
2271 +      reiserfs_warning(s, "clm-2200: dev %s, last commit %lu, current %lu\n",
2272 +                       kdevname(s->s_dev), SB_JOURNAL(s)->j_last_commit_id,
2273 +                      SB_JOURNAL(s)->j_last_commit_id);
2274 +  }
2275 +  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
2276 +
2277    /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
2278    cleanup_freed_for_journal_list(s, jl) ;
2279
2280    if (flushall) {
2281      atomic_set(&(jl->j_older_commits_done), 1) ;
2282    }
2283 -  atomic_set(&(jl->j_commit_flushing), 0) ;
2284 -  wake_up(&(jl->j_commit_wait)) ;
2285 +  up(&jl->j_commit_lock);
2286
2287 -  s->s_dirt = 1 ;
2288    return 0 ;
2289  }
2290
2291 @@ -853,22 +958,27 @@
2292  ** flush any and all journal lists older than you are
2293  ** can only be called from flush_journal_list
2294  */
2295 -static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
2296 -  int i, index ;
2297 -  struct reiserfs_journal_list *other_jl ;
2298 -
2299 -  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
2300 -  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
2301 -    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
2302 -    if (other_jl && other_jl->j_len > 0 &&
2303 -        other_jl->j_trans_id > 0 &&
2304 -       other_jl->j_trans_id < trans_id &&
2305 -        other_jl != jl) {
2306 -      /* do not flush all */
2307 -      flush_journal_list(p_s_sb, other_jl, 0) ;
2308 +static int flush_older_journal_lists(struct super_block *p_s_sb,
2309 +                                     struct reiserfs_journal_list *jl)
2310 +{
2311 +    struct list_head *entry;
2312 +    struct reiserfs_journal_list *other_jl ;
2313 +    unsigned long trans_id = jl->j_trans_id;
2314 +
2315 +    /* we know we are the only ones flushing things, no extra race
2316 +     * protection is required.
2317 +     */
2318 +restart:
2319 +    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
2320 +    other_jl = JOURNAL_LIST_ENTRY(entry);
2321 +    if (other_jl->j_trans_id < trans_id) {
2322 +       /* do not flush all */
2323 +       flush_journal_list(p_s_sb, other_jl, 0) ;
2324 +
2325 +       /* other_jl is now deleted from the list */
2326 +       goto restart;
2327      }
2328 -  }
2329 -  return 0 ;
2330 +    return 0 ;
2331  }
2332
2333  static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
2334 @@ -881,14 +991,23 @@
2335      put_bh(bh) ;
2336  }
2337  static void submit_logged_buffer(struct buffer_head *bh) {
2338 -    lock_buffer(bh) ;
2339      get_bh(bh) ;
2340      bh->b_end_io = reiserfs_end_buffer_io_sync ;
2341      mark_buffer_notjournal_new(bh) ;
2342      clear_bit(BH_Dirty, &bh->b_state) ;
2343 +    if (!buffer_uptodate(bh))
2344 +        BUG();
2345      submit_bh(WRITE, bh) ;
2346  }
2347
2348 +static void del_from_work_list(struct super_block *s,
2349 +                               struct reiserfs_journal_list *jl) {
2350 +    if (!list_empty(&jl->j_working_list)) {
2351 +        list_del_init(&jl->j_working_list);
2352 +       SB_JOURNAL(s)->j_num_work_lists--;
2353 +    }
2354 +}
2355 +
2356  /* flush a journal list, both commit and real blocks
2357  **
2358  ** always set flushall to 1, unless you are calling from inside
2359 @@ -909,29 +1028,27 @@
2360    unsigned long j_len_saved = jl->j_len ;
2361
2362    if (j_len_saved <= 0) {
2363 -    return 0 ;
2364 +    BUG();
2365    }
2366
2367    if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
2368      reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d\n",
2369                        atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
2370    }
2371 -  /* if someone is getting the commit list, we must wait for them */
2372 -  while (atomic_read(&(jl->j_commit_flushing))) {
2373 -    sleep_on(&(jl->j_commit_wait)) ;
2374 -  }
2375 -  /* if someone is flushing this list, we must wait for them */
2376 -  while (atomic_read(&(jl->j_flushing))) {
2377 -    sleep_on(&(jl->j_flush_wait)) ;
2378 -  }
2379
2380 -  /* this list is now ours, we can change anything we want */
2381 -  atomic_set(&(jl->j_flushing), 1) ;
2382 +  if (jl->j_trans_id == 0)
2383 +      BUG();
2384 +
2385 +  /* if flushall == 0, the lock is already held */
2386 +  if (flushall) {
2387 +      down(&SB_JOURNAL(s)->j_flush_sem);
2388 +  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
2389 +      BUG();
2390 +  }
2391
2392    count = 0 ;
2393    if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
2394 -    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
2395 -    atomic_dec(&(jl->j_flushing)) ;
2396 +    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, transid %lu\n", j_len_saved, jl->j_trans_id) ;
2397      return 0 ;
2398    }
2399
2400 @@ -981,13 +1098,13 @@
2401        get_bh(saved_bh) ;
2402
2403        if (buffer_journal_dirty(saved_bh)) {
2404 +       if (!can_dirty(cn))
2405 +           BUG();
2406          was_jwait = 1 ;
2407 -       mark_buffer_notjournal_dirty(saved_bh) ;
2408 -        /* undo the inc from journal_mark_dirty */
2409 -       put_bh(saved_bh) ;
2410 -      }
2411 -      if (can_dirty(cn)) {
2412 -        was_dirty = 1 ;
2413 +       was_dirty = 1;
2414 +      } else if (can_dirty(cn)) {
2415 +         /* everything with !pjl && jwait should be writable */
2416 +          BUG();
2417        }
2418      }
2419
2420 @@ -995,7 +1112,8 @@
2421      ** sure they are commited, and don't try writing it to disk
2422      */
2423      if (pjl) {
2424 -      flush_commit_list(s, pjl, 1) ;
2425 +      if (atomic_read(&pjl->j_commit_left))
2426 +         flush_commit_list(s, pjl, 1) ;
2427        goto free_cnode ;
2428      }
2429
2430 @@ -1029,7 +1147,12 @@
2431        /* we inc again because saved_bh gets decremented at free_cnode */
2432        get_bh(saved_bh) ;
2433        set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2434 +      lock_buffer(saved_bh);
2435        submit_logged_buffer(saved_bh) ;
2436 +      if (cn->blocknr != saved_bh->b_blocknr) {
2437 +printk("cn %lu does not match bh %lu\n", cn->blocknr, saved_bh->b_blocknr);
2438 +      BUG();
2439 +      }
2440        count++ ;
2441      } else {
2442        reiserfs_warning(s, "clm-2082: Unable to flush buffer %lu in flush_journal_list\n",
2443 @@ -1057,9 +1180,23 @@
2444         if (!cn->bh) {
2445           reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
2446         }
2447 +        if (cn->blocknr != cn->bh->b_blocknr) {
2448 +printk("2cn %lu does not match bh %lu\n", cn->blocknr, cn->bh->b_blocknr);
2449 +           BUG();
2450 +        }
2451         if (!buffer_uptodate(cn->bh)) {
2452 -         reiserfs_panic(s, "journal-949: buffer write failed\n") ;
2453 +         reiserfs_panic(s, "journal-949: buffer %lu write failed\n", cn->bh->b_blocknr) ;
2454         }
2455 +
2456 +       /* note, we must clear the JDirty_wait bit after the up to date
2457 +       ** check, otherwise we race against our flushpage routine
2458 +       */
2459 +       if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
2460 +           BUG();
2461 +
2462 +        /* undo the inc from journal_mark_dirty */
2463 +       put_bh(cn->bh) ;
2464 +
2465         refile_buffer(cn->bh) ;
2466          brelse(cn->bh) ;
2467        }
2468 @@ -1074,7 +1211,7 @@
2469    ** replayed after a crash
2470    */
2471    if (flushall) {
2472 -    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
2473 +    flush_older_journal_lists(s, jl);
2474    }
2475
2476    /* before we can remove everything from the hash tables for this
2477 @@ -1089,46 +1226,137 @@
2478      update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
2479    }
2480    remove_all_from_journal_list(s, jl, 0) ;
2481 +  list_del(&jl->j_list);
2482 +  SB_JOURNAL(s)->j_num_lists--;
2483 +  del_from_work_list(s, jl);
2484 +
2485 +  if (SB_JOURNAL(s)->j_last_flush_id != 0 &&
2486 +     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
2487 +      reiserfs_warning(s, "clm-2201: dev %s, last flush %lu, current %lu\n",
2488 +                       kdevname(s->s_dev), SB_JOURNAL(s)->j_last_flush_id,
2489 +                      SB_JOURNAL(s)->j_last_flush_id);
2490 +  }
2491 +  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
2492 +
2493 +  /* not strictly required since we are freeing the list, but it should
2494 +   * help find code using dead lists later on
2495 +   */
2496    jl->j_len = 0 ;
2497    atomic_set(&(jl->j_nonzerolen), 0) ;
2498    jl->j_start = 0 ;
2499    jl->j_realblock = NULL ;
2500    jl->j_commit_bh = NULL ;
2501    jl->j_trans_id = 0 ;
2502 -  atomic_dec(&(jl->j_flushing)) ;
2503 -  wake_up(&(jl->j_flush_wait)) ;
2504 +  jl->j_state = 0;
2505 +
2506 +  if (!list_empty(&jl->j_ordered_bh_list))
2507 +      BUG();
2508 +
2509 +  if (!list_empty(&jl->j_tail_bh_list))
2510 +      BUG();
2511 +
2512 +  // kmem_cache_free(journal_list_cachep, jl);
2513 +  reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
2514 +
2515 +  if (flushall)
2516 +      up(&SB_JOURNAL(s)->j_flush_sem);
2517    return 0 ;
2518  }
2519
2520
2521 -static int kupdate_one_transaction(struct super_block *s,
2522 +#define CHUNK_SIZE 32
2523 +struct buffer_chunk {
2524 +    struct buffer_head *bh[CHUNK_SIZE];
2525 +    int nr;
2526 +};
2527 +
2528 +static void write_chunk(struct buffer_chunk *chunk) {
2529 +    int i;
2530 +    for (i = 0; i < chunk->nr ; i++) {
2531 +       submit_logged_buffer(chunk->bh[i]) ;
2532 +    }
2533 +    chunk->nr = 0;
2534 +}
2535 +
2536 +static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
2537 +    if (chunk->nr >= CHUNK_SIZE)
2538 +        BUG();
2539 +    chunk->bh[chunk->nr++] = bh;
2540 +    if (chunk->nr >= CHUNK_SIZE)
2541 +        write_chunk(chunk);
2542 +}
2543 +
2544 +static int write_one_transaction(struct super_block *s,
2545 +                                 struct reiserfs_journal_list *jl,
2546 +                                struct buffer_chunk *chunk)
2547 +{
2548 +    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
2549 +    struct reiserfs_journal_cnode *cn;
2550 +    int ret = 0 ;
2551 +
2552 +    jl->j_state |= LIST_TOUCHED;
2553 +    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
2554 +       del_from_work_list(s, jl);
2555 +        return 0;
2556 +    }
2557 +    del_from_work_list(s, jl);
2558 +
2559 +    cn = jl->j_realblock ;
2560 +    while(cn) {
2561 +        /* if the blocknr == 0, this has been cleared from the hash,
2562 +        ** skip it
2563 +        */
2564 +        if (cn->blocknr == 0) {
2565 +            goto next ;
2566 +        }
2567 +        /* look for a more recent transaction that logged this
2568 +        ** buffer.  Only the most recent transaction with a buffer in
2569 +        ** it is allowed to send that buffer to disk
2570 +        */
2571 +        pjl = find_newer_jl_for_cn(cn) ;
2572 +        if (!pjl && cn->bh && buffer_journal_dirty(cn->bh) && can_dirty(cn)) {
2573 +            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
2574 +               struct buffer_head *tmp_bh;
2575 +               /* we can race against journal_mark_freed when we try
2576 +                * to lock_buffer(cn->bh), so we have to inc the buffer
2577 +                * count, and recheck things after locking
2578 +                */
2579 +               tmp_bh = cn->bh;
2580 +               get_bh(tmp_bh);
2581 +                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2582 +               lock_buffer(tmp_bh);
2583 +               if (cn->bh && buffer_journal_dirty(tmp_bh) &&
2584 +                   !test_bit(BH_JPrepared, &tmp_bh->b_state))
2585 +               {
2586 +                   add_to_chunk(chunk, tmp_bh);
2587 +                   ret++;
2588 +               } else {
2589 +                   /* note, cn->bh might be null now */
2590 +                   unlock_buffer(tmp_bh);
2591 +               }
2592 +               put_bh(tmp_bh);
2593 +            }
2594 +        }
2595 +next:
2596 +        cn = cn->next ;
2597 +       if (current->need_resched)
2598 +           schedule();
2599 +    }
2600 +    return ret ;
2601 +}
2602 +
2603 +static int wait_one_transaction(struct super_block *s,
2604                                      struct reiserfs_journal_list *jl)
2605  {
2606      struct reiserfs_journal_list *pjl ; /* previous list for this cn */
2607      struct reiserfs_journal_cnode *cn, *walk_cn ;
2608      unsigned long blocknr ;
2609 -    int run = 0 ;
2610 -    int orig_trans_id = jl->j_trans_id ;
2611      struct buffer_head *saved_bh ;
2612      int ret = 0 ;
2613
2614 -    /* if someone is getting the commit list, we must wait for them */
2615 -    while (atomic_read(&(jl->j_commit_flushing))) {
2616 -        sleep_on(&(jl->j_commit_wait)) ;
2617 -    }
2618 -    /* if someone is flushing this list, we must wait for them */
2619 -    while (atomic_read(&(jl->j_flushing))) {
2620 -        sleep_on(&(jl->j_flush_wait)) ;
2621 -    }
2622 -    /* was it flushed while we slept? */
2623 -    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
2624 -        return 0 ;
2625 +    if (atomic_read(&jl->j_commit_left) != 0 || jl->j_len <= 0) {
2626 +        BUG();
2627      }
2628 -
2629 -    /* this list is now ours, we can change anything we want */
2630 -    atomic_set(&(jl->j_flushing), 1) ;
2631 -
2632 -loop_start:
2633      cn = jl->j_realblock ;
2634      while(cn) {
2635          saved_bh = NULL ;
2636 @@ -1143,27 +1371,14 @@
2637          ** it is allowed to send that buffer to disk
2638          */
2639          pjl = find_newer_jl_for_cn(cn) ;
2640 -        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
2641 -            can_dirty(cn))
2642 -        {
2643 -            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
2644 -                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2645 -               submit_logged_buffer(cn->bh) ;
2646 -            } else {
2647 -                /* someone else is using this buffer.  We can't
2648 -                ** send it to disk right now because they might
2649 -                ** be changing/logging it.
2650 -                */
2651 -                ret = 1 ;
2652 -            }
2653 -        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
2654 +        if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
2655              clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
2656              if (!pjl && cn->bh) {
2657                  wait_on_buffer(cn->bh) ;
2658 -            }
2659 -            /* check again, someone could have logged while we scheduled */
2660 -            pjl = find_newer_jl_for_cn(cn) ;
2661 +               /* check again, someone could have logged while we scheduled */
2662 +               pjl = find_newer_jl_for_cn(cn) ;
2663
2664 +            }
2665              /* before the JDirty_wait bit is set, the
2666              ** buffer is added to the hash list.  So, if we are
2667              ** run in the middle of a do_journal_end, we will notice
2668 @@ -1210,60 +1425,182 @@
2669          }
2670  next:
2671          cn = cn->next ;
2672 +       if (current->need_resched)
2673 +           schedule();
2674      }
2675 -    /* the first run through the loop sends all the dirty buffers to
2676 -    ** ll_rw_block.
2677 -    ** the second run through the loop does all the accounting
2678 -    */
2679 -    if (run++ == 0) {
2680 -        goto loop_start ;
2681 +    return ret ;
2682 +}
2683 +
2684 +static int kupdate_transactions(struct super_block *s,
2685 +                                   struct reiserfs_journal_list *jl,
2686 +                                  struct reiserfs_journal_list **next_jl,
2687 +                                  unsigned long *next_trans_id,
2688 +                                  int num_blocks,
2689 +                                  int num_trans) {
2690 +    int ret = 0;
2691 +    int written = 0 ;
2692 +    int transactions_flushed = 0;
2693 +    unsigned long orig_trans_id = jl->j_trans_id;
2694 +    struct reiserfs_journal_list *orig_jl = jl;
2695 +    struct buffer_chunk chunk;
2696 +    struct list_head *entry;
2697 +    chunk.nr = 0;
2698 +
2699 +    down(&SB_JOURNAL(s)->j_flush_sem);
2700 +    if (!journal_list_still_alive(s, orig_trans_id)) {
2701 +       goto done;
2702      }
2703
2704 -    atomic_set(&(jl->j_flushing), 0) ;
2705 -    wake_up(&(jl->j_flush_wait)) ;
2706 -    return ret ;
2707 +    /* we've got j_flush_sem held, nobody is going to delete any
2708 +     * of these lists out from underneath us
2709 +     */
2710 +    while((num_trans && transactions_flushed < num_trans) ||
2711 +          (!num_trans && written < num_blocks)) {
2712 +
2713 +       if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
2714 +           atomic_read(&jl->j_commit_left))
2715 +       {
2716 +           del_from_work_list(s, jl);
2717 +           break;
2718 +       }
2719 +       ret = write_one_transaction(s, jl, &chunk);
2720 +
2721 +       if (ret < 0)
2722 +           goto done;
2723 +       transactions_flushed++;
2724 +       written += ret;
2725 +       entry = jl->j_list.next;
2726 +
2727 +       /* did we wrap? */
2728 +       if (entry == &SB_JOURNAL(s)->j_journal_list) {
2729 +           break;
2730 +        }
2731 +       jl = JOURNAL_LIST_ENTRY(entry);
2732 +
2733 +       /* don't bother with older transactions */
2734 +       if (jl->j_trans_id <= orig_trans_id)
2735 +           break;
2736 +    }
2737 +    if (chunk.nr) {
2738 +        write_chunk(&chunk);
2739 +    }
2740 +
2741 +    jl = orig_jl;
2742 +    *next_jl = jl;
2743 +    *next_trans_id = jl->j_trans_id;
2744 +    ret = transactions_flushed;
2745 +    while(transactions_flushed--) {
2746 +
2747 +       wait_one_transaction(s, jl);
2748 +       entry = jl->j_list.next;
2749 +       jl = JOURNAL_LIST_ENTRY(entry);
2750 +
2751 +       /* make sure we can really count */
2752 +       if (jl->j_trans_id <= orig_trans_id && transactions_flushed > 0) {
2753 +printk("flushing %s %lu, orig_trans_id was %lu\n", kdevname(s->s_dev), jl->j_trans_id, orig_trans_id);
2754 +           BUG();
2755 +        }
2756 +       *next_jl = jl;
2757 +       *next_trans_id = jl->j_trans_id;
2758 +    }
2759 +
2760 +done:
2761 +    up(&SB_JOURNAL(s)->j_flush_sem);
2762 +    return ret;
2763  }
2764 +
2765 +/* for o_sync and fsync heavy applications, they tend to use
2766 +** all the journa list slots with tiny transactions.  These
2767 +** trigger lots and lots of calls to update the header block, which
2768 +** adds seeks and slows things down.
2769 +**
2770 +** This function tries to clear out a large chunk of the journal lists
2771 +** at once, which makes everything faster since only the newest journal
2772 +** list updates the header block
2773 +*/
2774 +static int flush_used_journal_lists(struct super_block *s,
2775 +                                    struct reiserfs_journal_list *jl) {
2776 +    unsigned long len = 0;
2777 +    unsigned long cur_len;
2778 +    int ret;
2779 +    int i;
2780 +    struct reiserfs_journal_list *tjl;
2781 +    struct reiserfs_journal_list *flush_jl;
2782 +    unsigned long trans_id;
2783 +
2784 +    flush_jl = tjl = jl;
2785 +
2786 +    /* flush for 256 transactions or 256 blocks, whichever comes first */
2787 +    for(i = 0 ; i < 256 && len < 256 ; i++) {
2788 +       if (atomic_read(&tjl->j_commit_left) ||
2789 +           tjl->j_trans_id < jl->j_trans_id) {
2790 +           break;
2791 +       }
2792 +       cur_len = atomic_read(&tjl->j_nonzerolen);
2793 +       if (cur_len > 0) {
2794 +           tjl->j_state &= ~LIST_TOUCHED;
2795 +       }
2796 +       len += cur_len;
2797 +       flush_jl = tjl;
2798 +       if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
2799 +           break;
2800 +       tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
2801 +    }
2802 +    /* try to find a group of blocks we can flush across all the
2803 +    ** transactions, but only bother if we've actually spanned
2804 +    ** across multiple lists
2805 +    */
2806 +    if (flush_jl != jl) {
2807 +       ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
2808 +    }
2809 +    flush_journal_list(s, flush_jl, 1) ;
2810 +    return 0;
2811 +}
2812 +
2813 +
2814  /* since we never give dirty buffers to bdflush/kupdate, we have to
2815  ** flush them ourselves.  This runs through the journal lists, finds
2816  ** old metadata in need of flushing and sends it to disk.
2817  ** this does not end transactions, commit anything, or free
2818  ** cnodes.
2819 -**
2820 -** returns the highest transaction id that was flushed last time
2821  */
2822  static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
2823 -    struct reiserfs_journal_list *jl ;
2824 -    int i ;
2825 -    int start ;
2826 +    struct reiserfs_journal_list *jl, *next_jl;
2827 +    unsigned long trans_id, next_trans_id;
2828      time_t age ;
2829 -    int ret = 0 ;
2830
2831 -    start = SB_JOURNAL_LIST_INDEX(s) ;
2832 +    jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
2833
2834 -    /* safety check to prevent flush attempts during a mount */
2835 -    if (start < 0) {
2836 +restart:
2837 +    /* kupdate transactions might not set next_trans_id, it must be
2838 +     * initialized before each call
2839 +     */
2840 +    next_trans_id = 0;
2841 +    if (list_empty(&SB_JOURNAL(s)->j_working_list)) {
2842          return 0 ;
2843      }
2844 -    i = (start + 1) % JOURNAL_LIST_COUNT ;
2845 -    while(i != start) {
2846 -        jl = SB_JOURNAL_LIST(s) + i  ;
2847 -        age = CURRENT_TIME - jl->j_timestamp ;
2848 -        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) &&
2849 -            atomic_read(&(jl->j_nonzerolen)) > 0 &&
2850 -            atomic_read(&(jl->j_commit_left)) == 0) {
2851 +    trans_id = jl->j_trans_id;
2852
2853 -            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
2854 -                break ;
2855 -            }
2856 -            /* if ret was already 1, we want to preserve that */
2857 -            ret |= kupdate_one_transaction(s, jl) ;
2858 -        }
2859 -        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
2860 -            ret |= 1 ;
2861 -        }
2862 -        i = (i + 1) % JOURNAL_LIST_COUNT ;
2863 +    /* check for race with the code that frees lists */
2864 +    if (jl->j_trans_id == 0)
2865 +        BUG();
2866 +    age = CURRENT_TIME - jl->j_timestamp ;
2867 +    if (age >= SB_JOURNAL_MAX_COMMIT_AGE(s) &&
2868 +        atomic_read(&jl->j_nonzerolen) > 0 &&
2869 +       atomic_read(&jl->j_commit_left) == 0)
2870 +    {
2871 +        if (kupdate_transactions(s, jl, &next_jl, &next_trans_id, 32, 32) < 0)
2872 +           return 0;
2873 +       if (next_jl != JOURNAL_WORK_ENTRY(&SB_JOURNAL(s)->j_working_list) &&
2874 +           next_trans_id > trans_id)
2875 +       {
2876 +           if (journal_list_still_alive(s, next_trans_id)) {
2877 +               jl = next_jl;
2878 +               goto restart;
2879 +           }
2880 +       }
2881      }
2882 -    return ret ;
2883 +    return 0;
2884  }
2885
2886  /*
2887 @@ -1307,6 +1644,12 @@
2888  }
2889
2890  static void free_journal_ram(struct super_block *p_s_sb) {
2891 +
2892 +  // kmem_cache_free(journal_list_cachep, SB_JOURNAL(p_s_sb)->j_current_jl);
2893 +  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl,
2894 +                 sizeof(struct reiserfs_journal_list), p_s_sb);
2895 +  SB_JOURNAL(p_s_sb)->j_num_lists--;
2896 +
2897    vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
2898    free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
2899    free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
2900 @@ -1327,6 +1670,10 @@
2901  static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
2902    struct reiserfs_transaction_handle myth ;
2903
2904 +  down(&kreiserfsd_sem);
2905 +  list_del(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers);
2906 +  up(&kreiserfsd_sem);
2907 +
2908    /* we only want to flush out transactions if we were called with error == 0
2909    */
2910    if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
2911 @@ -1813,66 +2160,6 @@
2912    return 0 ;
2913  }
2914
2915 -
2916 -struct reiserfs_journal_commit_task {
2917 -  struct super_block *p_s_sb ;
2918 -  int jindex ;
2919 -  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
2920 -                       ** is zero, we free the whole struct on finish
2921 -                      */
2922 -  struct reiserfs_journal_commit_task *self ;
2923 -  struct wait_queue *task_done ;
2924 -  struct tq_struct task ;
2925 -} ;
2926 -
2927 -static void reiserfs_journal_commit_task_func(struct reiserfs_journal_commit_task *ct) {
2928 -
2929 -  struct reiserfs_journal_list *jl ;
2930 -  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
2931 -
2932 -  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ;
2933 -
2934 -  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
2935 -      atomic_read(&(jl->j_commit_left)) == 0) {
2936 -    kupdate_one_transaction(ct->p_s_sb, jl) ;
2937 -  }
2938 -  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
2939 -}
2940 -
2941 -static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
2942 -                                  struct super_block *p_s_sb,
2943 -                                 int jindex) {
2944 -  if (!ct) {
2945 -    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
2946 -  }
2947 -  ct->p_s_sb = p_s_sb ;
2948 -  ct->jindex = jindex ;
2949 -  ct->task_done = NULL ;
2950 -  INIT_LIST_HEAD(&ct->task.list) ;
2951 -  ct->task.sync = 0 ;
2952 -  ct->task.routine = (void *)(void *)reiserfs_journal_commit_task_func ;
2953 -  ct->self = ct ;
2954 -  ct->task.data = (void *)ct ;
2955 -}
2956 -
2957 -static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
2958 -  struct reiserfs_journal_commit_task *ct ;
2959 -  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
2960 -  ** to start/join a transaction, which will deadlock
2961 -  */
2962 -  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
2963 -  if (ct) {
2964 -    setup_commit_task_arg(ct, p_s_sb, jindex) ;
2965 -    queue_task(&(ct->task), &reiserfs_commit_thread_tq);
2966 -    wake_up(&reiserfs_commit_thread_wait) ;
2967 -  } else {
2968 -#ifdef CONFIG_REISERFS_CHECK
2969 -    reiserfs_warning(p_s_sb, "journal-1540: kmalloc failed, doing sync commit\n") ;
2970 -#endif
2971 -    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
2972 -  }
2973 -}
2974 -
2975  /*
2976  ** this is the commit thread.  It is started with kernel_thread on
2977  ** FS mount, and journal_release() waits for it to exit.
2978 @@ -1885,6 +2172,9 @@
2979  ** then run the per filesystem commit task queue when we wakeup.
2980  */
2981  static int reiserfs_journal_commit_thread(void *nullp) {
2982 +  struct list_head *entry, *safe ;
2983 +  struct super_block *s;
2984 +  time_t last_run = 0;
2985
2986    daemonize() ;
2987
2988 @@ -1897,13 +2187,73 @@
2989    lock_kernel() ;
2990    while(1) {
2991
2992 -    while(TQ_ACTIVE(reiserfs_commit_thread_tq)) {
2993 -      run_task_queue(&reiserfs_commit_thread_tq) ;
2994 +restart:
2995 +    down(&kreiserfsd_sem);
2996 +    list_for_each_safe(entry, safe, &kreiserfsd_supers) {
2997 +       s = list_entry(entry, struct super_block,
2998 +                      u.reiserfs_sb.s_reiserfs_supers);
2999 +       if (!(s->s_flags & MS_RDONLY)) {
3000 +           flush_async_commits(s);
3001 +
3002 +           if (CURRENT_TIME - last_run > 5) {
3003 +               reiserfs_flush_old_commits(s);
3004 +           }
3005 +
3006 +           if (!list_empty(&SB_JOURNAL(s)->j_working_list)) {
3007 +               struct reiserfs_journal_list *jl, *tjl;
3008 +               unsigned long trans_id ;
3009 +               unsigned long start;
3010 +               unsigned long cur_start;
3011 +               unsigned long nfract = SB_ONDISK_JOURNAL_SIZE(s) / 4;
3012 +               int ret;
3013 +
3014 +               jl = JOURNAL_WORK_ENTRY(SB_JOURNAL(s)->j_working_list.next);
3015 +               cur_start = SB_JOURNAL(s)->j_start;
3016 +               start = jl->j_start;
3017 +
3018 +               /* pretend the log doesn't actually wrap */
3019 +               if (cur_start < start) {
3020 +                   cur_start = cur_start + SB_ONDISK_JOURNAL_SIZE(s);
3021 +               }
3022 +
3023 +               /* if the first transaction on the working list is more
3024 +                * than nfract blocks away from the current transaction start
3025 +                * or there are more than 128 working lists, start
3026 +                * a background flush
3027 +                */
3028 +               if (cur_start - start > nfract ||
3029 +                   SB_JOURNAL(s)->j_num_work_lists > 32) {
3030 +                   tjl=JOURNAL_LIST_ENTRY(SB_JOURNAL(s)->j_journal_list.next);
3031 +                   ret = kupdate_transactions(s, jl, &tjl, &trans_id,32,128);
3032 +               }
3033 +           }
3034 +       }
3035      }
3036 +    /* check again for new async commits that need tending */
3037 +    list_for_each_safe(entry, safe, &kreiserfsd_supers) {
3038 +       s = list_entry(entry, struct super_block,
3039 +                      u.reiserfs_sb.s_reiserfs_supers);
3040 +       if (!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
3041 +           struct reiserfs_journal_list *jl;
3042 +           struct list_head *entry;
3043 +
3044 +           /* last entry is the youngest, commit it and you get everything */
3045 +           entry = SB_JOURNAL(s)->j_journal_list.prev;
3046 +           jl = JOURNAL_LIST_ENTRY(entry);
3047 +           if (!atomic_read(&(jl->j_older_commits_done))) {
3048 +               /* give new mounts a chance to come in */
3049 +               up(&kreiserfsd_sem);
3050 +               last_run = CURRENT_TIME;
3051 +               wake_up_all(&reiserfs_commit_thread_done) ;
3052 +               goto restart;
3053 +           }
3054 +       }
3055 +    }
3056 +    up(&kreiserfsd_sem);
3057 +    last_run = CURRENT_TIME;
3058
3059      /* if there aren't any more filesystems left, break */
3060      if (reiserfs_mounted_fs_count <= 0) {
3061 -      run_task_queue(&reiserfs_commit_thread_tq) ;
3062        break ;
3063      }
3064      wake_up(&reiserfs_commit_thread_done) ;
3065 @@ -1914,12 +2264,28 @@
3066    return 0 ;
3067  }
3068
3069 +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
3070 +{
3071 +    struct reiserfs_journal_list *jl;
3072 +retry:
3073 +    // jl = (struct reiserfs_journal_list *)kmem_cache_alloc(journal_list_cachep, SLAB_NOFS);
3074 +    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
3075 +    if (!jl) {
3076 +       yield();
3077 +       goto retry;
3078 +    }
3079 +    memset(jl, 0, sizeof(*jl));
3080 +    INIT_LIST_HEAD(&jl->j_list);
3081 +    INIT_LIST_HEAD(&jl->j_working_list);
3082 +    INIT_LIST_HEAD(&jl->j_ordered_bh_list);
3083 +    INIT_LIST_HEAD(&jl->j_tail_bh_list);
3084 +    sema_init(&jl->j_commit_lock, 1);
3085 +    SB_JOURNAL(s)->j_num_lists++;
3086 +    return jl;
3087 +}
3088 +
3089  static void journal_list_init(struct super_block *p_s_sb) {
3090 -  int i ;
3091 -  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
3092 -    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
3093 -    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
3094 -  }
3095 +    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
3096  }
3097
3098  static int release_journal_dev( struct super_block *super,
3099 @@ -1952,7 +2318,6 @@
3100         int blkdev_mode = FMODE_READ | FMODE_WRITE;
3101
3102         result = 0;
3103 -
3104         journal -> j_dev_bd = NULL;
3105         journal -> j_dev_file = NULL;
3106         jdev = SB_JOURNAL_DEV( super ) =
3107 @@ -2030,7 +2395,6 @@
3108         printk( "journal_init_dev: journal device: %s", kdevname( SB_JOURNAL_DEV( super ) ) );
3109         return result;
3110  }
3111 -
3112  /*
3113  ** must be called once on fs mount.  calls journal_read for you
3114  */
3115 @@ -2041,6 +2405,7 @@
3116      struct reiserfs_super_block * rs;
3117      struct reiserfs_journal_header *jh;
3118      struct reiserfs_journal *journal;
3119 +    struct reiserfs_journal_list *jl;
3120
3121      if (sizeof(struct reiserfs_journal_commit) != 4096 ||
3122         sizeof(struct reiserfs_journal_desc) != 4096) {
3123 @@ -2054,7 +2419,6 @@
3124         reiserfs_warning(p_s_sb, "Journal size %d is less than 512+1 blocks, which unsupported\n", SB_ONDISK_JOURNAL_SIZE(p_s_sb));
3125         return 1 ;
3126      }
3127 -
3128      journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
3129      if (!journal) {
3130         reiserfs_warning(p_s_sb, "journal-1256: unable to get memory for journal structure\n") ;
3131 @@ -2155,15 +2519,9 @@
3132             SB_JOURNAL_MAX_BATCH(p_s_sb) = SB_JOURNAL_TRANS_MAX(p_s_sb)*9 / 10;
3133         }
3134      }
3135 -
3136      brelse (bhjh);
3137
3138      SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
3139 -    SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
3140 -
3141 -    /* clear out the journal list array */
3142 -    memset(SB_JOURNAL_LIST(p_s_sb), 0,
3143 -           sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ;
3144
3145      journal_list_init(p_s_sb) ;
3146
3147 @@ -2171,8 +2529,6 @@
3148             JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
3149      memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
3150
3151 -    INIT_LIST_HEAD(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
3152 -
3153      SB_JOURNAL(p_s_sb)->j_start = 0 ;
3154      SB_JOURNAL(p_s_sb)->j_len = 0 ;
3155      SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
3156 @@ -2182,13 +2538,15 @@
3157      SB_JOURNAL(p_s_sb)->j_last = NULL ;
3158      SB_JOURNAL(p_s_sb)->j_first = NULL ;
3159      init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3160 -    init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
3161 -
3162 +    sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
3163 +    sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
3164 +    INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_journal_list);
3165 +    INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_working_list);
3166 +
3167      SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;
3168      SB_JOURNAL(p_s_sb)->j_mount_id = 10 ;
3169      SB_JOURNAL(p_s_sb)->j_state = 0 ;
3170      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
3171 -    atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
3172      SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
3173      SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
3174      SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ?
3175 @@ -2196,8 +2554,9 @@
3176      SB_JOURNAL(p_s_sb)->j_cnode_used = 0 ;
3177      SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
3178      init_journal_hash(p_s_sb) ;
3179 -    SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
3180 -    if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
3181 +    jl = SB_JOURNAL(p_s_sb)->j_current_jl;
3182 +    jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl) ;
3183 +    if (!jl->j_list_bitmap) {
3184         reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0\n") ;
3185         goto free_and_return;
3186      }
3187 @@ -2205,8 +2564,6 @@
3188         reiserfs_warning(p_s_sb, "Replay Failure, unable to mount\n") ;
3189         goto free_and_return;
3190      }
3191 -    /* once the read is done, we can set this where it belongs */
3192 -    SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ;
3193
3194      if (reiserfs_dont_log (p_s_sb))
3195         return 0;
3196 @@ -2216,6 +2573,9 @@
3197         kernel_thread((void *)(void *)reiserfs_journal_commit_thread, NULL,
3198                       CLONE_FS | CLONE_FILES | CLONE_VM) ;
3199      }
3200 +    down(&kreiserfsd_sem);
3201 +    list_add(&p_s_sb->u.reiserfs_sb.s_reiserfs_supers, &kreiserfsd_supers);
3202 +    up(&kreiserfsd_sem);
3203      return 0 ;
3204
3205  free_and_return:
3206 @@ -2230,7 +2590,9 @@
3207  */
3208  int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
3209    time_t now = CURRENT_TIME ;
3210 -  if (reiserfs_dont_log(th->t_super))
3211 +
3212 +  /* cannot restart while nested unless the parent allows it */
3213 +  if (!reiserfs_restartable_handle(th) && th->t_refcount > 1)
3214      return 0 ;
3215    if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
3216         (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) ||
3217 @@ -2239,9 +2601,48 @@
3218         SB_JOURNAL(th->t_super)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(th->t_super) * 3)) {
3219      return 1 ;
3220    }
3221 +
3222 +  /* we are allowing them to continue in the current transaction, so
3223 +  * we have to bump the blocks allocated now.
3224 +  */
3225 +  th->t_blocks_allocated += new_alloc;
3226 +  SB_JOURNAL(th->t_super)->j_len_alloc += new_alloc;
3227 +
3228    return 0 ;
3229  }
3230
3231 +int
3232 +reiserfs_restart_transaction(struct reiserfs_transaction_handle *th, int num) {
3233 +    int refcount = th->t_refcount ;
3234 +    struct super_block *s = th->t_super ;
3235 +    int flags = th->t_flags ;
3236 +    int parent_flags = 0;
3237 +    struct reiserfs_transaction_handle *saved_th = current->journal_info ;
3238 +
3239 +    /* if refcount is > 1, saved_th is the parent we've nested into, save
3240 +    ** his flags as well.  So far, only intermezzo needs this, 99% of the
3241 +    ** time it is horribly unsafe.
3242 +    */
3243 +    if (refcount > 1) {
3244 +       if (!reiserfs_restartable_handle(saved_th)) {
3245 +           BUG() ;
3246 +       }
3247 +       th->t_refcount = 1;
3248 +       parent_flags = saved_th->t_flags ;
3249 +    }
3250 +    th->t_flags = 0 ;
3251 +    journal_end(th, s, th->t_blocks_allocated) ;
3252 +    journal_begin(th, s, num) ;
3253 +    th->t_flags = flags;
3254 +    if (refcount > 1) {
3255 +       current->journal_info = saved_th ;
3256 +        th->t_refcount = refcount ;
3257 +       memcpy(saved_th, th, sizeof(*th)) ;
3258 +       saved_th->t_flags = parent_flags ;
3259 +    }
3260 +    return 0 ;
3261 +}
3262 +
3263  /* this must be called inside a transaction, and requires the
3264  ** kernel_lock to be held
3265  */
3266 @@ -2268,6 +2669,37 @@
3267                 !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
3268  }
3269
3270 +static void queue_log_writer(struct super_block *s) {
3271 +    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
3272 +    sleep_on(&SB_JOURNAL(s)->j_join_wait);
3273 +}
3274 +
3275 +static void wake_queued_writers(struct super_block *s) {
3276 +    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state)) {
3277 +        wake_up(&SB_JOURNAL(s)->j_join_wait);
3278 +    }
3279 +}
3280 +
3281 +static void let_transaction_grow(struct super_block *sb,
3282 +                                 unsigned long trans_id)
3283 +{
3284 +    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
3285 +    while(1) {
3286 +       yield();
3287 +        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
3288 +               atomic_read(&SB_JOURNAL(sb)->j_jlock)) &&
3289 +              SB_JOURNAL(sb)->j_trans_id == trans_id) {
3290 +           queue_log_writer(sb);
3291 +       }
3292 +       if (SB_JOURNAL(sb)->j_trans_id != trans_id)
3293 +           break;
3294 +       if (bcount == SB_JOURNAL(sb)->j_bcount)
3295 +           break;
3296 +       bcount = SB_JOURNAL(sb)->j_bcount;
3297 +    }
3298 +}
3299 +
3300 +
3301  /* join == true if you must join an existing transaction.
3302  ** join == false if you can deal with waiting for others to finish
3303  **
3304 @@ -2275,8 +2707,10 @@
3305  ** expect to use in nblocks.
3306  */
3307  static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
3308 -  time_t now = CURRENT_TIME ;
3309 +  time_t now ;
3310    int old_trans_id  ;
3311 +  struct reiserfs_transaction_handle myth ;
3312 +  int sched_count = 0;
3313
3314    reiserfs_check_lock_depth("journal_begin") ;
3315    RFALSE( p_s_sb->s_flags & MS_RDONLY,
3316 @@ -2287,9 +2721,14 @@
3317      return 0 ;
3318    }
3319    PROC_INFO_INC( p_s_sb, journal.journal_being );
3320 +  /* set here for journal_join */
3321 +  th->t_refcount = 1;
3322 +  th->t_flags = 0 ;
3323 +  th->t_super = p_s_sb ;
3324
3325  relock:
3326    lock_journal(p_s_sb) ;
3327 +  SB_JOURNAL(p_s_sb)->j_bcount++ ;
3328
3329    if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
3330      unlock_journal(p_s_sb) ;
3331 @@ -2297,12 +2736,12 @@
3332      PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
3333      goto relock ;
3334    }
3335 +  now = CURRENT_TIME;
3336
3337    /* if there is no room in the journal OR
3338    ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
3339    ** we don't sleep if there aren't other writers
3340    */
3341 -
3342    if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
3343       ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) ||
3344       (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3345 @@ -2310,54 +2749,128 @@
3346       (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
3347       (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
3348
3349 +    old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3350      unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
3351
3352 -    /* if writer count is 0, we can just force this transaction to end, and start
3353 -    ** a new one afterwards.
3354 -    */
3355 -    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
3356 -      struct reiserfs_transaction_handle myth ;
3357 -      journal_join(&myth, p_s_sb, 1) ;
3358 -      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3359 -      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3360 -      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
3361 +    if (!join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >=
3362 +        SB_JOURNAL_MAX_BATCH(p_s_sb) &&
3363 +       ((SB_JOURNAL(p_s_sb)->j_len + nblocks + 2) * 100) <
3364 +       (SB_JOURNAL(p_s_sb)->j_len_alloc * 75))
3365 +    {
3366 +       if (atomic_read(&SB_JOURNAL(p_s_sb)->j_wcount) > 10) {
3367 +           sched_count++;
3368 +           queue_log_writer(p_s_sb);
3369 +           goto relock;
3370 +       }
3371 +    }
3372 +    /* don't mess with joining the transaction if all we have to do is
3373 +     * wait for someone else to do a commit
3374 +     */
3375 +    if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3376 +       while (SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id &&
3377 +              atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3378 +           queue_log_writer(p_s_sb);
3379 +        }
3380 +       goto relock;
3381 +    }
3382 +    journal_join(&myth, p_s_sb, 1) ;
3383 +
3384 +    /* someone might have ended the transaction while we joined */
3385 +    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
3386 +        do_journal_end(&myth, p_s_sb, 1, 0) ;
3387      } else {
3388 -      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
3389 -      ** They won't batch on transaction end once we set j_jlock
3390 -      */
3391 -      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3392 -      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3393 -      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
3394 -            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
3395 -       sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3396 -      }
3397 +        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
3398      }
3399      PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
3400      goto relock ;
3401    }
3402
3403    if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
3404 -    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
3405 +    SB_JOURNAL(p_s_sb)->j_trans_start_time = CURRENT_TIME;
3406    }
3407    atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
3408    SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
3409    th->t_blocks_logged = 0 ;
3410    th->t_blocks_allocated = nblocks ;
3411 -  th->t_super = p_s_sb ;
3412    th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
3413 -  th->t_caller = "Unknown" ;
3414 +  reiserfs_set_handle_active(th) ;
3415    unlock_journal(p_s_sb) ;
3416 -  p_s_sb->s_dirt = 1;
3417    return 0 ;
3418  }
3419
3420 +struct reiserfs_transaction_handle *
3421 +reiserfs_persistent_transaction(struct super_block *s, unsigned long nblocks) {
3422 +    int ret ;
3423 +    struct reiserfs_transaction_handle *th ;
3424
3425 +    /* if we're nesting into an existing transaction.  It will be
3426 +    ** persistent on its own
3427 +    */
3428 +    if (reiserfs_transaction_running(s)) {
3429 +        th = current->journal_info ;
3430 +       th->t_refcount++ ;
3431 +       if (th->t_refcount < 2) {
3432 +           BUG() ;
3433 +       }
3434 +       return th ;
3435 +    }
3436 +    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
3437 +    if (!th) {
3438 +       return ERR_PTR(-ENOMEM) ;
3439 +    }
3440 +    ret = journal_begin(th, s, nblocks) ;
3441 +    if (ret) {
3442 +       reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
3443 +        return ERR_PTR(ret) ;
3444 +    }
3445 +    /* do_journal_end is now responsible for freeing the handle */
3446 +    reiserfs_set_handle_persistent(th) ;
3447 +    return th ;
3448 +}
3449  static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3450 +  struct reiserfs_transaction_handle *cur_th = current->journal_info;
3451 +
3452 +  /* this keeps do_journal_end from NULLing out the current->journal_info
3453 +  ** pointer
3454 +  */
3455 +  th->t_handle_save = cur_th ;
3456 +  if (cur_th && cur_th->t_refcount > 1) {
3457 +      BUG() ;
3458 +  }
3459    return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
3460  }
3461
3462  int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
3463 -  return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
3464 +    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
3465 +    int ret ;
3466 +
3467 +    th->t_handle_save = NULL ;
3468 +    if (cur_th) {
3469 +       /* we are nesting into the current transaction */
3470 +       if (cur_th->t_super == p_s_sb) {
3471 +             cur_th->t_refcount++ ;
3472 +             memcpy(th, cur_th, sizeof(*th));
3473 +             th->t_flags = 0 ;
3474 +             reiserfs_set_handle_active(th) ;
3475 +             if (th->t_refcount <= 1)
3476 +                     printk("BAD: refcount <= 1, but journal_info != 0\n");
3477 +             return 0;
3478 +       } else {
3479 +           /* we've ended up with a handle from a different filesystem.
3480 +           ** save it and restore on journal_end.  This should never
3481 +           ** really happen...
3482 +           */
3483 +           reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS\n") ;
3484 +           th->t_handle_save = current->journal_info ;
3485 +           current->journal_info = th;
3486 +       }
3487 +    } else {
3488 +       current->journal_info = th;
3489 +    }
3490 +    ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
3491 +    if (current->journal_info != th)
3492 +        BUG() ;
3493 +    return ret ;
3494  }
3495
3496  /* not used at all */
3497 @@ -2389,7 +2902,7 @@
3498      reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
3499                     th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
3500    }
3501 -  p_s_sb->s_dirt = 1 ;
3502 +  p_s_sb->s_dirt = 1;
3503
3504    prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
3505    /* already in this transaction, we are done */
3506 @@ -2413,6 +2926,7 @@
3507
3508    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
3509      reiserfs_warning(p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
3510 +    BUG();
3511      return 1 ;
3512    }
3513    /* this error means I've screwed up, and we've overflowed the transaction.
3514 @@ -2479,25 +2993,36 @@
3515    return 0 ;
3516  }
3517
3518 -/*
3519 -** if buffer already in current transaction, do a journal_mark_dirty
3520 -** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
3521 -** that don't need journaling
3522 -*/
3523 -int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
3524 -  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) ||
3525 -      buffer_journal_dirty(bh)) {
3526 -    return journal_mark_dirty(th, p_s_sb, bh) ;
3527 -  }
3528 -  if (get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_dev,bh->b_blocknr,bh->b_size)) {
3529 -    return journal_mark_dirty(th, p_s_sb, bh) ;
3530 -  }
3531 -  mark_buffer_dirty(bh) ;
3532 -  return 0 ;
3533 -}
3534 -
3535  int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3536 -  return do_journal_end(th, p_s_sb, nblocks, 0) ;
3537 +
3538 +    int ret;
3539 +    if (!current->journal_info && th->t_refcount > 1)
3540 +       printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
3541 +    if (th->t_refcount > 1) {
3542 +       struct reiserfs_transaction_handle *cur_th = current->journal_info ;
3543 +
3544 +       /* we aren't allowed to close a nested transaction on a different
3545 +       ** filesystem from the one in the task struct
3546 +       */
3547 +       if (cur_th->t_super != th->t_super)
3548 +           BUG() ;
3549 +
3550 +       th->t_refcount--;
3551 +       if (th != cur_th) {
3552 +           int flags = cur_th->t_flags ;
3553 +           /* nested handles are never persistent */
3554 +           if (reiserfs_persistent_handle(th)) {
3555 +               BUG() ;
3556 +           }
3557 +           memcpy(cur_th, th, sizeof(*th));
3558 +           th->t_flags = 0 ;
3559 +           cur_th->t_flags = flags ;
3560 +       }
3561 +       ret = 0;
3562 +    } else {
3563 +       ret = do_journal_end(th, p_s_sb, nblocks, 0) ;
3564 +    }
3565 +    return ret;
3566  }
3567
3568  /* removes from the current transaction, relsing and descrementing any counters.
3569 @@ -2600,6 +3125,10 @@
3570  */
3571  int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3572
3573 +  /* you are not allowed to sync while nested, very, very bad */
3574 +  if (th->t_refcount > 1) {
3575 +    BUG() ;
3576 +  }
3577    if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3578      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3579      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3580 @@ -2624,12 +3153,14 @@
3581  **
3582  */
3583  void flush_async_commits(struct super_block *p_s_sb) {
3584 -  int i ;
3585 +  struct reiserfs_journal_list *jl;
3586 +  struct list_head *entry;
3587
3588 -  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
3589 -    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
3590 -      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
3591 -    }
3592 +  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
3593 +      /* last entry is the youngest, commit it and you get everything */
3594 +      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
3595 +      jl = JOURNAL_LIST_ENTRY(entry);
3596 +      flush_commit_list(p_s_sb, jl, 1);
3597    }
3598  }
3599
3600 @@ -2637,58 +3168,39 @@
3601  ** flushes any old transactions to disk
3602  ** ends the current transaction if it is too old
3603  **
3604 -** also calls flush_journal_list with old_only == 1, which allows me to reclaim
3605 -** memory and such from the journal lists whose real blocks are all on disk.
3606 -**
3607 -** called by sync_dev_journal from buffer.c
3608  */
3609 -int flush_old_commits(struct super_block *p_s_sb, int immediate) {
3610 -  int i ;
3611 -  int count = 0;
3612 -  int start ;
3613 -  time_t now ;
3614 -  struct reiserfs_transaction_handle th ;
3615 -
3616 -  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
3617 -  now = CURRENT_TIME ;
3618 +int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
3619 +    time_t now ;
3620 +    struct reiserfs_transaction_handle th ;
3621 +
3622 +    now = CURRENT_TIME ;
3623 +    /* safety check so we don't flush while we are replaying the log during
3624 +     * mount
3625 +     */
3626 +    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
3627 +       return 0  ;
3628 +    }
3629
3630 -  /* safety check so we don't flush while we are replaying the log during mount */
3631 -  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
3632 -    return 0  ;
3633 -  }
3634 -  /* starting with oldest, loop until we get to the start */
3635 -  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
3636 -  while(i != start) {
3637 -    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
3638 -       immediate)) {
3639 -      /* we have to check again to be sure the current transaction did not change */
3640 -      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
3641 -       flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
3642 -      }
3643 -    }
3644 -    i = (i + 1) % JOURNAL_LIST_COUNT ;
3645 -    count++ ;
3646 -  }
3647 -  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
3648 -  ** force the commit blocks to disk
3649 -  */
3650 -  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
3651 -     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3652 -     SB_JOURNAL(p_s_sb)->j_len > 0 &&
3653 -     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
3654 -    journal_join(&th, p_s_sb, 1) ;
3655 -    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3656 -    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3657 -    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
3658 -  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to
3659 -                             flush, we must be sure old transactions hit the disk too. */
3660 -    journal_join(&th, p_s_sb, 1) ;
3661 -    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3662 -    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3663 -    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
3664 -  }
3665 -   reiserfs_journal_kupdate(p_s_sb) ;
3666 -   return 0 ;
3667 +    /* check the current transaction.  If there are no writers, and it is
3668 +     * too old, finish it, and force the commit blocks to disk
3669 +     */
3670 +    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
3671 +        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
3672 +        SB_JOURNAL(p_s_sb)->j_len > 0 &&
3673 +        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) >
3674 +       SB_JOURNAL_MAX_TRANS_AGE(p_s_sb))
3675 +    {
3676 +       journal_join(&th, p_s_sb, 1) ;
3677 +       reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3678 +       journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3679 +
3680 +       /* we're only being called from kreiserfsd, it makes no sense to do
3681 +       ** an async commit so that kreiserfsd can do it later
3682 +       */
3683 +       do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
3684 +    }
3685 +    reiserfs_journal_kupdate(p_s_sb) ;
3686 +    return p_s_sb->s_dirt;
3687  }
3688
3689  /*
3690 @@ -2709,6 +3221,7 @@
3691    int flush = flags & FLUSH_ALL ;
3692    int commit_now = flags & COMMIT_NOW ;
3693    int wait_on_commit = flags & WAIT ;
3694 +  struct reiserfs_journal_list *jl;
3695
3696    if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
3697      reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
3698 @@ -2727,8 +3240,9 @@
3699    if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3700      int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
3701      unlock_journal(p_s_sb) ;
3702 -    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
3703 -      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
3704 +    BUG();
3705 +    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) {
3706 +      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
3707        wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3708      }
3709      return 0 ;
3710 @@ -2741,24 +3255,37 @@
3711    */
3712    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
3713      if (flush || commit_now) {
3714 -      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
3715 +      unsigned trans_id ;
3716 +
3717 +      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
3718 +      trans_id = jl->j_trans_id;
3719 +
3720        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3721        if (flush) {
3722          SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
3723        }
3724        unlock_journal(p_s_sb) ;
3725 +
3726        /* sleep while the current transaction is still j_jlocked */
3727 -      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
3728 -            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
3729 -       sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
3730 -      }
3731 -      if (commit_now) {
3732 -       if (wait_on_commit) {
3733 -         flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
3734 -       } else {
3735 -         commit_flush_async(p_s_sb, orig_jindex) ;
3736 +      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3737 +       if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
3738 +           queue_log_writer(p_s_sb);
3739 +        } else {
3740 +           lock_journal(p_s_sb);
3741 +           if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3742 +               atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
3743 +           }
3744 +           unlock_journal(p_s_sb);
3745         }
3746        }
3747 +      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
3748 +          BUG();
3749 +      }
3750 +      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
3751 +          wait_on_commit)
3752 +      {
3753 +         flush_commit_list(p_s_sb, jl, 1) ;
3754 +      }
3755        return 0 ;
3756      }
3757      unlock_journal(p_s_sb) ;
3758 @@ -2776,8 +3303,8 @@
3759    if (!(SB_JOURNAL(p_s_sb)->j_must_wait > 0) && !(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))) && !flush && !commit_now &&
3760        (SB_JOURNAL(p_s_sb)->j_len < SB_JOURNAL_MAX_BATCH(p_s_sb))  &&
3761        SB_JOURNAL(p_s_sb)->j_len_alloc < SB_JOURNAL_MAX_BATCH(p_s_sb) && SB_JOURNAL(p_s_sb)->j_cnode_free > (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3)) {
3762 -    SB_JOURNAL(p_s_sb)->j_bcount++ ;
3763      unlock_journal(p_s_sb) ;
3764 +
3765      return 0 ;
3766    }
3767
3768 @@ -2807,16 +3334,13 @@
3769    struct reiserfs_list_bitmap *jb = NULL ;
3770    int cleaned = 0 ;
3771
3772 -  if (reiserfs_dont_log(th->t_super)) {
3773 -    bh = sb_get_hash_table(p_s_sb, blocknr) ;
3774 -    if (bh && buffer_dirty (bh)) {
3775 -      reiserfs_warning (p_s_sb, "journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr);
3776 -      BUG ();
3777 -    }
3778 -    brelse (bh);
3779 -    return 0 ;
3780 +  cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, p_s_sb->s_dev,
3781 +                                       blocknr, p_s_sb->s_blocksize) ;
3782 +  if (cn && cn->bh) {
3783 +      bh = cn->bh ;
3784 +      get_bh(bh) ;
3785    }
3786 -  bh = sb_get_hash_table(p_s_sb, blocknr) ;
3787 +
3788    /* if it is journal new, we just remove it from this transaction */
3789    if (bh && buffer_journal_new(bh)) {
3790      mark_buffer_notjournal_new(bh) ;
3791 @@ -2824,14 +3348,22 @@
3792      cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
3793    } else {
3794      /* set the bit for this block in the journal bitmap for this transaction */
3795 -    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
3796 +    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
3797      if (!jb) {
3798        reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
3799      }
3800 -    set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
3801
3802 -    /* Note, the entire while loop is not allowed to schedule.  */
3803 +    /* we set bits in the list bitmap so the block won't be reallocated
3804 +     * as a data block which might get flushed before this transaction
3805 +     * commits.  When data logging is on, the block might get reallocated
3806 +     * as a data block, but we know the data block won't get flushed before
3807 +     * we commit
3808 +     */
3809 +    if (!reiserfs_data_log(p_s_sb)) {
3810 +       set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
3811 +    }
3812
3813 +    /* Note, the entire while loop is not allowed to schedule.  */
3814      if (bh) {
3815        clear_prepared_bits(bh) ;
3816      }
3817 @@ -2876,57 +3408,77 @@
3818
3819  void reiserfs_update_inode_transaction(struct inode *inode) {
3820
3821 -  inode->u.reiserfs_i.i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
3822 -
3823 +  inode->u.reiserfs_i.i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
3824    inode->u.reiserfs_i.i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
3825  }
3826
3827  void reiserfs_update_tail_transaction(struct inode *inode) {
3828
3829 -  inode->u.reiserfs_i.i_tail_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
3830 -
3831 +  inode->u.reiserfs_i.i_tail_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
3832    inode->u.reiserfs_i.i_tail_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
3833  }
3834
3835 -static void __commit_trans_index(struct inode *inode, unsigned long id,
3836 -                                 unsigned long index)
3837 +static void __commit_trans_jl(struct inode *inode, unsigned long id,
3838 +                                 struct reiserfs_journal_list *jl)
3839  {
3840 -    struct reiserfs_journal_list *jl ;
3841      struct reiserfs_transaction_handle th ;
3842      struct super_block *sb = inode->i_sb ;
3843
3844 -    jl = SB_JOURNAL_LIST(sb) + index;
3845 -
3846      /* is it from the current transaction, or from an unknown transaction? */
3847      if (id == SB_JOURNAL(sb)->j_trans_id) {
3848 -       journal_join(&th, sb, 1) ;
3849 +       jl = SB_JOURNAL(sb)->j_current_jl;
3850 +       /* try to let other writers come in and grow this transaction */
3851 +       let_transaction_grow(sb, id);
3852 +       if (SB_JOURNAL(sb)->j_trans_id != id) {
3853 +           goto flush_commit_only;
3854 +       }
3855 +
3856 +       journal_begin(&th, sb, 1) ;
3857 +
3858 +       /* someone might have ended this transaction while we joined */
3859 +       if (SB_JOURNAL(sb)->j_trans_id != id) {
3860 +           reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
3861 +           journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
3862 +           journal_end(&th, sb, 1) ;
3863 +           goto flush_commit_only;
3864 +       }
3865 +
3866         journal_end_sync(&th, sb, 1) ;
3867 -    } else if (jl->j_trans_id == id) {
3868 -       flush_commit_list(sb, jl, 1) ;
3869 +
3870 +    } else {
3871 +       /* this gets tricky, we have to make sure the journal list in
3872 +        * the inode still exists.  We know the list is still around
3873 +        * if we've got a larger transaction id than the oldest list
3874 +        */
3875 +flush_commit_only:
3876 +       if (journal_list_still_alive(inode->i_sb, id)) {
3877 +           flush_commit_list(sb, jl, 1) ;
3878 +       }
3879      }
3880 -    /* if the transaction id does not match, this list is long since flushed
3881 -    ** and we don't have to do anything here
3882 -    */
3883 +    /* otherwise the list is gone, and long since committed */
3884  }
3885  void reiserfs_commit_for_tail(struct inode *inode) {
3886      unsigned long id = inode->u.reiserfs_i.i_tail_trans_id;
3887 -    unsigned long index = inode->u.reiserfs_i.i_tail_trans_index;
3888 +    struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_tail_jl;
3889
3890      /* for tails, if this info is unset there's nothing to commit */
3891 -    if (id && index)
3892 -       __commit_trans_index(inode, id, index);
3893 +    if (id && jl)
3894 +       __commit_trans_jl(inode, id, jl);
3895  }
3896  void reiserfs_commit_for_inode(struct inode *inode) {
3897      unsigned long id = inode->u.reiserfs_i.i_trans_id;
3898 -    unsigned long index = inode->u.reiserfs_i.i_trans_index;
3899 +    struct reiserfs_journal_list *jl = inode->u.reiserfs_i.i_jl;
3900
3901 -    /* for the whole inode, assume unset id or index means it was
3902 +    /* for the whole inode, assume unset id means it was
3903       * changed in the current transaction.  More conservative
3904       */
3905 -    if (!id || !index)
3906 +    if (!id || !jl) {
3907         reiserfs_update_inode_transaction(inode) ;
3908 +       id = inode->u.reiserfs_i.i_trans_id;
3909 +       /* jl will be updated in __commit_trans_jl */
3910 +    }
3911
3912 -    __commit_trans_index(inode, id, index);
3913 +    __commit_trans_jl(inode, id, jl);
3914  }
3915
3916  void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
3917 @@ -2954,8 +3506,6 @@
3918    int retry_count = 0 ;
3919
3920    PROC_INFO_INC( p_s_sb, journal.prepare );
3921 -  if (reiserfs_dont_log (p_s_sb))
3922 -    return;
3923
3924    while(!test_bit(BH_JPrepared, &bh->b_state) ||
3925          (wait && buffer_locked(bh))) {
3926 @@ -2964,16 +3514,37 @@
3927        return ;
3928      }
3929      set_bit(BH_JPrepared, &bh->b_state) ;
3930 +
3931      if (wait) {
3932        RFALSE( buffer_locked(bh) && cur_tb != NULL,
3933               "waiting while do_balance was running\n") ;
3934 +      /* only data buffers are allowed to come in dirty, and they
3935 +       * never get run through restore_prepared_buffer.  So we can
3936 +       * just mark them clean here and know it is safe
3937 +       */
3938 +      mark_buffer_clean(bh);
3939        wait_on_buffer(bh) ;
3940 -    }
3941 +    }
3942      PROC_INFO_INC( p_s_sb, journal.prepare_retry );
3943      retry_count++ ;
3944    }
3945  }
3946 -
3947 +static void flush_old_journal_lists(struct super_block *s) {
3948 +    struct reiserfs_journal_list *jl;
3949 +    struct list_head *entry;
3950 +    time_t now = CURRENT_TIME;
3951 +
3952 +    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
3953 +        entry = SB_JOURNAL(s)->j_journal_list.next;
3954 +       jl = JOURNAL_LIST_ENTRY(entry);
3955 +       /* this check should always be run, to send old lists to disk */
3956 +       if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
3957 +           flush_used_journal_lists(s, jl);
3958 +       } else {
3959 +           break;
3960 +       }
3961 +    }
3962 +}
3963  /*
3964  ** long and ugly.  If flush, will not return until all commit
3965  ** blocks and all real buffers in the trans are on disk.
3966 @@ -2990,18 +3561,30 @@
3967    struct buffer_head *c_bh ; /* commit bh */
3968    struct buffer_head *d_bh ; /* desc bh */
3969    int cur_write_start = 0 ; /* start index of current log write */
3970 -  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
3971    int old_start ;
3972    int i ;
3973 -  int jindex ;
3974 -  int orig_jindex ;
3975    int flush = flags & FLUSH_ALL ;
3976    int commit_now = flags & COMMIT_NOW ;
3977    int wait_on_commit = flags & WAIT ;
3978    struct reiserfs_super_block *rs ;
3979 +  struct reiserfs_journal_list *jl, *temp_jl;
3980 +  struct list_head *entry, *safe;
3981 +  int wakeup_kreiserfsd = 0;
3982 +  unsigned long jindex;
3983 +  unsigned long commit_trans_id;
3984 +
3985 +  if (th->t_refcount > 1)
3986 +    BUG() ;
3987
3988 +  reiserfs_check_lock_depth("journal end");
3989 +  current->journal_info = th->t_handle_save;
3990    if (reiserfs_dont_log(th->t_super)) {
3991 -    return 0 ;
3992 +    goto out ;
3993 +  }
3994 +
3995 +  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
3996 +      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3997 +      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3998    }
3999
4000    lock_journal(p_s_sb) ;
4001 @@ -3018,7 +3601,9 @@
4002    ** it tells us if we should continue with the journal_end, or just return
4003    */
4004    if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
4005 -    return 0 ;
4006 +    p_s_sb->s_dirt = 1;
4007 +    wake_queued_writers(p_s_sb);
4008 +    goto out ;
4009    }
4010
4011    /* check_journal_end might set these, check again */
4012 @@ -3037,8 +3622,11 @@
4013    }
4014
4015  #ifdef REISERFS_PREALLOCATE
4016 +  /* quota ops might need to nest, setup the journal_info pointer for them */
4017 +  current->journal_info = th ;
4018    reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
4019                                       * the transaction */
4020 +  current->journal_info = th->t_handle_save ;
4021  #endif
4022
4023    rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
4024 @@ -3059,25 +3647,23 @@
4025    mark_buffer_uptodate(c_bh, 1) ;
4026
4027    /* init this journal list */
4028 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
4029 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
4030 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
4031 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
4032 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
4033 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;
4034 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
4035 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
4036 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
4037 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
4038 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
4039 -
4040 -  /* which is faster, locking/unlocking at the start and end of the for
4041 -  ** or locking once per iteration around the insert_journal_hash?
4042 -  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
4043 -  ** LOOP MUST not cause schedule to occur.
4044 -  */
4045 +  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
4046 +
4047 +  /* save the transaction id in case we need to commit it later */
4048 +  commit_trans_id = jl->j_trans_id;
4049
4050 -  /* for each real block, add it to the journal list hash,
4051 +  atomic_set(&jl->j_older_commits_done, 0) ;
4052 +  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
4053 +  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
4054 +  jl->j_commit_bh = c_bh ;
4055 +  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
4056 +  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;
4057 +  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
4058 +  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
4059 +  jl->j_realblock = NULL ;
4060 +
4061 +  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
4062 +  **  for each real block, add it to the journal list hash,
4063    ** copy into real block index array in the commit or desc block
4064    */
4065    for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
4066 @@ -3087,7 +3673,7 @@
4067          reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
4068        }
4069        if (i == 0) {
4070 -        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
4071 +        jl->j_realblock = jl_cn ;
4072        }
4073        jl_cn->prev = last_cn ;
4074        jl_cn->next = NULL ;
4075 @@ -3105,7 +3691,7 @@
4076        jl_cn->state = 0 ;
4077        jl_cn->dev = cn->bh->b_dev ;
4078        jl_cn->bh = cn->bh ;
4079 -      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
4080 +      jl_cn->jlist = jl;
4081        insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ;
4082        if (i < JOURNAL_TRANS_HALF) {
4083         desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
4084 @@ -3130,29 +3716,34 @@
4085  reiserfs_warning(p_s_sb, "journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
4086      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
4087      wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
4088 -    return 0 ;
4089 +    goto out ;
4090    }
4091
4092    /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
4093    cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
4094 -  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
4095    cn = SB_JOURNAL(p_s_sb)->j_first ;
4096    jindex = 1 ; /* start at one so we don't get the desc again */
4097 -  while(cur_blocks_left > 0) {
4098 +  while(cn) {
4099 +    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
4100      /* copy all the real blocks into log area.  dirty log blocks */
4101      if (test_bit(BH_JDirty, &cn->bh->b_state)) {
4102        struct buffer_head *tmp_bh ;
4103        tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
4104                        ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
4105        mark_buffer_uptodate(tmp_bh, 1) ;
4106 -      memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;
4107 +      memcpy(tmp_bh->b_data, bh_kmap(cn->bh), cn->bh->b_size) ;
4108 +      bh_kunmap(cn->bh);
4109        jindex++ ;
4110 +      set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ;
4111 +      clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
4112      } else {
4113        /* JDirty cleared sometime during transaction.  don't log this one */
4114        reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
4115 +      brelse(cn->bh) ;
4116      }
4117 -    cn = cn->next ;
4118 -    cur_blocks_left-- ;
4119 +    next = cn->next ;
4120 +    free_cnode(p_s_sb, cn) ;
4121 +    cn = next ;
4122    }
4123
4124    /* we are done  with both the c_bh and d_bh, but
4125 @@ -3160,47 +3751,19 @@
4126    ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4127    */
4128
4129 -  /* now loop through and mark all buffers from this transaction as JDirty_wait
4130 -  ** clear the JDirty bit, clear BH_JNew too.
4131 -  ** if they weren't JDirty, they weren't logged, just relse them and move on
4132 -  */
4133 -  cn = SB_JOURNAL(p_s_sb)->j_first ;
4134 -  while(cn) {
4135 -    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
4136 -    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
4137 -      set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ;
4138 -      clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
4139 -    } else {
4140 -      brelse(cn->bh) ;
4141 -    }
4142 -    next = cn->next ;
4143 -    free_cnode(p_s_sb, cn) ;
4144 -    cn = next ;
4145 -  }
4146 -
4147 -  /* unlock the journal list for committing and flushing */
4148 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
4149 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
4150 -
4151 -  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
4152 -  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
4153 -  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
4154 +  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
4155
4156 -  /* write any buffers that must hit disk before this commit is done */
4157 -  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
4158 +  /* we lock the commit before putting it onto the main list because
4159 +   * we want to make sure nobody tries to run flush_commit_list until
4160 +   * the new transaction is fully setup, and we've already flushed the
4161 +   * ordered bh list
4162 +   */
4163 +  down(&jl->j_commit_lock);
4164
4165 -  /* honor the flush and async wishes from the caller */
4166 -  if (flush) {
4167 -
4168 -    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
4169 -    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;
4170 -  } else if (commit_now) {
4171 -    if (wait_on_commit) {
4172 -      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
4173 -    } else {
4174 -      commit_flush_async(p_s_sb, orig_jindex) ;
4175 -    }
4176 -  }
4177 +  /* now it is safe to insert this transaction on the main list */
4178 +  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
4179 +  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
4180 +  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
4181
4182    /* reset journal values for the next transaction */
4183    old_start = SB_JOURNAL(p_s_sb)->j_start ;
4184 @@ -3212,57 +3775,119 @@
4185    SB_JOURNAL(p_s_sb)->j_len = 0 ;
4186    SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
4187    SB_JOURNAL(p_s_sb)->j_trans_id++ ;
4188 +  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
4189    SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
4190    SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
4191    SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
4192    SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
4193    init_journal_hash(p_s_sb) ;
4194
4195 +  /* tail conversion targets have to hit the disk before we end the
4196 +   * transaction.  Otherwise a later transaction might repack the tail
4197 +   * before this transaction commits, leaving the data block unflushed and
4198 +   * clean, if we crash before the later transaction commits, the data block
4199 +   * is lost.
4200 +   */
4201 +  while(!list_empty(&jl->j_tail_bh_list)) {
4202 +      unlock_kernel();
4203 +      fsync_buffers_list(&jl->j_tail_bh_list);
4204 +      lock_kernel();
4205 +  }
4206 +  up(&jl->j_commit_lock);
4207 +
4208 +  /* honor the flush wishes from the caller, simple commits can
4209 +  ** be done outside the journal lock, they are done below
4210 +  */
4211 +  if (flush) {
4212 +    flush_commit_list(p_s_sb, jl, 1) ;
4213 +    flush_journal_list(p_s_sb, jl, 1) ;
4214 +  }
4215 +
4216 +
4217    /* if the next transaction has any chance of wrapping, flush
4218    ** transactions that might get overwritten.  If any journal lists are very
4219    ** old flush them as well.
4220    */
4221 -  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
4222 -    jindex = i ;
4223 -    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4224 -      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4225 -       flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
4226 -      }
4227 -    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 &&
4228 -              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
4229 -      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
4230 -            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
4231 -       flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ;
4232 +first_jl:
4233 +  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
4234 +    temp_jl = JOURNAL_LIST_ENTRY(entry);
4235 +    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
4236 +      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >=
4237 +          temp_jl->j_start)
4238 +      {
4239 +       flush_used_journal_lists(p_s_sb, temp_jl);
4240 +       wakeup_kreiserfsd = 1;
4241 +       goto first_jl;
4242 +      } else if ((SB_JOURNAL(p_s_sb)->j_start +
4243 +                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) <
4244 +                 SB_ONDISK_JOURNAL_SIZE(p_s_sb))
4245 +      {
4246 +          /* if we don't cross into the next transaction and we don't
4247 +          * wrap, there is no way we can overlap any later transactions
4248 +          * break now
4249 +          */
4250 +         break;
4251 +      }
4252 +    } else if ((SB_JOURNAL(p_s_sb)->j_start +
4253 +                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >
4254 +               SB_ONDISK_JOURNAL_SIZE(p_s_sb))
4255 +    {
4256 +      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) %
4257 +            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
4258 +      {
4259 +       flush_used_journal_lists(p_s_sb, temp_jl);
4260 +       wakeup_kreiserfsd = 1;
4261 +       goto first_jl;
4262 +      } else {
4263 +         /* we don't overlap anything from out start to the end of the
4264 +          * log, and our wrapped portion doesn't overlap anything at
4265 +          * the start of the log.  We can break
4266 +          */
4267 +         break;
4268        }
4269 -    }
4270 -    /* this check should always be run, to send old lists to disk */
4271 -    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 &&
4272 -              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp <
4273 -             (CURRENT_TIME - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
4274 -       flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ;
4275      }
4276    }
4277 +  flush_old_journal_lists(p_s_sb);
4278
4279 -  /* if the next journal_list is still in use, flush it */
4280 -  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
4281 -    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ;
4282 -  }
4283 +  /* soft limit */
4284 +  if (SB_JOURNAL(p_s_sb)->j_num_work_lists > 128 || wakeup_kreiserfsd) {
4285 +      wake_up(&reiserfs_commit_thread_wait) ;
4286 +  }
4287
4288 -  /* we don't want anyone flushing the new transaction's list */
4289 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
4290 -  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
4291 -  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) +
4292 -                                                                                        SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
4293 +  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
4294
4295 -  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
4296 +  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
4297      reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
4298    }
4299 -  unlock_journal(p_s_sb) ;
4300 +
4301    atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
4302 +  unlock_journal(p_s_sb) ;
4303    /* wake up any body waiting to join. */
4304 +  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
4305    wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
4306 +
4307 +  if (!flush && commit_now && wait_on_commit) {
4308 +      if (current->need_resched) {
4309 +          schedule() ;
4310 +      }
4311 +      if (journal_list_still_alive(p_s_sb, commit_trans_id))
4312 +         flush_commit_list(p_s_sb, jl, 1) ;
4313 +  }
4314 +  /* if we did an async commit, get kreiserfsd going on it */
4315 +  if (!commit_now && !wait_on_commit) {
4316 +      wake_up(&reiserfs_commit_thread_wait) ;
4317 +      schedule();
4318 +  }
4319 +out:
4320 +  reiserfs_check_lock_depth("journal end2");
4321 +  if (reiserfs_persistent_handle(th)) {
4322 +      memset(th, 0, sizeof(*th));
4323 +      reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), p_s_sb) ;
4324 +  } else
4325 +      th->t_flags = 0 ;
4326    return 0 ;
4327  }
4328
4329 -
4330 -
4331 +int __init reiserfs_journal_cache_init(void) {
4332 +    return 0;
4333 +}
4334 diff -urN linux-2.4.22.org/fs/reiserfs/Makefile linux-2.4.22/fs/reiserfs/Makefile
4335 --- linux-2.4.22.org/fs/reiserfs/Makefile       2003-11-21 15:08:29.000000000 +0100
4336 +++ linux-2.4.22/fs/reiserfs/Makefile   2003-11-21 15:14:23.000000000 +0100
4337 @@ -7,6 +7,7 @@
4338  #
4339  # Note 2! The CFLAGS definitions are now in the main makefile...
4340
4341 +export-objs := super.o
4342  O_TARGET := reiserfs.o
4343  obj-y   := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \
4344  lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o item_ops.o ioctl.o procfs.o
4345 diff -urN linux-2.4.22.org/fs/reiserfs/namei.c linux-2.4.22/fs/reiserfs/namei.c
4346 --- linux-2.4.22.org/fs/reiserfs/namei.c        2003-11-21 15:08:29.000000000 +0100
4347 +++ linux-2.4.22/fs/reiserfs/namei.c    2003-11-21 15:14:23.000000000 +0100
4348 @@ -7,6 +7,7 @@
4349  #include <linux/bitops.h>
4350  #include <linux/reiserfs_fs.h>
4351  #include <linux/smp_lock.h>
4352 +#include <linux/quotaops.h>
4353
4354  #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
4355  #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
4356 @@ -469,7 +470,7 @@
4357      }
4358
4359      /* perform the insertion of the entry that we have prepared */
4360 -    retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size);
4361 +    retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
4362      if (buffer != small_buf)
4363         reiserfs_kfree (buffer, buflen, dir->i_sb);
4364      if (retval) {
4365 @@ -478,7 +479,6 @@
4366      }
4367
4368      dir->i_size += paste_size;
4369 -    dir->i_blocks = ((dir->i_size + 511) >> 9);
4370      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
4371      if (!S_ISDIR (inode->i_mode) && visible)
4372         // reiserfs_mkdir or reiserfs_rename will do that by itself
4373 @@ -494,7 +494,9 @@
4374  ** inserted into the tree yet.
4375  */
4376  static int drop_new_inode(struct inode *inode) {
4377 +    DQUOT_DROP(inode);
4378      make_bad_inode(inode) ;
4379 +    inode->i_flags |= S_NOQUOTA;
4380      iput(inode) ;
4381      return 0 ;
4382  }
4383 @@ -518,6 +520,11 @@
4384      } else
4385          inode->i_gid = current->fsgid;
4386
4387 +    DQUOT_INIT(inode);
4388 +    if (DQUOT_ALLOC_INODE(inode)) {
4389 +        drop_new_inode(inode);
4390 +       return -EDQUOT;
4391 +    }
4392      return 0 ;
4393  }
4394
4395 @@ -536,7 +543,6 @@
4396         return retval ;
4397
4398      journal_begin(&th, dir->i_sb, jbegin_count) ;
4399 -    th.t_caller = "create" ;
4400      retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
4401      if (retval) {
4402         goto out_failed ;
4403 @@ -750,7 +756,6 @@
4404
4405      DEC_DIR_INODE_NLINK(dir)
4406      dir->i_size -= (DEH_SIZE + de.de_entrylen);
4407 -    dir->i_blocks = ((dir->i_size + 511) >> 9);
4408      reiserfs_update_sd (&th, dir);
4409
4410      /* prevent empty directory from getting lost */
4411 @@ -835,7 +840,6 @@
4412      reiserfs_update_sd (&th, inode);
4413
4414      dir->i_size -= (de.de_entrylen + DEH_SIZE);
4415 -    dir->i_blocks = ((dir->i_size + 511) >> 9);
4416      dir->i_ctime = dir->i_mtime = CURRENT_TIME;
4417      reiserfs_update_sd (&th, dir);
4418
4419 @@ -1245,7 +1249,6 @@
4420         reiserfs_warning ((&th)->t_super, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?\n");
4421
4422      old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
4423 -    old_dir->i_blocks = ((old_dir->i_size + 511) >> 9);
4424
4425      reiserfs_update_sd (&th, old_dir);
4426      reiserfs_update_sd (&th, new_dir);
4427 diff -urN linux-2.4.22.org/fs/reiserfs/objectid.c linux-2.4.22/fs/reiserfs/objectid.c
4428 --- linux-2.4.22.org/fs/reiserfs/objectid.c     2003-11-21 15:08:29.000000000 +0100
4429 +++ linux-2.4.22/fs/reiserfs/objectid.c 2003-11-21 15:14:23.000000000 +0100
4430 @@ -87,7 +87,6 @@
4431      }
4432
4433      journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
4434 -    s->s_dirt = 1;
4435      return unused_objectid;
4436  }
4437
4438 @@ -106,8 +105,6 @@
4439
4440      reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
4441      journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
4442 -    s->s_dirt = 1;
4443 -
4444
4445      /* start at the beginning of the objectid map (i = 0) and go to
4446         the end of it (i = disk_sb->s_oid_cursize).  Linear search is
4447 diff -urN linux-2.4.22.org/fs/reiserfs/procfs.c linux-2.4.22/fs/reiserfs/procfs.c
4448 --- linux-2.4.22.org/fs/reiserfs/procfs.c       2003-11-21 15:08:29.000000000 +0100
4449 +++ linux-2.4.22/fs/reiserfs/procfs.c   2003-11-21 15:14:24.000000000 +0100
4450 @@ -497,7 +497,6 @@
4451                         "j_first_unflushed_offset: \t%lu\n"
4452                         "j_last_flush_trans_id: \t%lu\n"
4453                         "j_trans_start_time: \t%li\n"
4454 -                       "j_journal_list_index: \t%i\n"
4455                         "j_list_bitmap_index: \t%i\n"
4456                         "j_must_wait: \t%i\n"
4457                         "j_next_full_flush: \t%i\n"
4458 @@ -543,7 +542,6 @@
4459                         JF( j_first_unflushed_offset ),
4460                         JF( j_last_flush_trans_id ),
4461                         JF( j_trans_start_time ),
4462 -                       JF( j_journal_list_index ),
4463                         JF( j_list_bitmap_index ),
4464                         JF( j_must_wait ),
4465                         JF( j_next_full_flush ),
4466 diff -urN linux-2.4.22.org/fs/reiserfs/stree.c linux-2.4.22/fs/reiserfs/stree.c
4467 --- linux-2.4.22.org/fs/reiserfs/stree.c        2003-11-21 15:08:29.000000000 +0100
4468 +++ linux-2.4.22/fs/reiserfs/stree.c    2003-11-21 15:14:25.000000000 +0100
4469 @@ -60,6 +60,7 @@
4470  #include <linux/pagemap.h>
4471  #include <linux/reiserfs_fs.h>
4472  #include <linux/smp_lock.h>
4473 +#include <linux/quotaops.h>
4474
4475  /* Does the buffer contain a disk block which is in the tree. */
4476  inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
4477 @@ -71,9 +72,6 @@
4478    return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
4479  }
4480
4481 -
4482 -
4483 -
4484  inline void copy_short_key (void * to, const void * from)
4485  {
4486      memcpy (to, from, SHORT_KEY_SIZE);
4487 @@ -652,9 +650,9 @@
4488                                         stop at leaf level - set to
4489                                         DISK_LEAF_NODE_LEVEL */
4490      ) {
4491 -    int  n_block_number = SB_ROOT_BLOCK (p_s_sb),
4492 -      expected_level = SB_TREE_HEIGHT (p_s_sb),
4493 -      n_block_size    = p_s_sb->s_blocksize;
4494 +    int  n_block_number,
4495 +         expected_level,
4496 +         n_block_size    = p_s_sb->s_blocksize;
4497      struct buffer_head  *       p_s_bh;
4498      struct path_element *       p_s_last_element;
4499      int                                n_node_level, n_retval;
4500 @@ -678,8 +676,11 @@
4501      /* With each iteration of this loop we search through the items in the
4502         current node, and calculate the next current node(next path element)
4503         for the next iteration of this loop.. */
4504 +    n_block_number = SB_ROOT_BLOCK (p_s_sb);
4505 +    expected_level = SB_TREE_HEIGHT (p_s_sb);
4506      while ( 1 ) {
4507
4508 +        reiserfs_check_lock_depth("search_by_key");
4509  #ifdef CONFIG_REISERFS_CHECK
4510         if ( !(++n_repeat_counter % 50000) )
4511             reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
4512 @@ -1123,8 +1124,7 @@
4513                 tmp = get_block_num(p_n_unfm_pointer,0);
4514                 put_block_num(p_n_unfm_pointer, 0, 0);
4515                 journal_mark_dirty (th, p_s_sb, p_s_bh);
4516 -               inode->i_blocks -= p_s_sb->s_blocksize / 512;
4517 -               reiserfs_free_block(th, tmp);
4518 +               reiserfs_free_block(th, inode, tmp, 1);
4519                 /* In case of big fragmentation it is possible that each block
4520                    freed will cause dirtying of one more bitmap and then we will
4521                    quickly overflow our transaction space. This is a
4522 @@ -1132,9 +1132,7 @@
4523                 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
4524                     int orig_len_alloc = th->t_blocks_allocated ;
4525                     pathrelse(p_s_path) ;
4526 -
4527 -                   journal_end(th, p_s_sb, orig_len_alloc) ;
4528 -                   journal_begin(th, p_s_sb, orig_len_alloc) ;
4529 +                   reiserfs_restart_transaction(th, orig_len_alloc);
4530                     reiserfs_update_inode_transaction(inode) ;
4531                     need_research = 1;
4532                     break;
4533 @@ -1168,8 +1166,7 @@
4534      }
4535  }
4536
4537 -
4538 -/* Calculate bytes number which will be deleted or cutted in the balance. */
4539 +/* Calculate number of bytes which will be deleted or cut during balance */
4540  int calc_deleted_bytes_number(
4541      struct  tree_balance  * p_s_tb,
4542      char                    c_mode
4543 @@ -1180,14 +1177,14 @@
4544      if ( is_statdata_le_ih (p_le_ih) )
4545         return 0;
4546
4547 +    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
4548      if ( is_direntry_le_ih (p_le_ih) ) {
4549         // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
4550         // we can't use EMPTY_DIR_SIZE, as old format dirs have a different
4551         // empty size.  ick. FIXME, is this right?
4552         //
4553 -       return ih_item_len(p_le_ih);
4554 +       return n_del_size ;
4555      }
4556 -    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
4557
4558      if ( is_indirect_le_ih (p_le_ih) )
4559         n_del_size = (n_del_size/UNFM_P_SIZE)*
4560 @@ -1221,17 +1218,46 @@
4561         item [--i] = 0;
4562  }
4563
4564 +#ifdef REISERQUOTA_DEBUG
4565 +char key2type(struct key *ih)
4566 +{
4567 +  if (is_direntry_le_key(2, ih))
4568 +    return 'd';
4569 +  if (is_direct_le_key(2, ih))
4570 +    return 'D';
4571 +  if (is_indirect_le_key(2, ih))
4572 +    return 'i';
4573 +  if (is_statdata_le_key(2, ih))
4574 +    return 's';
4575 +  return 'u';
4576 +}
4577 +
4578 +char head2type(struct item_head *ih)
4579 +{
4580 +  if (is_direntry_le_ih(ih))
4581 +    return 'd';
4582 +  if (is_direct_le_ih(ih))
4583 +    return 'D';
4584 +  if (is_indirect_le_ih(ih))
4585 +    return 'i';
4586 +  if (is_statdata_le_ih(ih))
4587 +    return 's';
4588 +  return 'u';
4589 +}
4590 +#endif
4591
4592  /* Delete object item. */
4593  int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
4594                           struct path * p_s_path, /* Path to the deleted item. */
4595                           const struct cpu_key * p_s_item_key, /* Key to search for the deleted item.  */
4596 -                         struct inode * p_s_inode,/* inode is here just to update i_blocks */
4597 +                         struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
4598                           struct buffer_head  * p_s_un_bh)    /* NULL or unformatted node pointer.    */
4599  {
4600      struct super_block * p_s_sb = p_s_inode->i_sb;
4601      struct tree_balance   s_del_balance;
4602      struct item_head      s_ih;
4603 +    struct item_head      *q_ih;
4604 +    int                          quota_cut_bytes;
4605      int                   n_ret_value,
4606         n_del_size,
4607         n_removed;
4608 @@ -1281,6 +1307,22 @@
4609
4610      // reiserfs_delete_item returns item length when success
4611      n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
4612 +    q_ih = get_ih(p_s_path) ;
4613 +    quota_cut_bytes = ih_item_len(q_ih) ;
4614 +
4615 +    /* hack so the quota code doesn't have to guess if the file
4616 +    ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
4617 +    ** We test the offset because the tail might have been
4618 +    ** split into multiple items, and we only want to decrement for
4619 +    ** the unfm node once
4620 +    */
4621 +    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
4622 +        if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
4623 +            quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
4624 +        } else {
4625 +           quota_cut_bytes = 0 ;
4626 +       }
4627 +    }
4628
4629      if ( p_s_un_bh )  {
4630         int off;
4631 @@ -1312,10 +1354,14 @@
4632         memcpy(data + off,
4633                B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
4634      }
4635 -
4636      /* Perform balancing after all resources have been collected at once. */
4637      do_balance(&s_del_balance, NULL, NULL, M_DELETE);
4638
4639 +#ifdef REISERQUOTA_DEBUG
4640 +    printk(KERN_DEBUG "reiserquota delete_item(): freeing %u, id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
4641 +#endif
4642 +    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
4643 +
4644      /* Return deleted body length */
4645      return n_ret_value;
4646  }
4647 @@ -1340,14 +1386,16 @@
4648
4649  /* this deletes item which never gets split */
4650  void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
4651 +                                struct inode *inode,
4652                                  struct key * key)
4653  {
4654      struct tree_balance tb;
4655      INITIALIZE_PATH (path);
4656 -    int item_len;
4657 +    int item_len = 0;
4658      int tb_init = 0 ;
4659      struct cpu_key cpu_key;
4660      int retval;
4661 +    int quota_cut_bytes = 0;
4662
4663      le_key2cpu_key (&cpu_key, key);
4664
4665 @@ -1371,6 +1419,7 @@
4666             item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
4667             init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
4668         }
4669 +       quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
4670
4671         retval = fix_nodes (M_DELETE, &tb, NULL, 0);
4672         if (retval == REPEAT_SEARCH) {
4673 @@ -1380,6 +1429,12 @@
4674
4675         if (retval == CARRY_ON) {
4676             do_balance (&tb, 0, 0, M_DELETE);
4677 +           if (inode) {        /* Should we count quota for item? (we don't count quotas for save-links) */
4678 +#ifdef REISERQUOTA_DEBUG
4679 +               printk(KERN_DEBUG "reiserquota delete_solid_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, inode->i_uid, key2type(key));
4680 +#endif
4681 +               DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
4682 +           }
4683             break;
4684         }
4685
4686 @@ -1412,7 +1467,7 @@
4687        }
4688  /* USE_INODE_GENERATION_COUNTER */
4689  #endif
4690 -    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
4691 +    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
4692  }
4693
4694
4695 @@ -1484,6 +1539,38 @@
4696      mark_inode_dirty (inode);
4697  }
4698
4699 +static void
4700 +unmap_buffers(struct page *page, loff_t pos) {
4701 +    struct buffer_head *bh ;
4702 +    struct buffer_head *head ;
4703 +    struct buffer_head *next ;
4704 +    unsigned long tail_index ;
4705 +    unsigned long cur_index ;
4706 +
4707 +    if (!page || !page->buffers)
4708 +        return;
4709 +
4710 +    tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
4711 +    cur_index = 0 ;
4712 +    head = page->buffers ;
4713 +    bh = head ;
4714 +    do {
4715 +        next = bh->b_this_page ;
4716 +
4717 +        /* we want to unmap the buffers that contain the tail, and
4718 +        ** all the buffers after it (since the tail must be at the
4719 +        ** end of the file).  We don't want to unmap file data
4720 +        ** before the tail, since it might be dirty and waiting to
4721 +        ** reach disk
4722 +        */
4723 +        cur_index += bh->b_size ;
4724 +        if (cur_index > tail_index) {
4725 +            reiserfs_unmap_buffer(bh) ;
4726 +        }
4727 +       bh = next ;
4728 +    } while (bh != head) ;
4729 +}
4730 +
4731
4732  /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
4733  int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
4734 @@ -1499,12 +1586,15 @@
4735         structure by using the init_tb_struct and fix_nodes functions.
4736         After that we can make tree balancing. */
4737      struct tree_balance s_cut_balance;
4738 +    struct item_head *p_le_ih;
4739 +    loff_t tail_pos = 0;
4740      int n_cut_size = 0,        /* Amount to be cut. */
4741         n_ret_value = CARRY_ON,
4742         n_removed = 0,     /* Number of the removed unformatted nodes. */
4743         n_is_inode_locked = 0;
4744      char                c_mode;            /* Mode of the balance. */
4745      int retval2 = -1;
4746 +    int quota_cut_bytes;
4747
4748
4749      init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
4750 @@ -1531,6 +1621,9 @@
4751                 /* tail has been left in the unformatted node */
4752                 return n_ret_value;
4753
4754 +           if (n_is_inode_locked) {
4755 +printk("inode locked twice\n");
4756 +           }
4757             n_is_inode_locked = 1;
4758
4759             /* removing of last unformatted node will change value we
4760 @@ -1545,6 +1638,7 @@
4761             set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
4762             p_s_item_key->key_length = 4;
4763             n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
4764 +           tail_pos = n_new_file_size;
4765             set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
4766             if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
4767                 print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
4768 @@ -1592,23 +1686,27 @@
4769      RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "illegal mode");
4770
4771      /* Calculate number of bytes that need to be cut from the item. */
4772 +    quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
4773      if (retval2 == -1)
4774         n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
4775      else
4776         n_ret_value = retval2;
4777 -
4778 -    if ( c_mode == M_DELETE ) {
4779 -       struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4780 -
4781 -       if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
4782 -           /* we delete first part of tail which was stored in direct
4783 -               item(s) */
4784 +
4785 +
4786 +    /* For direct items, we only change the quota when deleting the last
4787 +    ** item.
4788 +    */
4789 +    p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4790 +    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
4791 +        if (c_mode == M_DELETE &&
4792 +          (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
4793             // FIXME: this is to keep 3.5 happy
4794             p_s_inode->u.reiserfs_i.i_first_direct_byte = U32_MAX;
4795 -           p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512;
4796 +           quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
4797 +        } else {
4798 +           quota_cut_bytes = 0 ;
4799         }
4800      }
4801 -
4802  #ifdef CONFIG_REISERFS_CHECK
4803      if (n_is_inode_locked) {
4804         struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
4805 @@ -1642,7 +1740,12 @@
4806         ** deal with it here.
4807         */
4808         p_s_inode->u.reiserfs_i.i_flags &= ~i_pack_on_close_mask;
4809 +       unmap_buffers(page, tail_pos);
4810      }
4811 +#ifdef REISERQUOTA_DEBUG
4812 +    printk(KERN_DEBUG "reiserquota cut_from_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, '?');
4813 +#endif
4814 +    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
4815      return n_ret_value;
4816  }
4817
4818 @@ -1654,8 +1757,8 @@
4819
4820      set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
4821      set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
4822 -    reiserfs_delete_solid_item (th, INODE_PKEY (inode));
4823 -
4824 +    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
4825 +    reiserfs_update_sd(th, inode) ;
4826      set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
4827      set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);
4828  }
4829 @@ -1681,6 +1784,7 @@
4830         n_new_file_size;/* New file size. */
4831      int                   n_deleted;      /* Number of deleted or truncated bytes. */
4832      int retval;
4833 +    int jbegin_count = th->t_blocks_allocated;
4834
4835      if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
4836         return;
4837 @@ -1760,17 +1864,14 @@
4838         ** sure the file is consistent before ending the current trans
4839         ** and starting a new one
4840         */
4841 -        if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
4842 -         int orig_len_alloc = th->t_blocks_allocated ;
4843 +        if (journal_transaction_should_end(th, jbegin_count)) {
4844           decrement_counters_in_path(&s_search_path) ;
4845
4846           if (update_timestamps) {
4847               p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME;
4848           }
4849           reiserfs_update_sd(th, p_s_inode) ;
4850 -
4851 -         journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
4852 -         journal_begin(th, p_s_inode->i_sb, orig_len_alloc) ;
4853 +         reiserfs_restart_transaction(th, jbegin_count) ;
4854           reiserfs_update_inode_transaction(p_s_inode) ;
4855         }
4856      } while ( n_file_size > ROUND_UP (n_new_file_size) &&
4857 @@ -1822,18 +1923,37 @@
4858  int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
4859                               struct path         * p_s_search_path,    /* Path to the pasted item.          */
4860                               const struct cpu_key      * p_s_key,              /* Key to search for the needed item.*/
4861 +                             struct inode        * inode,              /* Inode item belongs to */
4862                               const char          * p_c_body,           /* Pointer to the bytes to paste.    */
4863                               int                   n_pasted_size)      /* Size of pasted bytes.             */
4864  {
4865      struct tree_balance s_paste_balance;
4866      int                 retval;
4867 +    int                        fs_gen;
4868 +
4869 +    fs_gen = get_generation(inode->i_sb) ;
4870 +
4871 +#ifdef REISERQUOTA_DEBUG
4872 +    printk(KERN_DEBUG "reiserquota paste_into_item(): allocating %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
4873 +#endif
4874
4875 +    if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
4876 +       pathrelse(p_s_search_path);
4877 +       return -EDQUOT;
4878 +    }
4879      init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
4880  #ifdef DISPLACE_NEW_PACKING_LOCALITIES
4881      s_paste_balance.key = p_s_key->on_disk_key;
4882  #endif
4883 -
4884 -    while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) {
4885 +
4886 +    /* DQUOT_* can schedule, must check before the fix_nodes */
4887 +    if (fs_changed(fs_gen, inode->i_sb)) {
4888 +       goto search_again;
4889 +    }
4890 +
4891 +    while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
4892 +REPEAT_SEARCH ) {
4893 +search_again:
4894         /* file system changed while we were in the fix_nodes */
4895         PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
4896         retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
4897 @@ -1862,6 +1982,10 @@
4898  error_out:
4899      /* this also releases the path */
4900      unfix_nodes(&s_paste_balance);
4901 +#ifdef REISERQUOTA_DEBUG
4902 +    printk(KERN_DEBUG "reiserquota paste_into_item(): freeing %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
4903 +#endif
4904 +    DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
4905      return retval ;
4906  }
4907
4908 @@ -1871,23 +1995,45 @@
4909                          struct path         *  p_s_path,         /* Path to the inserteded item.         */
4910                          const struct cpu_key      * key,
4911                          struct item_head    *  p_s_ih,           /* Pointer to the item header to insert.*/
4912 +                        struct inode        * inode,
4913                          const char          *  p_c_body)         /* Pointer to the bytes to insert.      */
4914  {
4915      struct tree_balance s_ins_balance;
4916      int                 retval;
4917 +    int fs_gen = 0 ;
4918 +    int quota_bytes = 0 ;
4919
4920 +    if (inode) {      /* Do we count quotas for item? */
4921 +       fs_gen = get_generation(inode->i_sb);
4922 +       quota_bytes = ih_item_len(p_s_ih);
4923 +
4924 +       /* hack so the quota code doesn't have to guess if the file has
4925 +        ** a tail, links are always tails, so there's no guessing needed
4926 +        */
4927 +       if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
4928 +           quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
4929 +       }
4930 +#ifdef REISERQUOTA_DEBUG
4931 +       printk(KERN_DEBUG "reiserquota insert_item(): allocating %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
4932 +#endif
4933 +       /* We can't dirty inode here. It would be immediately written but
4934 +        * appropriate stat item isn't inserted yet... */
4935 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
4936 +           pathrelse(p_s_path);
4937 +           return -EDQUOT;
4938 +       }
4939 +    }
4940      init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
4941  #ifdef DISPLACE_NEW_PACKING_LOCALITIES
4942      s_ins_balance.key = key->on_disk_key;
4943  #endif
4944 -
4945 -    /*
4946 -    if (p_c_body == 0)
4947 -      n_zeros_num = ih_item_len(p_s_ih);
4948 -    */
4949 -    //    le_key2cpu_key (&key, &(p_s_ih->ih_key));
4950 +    /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
4951 +    if (inode && fs_changed(fs_gen, inode->i_sb)) {
4952 +       goto search_again;
4953 +    }
4954
4955      while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
4956 +search_again:
4957         /* file system changed while we were in the fix_nodes */
4958         PROC_INFO_INC( th -> t_super, insert_item_restarted );
4959         retval = search_item (th->t_super, key, p_s_path);
4960 @@ -1902,7 +2048,7 @@
4961             goto error_out;
4962         }
4963      }
4964 -
4965 +
4966      /* make balancing after all resources will be collected at a time */
4967      if ( retval == CARRY_ON ) {
4968         do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
4969 @@ -1913,6 +2059,11 @@
4970  error_out:
4971      /* also releases the path */
4972      unfix_nodes(&s_ins_balance);
4973 +#ifdef REISERQUOTA_DEBUG
4974 +    printk(KERN_DEBUG "reiserquota insert_item(): freeing %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih));
4975 +#endif
4976 +    if (inode)
4977 +       DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
4978      return retval;
4979  }
4980
4981 diff -urN linux-2.4.22.org/fs/reiserfs/super.c linux-2.4.22/fs/reiserfs/super.c
4982 --- linux-2.4.22.org/fs/reiserfs/super.c        2003-11-21 15:08:29.000000000 +0100
4983 +++ linux-2.4.22/fs/reiserfs/super.c    2003-11-21 15:14:25.000000000 +0100
4984 @@ -13,6 +13,9 @@
4985  #include <linux/locks.h>
4986  #include <linux/init.h>
4987
4988 +EXPORT_SYMBOL(journal_begin) ;
4989 +EXPORT_SYMBOL(journal_end) ;
4990 +
4991  #define REISERFS_OLD_BLOCKSIZE 4096
4992  #define REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ 20
4993
4994 @@ -50,22 +53,28 @@
4995  static int reiserfs_remount (struct super_block * s, int * flags, char * data);
4996  static int reiserfs_statfs (struct super_block * s, struct statfs * buf);
4997
4998 -static void reiserfs_write_super (struct super_block * s)
4999 +static int reiserfs_sync_fs (struct super_block * s)
5000  {
5001 +    struct reiserfs_transaction_handle th;
5002 +    lock_kernel() ;
5003 +    if (!(s->s_flags & MS_RDONLY)) {
5004 +       journal_begin(&th, s, 1);
5005 +       journal_end_sync(&th, s, 1);
5006 +       s->s_dirt = 0;
5007 +    }
5008 +    unlock_kernel() ;
5009 +    return 0;
5010 +}
5011
5012 -  int dirty = 0 ;
5013 -  lock_kernel() ;
5014 -  if (!(s->s_flags & MS_RDONLY)) {
5015 -    dirty = flush_old_commits(s, 1) ;
5016 -  }
5017 -  s->s_dirt = dirty;
5018 -  unlock_kernel() ;
5019 +static void reiserfs_write_super (struct super_block * s)
5020 +{
5021 +    reiserfs_sync_fs(s);
5022  }
5023
5024 +
5025  static void reiserfs_write_super_lockfs (struct super_block * s)
5026  {
5027
5028 -  int dirty = 0 ;
5029    struct reiserfs_transaction_handle th ;
5030    lock_kernel() ;
5031    if (!(s->s_flags & MS_RDONLY)) {
5032 @@ -75,7 +84,7 @@
5033      reiserfs_block_writes(&th) ;
5034      journal_end(&th, s, 1) ;
5035    }
5036 -  s->s_dirt = dirty;
5037 +  s->s_dirt = 0;
5038    unlock_kernel() ;
5039  }
5040
5041 @@ -100,7 +109,7 @@
5042       /* we are going to do one balancing */
5043       journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
5044
5045 -     reiserfs_delete_solid_item (&th, key);
5046 +     reiserfs_delete_solid_item (&th, NULL, key);
5047       if (oid_free)
5048          /* removals are protected by direct items */
5049          reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
5050 @@ -286,8 +295,8 @@
5051      /* body of "save" link */
5052      link = INODE_PKEY (inode)->k_dir_id;
5053
5054 -    /* put "save" link inot tree */
5055 -    retval = reiserfs_insert_item (th, &path, &key, &ih, (char *)&link);
5056 +    /* put "save" link inot tree, don't charge quota to anyone */
5057 +    retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
5058      if (retval) {
5059         if (retval != -ENOSPC)
5060             reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d\n",
5061 @@ -329,7 +338,8 @@
5062            ( inode -> u.reiserfs_i.i_flags & i_link_saved_truncate_mask ) ) ||
5063          ( !truncate &&
5064            ( inode -> u.reiserfs_i.i_flags & i_link_saved_unlink_mask ) ) )
5065 -       reiserfs_delete_solid_item (&th, &key);
5066 +       /* don't take quota bytes from anywhere */
5067 +       reiserfs_delete_solid_item (&th, NULL, &key);
5068      if (!truncate) {
5069         reiserfs_release_objectid (&th, inode->i_ino);
5070         inode -> u.reiserfs_i.i_flags &= ~i_link_saved_unlink_mask;
5071 @@ -357,6 +367,7 @@
5072    ** to do a journal_end
5073    */
5074    journal_release(&th, s) ;
5075 +  s->s_dirt = 0;
5076
5077    for (i = 0; i < SB_BMAP_NR (s); i ++)
5078      brelse (SB_AP_BITMAP (s)[i].bh);
5079 @@ -418,6 +429,7 @@
5080    put_super: reiserfs_put_super,
5081    write_super: reiserfs_write_super,
5082    write_super_lockfs: reiserfs_write_super_lockfs,
5083 +  sync_fs: reiserfs_sync_fs,
5084    unlockfs: reiserfs_unlockfs,
5085    statfs: reiserfs_statfs,
5086    remount_fs: reiserfs_remount,
5087 @@ -463,6 +475,14 @@
5088      {NULL, 0, 0}
5089  };
5090
5091 +/* possible values for -o data= */
5092 +static const arg_desc_t logging_mode[] = {
5093 +    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
5094 +    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
5095 +    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
5096 +    {NULL, 0}
5097 +};
5098 +
5099
5100  /* possible values for "-o block-allocator=" and bits which are to be set in
5101     s_mount_opt of reiserfs specific part of in-core super block */
5102 @@ -612,10 +632,14 @@
5103
5104                 {"block-allocator", 'a', balloc, 0, 0},
5105                 {"hash", 'h', hash, 1<<FORCE_HASH_DETECT, 0},
5106 +               {"data", 'd', logging_mode, 0, 0},
5107
5108                 {"resize", 'r', 0, 0, 0},
5109                 {"attrs", 0, 0, 1<<REISERFS_ATTRS, 0},
5110                 {"noattrs", 0, 0, 0, 1<<REISERFS_ATTRS},
5111 +               {"usrquota", 0, 0, 0, 0},
5112 +               {"grpquota", 0, 0, 0, 0},
5113 +
5114                 {NULL, 0, 0, 0, 0}
5115      };
5116
5117 @@ -672,6 +696,47 @@
5118         }
5119  }
5120
5121 +static void switch_data_mode(struct super_block *s, unsigned long mode) {
5122 +    struct reiserfs_transaction_handle th;
5123 +    int sync_all = !reiserfs_data_log(s);
5124 +
5125 +    journal_begin(&th, s, 1);
5126 +    SB_JOURNAL(s)->j_must_wait = 1;
5127 +    journal_end_sync(&th, s, 1);
5128 +
5129 +    s->u.reiserfs_sb.s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
5130 +                                       (1 << REISERFS_DATA_ORDERED) |
5131 +                                      (1 << REISERFS_DATA_WRITEBACK));
5132 +    s->u.reiserfs_sb.s_mount_opt |= (1 << mode);
5133 +
5134 +    journal_begin(&th, s, 1);
5135 +    SB_JOURNAL(s)->j_must_wait = 1;
5136 +    journal_end_sync(&th, s, 1);
5137 +
5138 +    if (sync_all)
5139 +        fsync_no_super(s->s_dev);
5140 +}
5141 +
5142 +static void handle_data_mode(struct super_block *s, unsigned long mount_options)
5143 +{
5144 +    if (mount_options & (1 << REISERFS_DATA_LOG)) {
5145 +        if (!reiserfs_data_log(s)) {
5146 +           switch_data_mode(s, REISERFS_DATA_LOG);
5147 +           printk("reiserfs: switching to journaled data mode\n");
5148 +       }
5149 +    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
5150 +        if (!reiserfs_data_ordered(s)) {
5151 +           switch_data_mode(s, REISERFS_DATA_ORDERED);
5152 +           printk("reiserfs: switching to ordered data mode\n");
5153 +       }
5154 +    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
5155 +        if (!reiserfs_data_writeback(s)) {
5156 +           switch_data_mode(s, REISERFS_DATA_WRITEBACK);
5157 +           printk("reiserfs: switching to writeback data mode\n");
5158 +       }
5159 +    }
5160 +}
5161 +
5162  static int reiserfs_remount (struct super_block * s, int * mount_flags, char * data)
5163  {
5164    struct reiserfs_super_block * rs;
5165 @@ -723,9 +788,10 @@
5166      s->s_dirt = 0;
5167    } else {
5168      /* remount read-write */
5169 -    if (!(s->s_flags & MS_RDONLY))
5170 +    if (!(s->s_flags & MS_RDONLY)) {
5171 +       handle_data_mode(s, mount_options);
5172         return 0; /* We are read-write already */
5173 -
5174 +    }
5175      s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
5176      journal_begin(&th, s, 10) ;
5177
5178 @@ -743,9 +809,10 @@
5179    SB_JOURNAL(s)->j_must_wait = 1 ;
5180    journal_end(&th, s, 10) ;
5181
5182 -  if (!( *mount_flags & MS_RDONLY ) )
5183 +  if (!( *mount_flags & MS_RDONLY ) ) {
5184      finish_unfinished( s );
5185 -
5186 +    handle_data_mode(s, mount_options);
5187 +  }
5188    return 0;
5189  }
5190
5191 @@ -1172,9 +1239,6 @@
5192
5193      if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
5194        return NULL;
5195 -
5196 -
5197 -
5198      }
5199
5200      if (blocks) {
5201 @@ -1222,9 +1286,22 @@
5202      printk("reiserfs:warning: - it is slow mode for debugging.\n");
5203  #endif
5204
5205 -    /* fixme */
5206 -    jdev_name = NULL;
5207 +    /* make data=ordered the default */
5208 +    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
5209 +        !reiserfs_data_writeback(s))
5210 +    {
5211 +        s->u.reiserfs_sb.s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
5212 +    }
5213 +
5214 +    if (reiserfs_data_log(s)) {
5215 +        printk("reiserfs: using journaled data mode\n");
5216 +    } else if (reiserfs_data_ordered(s)) {
5217 +        printk("reiserfs: using ordered data mode\n");
5218 +    } else {
5219 +        printk("reiserfs: using writeback data mode\n");
5220 +    }
5221
5222 +    jdev_name = NULL;
5223      if( journal_init(s, jdev_name, old_format) ) {
5224         reiserfs_warning(s, "sh-2022: reiserfs_read_super: unable to initialize journal space\n") ;
5225         goto error ;
5226 @@ -1364,16 +1441,19 @@
5227
5228  static int __init init_reiserfs_fs (void)
5229  {
5230 +        int ret;
5231         reiserfs_proc_info_global_init();
5232         reiserfs_proc_register_global( "version",
5233                                        reiserfs_global_version_in_proc );
5234 +       ret = reiserfs_journal_cache_init();
5235 +       if (ret)
5236 +           return ret;
5237          return register_filesystem(&reiserfs_fs_type);
5238  }
5239
5240  MODULE_DESCRIPTION("ReiserFS journaled filesystem");
5241  MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
5242  MODULE_LICENSE("GPL");
5243 -EXPORT_NO_SYMBOLS;
5244
5245  static void __exit exit_reiserfs_fs(void)
5246  {
5247 diff -urN linux-2.4.22.org/fs/reiserfs/tail_conversion.c linux-2.4.22/fs/reiserfs/tail_conversion.c
5248 --- linux-2.4.22.org/fs/reiserfs/tail_conversion.c      2003-11-21 15:08:29.000000000 +0100
5249 +++ linux-2.4.22/fs/reiserfs/tail_conversion.c  2003-11-21 15:14:25.000000000 +0100
5250 @@ -66,11 +66,11 @@
5251         set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
5252          put_ih_item_len( &ind_ih, UNFM_P_SIZE );
5253         PATH_LAST_POSITION (path)++;
5254 -       n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih,
5255 +       n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
5256                                          (char *)&unfm_ptr);
5257      } else {
5258         /* Paste into last indirect item of an object. */
5259 -       n_retval = reiserfs_paste_into_item(th, path, &end_key,
5260 +       n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
5261                                             (char *)&unfm_ptr, UNFM_P_SIZE);
5262      }
5263      if ( n_retval ) {
5264 @@ -152,39 +152,6 @@
5265    }
5266  }
5267
5268 -static void
5269 -unmap_buffers(struct page *page, loff_t pos) {
5270 -  struct buffer_head *bh ;
5271 -  struct buffer_head *head ;
5272 -  struct buffer_head *next ;
5273 -  unsigned long tail_index ;
5274 -  unsigned long cur_index ;
5275 -
5276 -  if (page) {
5277 -    if (page->buffers) {
5278 -      tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
5279 -      cur_index = 0 ;
5280 -      head = page->buffers ;
5281 -      bh = head ;
5282 -      do {
5283 -       next = bh->b_this_page ;
5284 -
5285 -        /* we want to unmap the buffers that contain the tail, and
5286 -        ** all the buffers after it (since the tail must be at the
5287 -        ** end of the file).  We don't want to unmap file data
5288 -        ** before the tail, since it might be dirty and waiting to
5289 -        ** reach disk
5290 -        */
5291 -        cur_index += bh->b_size ;
5292 -        if (cur_index > tail_index) {
5293 -          reiserfs_unmap_buffer(bh) ;
5294 -        }
5295 -       bh = next ;
5296 -      } while (bh != head) ;
5297 -    }
5298 -  }
5299 -}
5300 -
5301  /* this first locks inode (neither reads nor sync are permitted),
5302     reads tail through page cache, insert direct item. When direct item
5303     inserted successfully inode is left locked. Return value is always
5304 @@ -261,7 +228,7 @@
5305      set_cpu_key_k_type (&key, TYPE_DIRECT);
5306      key.key_length = 4;
5307      /* Insert tail as new direct item in the tree */
5308 -    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih,
5309 +    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
5310                               tail ? tail : NULL) < 0 ) {
5311         /* No disk memory. So we can not convert last unformatted node
5312            to the direct item.  In this case we used to adjust
5313 @@ -274,10 +241,8 @@
5314      }
5315      kunmap(page) ;
5316
5317 -    /* this will invalidate all the buffers in the page after
5318 -    ** pos1
5319 -    */
5320 -    unmap_buffers(page, pos1) ;
5321 +    /* make sure to get the i_blocks changes from reiserfs_insert_item */
5322 +    reiserfs_update_sd(th, p_s_inode);
5323
5324      // note: we have now the same as in above direct2indirect
5325      // conversion: there are two keys which have matching first three
5326 @@ -285,7 +250,6 @@
5327
5328      /* We have inserted new direct item and must remove last
5329         unformatted node. */
5330 -    p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512);
5331      *p_c_mode = M_CUT;
5332
5333      /* we store position of first direct item in the in-core inode */
5334 diff -urN linux-2.4.22.org/include/linux/fs.h linux-2.4.22/include/linux/fs.h
5335 --- linux-2.4.22.org/include/linux/fs.h 2003-11-21 15:08:34.000000000 +0100
5336 +++ linux-2.4.22/include/linux/fs.h     2003-11-21 15:14:25.000000000 +0100
5337 @@ -1222,6 +1222,8 @@
5338         return test_and_set_bit(BH_Dirty, &bh->b_state);
5339  }
5340
5341 +extern void buffer_insert_list_journal_head(struct buffer_head *bh, struct list_head *list, void *journal_head);
5342 +
5343  static inline void mark_buffer_async(struct buffer_head * bh, int on)
5344  {
5345         if (on)
5346 @@ -1508,6 +1510,7 @@
5347  /* Generic buffer handling for block filesystems.. */
5348  extern int try_to_release_page(struct page * page, int gfp_mask);
5349  extern int discard_bh_page(struct page *, unsigned long, int);
5350 +extern void discard_buffer(struct buffer_head *bh) ;
5351  #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
5352  #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
5353  extern int block_symlink(struct inode *, const char *, int);
5354 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs.h linux-2.4.22/include/linux/reiserfs_fs.h
5355 --- linux-2.4.22.org/include/linux/reiserfs_fs.h        2003-11-21 15:08:34.000000000 +0100
5356 +++ linux-2.4.22/include/linux/reiserfs_fs.h    2003-11-21 15:14:25.000000000 +0100
5357 @@ -266,6 +266,7 @@
5358  #define NO_DISK_SPACE -3
5359  #define NO_BALANCING_NEEDED  (-4)
5360  #define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
5361 +#define QUOTA_EXCEEDED -6
5362
5363  typedef unsigned long b_blocknr_t;
5364  typedef __u32 unp_t;
5365 @@ -1329,8 +1330,7 @@
5366  #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter)
5367  #define get_generation(s) atomic_read (&fs_generation(s))
5368  #define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
5369 -#define fs_changed(gen,s) (gen != get_generation (s))
5370 -
5371 +#define fs_changed(gen,s) (gen != get_generation(s))
5372
5373  /***************************************************************************/
5374  /*                  FIXATE NODES                                           */
5375 @@ -1653,6 +1653,86 @@
5376    /* 12 */ struct journal_params jh_journal;
5377  } ;
5378
5379 +static inline int
5380 +reiserfs_file_data_log(struct inode *inode) {
5381 +    if (reiserfs_data_log(inode->i_sb) ||
5382 +       (inode->u.reiserfs_i.i_flags & i_data_log))
5383 +    {
5384 +        return 1 ;
5385 +    }
5386 +    return 0 ;
5387 +}
5388 +
5389 +/* flags for the nested transaction handle */
5390 +#define REISERFS_PERSISTENT_HANDLE 1
5391 +#define REISERFS_ACTIVE_HANDLE 2
5392 +#define REISERFS_CLOSE_NESTED 4
5393 +#define REISERFS_DANGLING_HANDLE 8
5394 +/*
5395 +** transaction handle which is passed around for all journal calls
5396 +*/
5397 +struct reiserfs_transaction_handle {
5398 +  struct super_block *t_super ; /* super for this FS when journal_begin was
5399 +                                  called. saves calls to reiserfs_get_super
5400 +                                  also used by nested transactions to make
5401 +                                  sure they are nesting on the right FS
5402 +                                  _must_ be first in the handle
5403 +                               */
5404 +  int t_refcount;
5405 +  int t_blocks_logged ;         /* number of blocks this writer has logged */
5406 +  int t_blocks_allocated ;      /* number of blocks this writer allocated */
5407 +  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
5408 +  int t_flags ;
5409 +  void *t_handle_save ;                /* save existing current->journal_info */
5410 +  int displace_new_blocks:1;    /* if new block allocation occurs, that
5411 +                                  block should be displaced from others */
5412 +} ;
5413 +
5414 +static inline int
5415 +reiserfs_dangling_handle(struct reiserfs_transaction_handle *th) {
5416 +    return (th && (th->t_flags & REISERFS_DANGLING_HANDLE)) ;
5417 +}
5418 +
5419 +static inline void
5420 +reiserfs_set_handle_dangling(struct reiserfs_transaction_handle *th) {
5421 +    th->t_flags |= REISERFS_DANGLING_HANDLE ;
5422 +}
5423 +
5424 +static inline void
5425 +reiserfs_clear_handle_dangling(struct reiserfs_transaction_handle *th) {
5426 +    th->t_flags &= ~REISERFS_DANGLING_HANDLE ;
5427 +}
5428 +
5429 +static inline int
5430 +reiserfs_persistent_handle(struct reiserfs_transaction_handle *th) {
5431 +    return (th && (th->t_flags & REISERFS_PERSISTENT_HANDLE)) ;
5432 +}
5433 +
5434 +static inline void
5435 +reiserfs_set_handle_persistent(struct reiserfs_transaction_handle *th) {
5436 +    th->t_flags |= REISERFS_PERSISTENT_HANDLE ;
5437 +}
5438 +
5439 +static inline int
5440 +reiserfs_active_handle(struct reiserfs_transaction_handle *th) {
5441 +    return (th && (th->t_flags & REISERFS_ACTIVE_HANDLE)) ;
5442 +}
5443 +
5444 +static inline void
5445 +reiserfs_set_handle_active(struct reiserfs_transaction_handle *th) {
5446 +    th->t_flags |= REISERFS_ACTIVE_HANDLE ;
5447 +}
5448 +
5449 +static inline int
5450 +reiserfs_restartable_handle(struct reiserfs_transaction_handle *th) {
5451 +    return (th && (th->t_flags & REISERFS_CLOSE_NESTED)) ;
5452 +}
5453 +
5454 +static inline void
5455 +reiserfs_set_handle_restartable(struct reiserfs_transaction_handle *th) {
5456 +    th->t_flags |= REISERFS_CLOSE_NESTED ;
5457 +}
5458 +
5459  extern task_queue reiserfs_commit_thread_tq ;
5460  extern wait_queue_head_t reiserfs_commit_thread_wait ;
5461
5462 @@ -1693,6 +1773,8 @@
5463  */
5464  #define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
5465
5466 +int reiserfs_journal_cache_init(void);
5467 +int reiserfs_flush_old_commits(struct super_block *);
5468  void reiserfs_commit_for_inode(struct inode *) ;
5469  void reiserfs_commit_for_tail(struct inode *) ;
5470  void reiserfs_update_inode_transaction(struct inode *) ;
5471 @@ -1701,6 +1783,18 @@
5472  void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
5473  void reiserfs_allow_writes(struct super_block *s) ;
5474  void reiserfs_check_lock_depth(char *caller) ;
5475 +int journal_mark_dirty(struct reiserfs_transaction_handle *,
5476 +                       struct super_block *, struct buffer_head *bh) ;
5477 +
5478 +static inline int reiserfs_transaction_running(struct super_block *s) {
5479 +    struct reiserfs_transaction_handle *th = current->journal_info ;
5480 +    if (th && th->t_super == s)
5481 +        return 1 ;
5482 +    if (th && th->t_super == NULL)
5483 +        BUG();
5484 +    return 0 ;
5485 +}
5486 +
5487  void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
5488  void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
5489  struct buffer_head  * journal_bread (struct super_block *s, int block);
5490 @@ -1716,8 +1810,14 @@
5491  int push_journal_writer(char *w) ;
5492  int pop_journal_writer(int windex) ;
5493  int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
5494 +int reiserfs_restart_transaction(struct reiserfs_transaction_handle *, int) ;
5495  int reiserfs_in_journal(struct super_block *p_s_sb, kdev_t dev, int bmap_nr, int bit_nr, int size, int searchall, unsigned int *next) ;
5496  int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
5497 +
5498 +/* allocates a transaction handle, and starts a new transaction it */
5499 +struct reiserfs_transaction_handle *
5500 +reiserfs_persistent_transaction(struct super_block *p_s_sb, unsigned long) ;
5501 +
5502  struct super_block *reiserfs_get_super(kdev_t dev) ;
5503  void flush_async_commits(struct super_block *p_s_sb) ;
5504
5505 @@ -1833,11 +1933,13 @@
5506  int reiserfs_insert_item (struct reiserfs_transaction_handle *th,
5507                           struct path * path,
5508                           const struct cpu_key * key,
5509 -                         struct item_head * ih, const char * body);
5510 +                         struct item_head * ih,
5511 +                         struct inode *inode, const char * body);
5512
5513  int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
5514                               struct path * path,
5515                               const struct cpu_key * key,
5516 +                             struct inode *inode,
5517                               const char * body, int paste_size);
5518
5519  int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
5520 @@ -1854,7 +1956,7 @@
5521                           struct buffer_head  * p_s_un_bh);
5522
5523  void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
5524 -                                                                struct key * key);
5525 +                                struct inode *inode, struct key * key);
5526  void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode);
5527  void reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
5528                            struct  inode * p_s_inode, struct page *,
5529 @@ -1895,8 +1997,18 @@
5530                                 int i_size,
5531                                 struct dentry *dentry,
5532                                 struct inode *inode);
5533 -int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode);
5534 -void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode);
5535 +
5536 +int reiserfs_sync_inode (struct reiserfs_transaction_handle *th,
5537 +                         struct inode * inode);
5538 +
5539 +void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
5540 +                              struct inode * inode, loff_t size);
5541 +
5542 +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
5543 +                                      struct inode *inode)
5544 +{
5545 +    reiserfs_update_sd_size(th, inode, inode->i_size) ;
5546 +}
5547
5548  void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode );
5549  void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs );
5550 @@ -1981,7 +2093,7 @@
5551  extern struct inode_operations reiserfs_file_inode_operations;
5552  extern struct file_operations reiserfs_file_operations;
5553  extern struct address_space_operations reiserfs_address_space_operations ;
5554 -int get_new_buffer (struct reiserfs_transaction_handle *th, struct buffer_head *,
5555 +int get_new_buffer (struct reiserfs_transaction_handle *th, struct inode *, struct buffer_head *,
5556                     struct buffer_head **, struct path *);
5557
5558
5559 @@ -2095,7 +2207,7 @@
5560
5561  int reiserfs_parse_alloc_options (struct super_block *, char *);
5562  int is_reusable (struct super_block * s, unsigned long block, int bit_value);
5563 -void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
5564 +void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *inode, unsigned long, int);
5565  int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
5566  extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb,
5567                                               b_blocknr_t *new_blocknrs, int amount_needed)
5568 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_i.h linux-2.4.22/include/linux/reiserfs_fs_i.h
5569 --- linux-2.4.22.org/include/linux/reiserfs_fs_i.h      2003-11-21 15:08:34.000000000 +0100
5570 +++ linux-2.4.22/include/linux/reiserfs_fs_i.h  2003-11-21 15:14:25.000000000 +0100
5571 @@ -6,6 +6,8 @@
5572
5573  #include <linux/list.h>
5574
5575 +struct reiserfs_journal_list;
5576 +
5577  /** bitmasks for i_flags field in reiserfs-specific part of inode */
5578  typedef enum {
5579      /** this says what format of key do all items (but stat data) of
5580 @@ -23,7 +25,9 @@
5581         truncate or unlink. Safe link is used to avoid leakage of disk
5582         space on crash with some files open, but unlinked. */
5583      i_link_saved_unlink_mask   =  0x0010,
5584 -    i_link_saved_truncate_mask =  0x0020
5585 +    i_link_saved_truncate_mask =  0x0020,
5586 +    /** are we logging data blocks for this file? */
5587 +    i_data_log                 =  0x0040,
5588  } reiserfs_inode_flags;
5589
5590
5591 @@ -52,14 +56,14 @@
5592      ** needs to be committed in order for this inode to be properly
5593      ** flushed */
5594      unsigned long i_trans_id ;
5595 -    unsigned long i_trans_index ;
5596 +    struct reiserfs_journal_list *i_jl;
5597
5598      /* direct io needs to make sure the tail is on disk to avoid
5599       * buffer alias problems.  This records the transaction last
5600       * involved in a direct->indirect conversion for this file
5601       */
5602      unsigned long i_tail_trans_id;
5603 -    unsigned long i_tail_trans_index;
5604 +    struct reiserfs_journal_list *i_tail_jl;
5605  };
5606
5607  #endif
5608 diff -urN linux-2.4.22.org/include/linux/reiserfs_fs_sb.h linux-2.4.22/include/linux/reiserfs_fs_sb.h
5609 --- linux-2.4.22.org/include/linux/reiserfs_fs_sb.h     2003-11-21 15:08:34.000000000 +0100
5610 +++ linux-2.4.22/include/linux/reiserfs_fs_sb.h 2003-11-21 15:14:25.000000000 +0100
5611 @@ -120,7 +120,6 @@
5612  #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
5613  #define JOURNAL_HASH_SIZE 8192
5614  #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
5615 -#define JOURNAL_LIST_COUNT 64
5616
5617  /* these are bh_state bit flag offset numbers, for use in the buffer head */
5618
5619 @@ -167,20 +166,27 @@
5620    struct reiserfs_bitmap_node **bitmaps ;
5621  } ;
5622
5623 -/*
5624 -** transaction handle which is passed around for all journal calls
5625 -*/
5626 -struct reiserfs_transaction_handle {
5627 -                               /* ifdef it. -Hans */
5628 -  char *t_caller ;              /* debugging use */
5629 -  int t_blocks_logged ;         /* number of blocks this writer has logged */
5630 -  int t_blocks_allocated ;      /* number of blocks this writer allocated */
5631 -  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
5632 -  struct super_block *t_super ; /* super for this FS when journal_begin was
5633 -                                   called. saves calls to reiserfs_get_super */
5634 -  int displace_new_blocks:1;   /* if new block allocation occurres, that block
5635 -                                  should be displaced from others */
5636 -} ;
5637 +struct reiserfs_journal_list;
5638 +
5639 +/* so, we're using fsync_buffers_list to do the ordered buffer writes,
5640 + * but we don't want to have a full inode on each buffer list, it is
5641 + * a big waste of space.
5642 + *
5643 + * instead we copy the very head of the inode into a list here, a kludge
5644 + * but much smaller.
5645 + */
5646 +struct reiserfs_inode_list {
5647 +    struct list_head        i_hash;
5648 +    struct list_head        i_list;
5649 +    struct list_head        i_dentry;
5650 +    struct list_head        i_dirty_buffers;
5651 +
5652 +    /* we could be very smart and do math based on the location
5653 +     * of the inode list in the journal list struct.
5654 +     * lets do that after this works properly
5655 +     */
5656 +    struct reiserfs_journal_list *jl;
5657 +};
5658
5659  /*
5660  ** one of these for each transaction.  The most important part here is the j_realblock.
5661 @@ -190,20 +196,32 @@
5662  ** to be overwritten */
5663  struct reiserfs_journal_list {
5664    unsigned long j_start ;
5665 +  unsigned long j_state ;
5666    unsigned long j_len ;
5667    atomic_t j_nonzerolen ;
5668    atomic_t j_commit_left ;
5669 -  atomic_t j_flushing ;
5670 -  atomic_t j_commit_flushing ;
5671    atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
5672 +  struct semaphore j_commit_lock ;
5673    unsigned long j_trans_id ;
5674    time_t j_timestamp ;
5675    struct reiserfs_list_bitmap *j_list_bitmap ;
5676    struct buffer_head *j_commit_bh ; /* commit buffer head */
5677    struct reiserfs_journal_cnode *j_realblock  ;
5678    struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
5679 -  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
5680 -  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
5681 +
5682 +  /* time ordered list of all the active transactions */
5683 +  struct list_head j_list;
5684 +
5685 +  /* time ordered list of all transactions not touched by kreiserfsd */
5686 +  struct list_head j_working_list;
5687 +
5688 +  /* for data=ordered support */
5689 +  struct list_head j_ordered_bh_list;
5690 +
5691 +  /* sigh, the tails have slightly different rules for flushing, they
5692 +   * need their own list
5693 +   */
5694 +  struct list_head j_tail_bh_list;
5695  } ;
5696
5697  struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
5698 @@ -230,16 +248,11 @@
5699    unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
5700    struct buffer_head *j_header_bh ;
5701
5702 -  /* j_flush_pages must be flushed before the current transaction can
5703 -  ** commit
5704 -  */
5705 -  struct reiserfs_page_list *j_flush_pages ;
5706    time_t j_trans_start_time ;         /* time this transaction started */
5707 -  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
5708 -  atomic_t j_wlock ;                       /* lock for j_wait */
5709 +  struct semaphore j_lock ;
5710 +  struct semaphore j_flush_sem ;
5711    wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
5712    atomic_t j_jlock ;                       /* lock for j_join_wait */
5713 -  int j_journal_list_index ;         /* journal list number of the current trans */
5714    int j_list_bitmap_index ;          /* number of next list bitmap to use */
5715    int j_must_wait ;                   /* no more journal begins allowed. MUST sleep on j_join_wait */
5716    int j_next_full_flush ;             /* next journal_end will flush all journal list */
5717 @@ -255,13 +268,28 @@
5718
5719    struct reiserfs_journal_cnode *j_cnode_free_list ;
5720    struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
5721 +  struct reiserfs_journal_list *j_current_jl;
5722
5723    int j_free_bitmap_nodes ;
5724    int j_used_bitmap_nodes ;
5725 +  int j_num_lists;      /* total number of active transactions */
5726 +  int j_num_work_lists; /* number that need attention from kreiserfsd */
5727 +
5728 +  /* debugging to make sure things are flushed in order */
5729 +  int j_last_flush_id;
5730 +
5731 +  /* debugging to make sure things are committed in order */
5732 +  int j_last_commit_id;
5733 +
5734    struct list_head j_bitmap_nodes ;
5735 -  struct list_head j_dirty_buffers ;
5736 +
5737 +  /* list of all active transactions */
5738 +  struct list_head j_journal_list;
5739 +
5740 +  /* lists that haven't been touched by kreiserfsd */
5741 +  struct list_head j_working_list;
5742 +
5743    struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;     /* array of bitmaps to record the deleted blocks */
5744 -  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;        /* array of all the journal lists */
5745    struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ;         /* hash table for real buffer heads in current trans */
5746    struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all
5747                                                                                 the transactions */
5748 @@ -413,6 +441,7 @@
5749      reiserfs_proc_info_data_t s_proc_info_data;
5750      struct proc_dir_entry *procdir;
5751      int reserved_blocks; /* amount of blocks reserved for further allocations */
5752 +    struct list_head s_reiserfs_supers;
5753  };
5754
5755  /* Definitions of reiserfs on-disk properties: */
5756 @@ -420,11 +449,12 @@
5757  #define REISERFS_3_6 1
5758
5759  /* Mount options */
5760 -#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
5761 -#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
5762 -#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
5763 -#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
5764 -#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
5765 +enum {
5766 +    REISERFS_LARGETAIL, /* large tails will be created in a session */
5767 +    REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
5768 +    REPLAYONLY,          /* replay journal and return 0. Use by fsck */
5769 +    REISERFS_NOLOG,      /* -o nolog: turn journalling off */
5770 +    REISERFS_CONVERT,    /* -o conv: causes conversion of old
5771                                   format super block to the new
5772                                   format. If not specified - old
5773                                   partition will be dealt with in a
5774 @@ -438,27 +468,25 @@
5775  ** the existing hash on the FS, so if you have a tea hash disk, and mount
5776  ** with -o hash=rupasov, the mount will fail.
5777  */
5778 -#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
5779 -#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
5780 -#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
5781 -#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
5782 +    FORCE_TEA_HASH,       /* try to force tea hash on mount */
5783 +    FORCE_RUPASOV_HASH,   /* try to force rupasov hash on mount */
5784 +    FORCE_R5_HASH,        /* try to force rupasov hash on mount */
5785 +    FORCE_HASH_DETECT,    /* try to detect hash function on mount */
5786
5787
5788  /* used for testing experimental features, makes benchmarking new
5789     features with and without more convenient, should never be used by
5790     users in any code shipped to users (ideally) */
5791
5792 -#define REISERFS_NO_BORDER 11
5793 -#define REISERFS_NO_UNHASHED_RELOCATION 12
5794 -#define REISERFS_HASHED_RELOCATION 13
5795 -#define REISERFS_TEST4 14
5796 -
5797 -#define REISERFS_TEST1 11
5798 -#define REISERFS_TEST2 12
5799 -#define REISERFS_TEST3 13
5800 -#define REISERFS_TEST4 14
5801 -
5802 -#define REISERFS_ATTRS (15)
5803 +    REISERFS_NO_BORDER,
5804 +    REISERFS_NO_UNHASHED_RELOCATION,
5805 +    REISERFS_HASHED_RELOCATION,
5806 +    REISERFS_DATA_LOG,
5807 +    REISERFS_DATA_ORDERED,
5808 +    REISERFS_DATA_WRITEBACK,
5809 +    REISERFS_ATTRS,
5810 +    REISERFS_TEST4,
5811 +};
5812
5813  #define reiserfs_r5_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_R5_HASH))
5814  #define reiserfs_rupasov_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_RUPASOV_HASH))
5815 @@ -467,6 +495,9 @@
5816  #define reiserfs_no_border(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_BORDER))
5817  #define reiserfs_no_unhashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
5818  #define reiserfs_hashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
5819 +#define reiserfs_data_log(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_LOG))
5820 +#define reiserfs_data_ordered(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_ORDERED))
5821 +#define reiserfs_data_writeback(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
5822  #define reiserfs_test4(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_TEST4))
5823
5824  #define have_large_tails(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_LARGETAIL))
5825 @@ -480,8 +511,6 @@
5826
5827  void reiserfs_file_buffer (struct buffer_head * bh, int list);
5828  int reiserfs_is_super(struct super_block *s)  ;
5829 -int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
5830 -int flush_old_commits(struct super_block *s, int) ;
5831  int show_reiserfs_locks(void) ;
5832  int reiserfs_resize(struct super_block *, unsigned long) ;
5833
5834 @@ -492,8 +521,6 @@
5835  #define SB_BUFFER_WITH_SB(s) ((s)->u.reiserfs_sb.s_sbh)
5836  #define SB_JOURNAL(s) ((s)->u.reiserfs_sb.s_journal)
5837  #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
5838 -#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
5839 -#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index)
5840  #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
5841  #define SB_AP_BITMAP(s) ((s)->u.reiserfs_sb.s_ap_bitmap)
5842
5843 diff -urN linux-2.4.22.org/kernel/ksyms.c linux-2.4.22/kernel/ksyms.c
5844 --- linux-2.4.22.org/kernel/ksyms.c     2003-11-21 15:08:31.000000000 +0100
5845 +++ linux-2.4.22/kernel/ksyms.c 2003-11-21 15:15:21.000000000 +0100
5846 @@ -182,6 +182,7 @@
5847  EXPORT_SYMBOL(end_buffer_io_async);
5848  EXPORT_SYMBOL(__mark_buffer_dirty);
5849  EXPORT_SYMBOL(__mark_inode_dirty);
5850 +EXPORT_SYMBOL(discard_buffer);      /* for FS flushpage funcs */
5851  EXPORT_SYMBOL(fd_install);
5852  EXPORT_SYMBOL(get_empty_filp);
5853  EXPORT_SYMBOL(init_private_file);
5854 diff -urN linux-2.4.22.org/mm/filemap.c linux-2.4.22/mm/filemap.c
5855 --- linux-2.4.22.org/mm/filemap.c       2003-11-21 15:08:31.000000000 +0100
5856 +++ linux-2.4.22/mm/filemap.c   2003-11-21 15:14:25.000000000 +0100
5857 @@ -3041,6 +3041,14 @@
5858         }
5859  }
5860
5861 +static void update_inode_times(struct inode *inode)
5862 +{
5863 +       time_t now = CURRENT_TIME;
5864 +       if (inode->i_ctime != now || inode->i_mtime != now) {
5865 +           inode->i_ctime = inode->i_mtime = now;
5866 +           mark_inode_dirty_sync(inode);
5867 +       }
5868 +}
5869  /*
5870   * precheck_file_write():
5871   * Check the conditions on a file descriptor prior to beginning a write
5872 @@ -3302,8 +3310,7 @@
5873                 BUG();
5874
5875         remove_suid(inode);
5876 -       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
5877 -       mark_inode_dirty_sync(inode);
5878 +       update_inode_times(inode);
5879
5880         written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
5881         if (written > 0) {